set need-separators off

! This is a "preserving" tokenizer, intending to preserve all input, even
! garbage. It expects omorfi.hfst, omorfi_additions.txt and proper_names.txt
! to be present. The latter two are intended to be a list of missing
! wordforms in OmorFi, and the latter a list of missing proper names that
! probably don't belong in a dictionary.

! We use directed unicode quotation marks in both directions because people
! don't use them correctly in the wild
define punct_trailing_or_preceding "\"" | "'" | "’" | "”" | "“" | "„" | "″"; 
define punct_trailing punct_trailing_or_preceding | "," | ")" | "]" | "}" | "–" | ":" | ";" | "»" | ">" ;
define punct_preceding punct_trailing_or_preceding | "<" | "(" | "[" | "{" | "-" | "–" | "»" | ">" ;

define punct_sentbound "." | "?" | "!";
define dash "-" | "–";
define hyphen "-" | "‐";
define minus "-" | "−";
define em_dash "—";

define date Num (Num) "." Num (Num) "." (Num^{1,4});
define time Num Num ":" Num Num;
define date_with_time date " " time;
define token_preceding_parts define(LC(.#. | "\n") hyphen) | "@" | "." | "#";
define numerical_expr (minus) [[Num+] | [Num^{1,3} [(","|"."|" ") Num^3]*]] (["."|","] Num+);
define scientific_num (minus) [Num+] (["." | ","] Num+) ("e" Num+);
define numerical_ord Num+ "." NRC(boundary UppercaseAlpha);
define numerical_range numerical_expr [(" ") dash (" ")] numerical_expr;
define date_range [ date | [ Num Num "." ] ] (" ") dash (" ") date;
define percentage_maybe_change (["+" | minus]) numerical_expr ("%");
define inch_sign "\"" | {”};
define numerical [numerical_expr (inch_sign)] | numerical_range | percentage_maybe_change | numerical_ord | scientific_num;
define url (({http}) ("s") {://}) ({www.}) [[Alpha | Num]+ (".")]+ ["." [Alpha]^{2,3}]^{2,3} (":" Num+) ("/" [\Whitespace]+);
define email [\Whitespace]+ "@" [\Whitespace]+ "." Alpha+;
define xml_tag "<" ("/") [\[">"|"\n"]]+ ">";
define typographic_paragraph_separator "*"+ | "-"+ | "=+";
define any_nonwhitespace_utf8_char ? - Whitespace;

define ae "a" | "ä";
define oe "o" | "ö";
define clitics [["k" oe ] | ["h" ae "n"] | [ "k" [{aa}|{ää}] "n" ] |  ! only the ones occurring with open classes
{kin} | [ "p" ae ]]+ ("s");

define disallowed_omorfi_analyses "[SUBCAT=DECIMAL]" | "[UPOS=NUM]";

define omorfi [@bin"omorfi.hfst" .o. [[\disallowed_omorfi_analyses]+] ] | @txt"omorfi_additions.txt" ;
define proper_name @txt"proper_names.txt";
define omorfi_suffixes [[("-") LowercaseAlpha+]+ .o. omorfi] .o. ["[POSITION=SUFFIX]" ?+];
define internal_punct "'" | "’" | "-" | "*"+;
define unknown_word_char [\[Whitespace | Punct | em_dash | punct_trailing]];
define unknown_token [unknown_word_char+ (internal_punct unknown_word_char+)]+ ;
define boundary Whitespace+ | .#. ;

define omorfi_with_trailing_punctuation
  LC(boundary) Ins(omorfi) RC([punct_trailing | punct_sentbound]);
define omorfi_with_preceding_punctuation
  LC(punct_preceding) Ins(omorfi) RC(boundary);
define omorfi_between_punctuation
  LC(punct_preceding) Ins(omorfi) RC([punct_trailing | punct_sentbound]);
define trailing_punct NLC(boundary) punct_trailing RC(boundary);
define preceding_punct LC(boundary) punct_preceding NRC(boundary);
define sentbound NLC(boundary) punct_sentbound+ RC(boundary);
define punct_between NLC(boundary) [punct_trailing | punct_preceding] NRC(boundary);

define improper_colon_separated_conjugation ([UppercaseAlpha | LowercaseAlpha] LowercaseAlpha*) ":" LowercaseAlpha+;
define token_trailing_parts hyphen | improper_colon_separated_conjugation | ["(" clitics ")"];


define TOP
  ! First the case with a plain omorfi token, possibly with trailing dash
  [ define( LC(boundary) Ins(omorfi) (token_trailing_parts) RC(boundary) ) ] |
  ! word.
  [ omorfi_with_trailing_punctuation ] |
  ! -word
  [ omorfi_with_preceding_punctuation ] |
  ! "word"
  [ omorfi_between_punctuation ] |
  [ proper_name (Ins(omorfi_suffixes)) (token_trailing_parts) ] |
  ! The trailing punctuation
  [ trailing_punct ] |
  [ preceding_punct ] |
  [ xml_tag ] |
  ! This will insert a newline and should be less preferred than dotted
  ! punctuations coming from omorfi
  [ sentbound EndTag("Boundary=Sentence")]::100 |
  [ punct_between ] |
  [[date | date_range | date_with_time | time | numerical] (token_trailing_parts)] |
  [("<") url (">") ] | email |
  ! Unknown token
  [ define( LC([boundary | Punct]) token_preceding_parts* unknown_token (Ins(omorfi_suffixes)) token_trailing_parts* RC([boundary | Punct]) ) ]::500 |
  [ token_trailing_parts ]::600 | ! lone trailing part
  [ Punct ]::700 |
  [ typographic_paragraph_separator ]::800 |
  [ [Alpha | Num]+ | any_nonwhitespace_utf8_char ]::900
;
