o
    iz                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 e Ze ZG dd deZG d	d
 d
eZdd
gZdS )    N)	text_type)Perluniprops)NonbreakingPrefixes)is_cjk)VIRAMASNUKTASc                
       s  e Zd ZdZededZededde	 de
 ZededZededZededde	 de
 ZededZd	Zd
ZdZdZdZdedfZdjeddfZdZdZdZdedfZdedfZdedfZdZdZdZ dZ!dZ"d Z#d!Z$d"Z%d#Z&d$Z'd%jed&d'fZ(d(jed&d'fZ)d)jed&d'fZ*d*eedfZ+d+jedd,fZ,d-Z-d.Z.d/Z/d0Z0d1Z1d2Z2d3Z3d4Z4d5Z5d6Z6d7Z7d8Z8d9Z9d:Z:d;Z;d<Z<d=Z=d>Z>d?Z?d@Z@dAZAdBZBdCZCdDZDdEZEdFZFdGZGdHZHdIZIdJZJdKZKdLZLdMZMdNZNdOZOdPZPdQZQdRZRdSZSdTZTdUZUdVZVdWZWdXZXdYZYdZjed[d\fZZd]jeed^d\fZ[d_jed[d\fZ\d`jed[dafZ]dbjedcdafZ^eZe[e\e]e^gZ_dZjed[d\fZ`ddjed[d\fZad_jed[d\fZbd`jed[defZce`eaebecgZddfZedgZfdhZgdiZhdjZidkZjdlZkg eeeee e!e"e#e$e%e&e(e)e*e+e,e-e.e/e0e1e2e3e4e5e6e7e8e9e:e;e<e=e>e?e@eAeBeCeDeEeFeGeHeIeJeKeLeMeNZle'eOePeQeReSeTeUeVeWg
ZmeReSeTeUeVeWeXeYgZnegeheiejekgZog dmZpd fdpdq	Zqdrds Zrdtdu Zsdvdw Ztdxdy Zudzd{ Zvd|d} Zwd~d ZxdddZy				odddZz  Z{S )MosesTokenizerz
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
     IsNIsAlnumIsScIsSoIsAlphaIsLower)z\s+ )z[\000-\037]r	   )z +r   )z^ r	   )z $r	   ([^{}\s\.'\`\,\-]) \1  ([{alphanum}])\-(?=[{alphanum}])alphanum\1 @-@ )	\.([\.]+)z
 DOTMULTI)DOTMULTI\.([^\.])zDOTDOTMULTI )
DOTMULTI\.DOTDOTMULTIz
([^{}])[,]z\1 , z
[,]([^{}])z , \1z
([{}])[,]$)z^```` )z^"r   )z^`([^`])z` \1)z^'z`  )z
([ ([{<])"\1 `` )z([ ([{<])``r   )z([ ([{<])`([^`])z\1 ` \2)z
([ ([{<])'z\1 ` )\.\.\.z _ELLIPSIS_ )
_ELLIPSIS_r   z([^{numbers}])[,]([^{numbers}]))numbersz\1 , \2z([{numbers}])[,]([^{numbers}])z([^{numbers}])[,]([{numbers}])z([;:@#\$%&{}{}])([{alphanum}])\/([{alphanum}])$1 \@\/\@ $2)z([^.])([.])([\]\)}>"']*) ?$z\1 \2\3)z([?!])r   )z([\]\[\(\){}<>])r   )z\(z-LRB-)z\)z-RRB-)\[z-LSB-)z\]z-RSB-)z\{z-LCB-)z\}z-RCB-)z--z -- )^r   )$r   )"z '' )z([^'])' \1 ' )z([^'])'r&   )z'([sSmMdD]) z '\1 )z'll z 'll )z're z 're )z've z 've )zn't z n't )z'LL z 'LL )z'RE z 'RE )z'VE z 'VE )zN'T z N'T )z ([Cc])annot z
 \1an not )z ([Dd])'ye z \1' ye )z ([Gg])imme z	 \1im me )z ([Gg])onna z	 \1on na )z ([Gg])otta z	 \1ot ta )z ([Ll])emme z	 \1em me )z ([Mm])ore'n z
 \1ore 'n )z '([Tt])is z '\1 is )z '([Tt])was z	 '\1 was )z ([Ww])anna z	 \1an na )z  *r   )z^ *r	   )z *$r	   )&&amp;)z\|&#124;)<&lt;)>&gt;)\'&apos;)z\"&quot;)r"   &#91;)]&#93;z([^{alpha}])[']([^{alpha}]))alphaz\1 ' \2z([^{alpha}{isn}])[']([{alpha}]))r4   isnz([{alpha}])[']([^{alpha}])z([{alpha}])[']([{alpha}])z\1 '\2z([{isn}])[']([s]))r5   z([^{alpha}])[']([{alpha}])z\1' \2)r.   z ' )z\.' ?$z . ' z<\/?\S+\/?>z#<\S+( [a-zA-Z0-9]+\="?[^"]")+ ?\/?>z#<\S+( [a-zA-Z0-9]+\='?[^']')+ ?\/?>'[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}z/(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+)z"((https?|ftp|rsync)://|www\.)[^ ]*r6   z@[a-zA-Z0-9_]+z#[a-zA-Z0-9_]+enNc                    s  t t   | _dd t|D  _|rJg  _t|d#}|D ]}| }|r:|	ds:| jvr: j
| q"W d    n1 sEw   Y   fdd jD  _ jdv rd} jdv rm|tdtd	7 } jd
v r~|tdtd7 } jdv r|tdtd7 }|tdtd7 }|tdtd7 }  j|7  _  j|7  _d jdf _dj jddf _dj jddf _d S d S )Nc                 S   s   g | ]}|  qS  )strip).0_nbpr8   r8   N/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/tokenize.py
<listcomp>0  s    z+MosesTokenizer.__init__.<locals>.<listcomp>r#c                    s$   g | ]}  |r|d d qS )r   r   )has_numeric_only
rpartition)r:   wselfr8   r<   r=   >  s    )zhjakocjkr	   )rG   rH   Hangul)rE   rH   Han)rF   rH   HiraganaKatakanar   r   r   r   r   r    r!   )superr   __init__langnonbreaking_prefixeswordsNONBREAKING_PREFIXESopenr9   
startswithappendNUMERIC_ONLY_PREFIXESr   joinperlunipropscharsr   r   formatPAD_NOT_ISALNUMAGGRESSIVE_HYPHEN_SPLITINTRATOKEN_SLASHES)rD   rO    custom_nonbreaking_prefixes_filefinline	cjk_chars	__class__rC   r<   rN   *  sN   






zMosesTokenizer.__init__c                 C   sF   t dd|}t d|r!t dd|}t dd|}t d|s|S )Nr   z DOTMULTI\1r   r   zDOTDOTMULTI \1r   )resubsearchrD   textr8   r8   r<   replace_multidots[  s   z MosesTokenizer.replace_multidotsc                 C   s4   t d|rt dd|}t d|st dd|S )Nr   z	DOTMULTI.DOTMULTI.)rd   rf   re   rg   r8   r8   r<   restore_multidotsb  s   z MosesTokenizer.restore_multidotsc                 C   s   t |t | j S N)set
differencer   rg   r8   r8   r<   islowerg  s   zMosesTokenizer.islowerc                 C   s   t t|t| jS rm   )anyrn   intersectionr   rg   r8   r8   r<   
isanyalphaj  s   zMosesTokenizer.isanyalphac                 C   s   t td|S )Nz[\s]+(\#NUMERIC_ONLY\#))boolrd   rf   rg   r8   r8   r<   r@   m  s   zMosesTokenizer.has_numeric_onlyc                 C   s   |  }t|}t|D ]X\}}td|}|rd|d}d|v r&| |sG|| jv r0|| jvsG||d krH||d  rH| 	||d  d rHq|| jv r^|d |k r^td||d  r^q|d ||< qd
|S )Nz	^(\S+)\.$   rk   r   z^[0-9]+z .r   )splitlen	enumeraterd   rf   grouprs   rR   rV   rp   rW   )rD   rh   tokens
num_tokensitokentoken_ends_with_periodprefixr8   r8   r<   handles_nonbreaking_prefixesp  s.   
	




z+MosesTokenizer.handles_nonbreaking_prefixesc                 C   "   | j D ]\}}t|||}q|S rm   )MOSES_ESCAPE_XML_REGEXESrd   re   rD   rh   regexpsubstitutionr8   r8   r<   
escape_xml     zMosesTokenizer.escape_xmlFc                 C   s^   t |}| jD ]\}}t|||}q| |}| jD ]\}}t|||}q|r+|S | S )z
        This is a Python port of the Penn treebank tokenizer adapted by the Moses
        machine translation community.
        )r   MOSES_PENN_REGEXES_1rd   re   r   MOSES_PENN_REGEXES_2rv   )rD   rh   
return_strr   r   r8   r8   r<   penn_tokenize  s   
zMosesTokenizer.penn_tokenizeTc                    s  t   | j| jfD ]\}}t||  q
|r9 fdd|D }t|D ]\}	}
dt|	d } |
| q% 	  	 | j
\}}t||  |rX| j\}}t||  |   | j| j| jfD ]\}}t||  qe| jdkr| jD ]\}}t||  qyn!| jdv r| jD ]\}}t||  qn| j\}}t||  |   | j\}}t|| 	  | j\}}t||  |rt|D ]\}	}
dt|	d } ||
 q|   |r|   |r S   S )a  
        Python port of the Moses tokenizer.

            :param tokens: A single string, i.e. sentence text.
            :type tokens: str
            :param aggressive_dash_splits: Option to trigger dash split rules .
            :type aggressive_dash_splits: bool
        c                    s*   g | ]}t | t jD ]}| qqS r8   )rd   finditer
IGNORECASEry   )r:   protected_patternmatchrh   r8   r<   r=     s    z+MosesTokenizer.tokenize.<locals>.<listcomp>THISISPROTECTED   r7   )frit)r   DEDUPLICATE_SPACE
ASCII_JUNKrd   re   rx   strzfillreplacer9   r[   r\   ri   COMMA_SEPARATE_1COMMA_SEPARATE_2COMMA_SEPARATE_3rO   ENGLISH_SPECIFIC_APOSTROPHEFR_IT_SPECIFIC_APOSTROPHENON_SPECIFIC_APOSTROPHEr   TRAILING_DOT_APOSTROPHErl   r   rv   )rD   rh   aggressive_dash_splitsr   escapeprotected_patternsr   r   protected_tokensr|   r}   substituitionr8   r   r<   tokenize  s\   











zMosesTokenizer.tokenize)r7   N)F)FFTN)|__name__
__module____qualname____doc__r   rW   rX   rY   r
   r   r   r   r   r   r   r   r   r   	MID_STRIP
LEFT_STRIPRIGHT_STRIPrZ   r[   r\    REPLACE_DOT_WITH_LITERALSTRING_1 REPLACE_DOT_WITH_LITERALSTRING_2 REPLACE_DOT_WITH_LITERALSTRING_3r   r   r   DIRECTIONAL_QUOTE_1DIRECTIONAL_QUOTE_2DIRECTIONAL_QUOTE_3DIRECTIONAL_QUOTE_4DIRECTIONAL_QUOTE_5DIRECTIONAL_QUOTE_6DIRECTIONAL_QUOTE_7DIRECTIONAL_QUOTE_8REPLACE_ELLIPSISRESTORE_ELLIPSISCOMMA_1COMMA_2COMMA_3SYMBOLSr]   FINAL_PERIODPAD_QUESTION_EXCLAMATION_MARKPAD_PARENTHESISCONVERT_PARENTHESIS_1CONVERT_PARENTHESIS_2CONVERT_PARENTHESIS_3CONVERT_PARENTHESIS_4CONVERT_PARENTHESIS_5CONVERT_PARENTHESIS_6PAD_DOUBLE_DASHESPAD_START_OF_STRPAD_END_OF_STRCONVERT_DOUBLE_TO_SINGLE_QUOTESHANDLES_SINGLE_QUOTES
APOSTROPHECONTRACTION_1CONTRACTION_2CONTRACTION_3CONTRACTION_4CONTRACTION_5CONTRACTION_6CONTRACTION_7CONTRACTION_8CONTRACTION_9CONTRACTION_10CONTRACTION_11CONTRACTION_12CONTRACTION_13CONTRACTION_14CONTRACTION_15CONTRACTION_16CONTRACTION_17CONTRACTION_18CONTRACTION_19CLEAN_EXTRA_SPACE_1CLEAN_EXTRA_SPACE_2CLEAN_EXTRA_SPACE_3ESCAPE_AMPERSANDESCAPE_PIPEESCAPE_LEFT_ANGLE_BRACKETESCAPE_RIGHT_ANGLE_BRACKETESCAPE_SINGLE_QUOTEESCAPE_DOUBLE_QUOTEESCAPE_LEFT_SQUARE_BRACKETESCAPE_RIGHT_SQUARE_BRACKETEN_SPECIFIC_1EN_SPECIFIC_2EN_SPECIFIC_3EN_SPECIFIC_4EN_SPECIFIC_5r   FR_IT_SPECIFIC_1FR_IT_SPECIFIC_2FR_IT_SPECIFIC_3FR_IT_SPECIFIC_4r   r   r   BASIC_PROTECTED_PATTERN_1BASIC_PROTECTED_PATTERN_2BASIC_PROTECTED_PATTERN_3BASIC_PROTECTED_PATTERN_4BASIC_PROTECTED_PATTERN_5r   r   r   BASIC_PROTECTED_PATTERNSWEB_PROTECTED_PATTERNSrN   ri   rl   rp   rs   r@   r   r   r   r   __classcell__r8   r8   rb   r<   r      s   ""

	
 !"#$%&'()*+,-./01261)
r   c                       s  e Zd ZdZededZededZ	ededZ
dZeddfZd	Zd
ZdZdZdZdZdZdZdZdZdZeeeeeeeeeeegZg dZg dZg dZdedeedeedeZd# fdd	Z dd Z!d$dd Z"d$d!d"Z#  Z$S )%MosesDetokenizerz
    This is a Python port of the Moses Detokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl

    r	   r   r   r   )z \@\-\@ -z {2,}r   )r)   |)r+   r*   )r-   r,   )r0   r%   )r/   ')r1   [)r3   r2   )r(   r'   )z&bar;r   )z&bra;r  )z&ket;r2   )6NnAa   Ä   ässaSsau   ssäu   Ssästau   stäStau   StähunHunhynHynhanrJ      hänu   Hänu   hönu   HönunUnynYnanAnu   änu   Änu   önu   ÖnseenSeenllaLlau   lläu   LlältaLtau   ltäu   LtälleLleksiKsikseKsettaTtaineIne)nisimmennensa)	rG   u   kör  r  pau   päkaanu   käänkinz^({})({})?({})$r   r7   c                    s   t t|   || _d S rm   )rM   r   rN   rO   )rD   rO   rb   r8   r<   rN     s   
zMosesDetokenizer.__init__c                 C   r   rm   )MOSES_UNESCAPE_XML_REGEXESrd   re   r   r8   r8   r<   unescape_xml  r   zMosesDetokenizer.unescape_xmlTc                 C   s  d d|}t|}| j\}}t|||}|r| |}dddddd}d}d}	| }tt	|D ]\}
}t
|d ra| jdkra|
dkrXt
||
d  d rX|	|7 }	n|	|| 7 }	d}q5td	| j d
 |ru|	|| 7 }	d}q5td|r| jdkrtd|r|	d7 }	|	|7 }	d}q5| jdkr|
dkrtd | j|r|	|7 }	d}q5| jdkr|
dkrtd|d rtd|d rtd|r|	|7 }	d}q5| jdv r|
t|d krtd | j|rtd | j||
d  r|	|| 7 }	d}q5| jdkrE|
t|d krEtd | j|rEtd||
d  rEtd||
d  tjrE|	|| ||
d   7 }	t|d d}q5td|r|}td|rWd}||d||< | jdkrn|dkrnd||< | jdkr}|d kr}d||< || d dkr| jdkr|d!kr|
dkrtd"||
d  r|	|7 }	d}q5|	|| 7 }	d}||  d7  < q5|	|7 }	d}||  d7  < q5| jd#krtd$||
d  rt| j|r|	|| 7 }	d}q5|	|| 7 }	d}q5| j\}}t|||	}	|	 }	|r|	S |	 S )%z
        Python port of the Moses detokenizer.
        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: list(str)
        :return: str
        z {} r   r   )r   r%   z```z''r	   rG   ru   z^[u   \(\[\{\¿\¡]+$z^[\,\.\?\!\:\;\\\%\}\]\)]+$r   z^[\?\!\:\;\\\%]$r7   z^['][{}]csz^[0-9]+$z^[.,]$)r   r   ga   z[{}][']$z^[{}]r   u   ^[-–]$z^li$|^mail.*Nu   ^[\'\"„“`]+$u   ^[„“”]+$r%   u   „u   “r   z[s]$fiz:$)rZ   rW   r   r\   rd   re   r1  rv   rx   iterr   rO   rf   r   r   rw   r   nextgetFINNISH_REGEX	ONE_SPACEr9   )rD   rz   r   unescaperh   r   r   quote_countsprepend_spacedetokenized_textr|   r}   normalized_quor8   r8   r<   r     s   










zMosesDetokenizer.tokenizec                 C   s   |  |||S )z&Duck-typing the abstract *tokenize()*.)r   )rD   rz   r   r>  r8   r8   r<   
detokenize8  s   zMosesDetokenizer.detokenize)r7   )TT)%r   r   r   r   r   rW   rX   rY   r   r   r   r\   rd   compiler=  UNESCAPE_FACTOR_SEPARATORUNESCAPE_LEFT_ANGLE_BRACKETUNESCAPE_RIGHT_ANGLE_BRACKETUNESCAPE_DOUBLE_QUOTEUNESCAPE_SINGLE_QUOTE UNESCAPE_SYNTAX_NONTERMINAL_LEFT!UNESCAPE_SYNTAX_NONTERMINAL_RIGHTUNESCAPE_AMPERSAND UNESCAPE_FACTOR_SEPARATOR_LEGACY'UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY(UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACYr0  FINNISH_MORPHSET_1FINNISH_MORPHSET_2FINNISH_MORPHSET_3rZ   r<  rN   r1  r   rC  r   r8   r8   rb   r<   r     sV    9
 r   )rd   sixr   sacremoses.corpusr   r   sacremoses.utilr   sacremoses.indicr   r   rX   rP   objectr   r   __all__r8   r8   r8   r<   <module>   s"         #