o
    i                     @   s:   d dl Z d dlZd dlmZ d dlmZ G dd dZdS )    N)	text_type)chainc                   @   s   e Zd ZdZg dZddgZg dZg dZg dZdgZ	d	d
gZ
dgZdgZg dZ						dddZdd Zdd Zdd ZdS )MosesPunctNormalizerz
    This is a Python port of the Moses punctuation normalizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
    )
)z\r )z\(z ()z\)z) z + )z\) ([.!:?;,])z)\g<1>)z\( ()z \)))z(\d) %z\g<1>%)z ::)z ;;)`')''z " ))u   „"u   “r   u   ”r   )u   –-)u   —z - r   )   ´r   )u   ([a-zA-Z])‘([a-zA-Z])\g<1>'\g<2>)u   ([a-zA-Z])’([a-zA-Z])r   )u   ‘r   )u   ‚r   u   ’r   )r   r   )u   ´´r   u   …z...))u    « r   )u   « r   )   «r   )u    » r   )u    »r   )   »r   )
)u    %%)u   nº u   nº )u    :r
   )u    ºCu    ºC)u    cmz cm)u    \??)u    \!!)u    ;r   )u   , z, r   )z"([,.]+)z\g<1>")z,"z",)z(\.+)"(\s*[^<])z"\g<1>\g<2>)
   (\d) (\d)z\g<1>,\g<2>)r   z\g<1>.\g<2>)$)u   ，,)u   。\s*. )u   、r   r   r   )u   ∶r
   )u   ：r
   )u   ？r   )u   《r   )u   》r   )u   ）r	   )u   ！r   )u   （r   )u   ；r   )u   」r   )u   「r   )u   ０0)u   １1)u   ２2)u   ３3)u   ４4)u   ５5)u   ６6)u   ７7)u   ８8)u   ９9)u   ．\s*r   )u   ～~r   r   )u   ━r   )u   〈<)u   〉>)u   【[)u   】])u   ％r   enTFc                 C   s   | j | j| j| jg| _|r| jd| j |r.|dkr#| j| j n|dv r.| j| j	 |rC|dv r<| j| j
 n| j| j tt| j | _|| _|| _dS )ah  
        :param language: The two-letter language code.
        :type lang: str
        :param penn: Normalize Penn Treebank style quotations.
        :type penn: bool
        :param norm_quote_commas: Normalize quotations and commas
        :type norm_quote_commas: bool
        :param norm_numbers: Normalize numbers
        :type norm_numbers: bool
           r.   )deesfr)r0   r1   czcsr2   N)EXTRA_WHITESPACENORMALIZE_UNICODEFRENCH_QUOTESHANDLE_PSEUDO_SPACESsubstitutionsinsertNORMALIZE_UNICODE_IF_NOT_PENNappendEN_QUOTATION_FOLLOWED_BY_COMMA$DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMADE_ES_CZ_CS_FROTHERlistr   pre_replace_unicode_punctpost_remove_control_chars)selflangpennnorm_quote_commasnorm_numbersrB   rC    rI   O/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/normalize.py__init__   s&   
zMosesPunctNormalizer.__init__c                 C   sJ   | j r| |}| jD ]\}}t||t|}q| jr!| |}| S )z?
        Returns a string with normalized punctuation.
        )	rB   replace_unicode_punctr9   resubr   rC   remove_control_charsstriprD   textregexpsubstitutionrI   rI   rJ   	normalize   s   

zMosesPunctNormalizer.normalizec                 C   s&   | j D ]\}}t||t|}q|S )N)REPLACE_UNICODE_PUNCTUATIONrM   rN   r   rQ   rI   rI   rJ   rL      s   z*MosesPunctNormalizer.replace_unicode_punctc                 C   s   t dd|S )Nz\p{C}r   )regexrN   )rD   rR   rI   rI   rJ   rO      s   z)MosesPunctNormalizer.remove_control_charsN)r.   TTTFF)__name__
__module____qualname____doc__r5   r;   r6   r7   r8   r=   r>   r?   r@   rV   rK   rU   rL   rO   rI   rI   rI   rJ   r      s4    	)
.r   )rM   rW   sixr   	itertoolsr   r   rI   rI   rI   rJ   <module>   s
   