o
    iH                     @   s   d dl mZ d dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlZejd  d	k rOd dlZd dlZejZeed
 e ZG dd deZG dd deZddgZdS )    )print_functionN)defaultdictCounter)partial)chain)	text_type)Perluniprops)parallelize_preprocessgrouper   zTYou should really be using Python3!!! Tick tock, tick tock, https://pythonclock.org/c                       s   e Zd ZdZededZededZ	ededZ
d$ fdd	Zd%d
dZ						d&ddZ						d&ddZ						d&ddZ						d&ddZd'ddZd(ddZedd Zdd Zdd Zd d! Zd"d# Z  ZS ))MosesTruecaserz
    This is a Python port of the Moses Truecaser from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
     Lowercase_LetterUppercase_LetterNutf8c                    sn   t t|   td| j| j| j| _	td| _
h d| _h d| _|| _|| _|r5| || _dS dS )a.  
        :param load_from:
        :type load_from:

        :param is_asr: A flag to indicate that model is for ASR. ASR input has
            no case, make sure it is lowercase, and make sure known are cased
            eg. 'i' to be uppercased even if i is known.
        :type is_asr: bool
        z[{}{}{}]z(<.*(?<=>))(.*)((?=</)[^>]*>)   .!:?   &#91;&#93;&apos;&quot;"'([N)superr   __init__recompileformatr   r   Titlecase_LetterSKIP_LETTERS_REGEXXML_SPLIT_REGXSENT_ENDDELAYED_SENT_STARTencodingis_asr_load_modelmodel)self	load_fromr*   r)   	__class__ N/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/truecase.pyr    -   s   

zMosesTruecaser.__init__Fc                 C   s   d}g }t |D ]L\}}td|rq|| jv rq|s#|| jv r#d}q| j|s,d}qd}|s3d}n|rD|d  r>d}n|dkrDd}d}|dkrT|| ||f q|S )z
        This function checks through each tokens in a sentence and returns the
        appropriate weight of each surface token form.
        T(<\S[^>]*>)Fr      g?)		enumerater!   searchr(   r'   r%   islowerappendlower)r-   tokenspossibly_use_first_tokenis_first_wordtruecase_weightsitokencurrent_word_weightr1   r1   r2   learn_truecase_weightsT   s4   
z%MosesTruecaser.learn_truecase_weightsr4   c                 C   sh   t t}t| j|d}tt||||d }|D ]\}	}
}||	 |
  |7  < q|r/| || | |S )aN  
        :param document_iterator: The input document, each outer list is a sentence,
                          the inner list is the list of tokens for each sentence.
        :type document_iterator: iter(list(str))

        :param possibly_use_first_token: When True, on the basis that the first
            word of a sentence is always capitalized; if this option is provided then:
            a) if a sentence-initial token is *not* capitalized, then it is counted, and
            b) if a capitalized sentence-initial token is the only token of the segment,
               then it is counted, but with only 10% of the weight of a normal token.
        :type possibly_use_first_token: bool

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        )r;   progress_bar)r   r   r   rA   r   r	   _save_model_from_casing_casing_to_model)r-   document_iteratorsave_tor;   	processesrC   casingtrain_truecasertoken_weightslowercase_tokensurface_tokenweightr1   r1   r2   _train   s   
zMosesTruecaser._trainc                 C   s"   d| _ | j|||||d| _ | j S )z\
        Default duck-type of _train(), accepts list(list(str)) as input documents.
        NrB   )r,   rO   )r-   	documentsrG   r;   rH   rC   r1   r1   r2   train   s   zMosesTruecaser.trainc                 C   sb   t || jd}dd | D }W d   n1 sw   Y  d| _| j|||||d| _| jS )zj
        Duck-type of _train(), accepts a filename to read as a `iter(list(str))`
        object.
        r)   c                 s       | ]}|  V  qd S Nsplit.0liner1   r1   r2   	<genexpr>       
z1MosesTruecaser.train_from_file.<locals>.<genexpr>NrB   )openr)   	readlinesr,   rO   )r-   filenamerG   r;   rH   rC   finrF   r1   r1   r2   train_from_file   s   zMosesTruecaser.train_from_filec                 C   s4   dd |  D }d| _| j|||||d| _| jS )zm
        Duck-type of _train(), accepts a file object to read as a `iter(list(str))`
        object.
        c                 s   rS   rT   rU   rW   r1   r1   r2   rZ      r[   z8MosesTruecaser.train_from_file_object.<locals>.<genexpr>NrB   )r]   r,   rO   )r-   file_objectrG   r;   rH   rC   rF   r1   r1   r2   train_from_file_object   s   z%MosesTruecaser.train_from_file_objectc                 C   s"  t d}t| dsJ |d}g }| |}t|D ]m\}}	td|	r*||	 q|	dks3|	dr9||	 qtd|	 \}	}
| j	rJ|	
 }	| jd |	
 d}| jd	 |	d}|re|re|}	n|rn|rk|n|	}	n|rr|}	|	|
 }	||	 |	| jv }|	| jv rd
}q|rd|S |S )a  
        Truecase a single sentence / line of text.

        :param text: A single string, i.e. sentence text.
        :type text: str

        :param use_known: Use the known case if a word is a known word but not the first word.
        :type use_known: bool
        zV
Use Truecaser.train() to train a model.
Or use Truecaser('modefile') to load a model.r,   Tr3   |z^([^\|]+)(.*)bestNknownF )strhasattr	split_xmlr5   r!   r6   r8   
startswithgroupsr*   r9   r,   getr'   r(   join)r-   text
return_str	use_knowncheck_model_messager<   truecased_tokensr:   r>   r?   other_factors	best_case
known_caser1   r1   r2   truecase   sB   






zMosesTruecaser.truecaseTc                 c   sb    t || jd}|D ]}| | }|rd|n|V  qW d    d S 1 s*w   Y  d S )NrR   rf   )r\   r)   rv   striprm   )r-   r^   ro   r_   rY   rr   r1   r1   r2   truecase_file>  s   "zMosesTruecaser.truecase_filec                 C   s(  |   } g }| rtd| }td| }td| }|ra| \}}td| rWt|dkrWtd|d rW|d  |7  < td|}|rV|d  |d	7  < |d
}n||d  |} n'|rq||d	 |d
} n|r||d	 |d
} ntd| |d   |d< | s|S )a
  
        Python port of split_xml function in Moses' truecaser:
        https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecaser.perl

        :param line: Input string, should be tokenized, separated by space.
        :type line: str
        z^\s*(<\S[^>]*>)(.*)$z^\s*([^\s<>]+)(.*)$z^\s*(\S+)(.*)$z^\Sr   z\|$z^(\|+)(.*)$r4      rf   zERROR: huh? {})	rw   r!   r6   rk   lengroupr8   	Exceptionr#   )rY   r:   has_xml
is_non_xmlxml_cognatespotential_xml	line_next	is_factorr1   r1   r2   ri   E  sF   	

&zMosesTruecaser.split_xmlc           	      C   sl   i }t  }|D ]&}||  }|d d ||< | js-|dd D ]\}}||  d7  < q q|||d}|S )zg

        :returns: A tuple of the (best, known) objects.
        :rtype: tuple(dict, Counter)
        r   r4   N)rd   re   rI   )r   most_commonr*   )	r-   rI   rd   re   token_lowerr:   r?   countr,   r1   r1   r2   rE   x  s   zMosesTruecaser._casing_to_modelc                 C   s   |  | jd | d S )NrI   )rD   r,   )r-   r^   r1   r1   r2   
save_model  s   zMosesTruecaser.save_modelc           
      C   s   t |d| jdI}|D ]=}t||  }g }t||  D ]\}\}}|dkr1d|||}	nd|||}	||	 qtd	|d|d qW d	   d	S 1 sTw   Y  d	S )
a"  
        Outputs the truecaser model file in the same output format as
        https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl

        :param casing: The dictionary of tokens counter from `train()`.
        :type casing: default(Counter)
        wrR   r   z
{} ({}/{})z{} ({})rf   
)endfileN)
r\   r)   sumvaluesr5   r   r#   r8   printrm   )
r-   rI   r^   foutr?   total_token_counttokens_countsr>   r   	out_tokenr1   r1   r2   rD     s   "z&MosesTruecaser._save_model_from_casingc                 C   s   t t}t|| jd1}|D ]&}|  }t|dD ]\}}|dd d}t|||  |< qqW d   n1 s?w   Y  | 	|S )z
        Loads pre-trained truecasing file.

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        rR   rz   /r   z()N)
r   r   r\   r)   rw   rV   r
   intr9   rE   )r-   r^   rI   r_   rY   r?   r   r1   r1   r2   r+     s   
zMosesTruecaser._load_model)NNr   )F)NFr4   FFF)T)__name__
__module____qualname____doc__r   rm   perlunipropscharsr   r   r$   r    rA   rO   rQ   r`   rb   rv   rx   staticmethodri   rE   r   rD   r+   __classcell__r1   r1   r/   r2   r   !   sF    
'1
-




H
2r   c                       s&   e Zd Z fddZdddZ  ZS )MosesDetruecaserc                    s0   t t|   h d| _h d| _h d| _d S )Nr   r   >#   al-.+el-.+aasatbebyinisofontoandanyforhisitsnotoffthewasfromlastthanthiswerewillwithaftertheirwhichduringagainstbecausebetween)r   r   r    r'   r(   ALWAYS_LOWERr-   r/   r1   r2   r      s   

zMosesDetruecaser.__init__Fc                    s   g }d}|  D ](}|r|dd  |dd  n|}|| | jv r)d}q| jvr0d}q|r< fdd|D }|rCd|S |S )z
        Detruecase the translated files from a model that learnt from truecased
        tokens.

        :param text: A single string, i.e. sentence text.
        :type text: str
        TNr4   Fc                    s6   g | ]}| j v r|n|d d  |dd   qS )Nr4   )r   upper)rX   r?   r   r1   r2   
<listcomp>  s    (z/MosesDetruecaser.detruecase.<locals>.<listcomp>rf   )rV   r   r8   r'   r(   rm   )r-   rn   is_headlinero   cased_tokenssentence_startr?   r1   r   r2   
detruecase  s   	$



zMosesDetruecaser.detruecaser   )r   r   r   r    r   r   r1   r1   r/   r2   r     s    6r   )
__future__r   r!   collectionsr   r   	functoolsr   	itertoolsr   sixr   sacremoses.corpusr   sacremoses.utilr	   r
   sysversion_infoiowarningsr\   warnrg   r   objectr   r   __all__r1   r1   r1   r2   <module>   s2      T