o
    i#                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlZd dlZejd  d	k rSd dlZejZeed
 eddgdZejdedejdddddejdddddejdddddejddddd d!e d"d# Zeej d$d  d%k rej!nej"Z"e" d&d' Z#d(d) Z$d*d+ Z%e&d,ejd-d.ddd/d0ejd1d2ddd3d0ejd4d5d6d7ejd8d9d:d7e$d;d< Z'e&d=ejd>d2ddd?d0e$d@dA Z(e&dBejdCddddDd0ejdEdFdddGd0ejdHd5dddId0ejdJd9dddKd0e$dLdM Z)e&dNejdOdPddQdRejdSd.dddTd0ejdUd5dddVd0e$dWdX Z*e&dYejdOdPddZdRejdSd.dddTd0ejdUd5ddd[d0e$d\d] Z+e&d^ejd_d.ddd`d0e$dadb Z,dS )c    N)deepcopypartialupdate_wrapper)MosesTokenizerMosesDetokenizer)MosesTruecaserMosesDetruecaser)MosesPunctNormalizerparallelize_preprocess   zTYou should really be using Python3!!! Tick tock, tick tock, https://pythonclock.org/z-hz--help)help_option_namesT)chaincontext_settingsz
--languagez-lenz+Use language specific rules when tokenizing)defaulthelpz--processesz-j   zNo. of processes.z
--encodingz-eutf8zSpecify encoding of file.z--quietz-qFzDisable progress bar.)is_flagr   r   c                 C   s   d S N )languageencoding	processesquietr   r   I/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/cli.pycli#   s   r   .   c                 K   s~   t jd|d.}|}| D ]}|t|fi |}q|r%|D ]}t | qW d    d S W d    d S 1 s8w   Y  d S )Nstdinr   )clickget_text_streamlistecho)
processorsr   kwargsfiniteratorprocitemr   r   r   process_pipeline5   s   "r.   c                    s    fdd}t | fi |S )z\Helper decorator to rewrite a function so that
    it returns another function from it.
    c                     s    fdd}t |fi | S )Nc                    s    | fi |S r   r   )streamr)   fr   r   	processorF   s   z.processor.<locals>.new_func.<locals>.processorr   )r)   r2   r0   r   r   new_funcE   s   zprocessor.<locals>.new_funcr   )r1   r)   r3   r   r0   r   r2   @   s   r2   c                 c   sD    |dkr| D ]}||V  qd S t || || dD ]}|V  qd S )Nr   )progress_barr   )r+   funcr   r   lineoutliner   r   r   parallel_or_notN   s   

r8   tokenizez--aggressive-dash-splitsz-azTriggers dash split rules.)r   r   r   z--xml-escapez-xz"Escape special characters for XML.z--protected-patternsz-pzXSpecify file with patters to be protected in tokenisation. Special values: :basic: :web:)r   z--custom-nb-prefixesz-czjSpecify a custom non-breaking prefixes file, add prefixes to the default ones from the specified language.c                 C   s   t ||d}|r7|dkr|j}n'|dkr|j}nt|dd}	dd |	 D }W d    n1 s2w   Y  t|jd|||d	}
t| |
||S )
N)lang custom_nonbreaking_prefixes_filez:basic:z:web:r   r#   c                 S   s   g | ]}|  qS r   )strip).0patternr   r   r   
<listcomp>   s    z!tokenize_file.<locals>.<listcomp>T)
return_straggressive_dash_splitsescapeprotected_patterns)r   BASIC_PROTECTED_PATTERNSWEB_PROTECTED_PATTERNSopen	readlinesr   r9   r8   )r+   r   r   r   
xml_escaperA   rC   custom_nb_prefixesmosesr*   moses_tokenizer   r   r   tokenize_file^   s&   $rL   
detokenizez--xml-unescapez$Unescape special characters for XML.c                 C   s4   t |d}t|jd|d}ttttj| |||S )N)r:   T)r@   unescape)r   r   rM   r8   r&   mapstrsplit)r+   r   r   r   xml_unescaperJ   moses_detokenizer   r   r   detokenize_file   s
   
rT   	normalizez--normalize-quote-commasz Normalize quotations and commas.z--normalize-numbersz-dzNormalize number.z--replace-unicode-punctsz2Replace unicode punctuations BEFORE normalization.z--remove-control-charsz.Remove control characters AFTER normalization.c           
      C   s*   t |||||d}t|j}	t| |	||S )N)norm_quote_commasnorm_numberspre_replace_unicode_punctpost_remove_control_chars)r   r   rU   r8   )
r+   r   r   r   normalize_quote_commasnormalize_numbersreplace_unicode_punctsremove_control_charsrJ   moses_normalizer   r   r   normalize_file   s   $
r_   ztrain-truecasez--modelfilez-mzFilename to save the modelfile.)requiredr   z--is-asrz)A flag to indicate that model is for ASR.z--possibly-use-first-tokenz*Use the first token as part of truecasing.c           	      C   s,   t |d}|j| ||| d}|| d S )Nis_asrpossibly_use_first_tokenr   r4   )r	   train
save_model)	r+   r   r   r   	modelfilerb   rd   rJ   modelr   r   r   train_truecaser   s   
ri   truecasez$Filename to save/load the modelfile.z1Use the first token as part of truecase training.c                 C   sd   t j|st| }t|d}|j|||| d}	|| t||d}
t|
jdd}t	| |||S )Nra   rc   )	load_fromrb   T)r@   )
ospathisfiler   r	   re   rf   r   rj   r8   )r+   r   r   r   rg   rb   rd   iterator_copy	truecaserrh   rJ   moses_truecaser   r   r   truecase_file  s   

rr   
detruecasez--is-headlinezWhether the file are headlines.c                 C   s$   t  }t|jd|d}t| |||S )NT)r@   is_headline)r
   r   rs   r8   )r+   r   r   r   rt   rJ   moses_detruecaser   r   r   detruecase_file@  s
   
rv   )-rl   copyr   	functoolsr   r   r$   sacremoses.tokenizer   r   sacremoses.truecaser	   r
   sacremoses.normalizer   sacremoses.utilr   syswarningsversion_infoiorF   warnrP   dictCONTEXT_SETTINGSgroupoptionversion_optionr   int__version__rQ   resultcallbackresult_callbackr.   r2   r8   commandrL   rT   r_   ri   rr   rv   r   r   r   r   <module>   s,  	
$

&
