o
    i                      @   sP   d dl mZ d dlZd dlmZmZ d dlmZ d dlm	Z	 G dd de
ZdS )    )print_functionN)Counterdefaultdict)reduce)pairwisec                   @   sL   e Zd ZdZdd ZdddZdd Zd	d
 Zdd Zdd Z	dddZ
dS )SubwordTokenizerzd
    This is a Python port of the Subword NMT from
    https://github.com/rsennrich/subword-nmt
    c                 C   s.   |  || _|  \| _| _t| j| _d S N)get_vocabularyvocabget_pair_statisticsstatsindicescopydeepcopy	big_stats)selffilename r   N/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/subwords.py__init__   s   zSubwordTokenizer.__init__Fc                 C   s   t  }t|,}|r#|D ]}| d\}}||  t|7  < qn	||   W d    n1 s6w   Y  t dd | D }| S )N c                 S   s0   i | ]\}}t |d d |d d f |qS )Nu   )tuple).0kvr   r   r   
<dictcomp>$   s   0 z3SubwordTokenizer.get_vocabulary.<locals>.<dictcomp>)	r   openstripsplitintupdatereaditemsmost_common)r   r   is_dictr
   finlinewordcountr   r   r   r	      s   
	zSubwordTokenizer.get_vocabularyc                 C   sp   t  }tdd }t| jD ]%\}\}}t|D ]\}}|||f  |7  < |||f |  d7  < qq||fS )z5Count frequency of all symbol pairs, and create indexc                   S   s   t  S r   )r   r   r   r   r   <lambda>-   s    z6SubwordTokenizer.get_pair_statistics.<locals>.<lambda>   )r   r   	enumerater
   r   )r   r   r   ir(   freqprevcurrr   r   r   r   (   s   z$SubwordTokenizer.get_pair_statisticsc                    sD   |\ d |dd fdd}t||dd |d fS )	z
        From https://stackoverflow.com/a/40367074/610569
            >>> modify_token(('s', 'h', 'e', 'r', 'l', 'o', 'c', 'k'), ('h', 'e'))
            ('S', 'he', 'r', 'l', 'o', 'c', 'k')
         \z\\c                    s0   | d  kr|kr| d d f S | |f S )Nr   r   )accefirstpair_strsecondr   r   r*   ?   s   
z/SubwordTokenizer.modify_token.<locals>.<lambda>r+   Nr   )joinreplacer   )r   tokenpairfr   r5   r   modify_token6   s
   zSubwordTokenizer.modify_tokenc                 C   sd   g }| j |  D ]&\}}|dk rq	| j| \}}| ||}||f| j|< |||||f q	|S )zJReplace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'r+   )r   r#   r
   r>   append)r   r<   changesjr.   r(   new_wordr   r   r   replace_pairE   s   zSubwordTokenizer.replace_pairc              	   C   s&  d| j |< t | j|< |\}}|| }|D ]\}}}}	d}
	 z|||
}
W n	 ty/   Y nyw |
t|d k r||
d  |kr|
r`||
d |
d  }| j |  |	8  < | j| |  d8  < |
t|d k r||
d  |ks|
t|d ks||
d  |kr||
d |
d  }| j |  |	8  < | j| |  d8  < |
d7 }
n|
d7 }
qd}
	 z|||
}
W n	 ty   Y nTw |
r||
d |
d  }| j |  |	7  < | j| |  d7  < |
t|d k r||
d  |kr||
|
d  }| j |  |	7  < | j| |  d7  < |
d7 }
qqdS )z
        Minimally update the indices and frequency of symbol pairs
        if we merge a pair of symbols, only pairs that overlap with occurrences
        of this pair are affected, and need to be updated.
        r   Tr+         N)r   r   r   index
ValueErrorlen)r   r<   changedr6   r8   new_pairrA   r(   old_wordr.   r-   r/   nexr   r   r   update_pair_statisticsQ   s^   
 

-$z'SubwordTokenizer.update_pair_statisticsrD   r+   Nc                 C   st   t | j d }t|D ]*}| j|}|D ]\}}	| |}
| ||
 d| j|< | j| |k r6  d S qqd S )N
   r   )maxr   valuesranger$   rC   rM   )r   num_symbolsmin_freqjumpr%   	thresholdr-   most_freq_tokensr;   r)   r@   r   r   r   learn   s   

zSubwordTokenizer.learn)F)rD   r+   N)__name__
__module____qualname____doc__r   r	   r   r>   rC   rM   rW   r   r   r   r   r      s    
[r   )
__future__r   r   collectionsr   r   	functoolsr   sacremoses.utilr   objectr   r   r   r   r   <module>   s   