o
    i/                     @   s`   d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
 G dd deZG dd	 d	e
ZdS )
a  
Lexical translation model that considers word order.

IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.

The EM algorithm used in Model 2 is:

:E step: In the training data, collect counts, weighted by prior
         probabilities.

         - (a) count how many times a source language word is translated
               into a target language word
         - (b) count how many times a particular position in the source
               sentence is aligned to a particular position in the target
               sentence

:M step: Estimate new probabilities based on the counts from the E step

Notations
---------

:i: Position in the source sentence
     Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
     Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language

References
----------

Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.

Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
    Ndefaultdict)AlignedSent	AlignmentIBMModel	IBMModel1)Countsc                       sb   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Z  ZS )	IBMModel2u`  
    Lexical translation model that considers word order

    >>> bitext = []
    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

    >>> ibm2 = IBMModel2(bitext, 5)

    >>> print(round(ibm2.translation_table['buch']['book'], 3))
    1.0
    >>> print(round(ibm2.translation_table['das']['book'], 3))
    0.0
    >>> print(round(ibm2.translation_table['buch'][None], 3))
    0.0
    >>> print(round(ibm2.translation_table['ja'][None], 3))
    0.0

    >>> print(round(ibm2.alignment_table[1][1][2][2], 3))
    0.939
    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
    0.0
    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
    1.0

    >>> test_sentence = bitext[2]
    >>> test_sentence.words
    ['das', 'buch', 'ist', 'ja', 'klein']
    >>> test_sentence.mots
    ['the', 'book', 'is', 'small']
    >>> test_sentence.alignment
    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

    Nc                    sr   t  | |du rt|d| }|j| _| | n
|d | _|d | _td|D ]}| | q*| | dS )a  
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model and an alignment model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, all the following entries must be present:
            ``translation_table``, ``alignment_table``.
            See ``IBMModel`` for the type and purpose of these tables.
        :type probability_tables: dict[str]: object
        N   translation_tablealignment_tabler   )	super__init__r   r   set_uniform_probabilitiesr   rangetrain	align_all)selfsentence_aligned_corpus
iterationsprobability_tablesibm1n	__class__ N/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/translate/ibm2.pyr   c   s   

zIBMModel2.__init__c           	      C   s   t  }|D ]N}t|j}t|j}||f|vrS|||f d|d  }|tjk r4tdt	| d  t
d|d D ]}t
d|d D ]}|| j| | | |< qDq;qd S )N   zA source sentence is too long (z& words). Results may be less accurate.r   )setlenmotswordsaddr   MIN_PROBwarningswarnstrr   r   )	r   r   l_m_combinationsaligned_sentencelminitial_probijr   r   r   r      s*   


z#IBMModel2.set_uniform_probabilitiesc              
   C   s   t  }|D ]X}d g|j }dg|j }t|j}t|j}| ||}td|d D ]2}	||	 }
td|d D ]$}|| }| ||	||}|||
  }||||
 ||||	|| q7q*q| 	| | 
| d S )NUNUSEDr   r   )Model2Countsr    r!   r   prob_all_alignmentsr   prob_alignment_pointupdate_lexical_translationupdate_alignment*maximize_lexical_translation_probabilities maximize_alignment_probabilities)r   parallel_corpuscountsr(   src_sentencetrg_sentencer)   r*   total_countr-   tr,   scountnormalized_countr   r   r   r      s&   


zIBMModel2.trainc                 C   s   t j}|j D ]=\}}| D ]4\}}| D ]+\}}|D ]$}	|j| | | |	 |j| | |	  }
t|
|| j| | | |	< qqqqd S N)r   r#   	alignmentitemsalignment_for_any_imaxr   )r   r7   r#   r,   j_sr-   src_sentence_lengthsr)   trg_sentence_lengthsr*   estimater   r   r   r5      s   z*IBMModel2.maximize_alignment_probabilitiesc                 C   sX   t t}tdt|D ]}|| }tdt|D ]}||  | ||||7  < qq|S )a  
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        r   r   )r   floatr   r   r1   )r   r8   r9   alignment_prob_for_tr-   r;   r,   r   r   r   r0      s   
zIBMModel2.prob_all_alignmentsc           	      C   sL   t |d }t |d }|| }|| }| j| | | j| | | |  S )zz
        Probability that position j in ``trg_sentence`` is aligned to
        position i in the ``src_sentence``
        r   )r   r   r   )	r   r,   r-   r8   r9   r)   r*   r<   r;   r   r   r   r1      s
   $zIBMModel2.prob_alignment_pointc           	      C   s   d}t |jd }t |jd }t|jD ]'\}}|dkrq|j| }|j| }|| j| | | j| | | |  9 }qt|tj	S )zc
        Probability of target sentence and an alignment given the
        source sentence
        g      ?r   r   )
r   r8   r9   	enumerater@   r   r   rC   r   r#   )	r   alignment_infoprobr)   r*   r-   r,   trg_wordsrc_wordr   r   r   prob_t_a_given_s   s   

zIBMModel2.prob_t_a_given_sc                 C   s   |D ]}|  | qd S r?   )align)r   r6   sentence_pairr   r   r   r      s   zIBMModel2.align_allc                 C   s   g }t |j}t |j}t|jD ]O\}}| j| d | jd |d  | |  }t|tj}d}t|jD ]"\}	}
| j| |
 | j|	d  |d  | |  }||krX|}|	}q6|	||f qt
||_dS )a  
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        Nr   r   )r   r    r!   rJ   r   r   rC   r   r#   appendr   r@   )r   rQ   best_alignmentr)   r*   r-   rM   	best_probbest_alignment_pointr,   rN   
align_probr   r   r   rP     s*   

zIBMModel2.alignr?   )__name__
__module____qualname____doc__r   r   r   r5   r0   r1   rO   r   rP   __classcell__r   r   r   r   r	   ;   s    ')r	   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r/   zo
    Data object to store counts of various parameters during training.
    Includes counts for alignment.
    c                    s*   t    tdd | _tdd | _d S )Nc                   S      t dd S )Nc                   S   r\   )Nc                   S      t tS r?   r   rH   r   r   r   r   <lambda>3      zKModel2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>r   r   r   r   r   r_   3      9Model2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>r   r   r   r   r   r_   3  ra   z'Model2Counts.__init__.<locals>.<lambda>c                   S   r\   )Nc                   S   r]   r?   r^   r   r   r   r   r_   6  r`   rb   r   r   r   r   r   r_   6  ra   )r   r   r   r@   rB   )r   r   r   r   r   0  s   

zModel2Counts.__init__c                 C   s,   | j | |  |7  < | j|  |7  < d S r?   )	t_given_sany_t_given_s)r   r=   r<   r;   r   r   r   r2   9  s   z'Model2Counts.update_lexical_translationc                 C   s<   | j | | | |  |7  < | j| | |  |7  < d S r?   )r@   rB   )r   r=   r,   r-   r)   r*   r   r   r   r3   =  s   zModel2Counts.update_alignment)rW   rX   rY   rZ   r   r2   r3   r[   r   r   r   r   r/   *  s
    	r/   )rZ   r$   collectionsr   nltk.translater   r   r   r   nltk.translate.ibm_modelr   r	   r/   r   r   r   r   <module>   s   , p