o
    i                     @   sd   d dl mZ d dlmZmZmZ d dlmZmZ d dl	m
Z
mZ G dd deZG dd deZd	S )
    )CorpusReader)StreamBackedCorpusViewconcatread_alignedsent_block)RegexpTokenizerWhitespaceTokenizer)AlignedSent	Alignmentc                   @   sL   e Zd ZdZde edddedfddZdd
dZdddZ	dddZ
d	S )AlignedCorpusReaderz
    Reader for corpora of word-aligned sentences.  Tokens are assumed
    to be separated by whitespace.  Sentences begin on separate lines.
    /
T)gapslatin1c                 C   s,   t | ||| || _|| _|| _|| _dS )a  
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)r   __init___sep_word_tokenizer_sent_tokenizer_alignedsent_block_reader)selfrootfileidssepword_tokenizersent_tokenizeralignedsent_block_readerencoding r   U/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/corpus/reader/aligned.pyr      s
   
zAlignedCorpusReader.__init__Nc                       t  fdd |dD S )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        c              
      *   g | ]\}}t ||d d  j j jqS )FAlignedSentCorpusViewr   r   r   .0fileidencr   r   r   
<listcomp>9       
z-AlignedCorpusReader.words.<locals>.<listcomp>Tr   abspathsr   r   r   r&   r   words2   s
   


zAlignedCorpusReader.wordsc                    r   )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        c              
      s*   g | ]\}}t ||d d j j jqS )FTr    r"   r&   r   r   r'   O   r(   z-AlignedCorpusReader.sents.<locals>.<listcomp>Tr)   r+   r   r&   r   sentsG   s
   


zAlignedCorpusReader.sentsc                    r   )zp
        :return: the given file(s) as a list of AlignedSent objects.
        :rtype: list(AlignedSent)
        c              
      r   )Tr    r"   r&   r   r   r'   c   r(   z5AlignedCorpusReader.aligned_sents.<locals>.<listcomp>Tr)   r+   r   r&   r   aligned_sents]   s
   


z!AlignedCorpusReader.aligned_sents)N)__name__
__module____qualname____doc__r   r   r   r   r,   r-   r.   r   r   r   r   r
      s    	



r
   c                   @   s    e Zd ZdZdd Zdd ZdS )r!   z
    A specialized corpus view for aligned sentences.
    ``AlignedSentCorpusView`` objects are typically created by
    ``AlignedCorpusReader`` (not directly by nltk users).
    c                 C   s2   || _ || _|| _|| _|| _tj| ||d d S )N)r   )_aligned_group_by_sentr   r   r   r   r   )r   corpus_filer   alignedgroup_by_sentr   r   r   r   r   r   r   y   s   
zAlignedSentCorpusView.__init__c                    sd    fdd  |D } jr"td|d |d< t| g}|S  jr,|d g}|S |d }|S )Nc                    s*   g | ]} j |D ]} j|q
qS r   )r   tokenizer   )r#   alignedsent_strsent_strr&   r   r   r'      s    

z4AlignedSentCorpusView.read_block.<locals>.<listcomp>    r   )r   r3   r	   
fromstringjoinr   r4   )r   streamblockr   r&   r   
read_block   s   


z AlignedSentCorpusView.read_blockN)r/   r0   r1   r2   r   rA   r   r   r   r   r!   r   s    r!   N)nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r   nltk.translater   r	   r
   r!   r   r   r   r   <module>   s   `