o
    i6                     @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 dd Z
G dd	 d	e	ZG d
d dZG dd dZG dd deZdS )z9
A reader for corpora whose documents are in MTE format.
    N)reduce)TaggedCorpusReaderconcat)XMLCorpusViewc                 C   s   |  ||S N)findall)rootpathns r   Q/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/corpus/reader/mte.pyxpath   s   r   c                   @   s$   e Zd ZdZdddZdddZdS )	MTECorpusViewz0
    Class for lazy viewing the MTE Corpus.
    Nc                 C   s   t | ||| d S r   )r   __init__)selffileidtagspecelt_handlerr   r   r   r      s   zMTECorpusView.__init__c              	   C   s   t tdd t| |||S )Nc                 S      | d uS r   r   xr   r   r   <lambda>       z*MTECorpusView.read_block.<locals>.<lambda>)listfilterr   
read_block)r   streamr   r   r   r   r   r      s   zMTECorpusView.read_blockr   )NN)__name__
__module____qualname____doc__r   r   r   r   r   r   r      s    
r   c                   @   s   e Zd ZdZdddZdZdZdZdZd	Z	d
d Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0S )1MTEFileReaderz
    Class for loading the content of the multext-east corpus. It
    parses the xml files and does some tag-filtering depending on the
    given method parameters.
    zhttps://www.tei-c.org/ns/1.0z%https://www.w3.org/XML/1998/namespace)teixmlz{https://www.tei-c.org/ns/1.0}z'{https://www.w3.org/XML/1998/namespace}zTEI/text/body/div/div/p/s/(w|c)zTEI/text/body/div/div/p/szTEI/text/body/div/div/pc                 C   s
   || _ d S r   )_MTEFileReader__file_path)r   	file_pathr   r   r   r   3   s   
zMTEFileReader.__init__c                 C   s   |j S r   )textclseltcontextr   r   r   	_word_elt6   s   zMTEFileReader._word_eltc                        fddt |d jD S )Nc                       g | ]}  |d qS r   )r+   .0wr(   r   r   
<listcomp><       z+MTEFileReader._sent_elt.<locals>.<listcomp>*r   r
   r'   r   r1   r   	_sent_elt:      zMTEFileReader._sent_eltc                    r,   )Nc                    r-   r   )r6   r/   sr1   r   r   r2   @   r3   z+MTEFileReader._para_elt.<locals>.<listcomp>r4   r5   r'   r   r1   r   	_para_elt>   r7   zMTEFileReader._para_eltc                 C   s   d|j vr
|jdfS | jdkr| jdkr|j|j d fS | jdkr1| jdkr1|jt|j d fS tdtdd| j d }|	|j d r`| jdkrU|j|j d fS |jt|j d fS d S )	Nana msd	universal^-.z.*$)
attribr&   _MTEFileReader__tags_MTEFileReader__tagsetMTETagConvertermsd_to_universalrecompilesubmatch)r(   r)   r*   tagsr   r   r   _tagged_word_eltB   s   


zMTEFileReader._tagged_word_eltc                    *   t tdd  fddt|d jD S )Nc                 S   r   r   r   r   r   r   r   r   \   r   z0MTEFileReader._tagged_sent_elt.<locals>.<lambda>c                    r-   r   )rL   r.   r1   r   r   r2   ]   r3   z2MTEFileReader._tagged_sent_elt.<locals>.<listcomp>r4   r   r   r   r
   r'   r   r1   r   _tagged_sent_eltX      zMTEFileReader._tagged_sent_eltc                    rM   )Nc                 S   r   r   r   r   r   r   r   r   e   r   z0MTEFileReader._tagged_para_elt.<locals>.<lambda>c                    r-   r   )rO   r8   r1   r   r   r2   f   r3   z2MTEFileReader._tagged_para_elt.<locals>.<listcomp>r4   rN   r'   r   r1   r   _tagged_para_elta   rP   zMTEFileReader._tagged_para_eltc                 C   s$   d|j vr
|jdfS |j|j d fS )Nlemmar<   )rB   r&   r'   r   r   r   _lemma_word_eltj   s   

zMTEFileReader._lemma_word_eltc                    r,   )Nc                    r-   r   )rS   r.   r1   r   r   r2   s   r3   z1MTEFileReader._lemma_sent_elt.<locals>.<listcomp>r4   r5   r'   r   r1   r   _lemma_sent_eltq   r7   zMTEFileReader._lemma_sent_eltc                    r,   )Nc                    r-   r   )rT   r8   r1   r   r   r2   w   r3   z1MTEFileReader._lemma_para_elt.<locals>.<listcomp>r4   r5   r'   r   r1   r   _lemma_para_eltu   r7   zMTEFileReader._lemma_para_eltc                 C      t | jtjtjS r   )r   r$   r!   	word_pathr+   r   r   r   r   wordsy      zMTEFileReader.wordsc                 C   rV   r   )r   r$   r!   	sent_pathr6   rX   r   r   r   sents~   rZ   zMTEFileReader.sentsc                 C   rV   r   )r   r$   r!   	para_pathr:   rX   r   r   r   paras   rZ   zMTEFileReader.parasc                 C   rV   r   )r   r$   r!   rW   rS   rX   r   r   r   lemma_words   rZ   zMTEFileReader.lemma_wordsc                 C      |t _|t _t| jt jt jS r   )r!   rD   rC   r   r$   rW   rL   r   tagsetrK   r   r   r   tagged_words   
   zMTEFileReader.tagged_wordsc                 C   rV   r   )r   r$   r!   r[   rT   rX   r   r   r   lemma_sents   rZ   zMTEFileReader.lemma_sentsc                 C   r`   r   )r!   rD   rC   r   r$   r[   rO   ra   r   r   r   tagged_sents   rd   zMTEFileReader.tagged_sentsc                 C   rV   r   )r   r$   r!   r]   rU   rX   r   r   r   lemma_paras   rZ   zMTEFileReader.lemma_parasc                 C   r`   r   )r!   rD   rC   r   r$   r]   rQ   ra   r   r   r   tagged_paras   rd   zMTEFileReader.tagged_parasN)r   r   r   r    r
   tag_nsxml_nsrW   r[   r]   r   classmethodr+   r6   r:   rL   rO   rQ   rS   rT   rU   rY   r\   r^   r_   rc   re   rf   rg   rh   r   r   r   r   r!   "   sL    








r!   c                   @   s:   e Zd ZdZdddddddd	d
ddddZedd ZdS )rE   zu
    Class for converting msd tags to universal tags, more conversion
    options are currently not implemented.
    ADJADPADVCONJDETNOUNNUMPRTPRONVERBrA   X)ASRCDNMQPVrA   r@   c                 C   s4   | d dks
| d n| d }|t jvrd}t j| S )z
        This function converts the annotation from the Multex-East to the universal tagset
        as described in Chapter 5 of the NLTK-Book

        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
        r   #   r@   )rE   mapping_msd_universal)tag	indicatorr   r   r   rF      s   

z MTETagConverter.msd_to_universalN)r   r   r   r    r   staticmethodrF   r   r   r   r   rE      s"    rE   c                   @   s|   e Zd ZdZdddZdd Zddd	Zdd
dZdddZdddZ	dddZ
dddZdddZdddZdddZdS )MTECorpusReaderz
    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
    scheme. These tags can be converted to the Universal tagset
    Nutf8c                 C   s   t | ||| d| _dS )a.  
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param encoding: The encoding of the given files (default is utf8)
        z00README.txtN)r   r   _readme)r   r   fileidsencodingr   r   r   r      s   
zMTECorpusReader.__init__c                    sP   |d u r j }nt|tr|g}t fdd|}tdd |}|s&td |S )Nc                    s
   |  j v S r   )_fileidsr   rX   r   r   r      s   
 z+MTECorpusReader.__fileids.<locals>.<lambda>c                 S   s   | dvS )N)zoana-bg.xmlzoana-mk.xmlr   r   r   r   r   r      r   z$No valid multext-east file specified)r   
isinstancestrr   printr   r   r   rX   r   	__fileids   s   
zMTECorpusReader.__fileidsc                       t  fdd |D S )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        c                    $   g | ]}t tj j| qS r   )r!   osr	   join_rootrY   r/   frX   r   r   r2          z)MTECorpusReader.words.<locals>.<listcomp>r   _MTECorpusReader__fileidsr   r   rX   r   rY      s
   
zMTECorpusReader.wordsc                    r   )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances,
                 each encoded as a list of word strings
        :rtype: list(list(str))
        c                    r   r   )r!   r   r	   r   r   r\   r   rX   r   r   r2   
  r   z)MTECorpusReader.sents.<locals>.<listcomp>r   r   r   rX   r   r\     
   
zMTECorpusReader.sentsc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a list
                 of sentences, which are in turn encoded as lists of word string
        :rtype: list(list(list(str)))
        c                    r   r   )r!   r   r	   r   r   r^   r   rX   r   r   r2     r   z)MTECorpusReader.paras.<locals>.<listcomp>r   r   r   rX   r   r^     r   zMTECorpusReader.parasc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words, the corresponding lemmas
                 and punctuation symbols, encoded as tuples (word, lemma)
        :rtype: list(tuple(str,str))
        c                    r   r   )r!   r   r	   r   r   r_   r   rX   r   r   r2   &  r   z/MTECorpusReader.lemma_words.<locals>.<listcomp>r   r   r   rX   r   r_     r   zMTECorpusReader.lemma_wordsr=   r<   c                    <   dksdkrt  fdd |D S td dS )a;  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of tagged words and punctuation symbols
                 encoded as tuples (word, tag)
        :rtype: list(tuple(str, str))
        r>   r=   c                    (   g | ]}t tj j|qS r   )r!   r   r	   r   r   rc   r   r   rK   rb   r   r   r2   9      z0MTECorpusReader.tagged_words.<locals>.<listcomp>Unknown tagset specified.Nr   r   r   r   r   rb   rK   r   r   r   rc   ,     	zMTECorpusReader.tagged_wordsc                    r   )aB  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances, each
                 encoded as a list of tuples of the word and the corresponding
                 lemma (word, lemma)
        :rtype: list(list(tuple(str, str)))
        c                    r   r   )r!   r   r	   r   r   re   r   rX   r   r   r2   L  r   z/MTECorpusReader.lemma_sents.<locals>.<listcomp>r   r   r   rX   r   re   C  
   
zMTECorpusReader.lemma_sentsc                    r   )aH  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of sentences or utterances, each
                 each encoded as a list of (word,tag) tuples
        :rtype: list(list(tuple(str, str)))
        r>   r=   c                    r   r   )r!   r   r	   r   r   rf   r   r   r   r   r2   _  r   z0MTECorpusReader.tagged_sents.<locals>.<listcomp>r   Nr   r   r   r   r   rf   R  r   zMTECorpusReader.tagged_sentsc                    r   )am  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list of
                 tuples of the word and the corresponding lemma (word, lemma)
        :rtype: list(List(List(tuple(str, str))))
        c                    r   r   )r!   r   r	   r   r   rg   r   rX   r   r   r2   r  r   z/MTECorpusReader.lemma_paras.<locals>.<listcomp>r   r   r   rX   r   rg   i  r   zMTECorpusReader.lemma_parasc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list
                 of (word,tag) tuples
        :rtype: list(list(list(tuple(str, str))))
        r>   r=   c                    r   r   )r!   r   r	   r   r   rh   r   r   r   r   r2     r   z0MTECorpusReader.tagged_paras.<locals>.<listcomp>r   Nr   r   r   r   r   rh   x  s   	zMTECorpusReader.tagged_paras)NNr   r   )Nr=   r<   )r   r   r   r    r   r   rY   r\   r^   r_   rc   re   rf   rg   rh   r   r   r   r   r      s    








r   )r    r   rG   	functoolsr   nltk.corpus.readerr   r   nltk.corpus.reader.xmldocsr   r   r   r!   rE   r   r   r   r   r   <module>   s     %