o
    i$                     @   sb   d Z ddlmZ ddlmZmZmZ G dd deZdddZG d	d
 d
e	Z
G dd deZdS )zACorpus reader for the XML version of the British National Corpus.    )concat)ElementTreeXMLCorpusReaderXMLCorpusViewc                   @   sT   e Zd ZdZdddZdddZdd	d
ZdddZdddZdddZ	dd Z
dS )BNCCorpusReadera7  Corpus reader for the XML version of the British National Corpus.

    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

    You can obtain the full version of the BNC corpus at
    https://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

    Tc                 C   s   t | || || _d S N)r   __init___lazy)selfrootfileidslazy r   Q/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/corpus/reader/bnc.pyr      s   
zBNCCorpusReader.__init__NFc                 C      |  |dd||S )aT  
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        FN_viewsr
   r   strip_spacestemr   r   r   words#   s   
zBNCCorpusReader.wordsc                 C   s   |rdnd}|  |d|||S )a   
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        c5posFr   r
   r   r   r   r   tagr   r   r   tagged_words/   s   zBNCCorpusReader.tagged_wordsc                 C   r   )a  
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        TNr   r   r   r   r   sents?   s   zBNCCorpusReader.sentsc                 C   s    |rdnd}| j |d|||dS )a  
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        r   r   T)sentr   r   r   r   r   r   r   r   tagged_sentsL   s   
zBNCCorpusReader.tagged_sentsc                    s4   | j rtn| j t fdd| |D S )zPA helper function that instantiates BNCWordViews or the list of words/sentences.c                    s   g | ]
} |qS r   r   ).0fileidfr   r   r   r   r   r   
<listcomp>a   s    z*BNCCorpusReader._views.<locals>.<listcomp>)r	   BNCWordView_wordsr   abspaths)r
   r   r   r   r   r   r   r!   r   r   ]   s   zBNCCorpusReader._viewsc              	   C   s   g }t | }|dD ]U}g }	t|D ]9}
|
j}|sd}|s#|r'| }|r/|
d|}|dkr;||
df}n|dkrJ||
d|
df}|	| q|r^|t	|j
d |	 q||	 qd|vsjJ |S )a  
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.//s hwr   r   nN)r   parsegetrootfindall_all_xmlwords_intextstripgetappendBNCSentenceattribextend)r
   r    bracket_sentr   r   r   resultxmldocxmlsentr   xmlwordwordr   r   r   r%   g   s,   zBNCCorpusReader._words)T)NTF)NFTF)NFFTF)__name__
__module____qualname____doc__r   r   r   r   r   r   r%   r   r   r   r   r      s    






r   Nc                 C   s:   |d u rg }| D ]}|j dv r|| qt|| q|S )N)cw)r   r1   r-   )eltr6   childr   r   r   r-      s   
r-   c                   @   s   e Zd ZdZdd ZdS )r2   z
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    c                 C   s   || _ t| | d S r   )numlistr   )r
   rC   itemsr   r   r   r      s   zBNCSentence.__init__N)r;   r<   r=   r>   r   r   r   r   r   r2      s    r2   c                   @   sB   e Zd ZdZh dZ	 dd Zdd Zdd Zd	d
 Zdd Z	dS )r$   zN
    A stream backed corpus view specialized for use with the BNC corpus.
    >   pbgapaligneventpauseshiftvocalunclearc                 C   s|   |rd}nd}|| _ || _|| _|| _d| _d| _d| _d| _t	| || | 
  | | jd| j |   ddi| _dS )aG  
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.*/sz.*/s/(.*/)?(c|w)Nz.*/teiHeader$r   r   )_sent_tag_strip_space_stemtitleauthoreditorrespsr   r   _open
read_block_streamhandle_headerclose_tag_context)r
   r    r   r   r   r   tagspecr   r   r   r      s    zBNCWordView.__init__c                 C   s   | d}|rddd |D | _| d}|r$ddd |D | _| d}|r6ddd |D | _| d	}|rJd
dd |D | _d S d S )NztitleStmt/title
c                 s       | ]}|j  V  qd S r   r.   r/   )r   rR   r   r   r   	<genexpr>       z,BNCWordView.handle_header.<locals>.<genexpr>ztitleStmt/authorc                 s   r^   r   r_   )r   rS   r   r   r   r`      ra   ztitleStmt/editorc                 s   r^   r   r_   )r   rT   r   r   r   r`      ra   ztitleStmt/respStmtz

c                 s   s$    | ]}d  dd |D V  qdS )r]   c                 s   r^   r   r_   )r   resp_eltr   r   r   r`      ra   z6BNCWordView.handle_header.<locals>.<genexpr>.<genexpr>N)join)r   respr   r   r   r`      s    
)r,   rc   rR   rS   rT   rU   )r
   rA   contexttitlesauthorseditorsrU   r   r   r   rY      s   




zBNCWordView.handle_headerc                 C   s   | j r| |S | |S r   )rN   handle_senthandle_word)r
   rA   re   r   r   r   
handle_elt   s   

zBNCWordView.handle_eltc                 C   st   |j }|sd}| js| jr| }| jr|d|}| jdkr(||df}|S | jdkr8||d|df}|S )Nr'   r(   r   r   )r.   rP   rQ   r/   r0   rO   )r
   rA   r:   r   r   r   rj      s   

zBNCWordView.handle_wordc                    sv   g }|D ].}|j dv r| fdd|D 7 }q|j dv r%| | q|j  jvr2td|j  qt|jd |S )N)mwhicorrtruncc                    s   g | ]}  |qS r   )rj   )r   r@   r
   r   r   r#     s    z+BNCWordView.handle_sent.<locals>.<listcomp>)r@   r?   zUnexpected element %sr)   )r   r1   rj   tags_to_ignore
ValueErrorr2   r3   )r
   rA   r   rB   r   rp   r   ri      s   

zBNCWordView.handle_sentN)
r;   r<   r=   r>   rq   r   rY   rk   rj   ri   r   r   r   r   r$      s    
 r$   r   )r>   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r-   rD   r2   r$   r   r   r   r   <module>   s   
