o
    i.                     @   s   d dl T d dlmZ edZedZedZedZedZ	edZ
ed	ZG d
d deZG dd deeZdS )    )*)XMLCorpusReaderz<p(?: [^>]*){0,1}>(.*?)</p>z<s(?: [^>]*){0,1}>(.*?)</s>z#<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>z!<[wc](?: [^>]*){0,1}>(.*?)</[wc]>ztype="(.*?)"zana="(.*?)"ztext id="(.*?)"c                   @   s0   e Zd Z			d
ddZdZdd Zdd	 ZdS )TEICorpusViewNr   c                 C   s,   || _ || _|| _|| _tj| ||d d S )N)startpos)_tagged_textids_group_by_sent_group_by_paraStreamBackedCorpusView__init__)selfcorpus_filetaggedgroup_by_sentgroup_by_paratagsethead_lentextids r   T/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/corpus/reader/pl196x.pyr      s
   
zTEICorpusView.__init__i   c                 C   sv  | | j}t|}|d|dks|ddkr;| }t|dkr&n||7 }|d|dks|ddks|dd}t|}| j	ru|D ])}|| j	vrt|
|d }||d  
dtd }|d | ||| d   }qKg }t|D ]<}	g }
t|	D ]$}| jst|}ntt| jt|}| jr|
| q|
| q| jr||
 q|||
 q||S )Nz<text idz</text>r   
    )	readlines	_pagesizeconcatcountreadlinelenreplaceTEXTIDfindallr   findPARASENTr   WORDlistmap
_parse_tag
TAGGEDWORDr   appendextendr	   )r   streamblocktmpr   tidbegendoutputpara_strparasent_strsentr   r   r   
read_block,   sN   

zTEICorpusView.read_blockc                 C   sB   |\}}| drt|d}||fS t|d}||fS )Nwr   )
startswithANAsearchgroupTYPE)r   tag_word_tupletagwordr   r   r   r(   S   s   
zTEICorpusView._parse_tag)Nr   N)__name__
__module____qualname__r   r   r7   r(   r   r   r   r   r      s    
'r   c                   @   s   e Zd ZdZdd Zdd Zdd Zdd	d
Zdd ZdddZ	dddZ
dddZdddZdddZdddZdddZdddZdS ) Pl196xCorpusReaderi
  c                 O   sD   d|v r
|d | _ nd | _ tj| g|R   t| | |   d S )Ntextid_file)r   r   r   CategorizedCorpusReader_init_textids)r   argskwargsr   r   r   r   _   s   zPl196xCorpusReader.__init__c                 C   s   t t| _t t| _| jd urVt| j8}|D ],}| }|dd\}}||  vr4t	d| j|f || j
D ]}| || q:qW d    d S 1 sOw   Y  d S d S )N r   z(In text_id mapping file %s: %s not found)defaultdictr&   _f2t_t2fr   openstripsplitfileids
ValueError
_delimiter_add_textids)r   fplinefile_idtext_idstext_idr   r   r   rG   j   s&   


"z Pl196xCorpusReader._init_textidsc                 C   s$   | j | | | j| | d S N)rL   r*   rM   )r   rW   rY   r   r   r   rT   z   s   zPl196xCorpusReader._add_textidsNc                    s   d }t ttdd |||fdkrtd|d ur|d fS |d ur) |d fS |d urZt|tr5|g}t fdd|D g }t }|D ]}t	 j
| t	|@ ||< qF||fS d S )Nc                 S   s   | d u S rZ   r   )accessorr   r   r   <lambda>   s    z-Pl196xCorpusReader._resolve.<locals>.<lambda>r   z6Specify exactly one of: fileids, categories or textidsc                 3       | ]} j | V  qd S rZ   )rM   ).0tr   r   r   	<genexpr>       z.Pl196xCorpusReader._resolve.<locals>.<genexpr>)r   r&   filterrR   rQ   
isinstancestrsumdictsetrL   )r   rQ   
categoriesr   r.   filestdictfr   r`   r   _resolve~   s6   

zPl196xCorpusReader._resolvec                 C   s   |S rZ   r   )r   r?   r   r   r   
decode_tag   s   zPl196xCorpusReader.decode_tagc                    sN     ||\}}|du rt jS t|tr|g}tt fdd|D g S )an  
        In the pl196x corpus each category is stored in single
        file and thus both methods provide identical functionality. In order
        to accommodate finer granularity, a non-standard textids() method was
        implemented. All the main functions can be supplied with a list
        of required chunks---giving much more control to the user.
        Nc                 3   r]   rZ   )rL   )r^   dr`   r   r   ra      rb   z-Pl196xCorpusReader.textids.<locals>.<genexpr>)rm   sortedrM   rd   re   rf   r   rQ   ri   _r   r`   r   r      s   

zPl196xCorpusReader.textidsc                    d     ||\}|d u r j}nt|tr|g}r't fdd|D S t fdd|D S )Nc              
      ,   g | ]}t  |d d d  j| dqS )Fr   r   r   abspathr   r^   fileidr   r   r   r   
<listcomp>       	z,Pl196xCorpusReader.words.<locals>.<listcomp>c              	      &   g | ]}t  |d d d  jdqS )Fr   rv   rx   r`   r   r   r{      s    rm   _fileidsrd   re   r   r   rQ   ri   r   r   rz   r   words   s    
	
zPl196xCorpusReader.wordsc                    rs   )Nc              
      s,   g | ]}t  |d dd  j| dqS FTru   rv   rx   rz   r   r   r{      r|   z,Pl196xCorpusReader.sents.<locals>.<listcomp>c              	      s&   g | ]}t  |d dd  jdqS FTr~   rv   rx   r`   r   r   r{          r   r   r   rz   r   sents       
	
zPl196xCorpusReader.sentsc                    rs   )Nc              
      ,   g | ]}t  |d dd j| dqS r   rv   rx   rz   r   r   r{      r|   z,Pl196xCorpusReader.paras.<locals>.<listcomp>c              	      &   g | ]}t  |d dd jdqS r   rv   rx   r`   r   r   r{     r   r   r   r   rz   r   paras   r   zPl196xCorpusReader.parasc                    rs   )Nc              
      r   TFru   rv   rx   rz   r   r   r{     r|   z3Pl196xCorpusReader.tagged_words.<locals>.<listcomp>c              	      r   TFr~   rv   rx   r`   r   r   r{   *  r   r   r   r   rz   r   tagged_words  r   zPl196xCorpusReader.tagged_wordsc                    rs   )Nc              
      s,   g | ]}t  |d d d j| dqS r   rv   rx   rz   r   r   r{   ;  r|   z3Pl196xCorpusReader.tagged_sents.<locals>.<listcomp>c              	      s&   g | ]}t  |d d d jdqS r   rv   rx   r`   r   r   r{   I  r   r   r   r   rz   r   tagged_sents2  r   zPl196xCorpusReader.tagged_sentsc                    rs   )Nc              
      rt   )Tru   rv   rx   rz   r   r   r{   Z  r|   z3Pl196xCorpusReader.tagged_paras.<locals>.<listcomp>c              	      r}   )Tr~   rv   rx   r`   r   r   r{   h  r   r   r   r   rz   r   tagged_parasQ  r   zPl196xCorpusReader.tagged_parasc                 C   s4   |  ||\}}t|dkrt| |d S td)Nr   r   zExpected a single file)rm   r   r   xml	TypeErrorrq   r   r   r   r   p  s   zPl196xCorpusReader.xmlrZ   )NN)NNN)rA   rB   rC   r   r   rG   rT   rm   rn   r   r   r   r   r   r   r   r   r   r   r   r   rD   \   s    
 


#



rD   N)nltk.corpus.reader.apinltk.corpus.reader.xmldocsr   recompiler#   r$   r)   r%   r=   r:   r    r
   r   rF   rD   r   r   r   r   <module>   s   






E