o
    i                     @   sv   d dl Z d dlmZmZ d dlmZmZmZ d dlm	Z	 dd Z
G dd deZd	d
 Zdd Zedkr9e  dS dS )    N)CorpusReaderSyntaxCorpusReader)FileSystemPathPointerfind_corpus_fileidsread_blankline_block)DependencyGraphc                 C   s   d dd | D S )N/c                 s   s$    | ]}|d  dkr|d  V  qdS )r   EOSN .0mr
   r
   R/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/corpus/reader/knbc.py	<genexpr>   s   " z<lambda>.<locals>.<genexpr>joinmorphsr
   r
   r   <lambda>   s    r   c                   @   s@   e Zd ZdZdefddZdd Zdd Zdd
dZdd Z	d	S )KNBCorpusReadera  
    This class implements:
      - ``__init__``, which specifies the location of the corpus
        and a method for detecting the sentence blocks in corpus files.
      - ``_read_block``, which reads a block from the input stream.
      - ``_word``, which takes a block and returns a list of list of words.
      - ``_tag``, which takes a block and returns a list of list of tagged
        words.
      - ``_parse``, which takes a block and returns a list of parsed
        sentences.

    The structure of tagged words:
      tagged_word = (word(str), tags(tuple))
      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

    Usage example

    >>> from nltk.corpus.util import LazyCorpusLoader
    >>> knbc = LazyCorpusLoader(
    ...     'knbc/corpus1',
    ...     KNBCorpusReader,
    ...     r'.*/KN.*',
    ...     encoding='euc-jp',
    ... )

    >>> len(knbc.sents()[0])
    9

    utf8c                 C   s   t | ||| || _dS )z
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        N)r   __init__
morphs2str)selfrootfileidsencodingr   r
   r
   r   r   7   s   
zKNBCorpusReader.__init__c                 C   s   t |S N)r   )r   streamr
   r
   r   _read_block@   s   zKNBCorpusReader._read_blockc                 C   s>   g }|  D ]}td|s| d}||d  q|S )NEOS|\*|\#|\+ r   )
splitlinesrematchstripsplitappend)r   treslinecellsr
   r
   r   _wordD   s   zKNBCorpusReader._wordNc              	   C   sP   g }|  D ]}td|s%| d}||d d|dd  f q|S )Nr    r!   r      )r"   r#   r$   r%   r&   r'   r   )r   r(   tagsetr)   r*   r+   r
   r
   r   _tagO   s    zKNBCorpusReader._tagc           
      C   s*  t  }d}| D ]s}|d dv rV| dd}td|d }|d us'J |j| }|||dg d t	|d}|d	krG||_
n
|j| d
 | |d7 }q	|d dkr|| d}|d d|dd  f}	|j|d  d |	 q	| jr|j D ]}| |d |d< q| S )Nr   z*+r!      z([\-0-9]*)([ADIP])r-      )addressrelworddeps#r4   )r   r"   r%   r&   r#   r$   nodesupdategroupintr   r'   r   r   valuestree)
r   r(   dgir*   r+   r   node
dep_parentmorphr
   r
   r   _parseZ   s.   

zKNBCorpusReader._parser   )
__name__
__module____qualname____doc___morphs2str_defaultr   r   r,   r/   rC   r
   r
   r
   r   r      s    	
r   c                  C   s   dd l } ddlm} | jd}dd tt|dD }dd }|d	tt||d
dd}t	|
 d d  t	d| d d  t	ddd | d d D  dd |_t	ddd | d d D  t	ddd | dd D  d S )Nr   LazyCorpusLoaderzcorpora/knbc/corpus1c                 S   s   g | ]
}t d |r|qS )z\d\-\d\-[\d]+\-[\d]+)r#   search)r   fr
   r
   r   
<listcomp>   s    
zdemo.<locals>.<listcomp>z.*c                 S   s2   |  d}|d t|d t|d t|d fS )N-r   r-   r1   r0   )r&   r;   )xr+   r
   r
   r   _knbc_fileids_sort   s   
(z demo.<locals>._knbc_fileids_sortknbc/corpus1)keyeuc-jpr   
    d   z

c                 s   s    | ]}t |V  qd S r   )strr   r=   r
   r
   r   r          zdemo.<locals>.<genexpr>r1   c                 S   s   d dd | D dS )Nr   c                 s   s:    | ]}|d  dkrd |d  |d dd V  qdS )r   r	   z{}({})r-   r!   r1   Nformatr&   r   r
   r
   r   r      s    0z)demo.<locals>.<lambda>.<locals>.<genexpr>zutf-8)r   encoder   r
   r
   r   r      s   
 zdemo.<locals>.<lambda>c                 s   s    | ]}d | V  qdS )z%sNr
   rY   r
   r
   r   r      rZ   
c                 s   s$    | ]}d  dd |D V  qdS )r!   c                 s   s.    | ]}d  |d |d dd V  qdS )z{}/{}r   r-   r!   r1   Nr[   )r   wr
   r
   r   r      s   , z!demo.<locals>.<genexpr>.<genexpr>Nr   )r   sentr
   r
   r   r      s
    
)nltknltk.corpus.utilrJ   datafindr   r   r   sortedprintr   r   wordsparsed_sentsr   tagged_sents)ra   rJ   r   r   rP   knbcr
   r
   r   demo   s.   
$
$
rk   c                  C   s   ddl m}  | dtddd}t| d tsJ t| d d ts&J t| d ts1J t|	 d d ts>J d S )Nr   rI   rQ   z.*/KN.*rS   rT   )
rb   rJ   r   
isinstancerg   rX   sentstagged_wordstupleri   )rJ   rj   r
   r
   r   test   s   rp   __main__)r#   nltk.corpus.reader.apir   r   nltk.corpus.reader.utilr   r   r   
nltk.parser   rH   r   rk   rp   rD   r
   r
   r
   r   <module>   s   	l)
