o
    i~0                     @   s  d Z ddlZddlZddlmZ ddlmZmZ zddl	m
Z
 W n	 ey)   Y nw ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd deZG dd deZdd Zdd Zdd Zd#ddZdd Zdd ZG dd deZd$d d!Z	 e d"kred ed dS dS )%z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScorefind)word_tokenize)Treec                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    Nc                 C   s   t j| || j|d d S )N)trainclassifier_builder
classifier)r   __init___classifier_builder)selfr   r    r   R/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/chunk/named_entity.pyr   $   s   
zNEChunkParserTagger.__init__c                 C   s   t j|ddddS )Niis      )	algorithmgaussian_prior_sigmatrace)r   r   r   r   r   r   r   r   ,   s   z'NEChunkParserTagger._classifier_builderc                 C   sD   z| j }W |S  ty!   ddlm} t|d| _ | j }Y |S w )Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr   set)r   wlr   r   r   r   _english_wordlist5   s   z%NEChunkParserTagger._english_wordlistc                 C   sj  || d }t || d }|dkr!d  }}d  }}	d  }
 }}na|dkrJ||d  d  }d }t ||d  d }d }	||d  d }d  }
}n8||d  d  }||d  d  }t ||d  d }t ||d  d }	||d  }||d  }t|}
|t|d krd  }}d  }}nI|t|d kr||d  d  }||d  d  }d }d }n(||d  d  }||d  d  }||d  d  }||d  d  }i dddt|dt|d|d d	  d
|dd   d|d|d||  v d|d|d|d|d|d|  d| d| d| d|
 d| }|S )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr$   lenr"   )r   tokensindexhistoryr+   r*   r/   prevprevwordr-   prevprevpos	prevshaper,   prevprevtagr0   nextnextwordr.   nextnextposfeaturesr   r   r   _feature_detector?   s   

	
z%NEChunkParserTagger._feature_detector)NN)__name__
__module____qualname____doc__r   r   r"   r?   r   r   r   r   r      s    
	
r   c                   @   s<   e Zd ZdZdd Zdd Zdd Zdd	 Zed
d Z	dS )NEChunkParser2
    Expected input: list of pos-tagged words
    c                 C   s   |  | d S N)_trainr   r   r   r   r      s   zNEChunkParser.__init__c                 C   s   | j |}| |}|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r5   taggedtreer   r   r   parse   s   
zNEChunkParser.parsec                    s"    fdd|D }t |d _d S )Nc                    s   g | ]}  |qS r   )_parse_to_tagged).0sr   r   r   
<listcomp>       z(NEChunkParser._train.<locals>.<listcomp>)r   )r   rH   )r   corpusr   rQ   r   rG      s   zNEChunkParser._trainc                 C   s   t dg }|D ]P\}}|dkr|| q|dr(|t |dd |g q|drW|rJt|d t rJ|d  |dd krJ|d | q|t |dd |g q|S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokrI   r   r   r   rJ      s   


*zNEChunkParser._tagged_to_parsec                 C   s   g }| D ]=}t |tr:t|dkrtd q||d d|  f |dd D ]}||d|  f q*q||df q|S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencerW   r   NrX   rV   )r\   r   r4   printrZ   r]   )r_   tokschildr`   r   r   r   rN      s   
zNEChunkParser._parse_to_taggedN)
r@   rA   rB   rC   r   rM   rG   rJ   staticmethodrN   r   r   r   r   rD   z   s    rD   c                 C   sX   t d| t jr
dS t d| t jrdS t d| t jr*|  r"dS |  r(dS dS d	S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r+   r   r   r   r$      s   r$   c                 C   s   |  drdS | dd S )NV-r   )r[   split)rP   r   r   r   r2      s   
r2   c                 C   s   |   }dd t|D }tdg }| D ]+}t|tr6|t| g  |D ]}|d |t|f q'q||t|f q|S )Nc                 s   s    | ]\}}|V  qd S rF   r   )rO   r+   r*   r   r   r   	<genexpr>   s    zpostag_tree.<locals>.<genexpr>rU   rY   )leavesr   r   r\   rZ   r]   next)rL   r   tag_iternewtreerc   subchildr   r   r   postag_tree   s   

ry   binaryTc                 c   sd    | D ],}t |D ]$\}}}|dr|rq
|D ]}|dr-tt j|||E d H  qq
qd S )Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfr   r   r   load_ace_data   s   
r   c                 c   s   t dtj| d   | d }g }t|}t| }W d    n1 s*w   Y  |dD ]2}|	dj
}|dD ]$}|ddkrKqAt|	d	j
}	t|	d
j
d }
||	|
|f qAq4t| }| }W d    n1 szw   Y  tdd|}dd }td||}tdd|}tdd|}tdd|}dd |D }|dkrd}tdg }t|D ]+\}	}
}|	|k r|}	|
|	krq|t|||	  |td||	|
   |
}q|t||d   |V  d S |dkrHd}tdg }t|D ]/\}	}
}|	|k r|}	|
|	krq|t|||	  |t|||	|
   |
}q|t||d   |V  d S td)Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 S   s   d|   |   d  S )N    )endstart)mr   r   r   subfunc   s   zload_ace_file.<locals>.subfuncz[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c                 S   s   h | ]\}}}|qS r   r   )rO   rP   etypr   r   r   	<setcomp>  rS   z load_ace_file.<locals>.<setcomp>rz   r   rU   NE
multiclasszbad fmt value)ra   r|   r   rr   openETrM   getrootfindallr	   textgetintrZ   readrk   subr   sortedextendr
   
ValueError)textfiler   annfileentitiesinfilexmlentityr   mentionrP   r   r   r   entity_typesirb   r   r   r   r      sj   









r   c                 C   s   t | } t |}d}t| |D ]B\\}}\}}||  kr#dkrBn n|sAtd|dd|dd|  tdddd d}qd}td|dd|dd|  qd S )	NFrV   z  15r   z  {:15} {:15} {2}z...T)rD   rN   zipra   format)correctguessedellipsiswctgtr   r   r   
cmp_chunks.  s   

 r   c                   @   s*   e Zd ZdZd
ddZdd Zdd Zd	S )Maxent_NE_ChunkerrE   r   c                 C   s0   ddl m} || _|d| d| _|   d S )Nr   r   z+chunkers/maxent_ne_chunker_tab/english_ace_/)	nltk.datar	   _fmt_tab_dirload_params)r   r   r	   r   r   r   r   E  s   zMaxent_NE_Chunker.__init__c                 C   sF   ddl m}m} || j\}}}}t||||d|}t|d| _d S )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)alwayson_features)r   )nltk.classify.maxentr   r   r   r   r   rH   )r   r   r   wgtmpglabaonmcr   r   r   r   L  s   zMaxent_NE_Chunker.load_paramsc           	      C   sV   ddl m} | jj}|j}|j}|j}|j}|j}| j	}|||||d| dd d S )Nr   )save_maxent_paramsz/tmp/english_ace_r   )tab_dir)
r   r   rH   _classifier	_encoding_weights_mapping_labels	_alwaysonr   )	r   r   classifecgr   r   r   r   r   r   r   r   save_paramsU  s   zMaxent_NE_Chunker.save_paramsNr   )r@   rA   rB   rC   r   r   r   r   r   r   r   r   @  s
    
	r   r   c                 C   s   t | }|  |S rF   )r   r   )r   chunkerr   r   r   build_modelb  s   r   __main__)rz   Tr   )!rC   r|   rk   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   r   r	   nltk.tokenizer
   	nltk.treer   r   rD   r$   r2   ry   r   r   r   r   r   r@   r   r   r   r   <module>   s<   [;

I
"*