o
    i                     @   st   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ dZG dd deZdS )	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc                   @   sF   e Zd ZdZdZ					dddZed	d
 Zdd ZdddZ	dS )StanfordTokenizeraF  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNutf8F-mx1000mc                 C   sf   t jtdtdd t| j|ddt|d| _|| _|| _	|d u r"i n|}d
dd	 | D | _d S )
Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)STANFORD_POSTAGGER )env_vars
searchpathurlverbose,c                 s   s"    | ]\}}| d | V  qdS )=Nr   ).0keyvalr   r   Q/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/nltk/tokenize/stanford.py	<genexpr>E   s     z-StanfordTokenizer.__init__.<locals>.<genexpr>)warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfpath_to_jarencodingoptionsr   r#   r   r   r   __init__%   s&   	
	zStanfordTokenizer.__init__c                 C   s   |   S )N)
splitlines)sr   r   r   _parse_tokenized_outputG   s   z)StanfordTokenizer._parse_tokenized_outputc                 C   s   dg}|  | ||S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r.   _execute)r'   r-   cmdr   r   r   tokenizeK   s   zStanfordTokenizer.tokenizec           
      C   s   | j }|d|g | j}|r|d| jg dt}t| j|d tjddd3}t	|t
r7|r7||}|| |  ||j t|| jttd\}}	||}W d    n1 s`w   Y  t|j t|dd |S )	Nz-charsetz-options )r*   r   wbF)modedelete)	classpathstdoutstderr)r"   extendr&   r$   r   r   r#   tempfileNamedTemporaryFile
isinstancer   encodewriteflushappendnamer   r!   r   decodeosunlink)
r'   r0   input_r   r)   r&   default_options
input_filer7   r8   r   r   r   r/   R   s*   




zStanfordTokenizer._execute)Nr
   NFr   )F)
__name__
__module____qualname____doc__r   r+   staticmethodr.   r1   r/   r   r   r   r   r	      s    
"
r	   )jsonrC   r:   r   
subprocessr   nltk.internalsr   r   r   r   nltk.parse.corenlpr   nltk.tokenize.apir   r    r	   r   r   r   r   <module>   s   