U
    e#                     @   st   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ dZG dd deZdS )	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc                   @   s<   e Zd ZdZdZdddZed	d
 Zdd ZdddZ	dS )StanfordTokenizeraF  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNutf8F-mx1000mc                 C   sf   t jtdtdd t| j|ddt|d| _|| _|| _	|d krDi n|}d
dd	 | D | _d S )
Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)ZSTANFORD_POSTAGGER )Zenv_varsZ
searchpathurlverbose,c                 s   s    | ]\}}| d | V  qdS )=Nr   ).0keyvalr   r   W/var/www/html/assets/scripts/venv/lib/python3.8/site-packages/nltk/tokenize/stanford.py	<genexpr>E   s     z-StanfordTokenizer.__init__.<locals>.<genexpr>)warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfZpath_to_jarencodingoptionsr   r    r   r   r   __init__%   s&    	
	zStanfordTokenizer.__init__c                 C   s   |   S )N)
splitlines)sr   r   r   _parse_tokenized_outputG   s    z)StanfordTokenizer._parse_tokenized_outputc                 C   s   dg}|  | ||S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r*   _execute)r$   r)   cmdr   r   r   tokenizeK   s    zStanfordTokenizer.tokenizec           
   	   C   s   | j }|d|g | j}|r.|d| jg dt}t| j|d tjddd\}t	|t
rn|rn||}|| |  ||j t|| jttd\}}	||}W 5 Q R X t|j t|dd |S )	Nz-charsetz-options )r&   r   wbF)modedelete)Z	classpathstdoutstderr)r   extendr#   r!   r   r   r    tempfileNamedTemporaryFile
isinstancer   encodewriteflushappendnamer   r   r   decodeosunlink)
r$   r,   input_r   r%   r#   Zdefault_optionsZ
input_filer2   r3   r   r   r   r+   R   s.    


   
zStanfordTokenizer._execute)Nr
   NFr   )F)
__name__
__module____qualname____doc__r   r'   staticmethodr*   r-   r+   r   r   r   r   r	      s        
"
r	   )jsonr>   r5   r   
subprocessr   Znltk.internalsr   r   r   r   Znltk.parse.corenlpr   Znltk.tokenize.apir   r   r	   r   r   r   r   <module>	   s   