U
    ex                     @   sr   d Z ddlmZmZ ddlZddlmZmZmZm	Z	m
Z
mZmZmZ dd Zejje  ddZG d	d
 d
ZdS )zC
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
    )ListTupleN)LegalitySyllableTokenizerStanfordSegmenterSyllableTokenizerTreebankWordTokenizerTweetTokenizerpunktsent_tokenizeword_tokenizec                  C   s<   z t  } | d | d W dS  tk
r6   Y dS X d S )NarzhTF)r   default_configLookupError)seg r   ]/var/www/html/assets/scripts/venv/lib/python3.8/site-packages/nltk/test/unit/test_tokenize.pyload_stanford_segmenter   s    

r   z/NLTK was unable to find stanford-segmenter.jar.)reasonc                   @   s  e Zd Zdd Zejddddddd	d
dgdddd	d
dgffddddd	ddgdddd	dgffdddd	dddddddddgddd	dddddddddgffdddd	dddd d!d"g	ddd	dddd d!d"g	ffd#ddd	d$ddd d!d"g	ddd	d%d&ddd d!d"g
ffd'ddd	d(ddd d!d"g	ddd	d)d*d+d,d-d.ddd d!d"gffd/d0d1d2dd	d3d4d"gd0d1d2dd	d5d"gffd6dd7d8d	d9d"gdd7d8d	d:d-d;d"gffgeee	e e	e f d<d=d>Z
d?d@ ZdAdB ZdCdD ZedEdF ZedGdH ZdIdJ ZdKdL ZdMdN ZdOdP ZdQdR ZdSdT ZdUdV ZdWdX ZdYdZ Zd[d\ Zd]d^ Zd_d` ZejdadbdcddgddfdgZdhdi Zejdjdkdld"dmgfdndod"d"dpgfdqdqgfdrdrgfdsdtdpgfdudodvgfdwdodxdpgfdydzdpgfd{d|dpgfd}d~dddddgfdd~dddddgfdddgfgee	e dddZdeS )TestTokenizec              
   C   sB   t ddd}d}||}ddddddd	d
ddg
}||ks>tdS )zW
        Test TweetTokenizer using words with special and accented characters.
        T)strip_handles
reduce_lenuA   @myke: Let's test these words: resumé España München français:zLet'stestZthesewordsu   resuméu   Españau   Münchenu	   françaisNr   tokenizeAssertionError)self	tokenizerZs9tokensexpectedr   r   r   test_tweet_tokenizer&   s    
z!TestTokenize.test_tweet_tokenizerztest_input, expectedsz#My text 0106404243030 is great textZMytextZ01064042430Z30isZgreatZ0106404243030zMy ticket id is 1234543124123ticketidZ12345431241Z23Z1234543124123z<@remy: This is waaaaayyyy too much for you!!!!!! 01064042430r   ThisZwaaayyyZtooZmuchforyou!z*My number is 06-46124080, except it's not.numberz06-46124080,exceptzit'snot.z+My number is 601-984-4813, except it's not.z601-984-4813z601-984-Z4813z/My number is (393)  928 -3010, except it's not.(393)  928 -3010(393)Z928-Z3010z1The product identification number is 48103284512.TheproductZidentificationZ
48103284512Z48103284512z(My favourite substraction is 240 - 1353.Z	favouriteZsubstractionz
240 - 1353Z240Z1353)
test_input	expectedsc                 C   s@   t ddg|D ],\}}tdd|d}||}||kstqdS )a  
        Test `match_phone_numbers` in TweetTokenizer.

        Note that TweetTokenizer is also passed the following for these tests:
            * strip_handles=True
            * reduce_len=True

        :param test_input: The input string to tokenize using TweetTokenizer.
        :type test_input: str
        :param expecteds: A 2-tuple of tokenized sentences. The first of the two
            tokenized is the expected output of tokenization with `match_phone_numbers=True`.
            The second of the two tokenized lists is the expected output of tokenization
            with `match_phone_numbers=False`.
        :type expecteds: Tuple[List[str], List[str]]
        TF)r   r   match_phone_numbersN)zipr   r   r   )r   r8   r9   r:   r!   r   Z	predictedr   r   r   test_tweet_tokenizer_expanded<   s     E
z*TestTokenize.test_tweet_tokenizer_expandedc                 C   s*   t  }|d}|dddddgks&tdS )3
        Test SyllableTokenizer tokenizer.
        ZjustificationZjustificaZtionNr   r   r   )r   r   r    r   r   r   +test_sonority_sequencing_syllable_tokenizer	  s    
z8TestTokenize.test_sonority_sequencing_syllable_tokenizerc                 C   s*   t  }dd }||}||gks&tdS )r=   9i'  NrA   )r   r   r#   r    r   r   r   test_syllable_tokenizer_numbers  s    
z,TestTokenize.test_syllable_tokenizer_numbersc                 C   s<   ddl m} d}t| }||}|dddgks8tdS )z;
        Test LegalitySyllableTokenizer tokenizer.
        r   )r   Z	wonderfulZwonZderZfulN)Znltk.corpusr   r   r   r   )r   r   Z	test_wordr   r    r   r   r   *test_legality_principle_syllable_tokenizer  s
    
z7TestTokenize.test_legality_principle_syllable_tokenizerc                 C   sN   t  }|d d}|| }| ddddddd	d
ddddgksJtdS )zN
        Test the Stanford Word Segmenter for Arabic (default config)
        r   un   يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلاتu   يبحثu   علمu   الحاسوبu   استخدامu   الحوسبةu   بu   جميعu
   اشكالu   هاu   لu   حلu   المشكلاتNr   r   segmentsplitr   r   r   sentZsegmented_sentr   r   r   test_stanford_segmenter_arabic%  s$    
z+TestTokenize.test_stanford_segmenter_arabicc                 C   sB   t  }|d d}|| }| ddddddgks>td	S )
zO
        Test the Stanford Word Segmenter for Chinese (default config)
        r   u$   这是斯坦福中文分词器测试u   这u   是u	   斯坦福u   中文u	   分词器u   测试NrF   rI   r   r   r   test_stanford_segmenter_chinese=  s
    
z,TestTokenize.test_stanford_segmenter_chinesec                 C   sP   t  }d}dg}||}||ks&td}ddddg}||}||ksLtdS )zT
        Test a string that resembles a phone number but contains a newline
        r0   z(393)
928 -3010r1   r2   r3   z	928 -3010Nr   )r   r   test1r!   resulttest2r   r   r   test_phone_tokenizerH  s    

z!TestTokenize.test_phone_tokenizerc           
      C   s  t  }d}dg}||}||ks&td}dg}||}||ksFtd}dddddd	d
ddddddddddddddddg}||}||kstd}dddg}||}||kstd}dddd d!d!g}||}||kstd"}	d#dd$dd#g}||	}||ks
td%S )&zX
        Test a string that contains Emoji ZWJ Sequences and skin tone modifier
        u   👨‍👩‍👧‍👧u   👨🏿u   🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽u   🤔u   🙈meu   asír,   seu   😌Zdsu   💕u   👭u   👙hellou   👩🏾‍🎓emojiu   👨‍👩‍👦‍👦howZareu   😊r)   todayu   🙅🏽u   🇦🇵🇵🇱🇪u   🇦🇵u   🇵🇱u   🇪u   Hi 🇨🇦, 😍!!ZHiu   🇨🇦u   😍r*   u   <3 🇨🇦 🤝 🇵🇱 <3z<3u   🤝Nr   )
r   r   rM   r!   rN   rO   test3test4test5test6r   r   r   test_emoji_tokenizer[  s`    






z!TestTokenize.test_emoji_tokenizerc                 C   s6   d}dddddddd	dd
dddg}t ||ks2tdS )zA
        Test padding of asterisk for word tokenization.
        z1This is a, *weird sentence with *asterisks in it.r'   r$   ar,   *ZweirdsentencewithZ	asterisksinitr/   Nr   r   r   r#   r!   r   r   r   test_pad_asterisk  s     zTestTokenize.test_pad_asteriskc                 C   s<   d}dddddddd	d
dddddddg}t ||ks8tdS )z@
        Test padding of dotdot* for word tokenization.
        zPWhy did dotdot.. not get tokenized but dotdotdot... did? How about manydots.....ZWhyZdiddotdotz..r.   getZ	tokenizedbutZ	dotdotdotz...?ZHowZaboutZmanydotsz.....Nrb   rc   r   r   r   test_pad_dotdot  s&    zTestTokenize.test_pad_dotdotc              &   C   s  t dd}d}dddg}||}||ks.td}dd	d
ddddddddddddddddddddddddd dg}||}||kstd!}d"d#d$d#d%d#d&d#d'd#d(d#d)d#d*d#d+d#d,d#d-d#d.d#d/d#d0d#d1d#d2d#d3d#d4d#d5d#g&}||}||kstd6}d/d"d1d"d2d"d3d"d4d"d5d"g}||}||ks(td7}d/d#d1d#d2d#d3d#d4d#d5d#d#d#d0d#d#d0d#d8d#d9d#d:d#g}||}||ksztd;}	d<d=d.d>g}||	}||kstd?}
d@dAdBdAd.dAdCdAg}||
}||kstdDS )EzW
        Test remove_handle() from casual.py with specially crafted edge cases
        T)r   z-@twitter hello @twi_tter_. hi @12345 @123newsrS   r/   hiu]   @n`@n~@n(@n)@n-@n=@n+@n\@n|@n[@n]@n{@n}@n;@n:@n'@n"@n/@n?@n.@n,@n<@n>@n @n
@n ñ@n.ü@n.ç@n.`~r1   r3   r4   =+\|[]{};r   '"/rh   r,   <>   ñ   ü   çzKa@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@nr\   z@njzALZ147rC   0_r*   @#$%&r]   z@n!a @n#a @n$a @n%a @n&a @n*azD@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@nz@n_z@n7z@njz^@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandleZpqrstuvwxyz1234Zendofhandlez^@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcdepz@abcdez@abcdefghijklmno5Nr   )r   r   rM   r!   rN   rO   rW   rX   rY   rZ   Ztest7r   r   r   test_remove_handle  s   



(





zTestTokenize.test_remove_handlec                 C   s   t  }d}dddddddd	d
ddddddddddddddg}t||}||ksVtd}ddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/g}t||}||kstd0}ddddddd d!d"d#d$d%d&d'd(d)d*d1d2d3d4d5d6d7g}t||}||kstd8S )9zC
        Test TreebankWordTokenizer.span_tokenize function
        zNGood muffins cost $3.88
in New (York).  Please (buy) me
two of them.
(Thanks).)r      )      )      )      )r      )      )      )       )r   $   )r   %   )r   &   )(   .   )/   0   )r   3   )r   4   )5   7   )8   ;   )<   >   )?   D   )E   F   )r   L   )r   M   )r   N   zmThe DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues)r      )r      )   
   )   r   )r      )      )r   r   )r   r   )r   *   )r   +   ),   r   )r   2   )r   9   ):   @   )A   r   )r   J   )K   r   )r   U   )V   \   )]   _   )`   f   )g   m   zqThe DUP is similar to the "religious right" in the United States and takes a ``hardline'' stance on social issues)r   O   )r   W   )r   Y   )Z   r   )a   c   )d   j   )k   q   N)r   listZspan_tokenizer   )r   r   rM   r!   rN   rO   rW   r   r   r   test_treebank_span_tokenizerd  s    z)TestTokenize.test_treebank_span_tokenizerc                 C   s\   d}ddddddddd	d
dddddg}t ||ks6td}dddddg}t ||ksXtdS )z-
        Test word_tokenize function
        z0The 'v', I've been fooled but I'll seek revenge.r5   rv   vr,   Iz'veZbeenZfooledrg   z'llseekZrevenger/   z'v' 're'z'reNrb   )r   r^   r!   r   r   r   test_word_tokenize  s*    zTestTokenize.test_word_tokenizec                 C   sZ   dddgfddddgfddddd	gfg}|D ](\}}d
d t |D }||ks,tq,d S )NZ12)r   r7   )r7   NZ123)r7   3)r   Nr   )r   r   )r   Nc                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp>  s     z5TestTokenize.test_punkt_pair_iter.<locals>.<listcomp>)r	   
_pair_iterr   )r   Z
test_casesr8   Zexpected_outputZactual_outputr   r   r   test_punkt_pair_iter  s    
z!TestTokenize.test_punkt_pair_iterc                 C   s   t g }t|}t| d S N)iterr	   r   r   )r   ra   genr   r   r   5test_punkt_pair_iter_handles_stop_iteration_exception  s    
zBTestTokenize.test_punkt_pair_iter_handles_stop_iteration_exceptionc                 C   s0   t  }G dd d}| |_t|d d S )Nc                   @   s   e Zd Zdd ZdS )zkTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMockc                 S   s   t g S r   )r   )r   sr   r   r   r     s    zyTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exception.<locals>.TestPunktTokenizeWordsMock.word_tokenizeN)__name__
__module____qualname__r   r   r   r   r   TestPunktTokenizeWordsMock  s   r   r   )r	   ZPunktBaseClass
_lang_varsr   Z_tokenize_words)r   objr   r   r   r   :test_punkt_tokenize_words_handles_stop_iteration_exception  s    zGTestTokenize.test_punkt_tokenize_words_handles_stop_iteration_exceptionc                 C   sD   G dd dt j}t j| d}d}dddg}|||ks@td S )Nc                   @   s   e Zd ZdZdS )zNTestTokenize.test_punkt_tokenize_custom_lang_vars.<locals>.BengaliLanguageVars)r/   rh   r*   u   ।Nr   r   r   Zsent_end_charsr   r   r   r   BengaliLanguageVars  s   r   )	lang_varsc  উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।u  উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।u+  অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন।u/  এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।)r	   PunktLanguageVarsPunktSentenceTokenizerr   r   )r   r   r   	sentencesr!   r   r   r   $test_punkt_tokenize_custom_lang_vars  s    z1TestTokenize.test_punkt_tokenize_custom_lang_varsc                 C   s(   t  }d}dg}|||ks$td S )Nr   )r	   r   r   r   )r   r   r   r!   r   r   r   'test_punkt_tokenize_no_custom_lang_vars  s
    z4TestTokenize.test_punkt_tokenize_no_custom_lang_varsz%input_text,n_sents,n_splits,lang_vars)z4Subject: Some subject. Attachments: Some attachments      )z4Subject: Some subject! Attachments: Some attachmentsr   r   )z4This is just a normal sentence, just like any other.r   r   Nc                 C   sJ   t  }|d kr||_t|||ks,ttt|||ksFtd S r   )r	   r   r   lenr   r   r   Zdebug_decisions)r   Z
input_textn_sentsn_splitsr   r   r   r   r   punkt_debug_decisions   s
    z"TestTokenize.punkt_debug_decisionsc                 C   s*   G dd dt j}| jddd| d d S )Nc                   @   s   e Zd ZdZdS )zGTestTokenize.test_punkt_debug_decisions_custom_end.<locals>.ExtLangVars)r/   rh   r*   ^Nr   r   r   r   r   ExtLangVars<  s   r  z4Subject: Some subject^ Attachments: Some attachmentsr   r   )r   r   r   )r	   r   r   )r   r  r   r   r   %test_punkt_debug_decisions_custom_end9  s    z2TestTokenize.test_punkt_debug_decisions_custom_endzsentences, expectedzthis is a test. . new sentence.zthis is a test.znew sentence.zThis. . . ThatzThis.ZThatzThis..... ThatzThis... ThatzThis.. . ThatzThis.. .zThis. .. Thatz.. ThatzThis. ,. Thatz,.zThis!!! ThatzThis!!!z
This! ThatzThis!z+1. This is R .
2. This is A .
3. That's allz1.zThis is R .z2.zThis is A .z3.z
That's allz+1. This is R .	2. This is A .	3. That's allzHello.	TherezHello.ZThere)r   r!   c                 C   s   t ||kstd S r   )r
   r   )r   r   r!   r   r   r   test_sent_tokenizeH  s    zTestTokenize.test_sent_tokenize)N) r   r   r   r"   pytestmarkZparametrizestrr   r   r<   rB   rD   rE   check_stanford_segmenterrK   rL   rP   r[   rd   ri   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   %   s  &#  4 	


A a




r   )__doc__typingr   r   r  Znltk.tokenizer   r   r   r   r   r	   r
   r   r   r  Zskipifr  r   r   r   r   r   <module>   s   (
