<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
"""
from typing import List, Tuple

import pytest

from nltk.tokenize import (
    LegalitySyllableTokenizer,
    StanfordSegmenter,
    SyllableTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    punkt,
    sent_tokenize,
    word_tokenize,
)


def load_stanford_segmenter():
    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
        return True
    except LookupError:
        return False


check_stanford_segmenter = pytest.mark.skipif(
    not load_stanford_segmenter(),
    reason="NLTK was unable to find stanford-segmenter.jar.",
)


class TestTokenize:
    def test_tweet_tokenizer(self):
        """
        Test TweetTokenizer using words with special and accented characters.
        """

        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        s9 = "@myke: Let's test these words: resumÃ© EspaÃ±a MÃ¼nchen franÃ§ais"
        tokens = tokenizer.tokenize(s9)
        expected = [
            ":",
            "Let's",
            "test",
            "these",
            "words",
            ":",
            "resumÃ©",
            "EspaÃ±a",
            "MÃ¼nchen",
            "franÃ§ais",
        ]
        assert tokens == expected

    @pytest.mark.parametrize(
        "test_input, expecteds",
        [
            (
                "My text 0106404243030 is great text",
                (
                    ["My", "text", "01064042430", "30", "is", "great", "text"],
                    ["My", "text", "0106404243030", "is", "great", "text"],
                ),
            ),
            (
                "My ticket id is 1234543124123",
                (
                    ["My", "ticket", "id", "is", "12345431241", "23"],
                    ["My", "ticket", "id", "is", "1234543124123"],
                ),
            ),
            (
                "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
                (
                    [
                        ":",
                        "This",
                        "is",
                        "waaayyy",
                        "too",
                        "much",
                        "for",
                        "you",
                        "!",
                        "!",
                        "!",
                        "01064042430",
                    ],
                    [
                        ":",
                        "This",
                        "is",
                        "waaayyy",
                        "too",
                        "much",
                        "for",
                        "you",
                        "!",
                        "!",
                        "!",
                        "01064042430",
                    ],
                ),
            ),
            # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
            # showing the TweetTokenizer performance for `match_phone_numbers=True` and
            # `match_phone_numbers=False`.
            (
                # Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
                "My number is 06-46124080, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "06-46124080",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "06-46124080",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
                "My number is 601-984-4813, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "601-984-4813",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "601-984-",
                        "4813",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
                "My number is (393)  928 -3010, except it's not.",
                (
                    [
                        "My",
                        "number",
                        "is",
                        "(393)  928 -3010",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                    [
                        "My",
                        "number",
                        "is",
                        "(",
                        "393",
                        ")",
                        "928",
                        "-",
                        "3010",
                        ",",
                        "except",
                        "it's",
                        "not",
                        ".",
                    ],
                ),
            ),
            (
                # A long number is tokenized correctly only if `match_phone_numbers=False`
                "The product identification number is 48103284512.",
                (
                    [
                        "The",
                        "product",
                        "identification",
                        "number",
                        "is",
                        "4810328451",
                        "2",
                        ".",
                    ],
                    [
                        "The",
                        "product",
                        "identification",
                        "number",
                        "is",
                        "48103284512",
                        ".",
                    ],
                ),
            ),
            (
                # `match_phone_numbers=True` can have some unforeseen
                "My favourite substraction is 240 - 1353.",
                (
                    ["My", "favourite", "substraction", "is", "240 - 1353", "."],
                    ["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
                ),
            ),
        ],
    )
    def test_tweet_tokenizer_expanded(
        self, test_input: str, expecteds: Tuple[List[str], List[str]]
    ):
        """
        Test `match_phone_numbers` in TweetTokenizer.

        Note that TweetTokenizer is also passed the following for these tests:
            * strip_handles=True
            * reduce_len=True

        :param test_input: The input string to tokenize using TweetTokenizer.
        :type test_input: str
        :param expecteds: A 2-tuple of tokenized sentences. The first of the two
            tokenized is the expected output of tokenization with `match_phone_numbers=True`.
            The second of the two tokenized lists is the expected output of tokenization
            with `match_phone_numbers=False`.
        :type expecteds: Tuple[List[str], List[str]]
        """
        for match_phone_numbers, expected in zip([True, False], expecteds):
            tokenizer = TweetTokenizer(
                strip_handles=True,
                reduce_len=True,
                match_phone_numbers=match_phone_numbers,
            )
            predicted = tokenizer.tokenize(test_input)
            assert predicted == expected

    def test_sonority_sequencing_syllable_tokenizer(self):
        """
        Test SyllableTokenizer tokenizer.
        """
        tokenizer = SyllableTokenizer()
        tokens = tokenizer.tokenize("justification")
        assert tokens == ["jus", "ti", "fi", "ca", "tion"]

    def test_syllable_tokenizer_numbers(self):
        """
        Test SyllableTokenizer tokenizer.
        """
        tokenizer = SyllableTokenizer()
        text = "9" * 10000
        tokens = tokenizer.tokenize(text)
        assert tokens == [text]

    def test_legality_principle_syllable_tokenizer(self):
        """
        Test LegalitySyllableTokenizer tokenizer.
        """
        from nltk.corpus import words

        test_word = "wonderful"
        tokenizer = LegalitySyllableTokenizer(words.words())
        tokens = tokenizer.tokenize(test_word)
        assert tokens == ["won", "der", "ful"]

    @check_stanford_segmenter
    def test_stanford_segmenter_arabic(self):
        """
        Test the Stanford Word Segmenter for Arabic (default config)
        """
        seg = StanfordSegmenter()
        seg.default_config("ar")
        sent = "ÙŠØ¨Ø­Ø« Ø¹Ù„Ù… Ø§Ù„Ø­Ø§Ø³ÙˆØ¨ Ø§Ø³ØªØ®Ø¯Ø§Ù… Ø§Ù„Ø­ÙˆØ³Ø¨Ø© Ø¨Ø¬Ù…ÙŠØ¹ Ø§Ø´ÙƒØ§Ù„Ù‡Ø§ Ù„Ø­Ù„ Ø§Ù„Ù…Ø´ÙƒÙ„Ø§Øª"
        segmented_sent = seg.segment(sent.split())
        assert segmented_sent.split() == [
            "ÙŠØ¨Ø­Ø«",
            "Ø¹Ù„Ù…",
            "Ø§Ù„Ø­Ø§Ø³ÙˆØ¨",
            "Ø§Ø³ØªØ®Ø¯Ø§Ù…",
            "Ø§Ù„Ø­ÙˆØ³Ø¨Ø©",
            "Ø¨",
            "Ø¬Ù…ÙŠØ¹",
            "Ø§Ø´ÙƒØ§Ù„",
            "Ù‡Ø§",
            "Ù„",
            "Ø­Ù„",
            "Ø§Ù„Ù…Ø´ÙƒÙ„Ø§Øª",
        ]

    @check_stanford_segmenter
    def test_stanford_segmenter_chinese(self):
        """
        Test the Stanford Word Segmenter for Chinese (default config)
        """
        seg = StanfordSegmenter()
        seg.default_config("zh")
        sent = "è¿™æ˜¯æ–¯å¦ç¦ä¸­æ–‡åˆ†è¯å™¨æµ‹è¯•"
        segmented_sent = seg.segment(sent.split())
        assert segmented_sent.split() == ["è¿™", "æ˜¯", "æ–¯å¦ç¦", "ä¸­æ–‡", "åˆ†è¯å™¨", "æµ‹è¯•"]

    def test_phone_tokenizer(self):
        """
        Test a string that resembles a phone number but contains a newline
        """

        # Should be recognized as a phone number, albeit one with multiple spaces
        tokenizer = TweetTokenizer()
        test1 = "(393)  928 -3010"
        expected = ["(393)  928 -3010"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # Due to newline, first three elements aren't part of a phone number;
        # fourth is
        test2 = "(393)\n928 -3010"
        expected = ["(", "393", ")", "928 -3010"]
        result = tokenizer.tokenize(test2)
        assert result == expected

    def test_emoji_tokenizer(self):
        """
        Test a string that contains Emoji ZWJ Sequences and skin tone modifier
        """
        tokenizer = TweetTokenizer()

        # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
        test1 = "ðŸ‘¨â€ðŸ‘©â€ðŸ‘§â€ðŸ‘§"
        expected = ["ðŸ‘¨â€ðŸ‘©â€ðŸ‘§â€ðŸ‘§"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
        test2 = "ðŸ‘¨ðŸ¿"
        expected = ["ðŸ‘¨ðŸ¿"]
        result = tokenizer.tokenize(test2)
        assert result == expected

        # A string containing both skin tone modifier and ZWJ Sequences
        test3 = "ðŸ¤” ðŸ™ˆ me asÃ­, seðŸ˜Œ ds ðŸ’•ðŸ‘­ðŸ‘™ hello ðŸ‘©ðŸ¾â€ðŸŽ“ emoji hello ðŸ‘¨â€ðŸ‘©â€ðŸ‘¦â€ðŸ‘¦ how are ðŸ˜Š you todayðŸ™…ðŸ½ðŸ™…ðŸ½"
        expected = [
            "ðŸ¤”",
            "ðŸ™ˆ",
            "me",
            "asÃ­",
            ",",
            "se",
            "ðŸ˜Œ",
            "ds",
            "ðŸ’•",
            "ðŸ‘­",
            "ðŸ‘™",
            "hello",
            "ðŸ‘©ðŸ¾\u200dðŸŽ“",
            "emoji",
            "hello",
            "ðŸ‘¨\u200dðŸ‘©\u200dðŸ‘¦\u200dðŸ‘¦",
            "how",
            "are",
            "ðŸ˜Š",
            "you",
            "today",
            "ðŸ™…ðŸ½",
            "ðŸ™…ðŸ½",
        ]
        result = tokenizer.tokenize(test3)
        assert result == expected

        # emoji flag sequences, including enclosed letter pairs
        # Expected behavior from #3034
        test4 = "ðŸ‡¦ðŸ‡µðŸ‡µðŸ‡±ðŸ‡ª"
        expected = ["ðŸ‡¦ðŸ‡µ", "ðŸ‡µðŸ‡±", "ðŸ‡ª"]
        result = tokenizer.tokenize(test4)
        assert result == expected

        test5 = "Hi ðŸ‡¨ðŸ‡¦, ðŸ˜!!"
        expected = ["Hi", "ðŸ‡¨ðŸ‡¦", ",", "ðŸ˜", "!", "!"]
        result = tokenizer.tokenize(test5)
        assert result == expected

        test6 = "&lt;3 ðŸ‡¨ðŸ‡¦ ðŸ¤ ðŸ‡µðŸ‡± &lt;3"
        expected = ["&lt;3", "ðŸ‡¨ðŸ‡¦", "ðŸ¤", "ðŸ‡µðŸ‡±", "&lt;3"]
        result = tokenizer.tokenize(test6)
        assert result == expected

    def test_pad_asterisk(self):
        """
        Test padding of asterisk for word tokenization.
        """
        text = "This is a, *weird sentence with *asterisks in it."
        expected = [
            "This",
            "is",
            "a",
            ",",
            "*",
            "weird",
            "sentence",
            "with",
            "*",
            "asterisks",
            "in",
            "it",
            ".",
        ]
        assert word_tokenize(text) == expected

    def test_pad_dotdot(self):
        """
        Test padding of dotdot* for word tokenization.
        """
        text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
        expected = [
            "Why",
            "did",
            "dotdot",
            "..",
            "not",
            "get",
            "tokenized",
            "but",
            "dotdotdot",
            "...",
            "did",
            "?",
            "How",
            "about",
            "manydots",
            ".....",
        ]
        assert word_tokenize(text) == expected

    def test_remove_handle(self):
        """
        Test remove_handle() from casual.py with specially crafted edge cases
        """

        tokenizer = TweetTokenizer(strip_handles=True)

        # Simple example. Handles with just numbers should be allowed
        test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
        expected = ["hello", ".", "hi"]
        result = tokenizer.tokenize(test1)
        assert result == expected

        # Handles are allowed to follow any of the following characters
        test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n&lt;@n&gt;@n @n\n@n Ã±@n.Ã¼@n.Ã§@n."
        expected = [
            "`",
            "~",
            "(",
            ")",
            "-",
            "=",
            "+",
            "\\",
            "|",
            "[",
            "]",
            "{",
            "}",
            ";",
            ":",
            "'",
            '"',
            "/",
            "?",
            ".",
            ",",
            "&lt;",
            "&gt;",
            "Ã±",
            ".",
            "Ã¼",
            ".",
            "Ã§",
            ".",
        ]
        result = tokenizer.tokenize(test2)
        assert result == expected

        # Handles are NOT allowed to follow any of the following characters
        test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &amp;@n *@n"
        expected = [
            "a",
            "@n",
            "j",
            "@n",
            "z",
            "@n",
            "A",
            "@n",
            "L",
            "@n",
            "Z",
            "@n",
            "1",
            "@n",
            "4",
            "@n",
            "7",
            "@n",
            "9",
            "@n",
            "0",
            "@n",
            "_",
            "@n",
            "!",
            "@n",
            "@",
            "@n",
            "#",
            "@n",
            "$",
            "@n",
            "%",
            "@n",
            "&amp;",
            "@n",
            "*",
            "@n",
        ]
        result = tokenizer.tokenize(test3)
        assert result == expected

        # Handles are allowed to precede the following characters
        test4 = "@n!a @n#a @n$a @n%a @n&amp;a @n*a"
        expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&amp;", "a", "*", "a"]
        result = tokenizer.tokenize(test4)
        assert result == expected

        # Tests interactions with special symbols and multiple @
        test5 = "@n!@n @n#@n @n$@n @n%@n @n&amp;@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
        expected = [
            "!",
            "@n",
            "#",
            "@n",
            "$",
            "@n",
            "%",
            "@n",
            "&amp;",
            "@n",
            "*",
            "@n",
            "@n",
            "@n",
            "@",
            "@n",
            "@n",
            "@",
            "@n",
            "@n_",
            "@n",
            "@n7",
            "@n",
            "@nj",
            "@n",
        ]
        result = tokenizer.tokenize(test5)
        assert result == expected

        # Tests that handles can have a max length of 15
        test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
        expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
        result = tokenizer.tokenize(test6)
        assert result == expected

        # Edge case where an @ comes directly after a long handle
        test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
        expected = [
            "p",
            "@abcde",
            "@abcdefghijklmno",
            "@abcde",
            "_",
            "@abcde",
            "5",
            "@abcde",
        ]
        result = tokenizer.tokenize(test7)
        assert result == expected

    def test_treebank_span_tokenizer(self):
        """
        Test TreebankWordTokenizer.span_tokenize function
        """

        tokenizer = TreebankWordTokenizer()

        # Test case in the docstring
        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
        expected = [
            (0, 4),
            (5, 12),
            (13, 17),
            (18, 19),
            (19, 23),
            (24, 26),
            (27, 30),
            (31, 32),
            (32, 36),
            (36, 37),
            (37, 38),
            (40, 46),
            (47, 48),
            (48, 51),
            (51, 52),
            (53, 55),
            (56, 59),
            (60, 62),
            (63, 68),
            (69, 70),
            (70, 76),
            (76, 77),
            (77, 78),
        ]
        result = list(tokenizer.span_tokenize(test1))
        assert result == expected

        # Test case with double quotation
        test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues'
        expected = [
            (0, 3),
            (4, 7),
            (8, 10),
            (11, 18),
            (19, 21),
            (22, 25),
            (26, 27),
            (27, 36),
            (37, 42),
            (42, 43),
            (44, 46),
            (47, 50),
            (51, 57),
            (58, 64),
            (65, 68),
            (69, 74),
            (75, 76),
            (77, 85),
            (86, 92),
            (93, 95),
            (96, 102),
            (103, 109),
        ]
        result = list(tokenizer.span_tokenize(test2))
        assert result == expected

        # Test case with double qoutation as well as converted quotations
        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
        expected = [
            (0, 3),
            (4, 7),
            (8, 10),
            (11, 18),
            (19, 21),
            (22, 25),
            (26, 27),
            (27, 36),
            (37, 42),
            (42, 43),
            (44, 46),
            (47, 50),
            (51, 57),
            (58, 64),
            (65, 68),
            (69, 74),
            (75, 76),
            (77, 79),
            (79, 87),
            (87, 89),
            (90, 96),
            (97, 99),
            (100, 106),
            (107, 113),
        ]
        result = list(tokenizer.span_tokenize(test3))
        assert result == expected

    def test_word_tokenize(self):
        """
        Test word_tokenize function
        """

        sentence = "The 'v', I've been fooled but I'll seek revenge."
        expected = [
            "The",
            "'",
            "v",
            "'",
            ",",
            "I",
            "'ve",
            "been",
            "fooled",
            "but",
            "I",
            "'ll",
            "seek",
            "revenge",
            ".",
        ]
        assert word_tokenize(sentence) == expected

        sentence = "'v' 're'"
        expected = ["'", "v", "'", "'re", "'"]
        assert word_tokenize(sentence) == expected

    def test_punkt_pair_iter(self):

        test_cases = [
            ("12", [("1", "2"), ("2", None)]),
            ("123", [("1", "2"), ("2", "3"), ("3", None)]),
            ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
        ]

        for (test_input, expected_output) in test_cases:
            actual_output = [x for x in punkt._pair_iter(test_input)]

            assert actual_output == expected_output

    def test_punkt_pair_iter_handles_stop_iteration_exception(self):
        # test input to trigger StopIteration from next()
        it = iter([])
        # call method under test and produce a generator
        gen = punkt._pair_iter(it)
        # unpack generator, ensure that no error is raised
        list(gen)

    def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
        obj = punkt.PunktBaseClass()

        class TestPunktTokenizeWordsMock:
            def word_tokenize(self, s):
                return iter([])

        obj._lang_vars = TestPunktTokenizeWordsMock()
        # unpack generator, ensure that no error is raised
        list(obj._tokenize_words("test"))

    def test_punkt_tokenize_custom_lang_vars(self):

        # Create LangVars including a full stop end character as used in Bengali
        class BengaliLanguageVars(punkt.PunktLanguageVars):
            sent_end_chars = (".", "?", "!", "\u0964")

        obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars())

        # We now expect these sentences to be split up into the individual sentences
        sentences = "à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¶à§à¦°à§€ à¦à¦® à¦­à§‡à¦™à§à¦•à¦¾à¦‡à§Ÿà¦¾ à¦¨à¦¾à¦‡à¦¡à§ à¦¸à§‹à¦®à¦¬à¦¾à¦° à¦†à¦‡ à¦†à¦‡ à¦Ÿà¦¿ à¦¦à¦¿à¦²à§à¦²à¦¿à¦° à¦¹à§€à¦°à¦• à¦œà§Ÿà¦¨à§à¦¤à§€ à¦‰à¦¦à¦¯à¦¾à¦ªà¦¨à§‡à¦° à¦‰à¦¦à§à¦¬à§‹à¦§à¦¨ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤ à¦…à¦¨à¦²à¦¾à¦‡à¦¨à§‡à¦° à¦®à¦¾à¦§à§à¦¯à¦®à§‡ à¦à¦‡ à¦…à¦¨à§à¦·à§à¦&nbsp;à¦¾à¦¨à§‡ à¦•à§‡à¦¨à§à¦¦à§à¦°à§€à§Ÿ à¦®à¦¾à¦¨à¦¬ à¦¸à¦®à§à¦ªà¦¦ à¦‰à¦¨à§à¦¨à§Ÿà¦¨à¦®à¦¨à§à¦¤à§à¦°à§€ à¦¶à§à¦°à§€ à¦°à¦®à§‡à¦¶ à¦ªà§‹à¦–à¦°à¦¿à§Ÿà¦¾à¦² â€˜à¦¨à¦¿à¦¶à¦¾à¦™à§à¦•â€™  à¦‰à¦ªà¦¸à§à¦¥à¦¿à¦¤ à¦›à¦¿à¦²à§‡à¦¨à¥¤ à¦à¦‡ à¦‰à¦ªà¦²à¦•à§à¦·à§à¦¯à§‡ à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¹à§€à¦°à¦•à¦œà§Ÿà¦¨à§à¦¤à§€à¦° à¦²à§‹à¦—à§‹ à¦à¦¬à¦‚ à§¨à§¦à§©à§¦-à¦à¦° à¦œà¦¨à§à¦¯ à¦ªà§à¦°à¦¤à¦¿à¦·à§à¦&nbsp;à¦¾à¦¨à§‡à¦° à¦²à¦•à§à¦·à§à¦¯ à¦“ à¦ªà¦°à¦¿à¦•à¦²à§à¦ªà¦¨à¦¾à¦° à¦¨à¦¥à¦¿ à¦ªà§à¦°à¦•à¦¾à¦¶ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤"
        expected = [
            "à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¶à§à¦°à§€ à¦à¦® à¦­à§‡à¦™à§à¦•à¦¾à¦‡à§Ÿà¦¾ à¦¨à¦¾à¦‡à¦¡à§ à¦¸à§‹à¦®à¦¬à¦¾à¦° à¦†à¦‡ à¦†à¦‡ à¦Ÿà¦¿ à¦¦à¦¿à¦²à§à¦²à¦¿à¦° à¦¹à§€à¦°à¦• à¦œà§Ÿà¦¨à§à¦¤à§€ à¦‰à¦¦à¦¯à¦¾à¦ªà¦¨à§‡à¦° à¦‰à¦¦à§à¦¬à§‹à¦§à¦¨ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤",
            "à¦…à¦¨à¦²à¦¾à¦‡à¦¨à§‡à¦° à¦®à¦¾à¦§à§à¦¯à¦®à§‡ à¦à¦‡ à¦…à¦¨à§à¦·à§à¦&nbsp;à¦¾à¦¨à§‡ à¦•à§‡à¦¨à§à¦¦à§à¦°à§€à§Ÿ à¦®à¦¾à¦¨à¦¬ à¦¸à¦®à§à¦ªà¦¦ à¦‰à¦¨à§à¦¨à§Ÿà¦¨à¦®à¦¨à§à¦¤à§à¦°à§€ à¦¶à§à¦°à§€ à¦°à¦®à§‡à¦¶ à¦ªà§‹à¦–à¦°à¦¿à§Ÿà¦¾à¦² â€˜à¦¨à¦¿à¦¶à¦¾à¦™à§à¦•â€™  à¦‰à¦ªà¦¸à§à¦¥à¦¿à¦¤ à¦›à¦¿à¦²à§‡à¦¨à¥¤",
            "à¦à¦‡ à¦‰à¦ªà¦²à¦•à§à¦·à§à¦¯à§‡ à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¹à§€à¦°à¦•à¦œà§Ÿà¦¨à§à¦¤à§€à¦° à¦²à§‹à¦—à§‹ à¦à¦¬à¦‚ à§¨à§¦à§©à§¦-à¦à¦° à¦œà¦¨à§à¦¯ à¦ªà§à¦°à¦¤à¦¿à¦·à§à¦&nbsp;à¦¾à¦¨à§‡à¦° à¦²à¦•à§à¦·à§à¦¯ à¦“ à¦ªà¦°à¦¿à¦•à¦²à§à¦ªà¦¨à¦¾à¦° à¦¨à¦¥à¦¿ à¦ªà§à¦°à¦•à¦¾à¦¶ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤",
        ]

        assert obj.tokenize(sentences) == expected

    def test_punkt_tokenize_no_custom_lang_vars(self):

        obj = punkt.PunktSentenceTokenizer()

        # We expect these sentences to not be split properly, as the Bengali full stop 'à¥¤' is not included in the default language vars
        sentences = "à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¶à§à¦°à§€ à¦à¦® à¦­à§‡à¦™à§à¦•à¦¾à¦‡à§Ÿà¦¾ à¦¨à¦¾à¦‡à¦¡à§ à¦¸à§‹à¦®à¦¬à¦¾à¦° à¦†à¦‡ à¦†à¦‡ à¦Ÿà¦¿ à¦¦à¦¿à¦²à§à¦²à¦¿à¦° à¦¹à§€à¦°à¦• à¦œà§Ÿà¦¨à§à¦¤à§€ à¦‰à¦¦à¦¯à¦¾à¦ªà¦¨à§‡à¦° à¦‰à¦¦à§à¦¬à§‹à¦§à¦¨ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤ à¦…à¦¨à¦²à¦¾à¦‡à¦¨à§‡à¦° à¦®à¦¾à¦§à§à¦¯à¦®à§‡ à¦à¦‡ à¦…à¦¨à§à¦·à§à¦&nbsp;à¦¾à¦¨à§‡ à¦•à§‡à¦¨à§à¦¦à§à¦°à§€à§Ÿ à¦®à¦¾à¦¨à¦¬ à¦¸à¦®à§à¦ªà¦¦ à¦‰à¦¨à§à¦¨à§Ÿà¦¨à¦®à¦¨à§à¦¤à§à¦°à§€ à¦¶à§à¦°à§€ à¦°à¦®à§‡à¦¶ à¦ªà§‹à¦–à¦°à¦¿à§Ÿà¦¾à¦² â€˜à¦¨à¦¿à¦¶à¦¾à¦™à§à¦•â€™  à¦‰à¦ªà¦¸à§à¦¥à¦¿à¦¤ à¦›à¦¿à¦²à§‡à¦¨à¥¤ à¦à¦‡ à¦‰à¦ªà¦²à¦•à§à¦·à§à¦¯à§‡ à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¹à§€à¦°à¦•à¦œà§Ÿà¦¨à§à¦¤à§€à¦° à¦²à§‹à¦—à§‹ à¦à¦¬à¦‚ à§¨à§¦à§©à§¦-à¦à¦° à¦œà¦¨à§à¦¯ à¦ªà§à¦°à¦¤à¦¿à¦·à§à¦&nbsp;à¦¾à¦¨à§‡à¦° à¦²à¦•à§à¦·à§à¦¯ à¦“ à¦ªà¦°à¦¿à¦•à¦²à§à¦ªà¦¨à¦¾à¦° à¦¨à¦¥à¦¿ à¦ªà§à¦°à¦•à¦¾à¦¶ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤"
        expected = [
            "à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¶à§à¦°à§€ à¦à¦® à¦­à§‡à¦™à§à¦•à¦¾à¦‡à§Ÿà¦¾ à¦¨à¦¾à¦‡à¦¡à§ à¦¸à§‹à¦®à¦¬à¦¾à¦° à¦†à¦‡ à¦†à¦‡ à¦Ÿà¦¿ à¦¦à¦¿à¦²à§à¦²à¦¿à¦° à¦¹à§€à¦°à¦• à¦œà§Ÿà¦¨à§à¦¤à§€ à¦‰à¦¦à¦¯à¦¾à¦ªà¦¨à§‡à¦° à¦‰à¦¦à§à¦¬à§‹à¦§à¦¨ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤ à¦…à¦¨à¦²à¦¾à¦‡à¦¨à§‡à¦° à¦®à¦¾à¦§à§à¦¯à¦®à§‡ à¦à¦‡ à¦…à¦¨à§à¦·à§à¦&nbsp;à¦¾à¦¨à§‡ à¦•à§‡à¦¨à§à¦¦à§à¦°à§€à§Ÿ à¦®à¦¾à¦¨à¦¬ à¦¸à¦®à§à¦ªà¦¦ à¦‰à¦¨à§à¦¨à§Ÿà¦¨à¦®à¦¨à§à¦¤à§à¦°à§€ à¦¶à§à¦°à§€ à¦°à¦®à§‡à¦¶ à¦ªà§‹à¦–à¦°à¦¿à§Ÿà¦¾à¦² â€˜à¦¨à¦¿à¦¶à¦¾à¦™à§à¦•â€™  à¦‰à¦ªà¦¸à§à¦¥à¦¿à¦¤ à¦›à¦¿à¦²à§‡à¦¨à¥¤ à¦à¦‡ à¦‰à¦ªà¦²à¦•à§à¦·à§à¦¯à§‡ à¦‰à¦ªà¦°à¦¾à¦·à§à¦Ÿà§à¦°à¦ªà¦¤à¦¿ à¦¹à§€à¦°à¦•à¦œà§Ÿà¦¨à§à¦¤à§€à¦° à¦²à§‹à¦—à§‹ à¦à¦¬à¦‚ à§¨à§¦à§©à§¦-à¦à¦° à¦œà¦¨à§à¦¯ à¦ªà§à¦°à¦¤à¦¿à¦·à§à¦&nbsp;à¦¾à¦¨à§‡à¦° à¦²à¦•à§à¦·à§à¦¯ à¦“ à¦ªà¦°à¦¿à¦•à¦²à§à¦ªà¦¨à¦¾à¦° à¦¨à¦¥à¦¿ à¦ªà§à¦°à¦•à¦¾à¦¶ à¦•à¦°à§‡à¦›à§‡à¦¨à¥¤"
        ]

        assert obj.tokenize(sentences) == expected

    @pytest.mark.parametrize(
        "input_text,n_sents,n_splits,lang_vars",
        [
            # Test debug_decisions on a text with two sentences, split by a dot.
            ("Subject: Some subject. Attachments: Some attachments", 2, 1),
            # The sentence should be split into two sections,
            # with one split and hence one decision.
            # Test debug_decisions on a text with two sentences, split by an exclamation mark.
            ("Subject: Some subject! Attachments: Some attachments", 2, 1),
            # The sentence should be split into two sections,
            # with one split and hence one decision.
            # Test debug_decisions on a text with one sentences,
            # which is not split.
            ("This is just a normal sentence, just like any other.", 1, 0)
            # Hence just 1
        ],
    )
    def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
        tokenizer = punkt.PunktSentenceTokenizer()
        if lang_vars != None:
            tokenizer._lang_vars = lang_vars

        assert len(tokenizer.tokenize(input_text)) == n_sents
        assert len(list(tokenizer.debug_decisions(input_text))) == n_splits

    def test_punkt_debug_decisions_custom_end(self):
        # Test debug_decisions on a text with two sentences,
        # split by a custom end character, based on Issue #2519
        class ExtLangVars(punkt.PunktLanguageVars):
            sent_end_chars = (".", "?", "!", "^")

        self.punkt_debug_decisions(
            "Subject: Some subject^ Attachments: Some attachments",
            n_sents=2,
            n_splits=1,
            lang_vars=ExtLangVars(),
        )
        # The sentence should be split into two sections,
        # with one split and hence one decision.

    @pytest.mark.parametrize(
        "sentences, expected",
        [
            (
                "this is a test. . new sentence.",
                ["this is a test.", ".", "new sentence."],
            ),
            ("This. . . That", ["This.", ".", ".", "That"]),
            ("This..... That", ["This..... That"]),
            ("This... That", ["This... That"]),
            ("This.. . That", ["This.. .", "That"]),
            ("This. .. That", ["This.", ".. That"]),
            ("This. ,. That", ["This.", ",.", "That"]),
            ("This!!! That", ["This!!!", "That"]),
            ("This! That", ["This!", "That"]),
            (
                "1. This is R .\n2. This is A .\n3. That's all",
                ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
            ),
            (
                "1. This is R .\t2. This is A .\t3. That's all",
                ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
            ),
            ("Hello.\tThere", ["Hello.", "There"]),
        ],
    )
    def test_sent_tokenize(self, sentences: str, expected: List[str]):
        assert sent_tokenize(sentences) == expected
</pre></body></html>