Sentence tokenizer#

This tutorial is available as an IPython notebook at Malaya/example/tokenizer-sentence.

[1]:
%%time
import malaya
CPU times: user 6.15 s, sys: 1.22 s, total: 7.36 s
Wall time: 8.81 s

Sentence tokenizer#

We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, titles, time and month to split a sentence into multiple sentences.

class SentenceTokenizer:
    def __init__(self):
        pass
[2]:
s_tokenizer = malaya.tokenizer.SentenceTokenizer()

Tokenize#

def tokenize(self, string, minimum_length=5):
    """
    Tokenize string into multiple strings.

    Parameters
    ----------
    string : str
    minimum_length: int, optional (default=5)
        minimum length to assume a string is a string, default 5 characters.

    Returns
    -------
    result: List[str]
    """
[3]:
s = """
no. 1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi
"""
s_tokenizer.tokenize(s)
[3]:
['no.1 polis bertemu dengan suspek di ladang getah.',
 'polis tembak pui pui pui bertubi tubi.']
[4]:
s = """
email saya di husein.zol01@gmail.com, nanti jom berkopi
"""
s_tokenizer.tokenize(s)
[4]:
['email saya di husein.zol01@gmail.com, nanti jom berkopi.']
[5]:
s = """
ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.
"""
s_tokenizer.tokenize(s)
[5]:
['ke.2 cerita nya begini.',
 'saya berjalan jalan ditepi muara jumpa anak dara.']
[6]:
s = """
ke. 2 cerita nya begini. aku jumpa ybhg. dr. syed tadi, sakai gila
"""
s_tokenizer.tokenize(s)
[6]:
['ke.2 cerita nya begini.', 'aku jumpa ybhg. dr. syed tadi, sakai gila.']