Sentence tokenizer
Contents
Sentence tokenizer#
This tutorial is available as an IPython notebook at Malaya/example/tokenizer-sentence.
[1]:
%%time
import malaya
CPU times: user 6.15 s, sys: 1.22 s, total: 7.36 s
Wall time: 8.81 s
Sentence tokenizer#
We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, titles, time and month to split a sentence into multiple sentences.
class SentenceTokenizer:
def __init__(self):
pass
[2]:
s_tokenizer = malaya.tokenizer.SentenceTokenizer()
Tokenize#
def tokenize(self, string, minimum_length=5):
"""
Tokenize string into multiple strings.
Parameters
----------
string : str
minimum_length: int, optional (default=5)
minimum length to assume a string is a string, default 5 characters.
Returns
-------
result: List[str]
"""
[3]:
s = """
no. 1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi
"""
s_tokenizer.tokenize(s)
[3]:
['no.1 polis bertemu dengan suspek di ladang getah.',
'polis tembak pui pui pui bertubi tubi.']
[4]:
s = """
email saya di husein.zol01@gmail.com, nanti jom berkopi
"""
s_tokenizer.tokenize(s)
[4]:
['email saya di husein.zol01@gmail.com, nanti jom berkopi.']
[5]:
s = """
ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.
"""
s_tokenizer.tokenize(s)
[5]:
['ke.2 cerita nya begini.',
'saya berjalan jalan ditepi muara jumpa anak dara.']
[6]:
s = """
ke. 2 cerita nya begini. aku jumpa ybhg. dr. syed tadi, sakai gila
"""
s_tokenizer.tokenize(s)
[6]:
['ke.2 cerita nya begini.', 'aku jumpa ybhg. dr. syed tadi, sakai gila.']