Word and sentence tokenizer
Contents
Word and sentence tokenizer#
This tutorial is available as an IPython notebook at Malaya/example/tokenizer.
[1]:
%%time
import malaya
CPU times: user 5.84 s, sys: 971 ms, total: 6.81 s
Wall time: 7.56 s
[2]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli (911223-06-2305), dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'
string8 = 'online & desktop: regexr.com or download the desktop version for Mac'
string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'
string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'
Load word tokenizer#
class Tokenizer:
def __init__(self, **kwargs):
"""
Load Tokenizer object.
Check supported regex pattern at
https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85
Parameters
----------
emojis: bool, optional (default=False)
True to keep emojis.
urls: bool, optional (default=True)
True to keep urls.
urls_improved: bool, optional (default=True)
True to keep urls, better version.
tags: bool, optional (default=True)
True to keep tags: <tag>.
emails: bool, optional (default=True)
True to keep emails.
users: bool, optional (default=True)
True to keep users handles: @cbaziotis.
hashtags: bool, optional (default=True)
True to keep hashtags.
phones: bool, optional (default=True)
True to keep phones.
percents: bool, optional (default=True)
True to keep percents.
money: bool, optional (default=True)
True to keep money expressions.
date: bool, optional (default=True)
True to keep date expressions.
time: bool, optional (default=True)
True to keep time expressions.
time_pukul: bool, optional (default=True)
True to keep time `pukul` expressions.
acronyms: bool, optional (default=True)
True to keep acronyms.
emoticons: bool, optional (default=True)
True to keep emoticons.
censored: bool, optional (default=True)
True to keep censored words: f**k.
emphasis: bool, optional (default=True)
True to keep words with emphasis: *very* good.
numbers: bool, optional (default=True)
True to keep numbers.
temperature: bool, optional (default=True)
True to keep temperatures
distance: bool, optional (default=True)
True to keep distances.
volume: bool, optional (default=True)
True to keep volumes.
duration: bool, optional (default=True)
True to keep durations.
weight: bool, optional (default=True)
True to keep weights.
hypen: bool, optional (default=True)
True to keep hypens.
ic: bool, optional (default=True)
True to keep Malaysian IC.
title: bool, optional (default=True)
True to keep title with dot, Dr. ayam -> ['Dr.', 'ayam']
"""
[3]:
tokenizer = malaya.tokenizer.Tokenizer()
[4]:
tokenizer.tokenize(string1)
[4]:
['xjdi',
'ke',
',',
'y',
'u',
'xsuke',
'makan',
'HUSEIN',
'kt',
'situ',
'tmpt',
',',
'i',
'hate',
'it',
'.',
'pelikle',
',',
'pada']
[5]:
tokenizer.tokenize(string2)
[5]:
['i',
'mmg2',
'xske',
'mknn',
'HUSEIN',
'kampng',
'tmpat',
',',
'i',
'love',
'them',
'.',
'pelikle',
'saye']
[6]:
tokenizer.tokenize(string3)
[6]:
['perdana',
'menteri',
'ke11',
'sgt',
'suka',
'makn',
'ayam',
',',
'harganya',
'cuma',
'rm15.50']
[7]:
tokenizer.tokenize(string4)
[7]:
['pada',
'10',
'/',
'4',
',',
'kementerian',
'mengumumkan',
',',
'1',
'/',
'100']
[8]:
tokenizer.tokenize(string5)
[8]:
['Husein',
'Zolkepli',
'(',
'911223-06-2305',
')',
',',
'dapat',
'tempat',
'ke-12',
'lumba',
'lari',
'hari',
'ni']
[9]:
tokenizer.tokenize(string6)
[9]:
['Husein',
'Zolkepli',
'(',
'2011',
'-',
'2019',
')',
'adalah',
'ketua',
'kampng',
'di',
'kedah',
'sekolah',
'King',
'Edward',
'ke-IV']
[10]:
tokenizer.tokenize(string7)
[10]:
['2jam',
'30 minit',
'aku',
'tunggu',
'kau',
',',
'60.1 kg',
'kau',
'ni',
',',
'suhu',
'harini',
'31.2c',
',',
'aku',
'dahaga',
'minum',
'600ml']
[11]:
tokenizer.tokenize(string8)
[11]:
['online',
'&',
'desktop',
':',
'regexr.com',
'or',
'download',
'the',
'desktop',
'version',
'for',
'Mac']
[12]:
tokenizer.tokenize(string9)
[12]:
['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']
url#
[13]:
tokenizer.tokenize('website saya http://huseinhouse.com')
[13]:
['website', 'saya', 'http://huseinhouse.com']
[14]:
tokenizer.tokenize('website saya huseinhouse.com')
[14]:
['website', 'saya', 'huseinhouse.com']
[15]:
tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')
[15]:
['website', 'saya', 'huseinhouse.com/pelik?a=1']
tags#
[16]:
tokenizer.tokenize('panggil saya <husein>')
[16]:
['panggil', 'saya', '<husein>']
[17]:
tokenizer.tokenize('panggil saya <husein >')
[17]:
['panggil', 'saya', '<', 'husein', '>']
emails#
[18]:
tokenizer.tokenize('email saya husein@rumah.com')
[18]:
['email', 'saya', 'husein@rumah.com']
[19]:
tokenizer.tokenize('email saya husein@rumah.com.my')
[19]:
['email', 'saya', 'husein@rumah.com.my']
users#
[20]:
tokenizer.tokenize('twitter saya @husein123zolkepli')
[20]:
['twitter', 'saya', '@husein123zolkepli']
[21]:
tokenizer.tokenize('twitter saya @ husein123zolkepli')
[21]:
['twitter', 'saya', '@', 'husein123zolkepli']
hashtags#
[22]:
tokenizer.tokenize('panggil saya #huseincomel')
[22]:
['panggil', 'saya', '#huseincomel']
[23]:
tokenizer.tokenize('panggil saya # huseincomel')
[23]:
['panggil', 'saya', '#', 'huseincomel']
phones#
[24]:
tokenizer.tokenize('call sye di 013-1234567')
[24]:
['call', 'sye', 'di', '013-1234567']
[25]:
tokenizer.tokenize('call sye di 013- 1234567')
[25]:
['call', 'sye', 'di', '013', '-', '1234567']
percents#
[26]:
tokenizer.tokenize('saya sokong 100%')
[26]:
['saya', 'sokong', '100%']
[27]:
tokenizer.tokenize('saya sokong 100 %')
[27]:
['saya', 'sokong', '100', '%']
money#
[28]:
tokenizer.tokenize('saya tinggal rm100')
[28]:
['saya', 'tinggal', 'rm100']
[29]:
tokenizer.tokenize('saya tinggal rm100k')
[29]:
['saya', 'tinggal', 'rm100k']
[30]:
tokenizer.tokenize('saya tinggal rm100M')
[30]:
['saya', 'tinggal', 'rm100M']
[31]:
tokenizer.tokenize('saya tinggal rm100.123M')
[31]:
['saya', 'tinggal', 'rm100.123M']
[32]:
tokenizer.tokenize('saya tinggal 40 sen')
[32]:
['saya', 'tinggal', '40 sen']
[33]:
tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')
[33]:
['saya', 'tinggal', '21 ringgit', '50 sen']
[73]:
tokenizer.tokenize('saya tinggal 21 juta ringgit')
[73]:
['saya', 'tinggal', '21 juta ringgit']
[74]:
tokenizer.tokenize('saya tinggal rm 2ribu')
[74]:
['saya', 'tinggal', 'rm 2ribu']
[75]:
tokenizer.tokenize('saya tinggal rm2 juta')
[75]:
['saya', 'tinggal', 'rm2 juta']
date#
[34]:
tokenizer.tokenize('tarikh perjumpaan 10/11/2011')
[34]:
['tarikh', 'perjumpaan', '10/11/2011']
[35]:
tokenizer.tokenize('tarikh perjumpaan 10-11-2011')
[35]:
['tarikh', 'perjumpaan', '10-11-2011']
[36]:
tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')
[36]:
['tarikh', 'perjumpaan', '12 mei 2011']
[37]:
tokenizer.tokenize('tarikh perjumpaan mei 12 2011')
[37]:
['tarikh', 'perjumpaan', 'mei 12 2011']
time#
[38]:
tokenizer.tokenize('jumpa 3 am')
[38]:
['jumpa', '3 am']
[39]:
tokenizer.tokenize('jumpa 3.30am')
[39]:
['jumpa', '3.30am']
[40]:
tokenizer.tokenize('jumpa 22:00')
[40]:
['jumpa', '22:00']
[41]:
tokenizer.tokenize('jumpa pukul 2')
[41]:
['jumpa', 'pukul 2']
[42]:
tokenizer.tokenize('jumpa pukul 2.30')
[42]:
['jumpa', 'pukul 2.30']
[43]:
tokenizer.tokenize('jumpa 2.30 pagi')
[43]:
['jumpa', '2.30 pagi']
[44]:
tokenizer.tokenize('jumpa 2.30 ptg')
[44]:
['jumpa', '2.30 ptg']
[45]:
tokenizer.tokenize('jumpa 2.30 malam')
[45]:
['jumpa', '2.30 malam']
[46]:
tokenizer.tokenize('jumpa 2.30 tngahari')
[46]:
['jumpa', '2.30 tngahari']
[47]:
tokenizer.tokenize('jumpa 2:30:00 tngahari')
[47]:
['jumpa', '2:30:00 tngahari']
[48]:
tokenizer.tokenize('jumpa pukul 2:30:00 tngahari')
[48]:
['jumpa', 'pukul 2:30:00', 'tngahari']
temperature#
[52]:
tokenizer.tokenize('sejuk harini, 31.1c')
[52]:
['sejuk', 'harini', ',', '31.1c']
[53]:
tokenizer.tokenize('sejuk harini, 31.1C')
[53]:
['sejuk', 'harini', ',', '31.1C']
distance#
[54]:
tokenizer.tokenize('nak sampai lagi 31km')
[54]:
['nak', 'sampai', 'lagi', '31km']
[55]:
tokenizer.tokenize('nak sampai lagi 31 km')
[55]:
['nak', 'sampai', 'lagi', '31 km']
volume#
[56]:
tokenizer.tokenize('botol ni 400ml')
[56]:
['botol', 'ni', '400ml']
[57]:
tokenizer.tokenize('botol ni 400 l')
[57]:
['botol', 'ni', '400 l']
duration#
[58]:
tokenizer.tokenize('aku dah tunggu kau 2jam kut')
[58]:
['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']
[59]:
tokenizer.tokenize('aku dah tunggu kau 2 jam kut')
[59]:
['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']
[60]:
tokenizer.tokenize('lagi 10 minit 3 jam')
[60]:
['lagi', '10 minit', '3 jam']
weight#
[61]:
tokenizer.tokenize('berat kau 60 kg')
[61]:
['berat', 'kau', '60 kg']
[62]:
tokenizer.tokenize('berat kau 60kg')
[62]:
['berat', 'kau', '60kg']
hypen#
[63]:
tokenizer.tokenize('sememang-memangnya kau sakai')
[63]:
['sememang-memangnya', 'kau', 'sakai']
[64]:
tokenizer.tokenize('sememang- memangnya kau sakai')
[64]:
['sememang', '-', 'memangnya', 'kau', 'sakai']
IC#
[65]:
tokenizer.tokenize('sememang-memangnya kau sakai, 911223-06-2305')
[65]:
['sememang-memangnya', 'kau', 'sakai', ',', '911223-06-2305']
Sentence tokenizer#
We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.
class SentenceTokenizer:
def __init__(self):
pass
[66]:
s = """
no.1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi
"""
[67]:
s_tokenizer = malaya.tokenizer.SentenceTokenizer()
[68]:
s_tokenizer.tokenize(s)
[68]:
['no.1 polis bertemu dengan suspek di ladang getah.',
'polis tembak pui pui pui bertubi tubi.']
[69]:
s = """
email saya di husein.zol01@gmail.com, nanti jom berkopi
"""
[70]:
s_tokenizer.tokenize(s)
[70]:
['email saya di husein.zol01@gmail.com, nanti jom berkopi.']
[71]:
s = """
ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.
"""
[72]:
s_tokenizer.tokenize(s)
[72]:
['ke.2 cerita nya begini.',
'saya berjalan jalan ditepi muara jumpa anak dara.']