Word and sentence tokenizer

This tutorial is available as an IPython notebook at Malaya/example/tokenizer.

[1]:
%%time
import malaya
CPU times: user 5.43 s, sys: 1.13 s, total: 6.56 s
Wall time: 7.88 s
[2]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'

Load word tokenizer

class TOKENIZER:
    def __init__(self, lowercase = False, **kwargs):
        """
        Load Tokenizer object.
        Check supported regex pattern at https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85

        Parameters
        ----------
        lowercase: bool, optional (default=False)
            lowercase tokens.
        emojis: bool, optional (default=True)
            True to keep emojis.
        urls: bool, optional (default=True)
            True to keep urls.
        tags: bool, optional (default=True)
            True to keep tags: <tag>.
        emails: bool, optional (default=True)
            True to keep emails.
        users: bool, optional (default=True)
            True to keep users handles: @cbaziotis.
        hashtags: bool, optional (default=True)
            True to keep hashtags.
        phones: bool, optional (default=True)
            True to keep phones.
        percents: bool, optional (default=True)
            True to keep percents.
        money: bool, optional (default=True)
            True to keep money expressions.
        date: bool, optional (default=True)
            True to keep date expressions.
        time: bool, optional (default=True)
            True to keep time expressions.
        acronyms: bool, optional (default=True)
            True to keep acronyms.
        emoticons: bool, optional (default=True)
            True to keep emoticons.
        censored: bool, optional (default=True)
            True to keep censored words: f**k.
        emphasis: bool, optional (default=True)
            True to keep words with emphasis: *very* good.
        numbers: bool, optional (default=True)
            True to keep numbers.
        temperature: bool, optional (default=True)
            True to keep temperatures
        distance: bool, optional (default=True)
            True to keep distances.
        volume: bool, optional (default=True)
            True to keep volumes.
        duration: bool, optional (default=True)
            True to keep durations.
        weight: bool, optional (default=True)
            True to keep weights.
        hypen: bool, optional (default=True)
            True to keep hypens.
        """
[3]:
tokenizer = malaya.preprocessing.TOKENIZER()
[4]:
tokenizer.tokenize(string1)
[4]:
['xjdi',
 'ke',
 ',',
 'y',
 'u',
 'xsuke',
 'makan',
 'HUSEIN',
 'kt',
 'situ',
 'tmpt',
 ',',
 'i',
 'hate',
 'it',
 '.',
 'pelikle',
 ',',
 'pada']
[5]:
tokenizer.tokenize(string2)
[5]:
['i',
 'mmg2',
 'xske',
 'mknn',
 'HUSEIN',
 'kampng',
 'tmpat',
 ',',
 'i',
 'love',
 'them',
 '.',
 'pelikle',
 'saye']
[6]:
tokenizer.tokenize(string3)
[6]:
['perdana',
 'menteri',
 'ke11',
 'sgt',
 'suka',
 'makn',
 'ayam',
 ',',
 'harganya',
 'cuma',
 'rm15.50']
[7]:
tokenizer.tokenize(string4)
[7]:
['pada',
 '10',
 '/',
 '4',
 ',',
 'kementerian',
 'mengumumkan',
 ',',
 '1',
 '/',
 '100']
[8]:
tokenizer.tokenize(string6)
[8]:
['Husein',
 'Zolkepli',
 '(',
 '2011',
 '-',
 '2019',
 ')',
 'adalah',
 'ketua',
 'kampng',
 'di',
 'kedah',
 'sekolah',
 'King',
 'Edward',
 'ke-IV']
[9]:
tokenizer.tokenize(string7)
[9]:
['2jam',
 '30 minit',
 'aku',
 'tunggu',
 'kau',
 ',',
 '60.1 kg',
 'kau',
 'ni',
 ',',
 'suhu',
 'harini',
 '31.2c,',
 'aku',
 'dahaga',
 'minum',
 '600ml']

url

[10]:
tokenizer.tokenize('website saya http://huseinhouse.com')
[10]:
['website', 'saya', 'http://huseinhouse.com']
[11]:
tokenizer.tokenize('website saya huseinhouse.com')
[11]:
['website', 'saya', 'huseinhouse.com']

tags

[12]:
tokenizer.tokenize('panggil saya <husein>')
[12]:
['panggil', 'saya', '<husein>']
[13]:
tokenizer.tokenize('panggil saya <husein >')
[13]:
['panggil', 'saya', '<', 'husein', '>']

emails

[14]:
tokenizer.tokenize('email saya husein@rumah.com')
[14]:
['email', 'saya', 'husein@rumah.com']
[15]:
tokenizer.tokenize('email saya husein@rumah.com.my')
[15]:
['email', 'saya', 'husein@rumah.com.my']

users

[16]:
tokenizer.tokenize('twitter saya @husein123zolkepli')
[16]:
['twitter', 'saya', '@husein123zolkepli']
[17]:
tokenizer.tokenize('twitter saya @ husein123zolkepli')
[17]:
['twitter', 'saya', '@', 'husein123zolkepli']

hashtags

[18]:
tokenizer.tokenize('panggil saya #huseincomel')
[18]:
['panggil', 'saya', '#huseincomel']
[19]:
tokenizer.tokenize('panggil saya # huseincomel')
[19]:
['panggil', 'saya', '#', 'huseincomel']

phones

[20]:
tokenizer.tokenize('call sye di 013-1234567')
[20]:
['call', 'sye', 'di', '013-1234567']
[21]:
tokenizer.tokenize('call sye di 013- 1234567')
[21]:
['call', 'sye', 'di', '013', '-', '1234567']

percents

[22]:
tokenizer.tokenize('saya sokong 100%')
[22]:
['saya', 'sokong', '100%']
[23]:
tokenizer.tokenize('saya sokong 100 %')
[23]:
['saya', 'sokong', '100', '%']

money

[24]:
tokenizer.tokenize('saya tinggal rm100')
[24]:
['saya', 'tinggal', 'rm100']
[25]:
tokenizer.tokenize('saya tinggal rm100k')
[25]:
['saya', 'tinggal', 'rm100k']
[26]:
tokenizer.tokenize('saya tinggal rm100M')
[26]:
['saya', 'tinggal', 'rm100M']
[27]:
tokenizer.tokenize('saya tinggal rm100.123M')
[27]:
['saya', 'tinggal', 'rm100.123M']
[28]:
tokenizer.tokenize('saya tinggal 40 sen')
[28]:
['saya', 'tinggal', '40 sen']
[29]:
tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')
[29]:
['saya', 'tinggal', '21 ringgit', '50 sen']

date

[30]:
tokenizer.tokenize('tarikh perjumpaan 10/11/2011')
[30]:
['tarikh', 'perjumpaan', '10/11/2011']
[31]:
tokenizer.tokenize('tarikh perjumpaan 10-11-2011')
[31]:
['tarikh', 'perjumpaan', '10-11-2011']
[32]:
tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')
[32]:
['tarikh', 'perjumpaan', '12 mei 2011']
[33]:
tokenizer.tokenize('tarikh perjumpaan mei 12 2011')
[33]:
['tarikh', 'perjumpaan', 'mei 12 2011']

time

[34]:
tokenizer.tokenize('jumpa 3 am')
[34]:
['jumpa', '3 am']
[35]:
tokenizer.tokenize('jumpa 22:00')
[35]:
['jumpa', '22:00']

censored

[36]:
tokenizer.tokenize('f**k lah')
[36]:
['f**k', 'lah']

emphasis

[37]:
tokenizer.tokenize('*damn* good weih')
[37]:
['*damn*', 'good', 'weih']

numbers

[38]:
tokenizer.tokenize('no saya 123')
[38]:
['no', 'saya', '123']

temperature

[39]:
tokenizer.tokenize('sejuk harini, 31.1c')
[39]:
['sejuk', 'harini', ',', '31.1c']
[40]:
tokenizer.tokenize('sejuk harini, 31.1C')
[40]:
['sejuk', 'harini', ',', '31.1C']

distance

[41]:
tokenizer.tokenize('nak sampai lagi 31km')
[41]:
['nak', 'sampai', 'lagi', '31km']
[42]:
tokenizer.tokenize('nak sampai lagi 31 km')
[42]:
['nak', 'sampai', 'lagi', '31 km']

volume

[43]:
tokenizer.tokenize('botol ni 400ml')
[43]:
['botol', 'ni', '400ml']
[44]:
tokenizer.tokenize('botol ni 400 l')
[44]:
['botol', 'ni', '400 l']

duration

[45]:
tokenizer.tokenize('aku dah tunggu kau 2jam kut')
[45]:
['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']
[46]:
tokenizer.tokenize('aku dah tunggu kau 2 jam kut')
[46]:
['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']
[47]:
tokenizer.tokenize('lagi 10 minit 3 jam')
[47]:
['lagi', '10 minit', '3 jam']

weight

[48]:
tokenizer.tokenize('berat kau 60 kg')
[48]:
['berat', 'kau', '60 kg']
[49]:
tokenizer.tokenize('berat kau 60kg')
[49]:
['berat', 'kau', '60kg']

hypen

[50]:
tokenizer.tokenize('sememang-memangnya kau sakai')
[50]:
['sememang-memangnya', 'kau', 'sakai']
[51]:
tokenizer.tokenize('sememang- memangnya kau sakai')
[51]:
['sememang', '-', 'memangnya', 'kau', 'sakai']

Sentence tokenizer

We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.

def split_into_sentences(text, minimum_length = 5):
    """
    Sentence tokenizer.

    Parameters
    ----------
    text: str
    minimum_length: int, optional (default=5)
        minimum length to assume a string is a string, default 5 characters.

    Returns
    -------
    result: List[str]
    """
[52]:
s = """
no.1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi
"""
[53]:
malaya.text.function.split_into_sentences(s)
[53]:
['no.1 polis bertemu dengan suspek di ladang getah.',
 'polis tembak pui pui pui bertubi tubi.']
[56]:
s = """
email saya di husein.zol01@gmail.com, nanti jom berkopi
"""
[57]:
malaya.text.function.split_into_sentences(s)
[57]:
['email saya di husein.zol01@gmail.com, nanti jom berkopi.']
[60]:
s = """
ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.
"""
[61]:
malaya.text.function.split_into_sentences(s)
[61]:
['ke.2 cerita nya begini.',
 'saya berjalan jalan ditepi muara jumpa anak dara.']