{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Word tokenizer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/tokenizer-word](https://github.com/huseinzol05/Malaya/tree/master/example/tokenizer-word).\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 6.01 s, sys: 1.2 s, total: 7.21 s\n", "Wall time: 8.42 s\n" ] } ], "source": [ "%%time\n", "import malaya" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'\n", "string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'\n", "string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'\n", "string4 = 'pada 10/4, kementerian mengumumkan, 1/100'\n", "string5 = 'Husein Zolkepli (911223-06-2305), dapat tempat ke-12 lumba lari hari ni'\n", "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'\n", "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'\n", "string8 = 'online & desktop: regexr.com or download the desktop version for Mac'\n", "string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'\n", "string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load word tokenizer\n", "\n", "```python\n", "class Tokenizer:\n", " def __init__(self, **kwargs):\n", " \"\"\"\n", " Load Tokenizer object.\n", " Check supported regex pattern at \n", " https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", "\n", " Parameters\n", " ----------\n", " emojis: bool, optional (default=False)\n", " True to keep emojis.\n", " urls: bool, optional (default=True)\n", " True to keep urls.\n", " urls_improved: bool, optional (default=True)\n", " True to keep urls, better version.\n", " tags: bool, optional (default=True)\n", " True to keep tags: .\n", " emails: bool, optional (default=True)\n", " True to keep emails.\n", " users: bool, optional (default=True)\n", " True to keep users handles: @cbaziotis.\n", " hashtags: bool, optional (default=True)\n", " True to keep hashtags.\n", " phones: bool, optional (default=True)\n", " True to keep phones.\n", " percents: bool, optional (default=True)\n", " True to keep percents.\n", " money: bool, optional (default=True)\n", " True to keep money expressions.\n", " date: bool, optional (default=True)\n", " True to keep date expressions.\n", " time: bool, optional (default=True)\n", " True to keep time expressions.\n", " time_pukul: bool, optional (default=True)\n", " True to keep time `pukul` expressions.\n", " acronyms: bool, optional (default=True)\n", " True to keep acronyms.\n", " emoticons: bool, optional (default=True)\n", " True to keep emoticons.\n", " censored: bool, optional (default=True)\n", " True to keep censored words: f**k.\n", " emphasis: bool, optional (default=True)\n", " True to keep words with emphasis: *very* good.\n", " numbers: bool, optional (default=True)\n", " True to keep numbers.\n", " temperature: bool, optional (default=True)\n", " True to keep temperatures\n", " distance: bool, optional (default=True)\n", " True to keep distances.\n", " volume: bool, optional (default=True)\n", " True to keep volumes.\n", " duration: bool, optional (default=True)\n", " True to keep durations.\n", " weight: bool, optional (default=True)\n", " True to keep weights.\n", " hypen: bool, optional (default=True)\n", " True to keep hypens.\n", " ic: bool, optional (default=True)\n", " True to keep Malaysian IC.\n", " title: bool, optional (default=True)\n", " True to keep title with dot, Dr. ayam -> ['Dr.', 'ayam']\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tokenizer = malaya.tokenizer.Tokenizer()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tokenize\n", "\n", "```python\n", "def tokenize(self, string: str, lowercase: bool = False):\n", " \"\"\"\n", " Tokenize string into words.\n", "\n", " Parameters\n", " ----------\n", " string : str\n", " lowercase: bool, optional (default=False)\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['xjdi',\n", " 'ke',\n", " ',',\n", " 'y',\n", " 'u',\n", " 'xsuke',\n", " 'makan',\n", " 'HUSEIN',\n", " 'kt',\n", " 'situ',\n", " 'tmpt',\n", " ',',\n", " 'i',\n", " 'hate',\n", " 'it',\n", " '.',\n", " 'pelikle',\n", " ',',\n", " 'pada']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['i',\n", " 'mmg2',\n", " 'xske',\n", " 'mknn',\n", " 'HUSEIN',\n", " 'kampng',\n", " 'tmpat',\n", " ',',\n", " 'i',\n", " 'love',\n", " 'them',\n", " '.',\n", " 'pelikle',\n", " 'saye']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string2)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['perdana',\n", " 'menteri',\n", " 'ke11',\n", " 'sgt',\n", " 'suka',\n", " 'makn',\n", " 'ayam',\n", " ',',\n", " 'harganya',\n", " 'cuma',\n", " 'rm15.50']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string3)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pada',\n", " '10',\n", " '/',\n", " '4',\n", " ',',\n", " 'kementerian',\n", " 'mengumumkan',\n", " ',',\n", " '1',\n", " '/',\n", " '100']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string4)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Husein',\n", " 'Zolkepli',\n", " '(',\n", " '911223-06-2305',\n", " ')',\n", " ',',\n", " 'dapat',\n", " 'tempat',\n", " 'ke-12',\n", " 'lumba',\n", " 'lari',\n", " 'hari',\n", " 'ni']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Husein',\n", " 'Zolkepli',\n", " '(',\n", " '2011',\n", " '-',\n", " '2019',\n", " ')',\n", " 'adalah',\n", " 'ketua',\n", " 'kampng',\n", " 'di',\n", " 'kedah',\n", " 'sekolah',\n", " 'King',\n", " 'Edward',\n", " 'ke-IV']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string6)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['2jam',\n", " '30 minit',\n", " 'aku',\n", " 'tunggu',\n", " 'kau',\n", " ',',\n", " '60.1 kg',\n", " 'kau',\n", " 'ni',\n", " ',',\n", " 'suhu',\n", " 'harini',\n", " '31.2c',\n", " ',',\n", " 'aku',\n", " 'dahaga',\n", " 'minum',\n", " '600ml']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string7)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['online',\n", " '&',\n", " 'desktop',\n", " ':',\n", " 'regexr.com',\n", " 'or',\n", " 'download',\n", " 'the',\n", " 'desktop',\n", " 'version',\n", " 'for',\n", " 'Mac']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string8)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(string9)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### url" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['website', 'saya', 'http://huseinhouse.com']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('website saya http://huseinhouse.com')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['website', 'saya', 'huseinhouse.com']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('website saya huseinhouse.com')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['website', 'saya', 'huseinhouse.com/pelik?a=1']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### tags" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['panggil', 'saya', '']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('panggil saya ')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['panggil', 'saya', '<', 'husein', '>']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('panggil saya ')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### emails" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['email', 'saya', 'husein@rumah.com']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('email saya husein@rumah.com')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['email', 'saya', 'husein@rumah.com.my']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('email saya husein@rumah.com.my')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### users" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['twitter', 'saya', '@husein123zolkepli']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('twitter saya @husein123zolkepli')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['twitter', 'saya', '@', 'husein123zolkepli']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('twitter saya @ husein123zolkepli')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### hashtags" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['panggil', 'saya', '#huseincomel']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('panggil saya #huseincomel')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['panggil', 'saya', '#', 'huseincomel']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('panggil saya # huseincomel')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### phones" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['call', 'sye', 'di', '013-1234567']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('call sye di 013-1234567')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['call', 'sye', 'di', '013', '-', '1234567']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('call sye di 013- 1234567')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### percents" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'sokong', '100%']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya sokong 100%')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'sokong', '100', '%']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya sokong 100 %')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### money" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm100']" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm100')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm100k']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm100k')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm100M']" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm100M')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm100.123M']" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm100.123M')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', '40 sen']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal 40 sen')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', '21 ringgit', '50 sen']" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', '21 juta ringgit']" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal 21 juta ringgit')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm 2ribu']" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm 2ribu')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['saya', 'tinggal', 'rm2 juta']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('saya tinggal rm2 juta')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### date" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['tarikh', 'perjumpaan', '10/11/2011']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('tarikh perjumpaan 10/11/2011')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['tarikh', 'perjumpaan', '10-11-2011']" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('tarikh perjumpaan 10-11-2011')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['tarikh', 'perjumpaan', '12 mei 2011']" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['tarikh', 'perjumpaan', 'mei 12 2011']" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('tarikh perjumpaan mei 12 2011')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### time" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '3 am']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 3 am')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '3.30am']" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 3.30am')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '22:00']" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 22:00')" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', 'pukul 2']" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa pukul 2')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', 'pukul 2.30']" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa pukul 2.30')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '2.30 pagi']" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 2.30 pagi')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '2.30 ptg']" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 2.30 ptg')" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '2.30 malam']" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 2.30 malam')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '2.30 tngahari']" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 2.30 tngahari')" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', '2:30:00 tngahari']" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa 2:30:00 tngahari')" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['jumpa', 'pukul 2:30:00', 'tngahari']" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('jumpa pukul 2:30:00 tngahari')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### censored" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['f**k', 'lah']" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('f**k lah')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### emphasis" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['*damn*', 'good', 'weih']" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('*damn* good weih')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### numbers" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['no', 'saya', '123']" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('no saya 123')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### temperature" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sejuk', 'harini', ',', '31.1c']" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('sejuk harini, 31.1c')" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sejuk', 'harini', ',', '31.1C']" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('sejuk harini, 31.1C')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### distance" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['nak', 'sampai', 'lagi', '31km']" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('nak sampai lagi 31km')" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['nak', 'sampai', 'lagi', '31 km']" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('nak sampai lagi 31 km')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### volume" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['botol', 'ni', '400ml']" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('botol ni 400ml')" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['botol', 'ni', '400 l']" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('botol ni 400 l')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### duration" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('aku dah tunggu kau 2jam kut')" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('aku dah tunggu kau 2 jam kut')" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['lagi', '10 minit', '3 jam']" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('lagi 10 minit 3 jam')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### weight" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['berat', 'kau', '60 kg']" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('berat kau 60 kg')" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['berat', 'kau', '60kg']" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('berat kau 60kg')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### hypen" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sememang-memangnya', 'kau', 'sakai']" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('sememang-memangnya kau sakai')" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['sememang', '-', 'memangnya', 'kau', 'sakai']" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('sememang- memangnya kau sakai')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### IC" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sememang-memangnya', 'kau', 'sakai', ',', '911223-06-2305']" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('sememang-memangnya kau sakai, 911223-06-2305')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### titles" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['dr.', 'syed', 'sakai', 'gile', '.']" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('dr. syed sakai gile.')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['dr.', 'phd.', 'syed', 'sakai', 'gile', '.']" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('dr. phd. syed sakai gile.')" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ybhg.', 'dr.', 'syed', '.']" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize('ybhg. dr. syed.')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }