{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Kesalahan Tatabahasa with Tagging" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/tatabahasa-tagging](https://github.com/huseinzol05/Malaya/tree/master/example/tatabahasa-tagging).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module only trained on standard language structure, so it is not save to use it for local language structure.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" ] } ], "source": [ "import malaya\n", "from pprint import pprint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available HuggingFace models" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mesolitica/finetune-tatabahasa-t5-tiny-standard-bahasa-cased': {'Size (MB)': 139,\n", " 'exactly-match': 0.7665198237,\n", " 'f1': 0.970908229,\n", " 'exactly-match-tags': 0.87404885863,\n", " 'f1-tags': 0.9878723587,\n", " 'Suggested length': 256},\n", " 'mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased': {'Size (MB)': 242,\n", " 'exactly-match': 0.8145774929,\n", " 'f1': 0.9781278,\n", " 'exactly-match-tags': 0.89447336,\n", " 'f1-tags': 0.990597377,\n", " 'Suggested length': 256},\n", " 'mesolitica/finetune-tatabahasa-t5-base-standard-bahasa-cased': {'Size (MB)': 892,\n", " 'exactly-match': 0.760913095,\n", " 'f1': 0.970136249,\n", " 'exactly-match-tags': 0.865839,\n", " 'f1-tags': 0.9868999035,\n", " 'Suggested length': 256}}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.tatabahasa.available_huggingface" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Supported kesalahan tatabahasa\n", "\n", "For full description, check out https://tatabahasabm.tripod.com/tata/salahtata.htm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[{'class': 0, 'Description': 'PAD', 'salah': '', 'betul': ''},\n", " {'class': 1, 'Description': 'kesambungan subwords', 'salah': '', 'betul': ''},\n", " {'class': 2, 'Description': 'tiada kesalahan', 'salah': '', 'betul': ''},\n", " {'class': 3,\n", " 'Description': 'kesalahan frasa nama, Perkara yang diterangkan mesti mendahului \"penerang\"',\n", " 'salah': 'Cili sos',\n", " 'betul': 'sos cili'},\n", " {'class': 4,\n", " 'Description': 'kesalahan kata jamak',\n", " 'salah': 'mereka-mereka',\n", " 'betul': 'mereka'},\n", " {'class': 5,\n", " 'Description': 'kesalahan kata penguat',\n", " 'salah': 'sangat tinggi sekali',\n", " 'betul': 'sangat tinggi'},\n", " {'class': 6,\n", " 'Description': 'kata adjektif dan imbuhan \"ter\" tanpa penguat.',\n", " 'salah': 'Sani mendapat markah yang tertinggi sekali.',\n", " 'betul': 'Sani mendapat markah yang tertinggi.'},\n", " {'class': 7,\n", " 'Description': 'kesalahan kata hubung',\n", " 'salah': 'Sally sedang membaca bila saya tiba di rumahnya.',\n", " 'betul': 'Sally sedang membaca apabila saya tiba di rumahnya.'},\n", " {'class': 8,\n", " 'Description': 'kesalahan kata bilangan',\n", " 'salah': 'Beribu peniaga tidak membayar cukai pendapatan.',\n", " 'betul': 'Beribu-ribu peniaga tidak membayar cukai pendapatan'},\n", " {'class': 9,\n", " 'Description': 'kesalahan kata sendi',\n", " 'salah': 'Umar telah berpindah daripada sekolah ini bulan lalu.',\n", " 'betul': 'Umar telah berpindah dari sekolah ini bulan lalu.'},\n", " {'class': 10,\n", " 'Description': 'kesalahan penjodoh bilangan',\n", " 'salah': 'Setiap orang pelajar',\n", " 'betul': 'Setiap pelajar.'},\n", " {'class': 11,\n", " 'Description': 'kesalahan kata ganti diri',\n", " 'salah': 'Pencuri itu telah ditangkap. Beliau dibawa ke balai polis.',\n", " 'betul': 'Pencuri itu telah ditangkap. Dia dibawa ke balai polis.'},\n", " {'class': 12,\n", " 'Description': 'kesalahan ayat pasif',\n", " 'salah': 'Cerpen itu telah dikarang oleh saya.',\n", " 'betul': 'Cerpen itu telah saya karang.'},\n", " {'class': 13,\n", " 'Description': 'kesalahan kata tanya',\n", " 'salah': 'Kamu berasal dari manakah ?',\n", " 'betul': 'Kamu berasal dari mana ?'},\n", " {'class': 14,\n", " 'Description': 'kesalahan tanda baca',\n", " 'salah': 'Kamu berasal dari manakah .',\n", " 'betul': 'Kamu berasal dari mana ?'},\n", " {'class': 15,\n", " 'Description': 'kesalahan kata kerja tak transitif',\n", " 'salah': 'Dia kata kepada saya',\n", " 'betul': 'Dia berkata kepada saya'},\n", " {'class': 16,\n", " 'Description': 'kesalahan kata kerja transitif',\n", " 'salah': 'Dia suka baca buku',\n", " 'betul': 'Dia suka membaca buku'},\n", " {'class': 17,\n", " 'Description': 'penggunaan kata yang tidak tepat',\n", " 'salah': 'Tembuk Besar negeri Cina dibina oleh Shih Huang Ti.',\n", " 'betul': 'Tembok Besar negeri Cina dibina oleh Shih Huang Ti'}]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.tatabahasa.describe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Right now we only able to predict up to 15 different kesalahan tatabahasa, hopefully in the future we can scale this up." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load HuggingFace model\n", "\n", "```python\n", "def huggingface(\n", " model: str = 'mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased',\n", " force_check: bool = True,\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load HuggingFace model to fix kesalahan tatabahasa.\n", "\n", " Parameters\n", " ----------\n", " model: str, optional (default='mesolitica/finetune-tatabahasa-t5-small-standard-bahasa-cased')\n", " Check available models at `malaya.tatabahasa.available_huggingface`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", " Set to False if you have your own huggingface model.\n", "\n", " Returns\n", " -------\n", " result: malaya.torch_model.huggingface.Tatabahasa\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n", "You are using the default legacy behaviour of the . If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n" ] } ], "source": [ "model = malaya.tatabahasa.huggingface()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Predict\n", "\n", "```python\n", "def generate(\n", " self,\n", " strings: List[str],\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Fix kesalahan tatatabahasa.\n", "\n", " Parameters\n", " ----------\n", " strings : List[str]\n", " **kwargs: vector arguments pass to huggingface `generate` method.\n", " Read more at https://huggingface.co/docs/transformers/main_classes/text_generation\n", " Fix kesalahan tatabahasa supported all decoding methods except beam.\n", "\n", " Returns\n", " -------\n", " result: List[Tuple[str, int]]\n", " \"\"\"\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Randomly picked string in bahasa melayu wikipedia." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# https://ms.wikipedia.org/wiki/Bola_sepak\n", "string = 'Pada amnya, hanya penjaga gol sahaja yang dibenarkan menyentuh bola dengan tangan di dalam kawasan golnya'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Pada', 2),\n", " ('amnya', 2),\n", " (',', 2),\n", " ('hanya', 2),\n", " ('penjaga', 2),\n", " ('gol', 2),\n", " ('sahaja', 2),\n", " ('yang', 2),\n", " ('dibenarkan', 2),\n", " ('menyentuh', 2),\n", " ('bola', 2),\n", " ('dengan', 2),\n", " ('tangan', 2),\n", " ('di', 2),\n", " ('dalam', 2),\n", " ('kawasan', 2),\n", " ('golnya', 2)]]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([string], max_length = 256)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now assumed we have kesalahan frasa nama, from `penjaga gol` become `gol penjaga`." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# https://ms.wikipedia.org/wiki/Bola_sepak\n", "string = 'Pada amnya, hanya gol penjaga sahaja yang dibenarkan menyentuh bola dengan tangan di dalam kawasan golnya'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[[('Pada', 2),\n", " ('amnya', 2),\n", " (',', 2),\n", " ('hanya', 2),\n", " ('penjaga', 3),\n", " ('gol', 3),\n", " ('sahaja', 2),\n", " ('yang', 2),\n", " ('dibenarkan', 2),\n", " ('menyentuh', 2),\n", " ('bola', 2),\n", " ('dengan', 2),\n", " ('tangan', 2),\n", " ('di', 2),\n", " ('dalam', 2),\n", " ('kawasan', 2),\n", " ('golnya', 2)]]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Sani', 2),\n", " ('mendapat', 2),\n", " ('markah', 2),\n", " ('yang', 2),\n", " ('tertinggi', 6),\n", " ('.', 2)],\n", " [('Hassan', 2),\n", " ('ialah', 2),\n", " ('peserta', 2),\n", " ('yang', 2),\n", " ('termuda', 6),\n", " ('dalam', 2),\n", " ('pertandingan', 2),\n", " ('itu', 2),\n", " ('.', 2)]]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Sani mendapat markah yang tertinggi sekali.'\n", "string1 = 'Hassan ialah peserta yang termuda sekali dalam pertandingan itu.'\n", "model.generate([string, string1], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Dia', 2), ('berkata', 15), ('kepada', 2), ('saya', 2), ('.', 2)]]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Dia kata kepada saya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "100" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pickle\n", "\n", "with open('tests/dataset-tatabahasa.pkl', 'rb') as fopen:\n", " test_set = pickle.load(fopen)\n", " \n", "len(test_set)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def get_xy(row):\n", " x, y, tag = [], [], []\n", "\n", " for i in range(len(row[0])):\n", " t = [row[0][i][0]]\n", " y.extend(t)\n", " t = [row[1][i][0]]\n", " x.extend(t)\n", " tag.extend([row[1][i][1]] * len(t))\n", "\n", " return ' '.join(x), ' '.join(y), tag" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain seperti posisi penyerang !',\n", " 'Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain di posisi penyerang .',\n", " [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 14])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x, y, t = get_xy(test_set[0])\n", "x, y, t" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Dirk', 2),\n", " ('Jan', 2),\n", " ('Klaas', 2),\n", " ('\"', 2),\n", " ('Klaas-Jan', 2),\n", " ('\"', 2),\n", " ('Huntelaar', 2),\n", " ('(', 2),\n", " ('lahir', 2),\n", " ('12', 2),\n", " ('Ogos', 2),\n", " ('1983', 2),\n", " (')', 2),\n", " ('merupakan', 2),\n", " ('pemain', 2),\n", " ('bola', 2),\n", " ('sepak', 2),\n", " ('Belanda', 2),\n", " ('yang', 2),\n", " ('bermain', 2),\n", " ('di', 9),\n", " ('posisi', 2),\n", " ('penyerang', 2),\n", " ('.', 14)]]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([x], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Pada tahun 2002 , kedua-dua gol beliau menduduki tempat ke-6 dalam 100 Greatest Sporting Moments oleh saluran Channel 4 UK .',\n", " 'Pada tahun 2002 , kedua-dua gol ini menduduki tempat ke-6 dalam 100 Greatest Sporting Moments oleh saluran Channel 4 UK .',\n", " [2, 2, 2, 2, 2, 2, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x, y, t = get_xy(test_set[-1])\n", "x, y, t" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Pada', 2),\n", " ('tahun', 2),\n", " ('2002', 2),\n", " (',', 2),\n", " ('kedua-dua', 2),\n", " ('gol', 2),\n", " ('beliau', 11),\n", " ('menduduki', 2),\n", " ('tempat', 2),\n", " ('ke-6', 2),\n", " ('dalam', 2),\n", " ('100', 2),\n", " ('Greatest', 2),\n", " ('Sporting', 2),\n", " ('Moments', 2),\n", " ('oleh', 2),\n", " ('saluran', 2),\n", " ('Channel', 2),\n", " ('4', 2),\n", " ('UK', 2),\n", " ('.', 2)]]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([x], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Gol inilah yang bergelar Goal of the Century dengan undian Internet 2000 sejak FIFA .',\n", " 'Gol inilah yang bergelar Goal of the Century di undian Internet 2000 oleh FIFA .',\n", " [2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, 9, 2, 2])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x, y, t = get_xy(test_set[-2])\n", "x, y, t" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Gol', 2),\n", " ('inilah', 2),\n", " ('yang', 2),\n", " ('bergelar', 2),\n", " ('Goal', 2),\n", " ('of', 2),\n", " ('the', 2),\n", " ('Century', 2),\n", " ('dalam', 9),\n", " ('undian', 2),\n", " ('Internet', 2),\n", " ('2000', 2),\n", " ('oleh', 9),\n", " ('FIFA', 2),\n", " ('.', 2)]]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([x], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Beliau mengambil bola dalam kawasan kepul diri lalu pusing dan luru lebih separuh padang sambil menyentuh bola 11 kali , memintas lima pemain England : ( Glenn Hoddle , Peter Reid , Kenny Sansom , Terry Butcher , dan Terry Fenwick ) serta penjaga gawang Peter Shilton .',\n", " 'Beliau mengambil bola di kawasan pasukan diri lalu berpusing-pusing dan meluru lebih separuh padang sambil menyentuh bola 11 kali , memintas lima pemain England : ( Glenn Hoddle , Peter Reid , Kenny Sansom , Terry Butcher , dan Terry Fenwick ) serta penjaga gawang Peter Shilton .',\n", " [2,\n", " 2,\n", " 2,\n", " 9,\n", " 2,\n", " 10,\n", " 2,\n", " 2,\n", " 15,\n", " 2,\n", " 15,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x, y, t = get_xy(test_set[-3])\n", "x, y, t" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Beliau', 2),\n", " ('mengambil', 2),\n", " ('bola', 2),\n", " ('dalam', 2),\n", " ('kawasan', 2),\n", " ('pasukan', 10),\n", " ('berdiri', 15),\n", " ('lalu', 2),\n", " ('berpusing', 15),\n", " ('dan', 2),\n", " ('meluru', 15),\n", " ('lebih', 2),\n", " ('separuh', 2),\n", " ('padang', 2),\n", " ('sambil', 2),\n", " ('menyentuh', 2),\n", " ('bola', 2),\n", " ('11', 2),\n", " ('kali', 2),\n", " (',', 2),\n", " ('memintas', 2),\n", " ('lima', 2),\n", " ('pemain', 2),\n", " ('England', 2),\n", " (':', 2),\n", " ('(', 2),\n", " ('Glenn', 2),\n", " ('Hoddle', 2),\n", " (',', 2),\n", " ('Peter', 2),\n", " ('Reid', 2),\n", " (',', 2),\n", " ('Kenny', 2),\n", " ('Sansom', 2),\n", " (',', 2),\n", " ('Terry', 2),\n", " ('Butcher', 2),\n", " (',', 2),\n", " ('dan', 2),\n", " ('Terry', 2),\n", " ('Fenwick', 2),\n", " (')', 2),\n", " ('serta', 2),\n", " ('penjaga', 2),\n", " ('gawang', 2),\n", " ('Peter', 2),\n", " ('Shilton', 2),\n", " ('.', 2)]]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.generate([x], max_length = 256)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### More examples\n", "\n", "I just copy pasted from https://ms.wikipedia.org/wiki/Kesalahan_biasa_tatabahasa_Melayu" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Tidak', 2),\n", " ('ada', 2),\n", " ('apa', 2),\n", " ('yang', 2),\n", " ('mereka', 2),\n", " ('risaukan', 2),\n", " ('waktu', 2),\n", " ('itu.', 2)]]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Tidak ada apa yang mereka risaukan waktu itu.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Ayahnya', 2),\n", " ('setuju', 2),\n", " ('dan', 7),\n", " ('melanggar', 2),\n", " ('syarat', 2),\n", " ('yang', 2),\n", " ('dia', 2),\n", " ('sendiri', 2),\n", " ('menetapkan', 2),\n", " ('.', 2)]]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Ayahnya setuju walaupun melanggar syarat yang dia sendiri menetapkan.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Semuanya', 2), ('dia', 2), ('kenal', 2), ('.', 2)]]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Semuanya dia kenal.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Dia', 2),\n", " ('menjawab', 2),\n", " ('seperti', 2),\n", " ('disuruh-suruh', 2),\n", " ('oleh', 2),\n", " ('kuasa', 2),\n", " ('yang', 2),\n", " ('dia', 2),\n", " ('tidak', 2),\n", " ('tahu', 2),\n", " ('dari', 2),\n", " ('mana', 2),\n", " ('puncanya.', 2)]]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Dia menjawab seperti disuruh-suruh oleh kuasa yang dia tidak tahu dari mana puncanya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Bola', 2),\n", " ('ini', 2),\n", " ('ditendang', 2),\n", " ('oleh', 2),\n", " ('saya', 2),\n", " ('.', 2)]]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Bola ini ditendang oleh saya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Makanan', 2),\n", " ('ini', 11),\n", " ('kamu', 2),\n", " ('telah', 2),\n", " ('makan', 2),\n", " ('.', 14)]]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Makanan ini kamu telah makan?'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Segala', 2),\n", " ('perubahan', 2),\n", " ('yang', 2),\n", " ('berlaku', 2),\n", " ('kita', 2),\n", " ('akan', 2),\n", " ('menghadapi', 2),\n", " ('sama-sama', 2),\n", " ('.', 2)]]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Segala perubahan yang berlaku kita akan menghadapi sama-sama.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Kakak', 2),\n", " ('saya', 2),\n", " ('sedang', 2),\n", " ('memasak', 2),\n", " ('gulai', 2),\n", " ('nangka.', 2),\n", " ('Dia', 2),\n", " ('menyenduk', 2),\n", " ('seketul', 2),\n", " ('gulai', 3),\n", " ('nangka', 3),\n", " ('dan', 2),\n", " ('menyuruh', 2),\n", " ('saya', 2),\n", " ('merasanya.', 2)]]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Kakak saya sedang memasak gulai nangka. Dia menyenduk seketul nangka gulai dan menyuruh saya merasanya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Sally', 2),\n", " ('sedang', 2),\n", " ('membaca', 2),\n", " ('bila', 2),\n", " ('saya', 11),\n", " ('tiba', 2),\n", " ('di', 2),\n", " ('rumahnya.', 2)]]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Sally sedang membaca bila saya tiba di rumahnya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Badannya', 2),\n", " ('besar', 2),\n", " ('dan', 7),\n", " ('kakinya', 2),\n", " ('kecil', 2),\n", " ('.', 2)]]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Badannya besar kecuali kakinya kecil.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Beribu', 2),\n", " ('peniaga', 2),\n", " ('tidak', 2),\n", " ('membayar', 2),\n", " ('cukai', 2),\n", " ('pendapatan.', 2)]]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Beribu peniaga tidak membayar cukai pendapatan.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Setengah', 2),\n", " ('remaja', 2),\n", " ('suka', 2),\n", " ('membuang', 2),\n", " ('masa', 2),\n", " ('di', 2),\n", " ('pasar', 2),\n", " ('raya.', 2)]]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Setengah remaja suka membuang masa di pasar raya.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Umar', 2),\n", " ('telah', 2),\n", " ('berpindah', 2),\n", " ('ke', 9),\n", " ('sekolah', 2),\n", " ('ini', 2),\n", " ('bulan', 2),\n", " ('lalu', 2),\n", " ('.', 2)]]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Umar telah berpindah daripada sekolah ini bulan lalu.'\n", "model.generate([string], max_length = 256)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('Para', 4), ('peserta', 2), ('sedang', 2), ('berbaris', 2), ('.', 2)]]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string = 'Para-para peserta sedang berbaris.'\n", "model.generate([string], max_length = 256)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }