{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Part-of-Speech Recognition" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/part-of-speech](https://github.com/huseinzol05/Malaya/tree/master/example/part-of-speech).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module only trained on standard language structure, so it is not save to use it for local language structure.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.83 s, sys: 3.88 s, total: 6.71 s\n", "Wall time: 1.95 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" ] } ], "source": [ "%%time\n", "import malaya" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Describe supported POS" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'Tag': 'ADJ', 'Description': 'Adjective, kata sifat'},\n", " {'Tag': 'ADP', 'Description': 'Adposition'},\n", " {'Tag': 'ADV', 'Description': 'Adverb, kata keterangan'},\n", " {'Tag': 'ADX', 'Description': 'Auxiliary verb, kata kerja tambahan'},\n", " {'Tag': 'CCONJ', 'Description': 'Coordinating conjuction, kata hubung'},\n", " {'Tag': 'DET', 'Description': 'Determiner, kata penentu'},\n", " {'Tag': 'NOUN', 'Description': ' Noun, kata nama'},\n", " {'Tag': 'NUM', 'Description': 'Number, nombor'},\n", " {'Tag': 'PART', 'Description': 'Particle'},\n", " {'Tag': 'PRON', 'Description': 'Pronoun, kata ganti'},\n", " {'Tag': 'PROPN', 'Description': 'Proper noun, kata ganti nama khas'},\n", " {'Tag': 'SCONJ', 'Description': 'Subordinating conjunction'},\n", " {'Tag': 'SYM', 'Description': 'Symbol'},\n", " {'Tag': 'VERB', 'Description': 'Verb, kata kerja'},\n", " {'Tag': 'X', 'Description': 'Other'}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.pos.describe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available HuggingFace POS models" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mesolitica/pos-t5-tiny-standard-bahasa-cased': {'Size (MB)': 84.7,\n", " 'PART': {'precision': 0.8938547486033519,\n", " 'recall': 0.9411764705882353,\n", " 'f1': 0.9169054441260744,\n", " 'number': 170},\n", " 'CCONJ': {'precision': 0.9713905522288756,\n", " 'recall': 0.9785522788203753,\n", " 'f1': 0.974958263772955,\n", " 'number': 1492},\n", " 'ADJ': {'precision': 0.9192897497982244,\n", " 'recall': 0.88984375,\n", " 'f1': 0.9043271139341008,\n", " 'number': 1280},\n", " 'ADP': {'precision': 0.9770908087220536,\n", " 'recall': 0.9844271412680756,\n", " 'f1': 0.9807452555755645,\n", " 'number': 3596},\n", " 'ADV': {'precision': 0.9478672985781991,\n", " 'recall': 0.9523809523809523,\n", " 'f1': 0.9501187648456056,\n", " 'number': 1260},\n", " 'VERB': {'precision': 0.9654357459379616,\n", " 'recall': 0.9662921348314607,\n", " 'f1': 0.9658637505541599,\n", " 'number': 3382},\n", " 'DET': {'precision': 0.9603854389721628,\n", " 'recall': 0.9542553191489361,\n", " 'f1': 0.9573105656350054,\n", " 'number': 940},\n", " 'NOUN': {'precision': 0.8789933694996986,\n", " 'recall': 0.8976608187134503,\n", " 'f1': 0.8882290239074159,\n", " 'number': 6498},\n", " 'PRON': {'precision': 0.9888991674375578,\n", " 'recall': 0.9861623616236163,\n", " 'f1': 0.9875288683602771,\n", " 'number': 1084},\n", " 'PROPN': {'precision': 0.8842357164223751,\n", " 'recall': 0.8982072318444242,\n", " 'f1': 0.891166716912873,\n", " 'number': 6582},\n", " 'NUM': {'precision': 0.9532391622016562,\n", " 'recall': 0.9688118811881188,\n", " 'f1': 0.9609624355511908,\n", " 'number': 2020},\n", " 'PUNCT': {'precision': 0.9991261796574624,\n", " 'recall': 0.9980796089385475,\n", " 'f1': 0.9986026200873362,\n", " 'number': 5728},\n", " 'AUX': {'precision': 1.0,\n", " 'recall': 0.9852941176470589,\n", " 'f1': 0.9925925925925926,\n", " 'number': 204},\n", " 'SYM': {'precision': 0.8950617283950617,\n", " 'recall': 0.90625,\n", " 'f1': 0.9006211180124224,\n", " 'number': 160},\n", " 'X': {'precision': 0.4444444444444444,\n", " 'recall': 0.5,\n", " 'f1': 0.47058823529411764,\n", " 'number': 16},\n", " 'overall_precision': 0.9370964022140221,\n", " 'overall_recall': 0.9446123445309775,\n", " 'overall_f1': 0.9408393632416786,\n", " 'overall_accuracy': 0.9579554043839759},\n", " 'mesolitica/pos-t5-small-standard-bahasa-cased': {'Size (MB)': 141,\n", " 'PART': {'precision': 0.950920245398773,\n", " 'recall': 0.9117647058823529,\n", " 'f1': 0.9309309309309309,\n", " 'number': 170},\n", " 'SCONJ': {'precision': 0.9883481836874571,\n", " 'recall': 0.9664879356568364,\n", " 'f1': 0.9772958319213825,\n", " 'number': 1492},\n", " 'ADJ': {'precision': 0.9257425742574258,\n", " 'recall': 0.8765625,\n", " 'f1': 0.9004815409309791,\n", " 'number': 1280},\n", " 'ADP': {'precision': 0.9854219231847491,\n", " 'recall': 0.9774749721913237,\n", " 'f1': 0.9814323607427056,\n", " 'number': 3596},\n", " 'ADV': {'precision': 0.9580306698950767,\n", " 'recall': 0.942063492063492,\n", " 'f1': 0.9499799919967987,\n", " 'number': 1260},\n", " 'VERB': {'precision': 0.9693969396939695,\n", " 'recall': 0.9553518628030752,\n", " 'f1': 0.9623231571109457,\n", " 'number': 3382},\n", " 'DET': {'precision': 0.9666307857911733,\n", " 'recall': 0.9553191489361702,\n", " 'f1': 0.9609416800428037,\n", " 'number': 940},\n", " 'NOUN': {'precision': 0.892811906269791,\n", " 'recall': 0.8678054786088027,\n", " 'f1': 0.880131106602154,\n", " 'number': 6498},\n", " 'PRON': {'precision': 0.9906803355079217,\n", " 'recall': 0.9806273062730627,\n", " 'f1': 0.9856281872971719,\n", " 'number': 1084},\n", " 'PROPN': {'precision': 0.8682452062754212,\n", " 'recall': 0.9080826496505622,\n", " 'f1': 0.8877172137234517,\n", " 'number': 6582},\n", " 'NUM': {'precision': 0.9799899949974987,\n", " 'recall': 0.9698019801980198,\n", " 'f1': 0.9748693704901717,\n", " 'number': 2020},\n", " 'PUNCT': {'precision': 0.9986033519553073,\n", " 'recall': 0.9986033519553073,\n", " 'f1': 0.9986033519553073,\n", " 'number': 5728},\n", " 'AUX': {'precision': 0.9900990099009901,\n", " 'recall': 0.9803921568627451,\n", " 'f1': 0.9852216748768472,\n", " 'number': 204},\n", " 'SYM': {'precision': 0.9246575342465754,\n", " 'recall': 0.84375,\n", " 'f1': 0.8823529411764707,\n", " 'number': 160},\n", " 'X': {'precision': 1.0, 'recall': 0.25, 'f1': 0.4, 'number': 16},\n", " 'overall_precision': 0.941408302679979,\n", " 'overall_recall': 0.9370859002673486,\n", " 'overall_f1': 0.939242128564355,\n", " 'overall_accuracy': 0.955475245653817}}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.pos.available_huggingface" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load HuggingFace model\n", "\n", "```python\n", "def huggingface(\n", " model: str = 'mesolitica/pos-t5-small-standard-bahasa-cased',\n", " force_check: bool = True,\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load HuggingFace model to Part-of-Speech Recognition.\n", "\n", " Parameters\n", " ----------\n", " model: str, optional (default='mesolitica/pos-t5-small-standard-bahasa-cased')\n", " Check available models at `malaya.pos.available_huggingface`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", " Set to False if you have your own huggingface model.\n", "\n", " Returns\n", " -------\n", " result: malaya.torch_model.huggingface.Tagging\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "model = malaya.pos.huggingface()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Predict\n", "\n", "```python\n", "def predict(self, string: str):\n", " \"\"\"\n", " Tag a string.\n", "\n", " Parameters\n", " ----------\n", " string : str\n", "\n", " Returns\n", " -------\n", " result: Tuple[str, str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ] }, { "data": { "text/plain": [ "[('KUALA', 'PROPN'),\n", " ('LUMPUR:', 'PROPN'),\n", " ('Sempena', 'PROPN'),\n", " ('sambutan', 'NOUN'),\n", " ('Aidilfitri', 'PROPN'),\n", " ('minggu', 'NOUN'),\n", " ('depan,', 'ADJ'),\n", " ('Perdana', 'PROPN'),\n", " ('Menteri', 'PROPN'),\n", " ('Tun', 'PROPN'),\n", " ('Dr', 'PROPN'),\n", " ('Mahathir', 'PROPN'),\n", " ('Mohamad', 'PROPN'),\n", " ('dan', 'CCONJ'),\n", " ('Menteri', 'PROPN'),\n", " ('Pengangkutan', 'PROPN'),\n", " ('Anthony', 'PROPN'),\n", " ('Loke', 'PROPN'),\n", " ('Siew', 'PROPN'),\n", " ('Fook', 'PROPN'),\n", " ('menitipkan', 'VERB'),\n", " ('pesanan', 'NOUN'),\n", " ('khas', 'ADJ'),\n", " ('kepada', 'ADP'),\n", " ('orang', 'NOUN'),\n", " ('ramai', 'NOUN'),\n", " ('yang', 'PRON'),\n", " ('mahu', 'ADV'),\n", " ('pulang', 'VERB'),\n", " ('ke', 'ADP'),\n", " ('kampung', 'NOUN'),\n", " ('halaman', 'NOUN'),\n", " ('masing-masing.', 'DET'),\n", " ('Dalam', 'ADP'),\n", " ('video', 'NOUN'),\n", " ('pendek', 'ADJ'),\n", " ('terbitan', 'NOUN'),\n", " ('Jabatan', 'PROPN'),\n", " ('Keselamatan', 'PROPN'),\n", " ('Jalan', 'PROPN'),\n", " ('Raya', 'PROPN'),\n", " ('(JKJR)', 'PUNCT'),\n", " ('itu,', 'DET'),\n", " ('Dr', 'PROPN'),\n", " ('Mahathir', 'PROPN'),\n", " ('menasihati', 'VERB'),\n", " ('mereka', 'PRON'),\n", " ('supaya', 'NOUN'),\n", " ('berhenti', 'VERB'),\n", " ('berehat', 'VERB'),\n", " ('dan', 'CCONJ'),\n", " ('tidur', 'VERB'),\n", " ('sebentar', 'ADV'),\n", " ('sekiranya', 'ADV'),\n", " ('mengantuk', 'VERB'),\n", " ('ketika', 'SCONJ'),\n", " ('memandu.', 'VERB')]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Group similar tags\n", "\n", "```python\n", "def analyze(self, string: str):\n", " \"\"\"\n", " Analyze a string.\n", "\n", " Parameters\n", " ----------\n", " string : str\n", "\n", " Returns\n", " -------\n", " result: {'words': List[str], 'tags': [{'text': 'text', 'type': 'location', 'score': 1.0, 'beginOffset': 0, 'endOffset': 1}]}\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'text': ['KUALA', 'LUMPUR:', 'Sempena'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 0,\n", " 'endOffset': 3},\n", " {'text': ['sambutan'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 3,\n", " 'endOffset': 4},\n", " {'text': ['Aidilfitri'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 4,\n", " 'endOffset': 5},\n", " {'text': ['minggu'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 5,\n", " 'endOffset': 6},\n", " {'text': ['depan,'],\n", " 'type': 'ADJ',\n", " 'score': 1.0,\n", " 'beginOffset': 6,\n", " 'endOffset': 7},\n", " {'text': ['Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 7,\n", " 'endOffset': 13},\n", " {'text': ['dan'],\n", " 'type': 'CCONJ',\n", " 'score': 1.0,\n", " 'beginOffset': 13,\n", " 'endOffset': 14},\n", " {'text': ['Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 14,\n", " 'endOffset': 20},\n", " {'text': ['menitipkan'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 20,\n", " 'endOffset': 21},\n", " {'text': ['pesanan'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 21,\n", " 'endOffset': 22},\n", " {'text': ['khas'],\n", " 'type': 'ADJ',\n", " 'score': 1.0,\n", " 'beginOffset': 22,\n", " 'endOffset': 23},\n", " {'text': ['kepada'],\n", " 'type': 'ADP',\n", " 'score': 1.0,\n", " 'beginOffset': 23,\n", " 'endOffset': 24},\n", " {'text': ['orang', 'ramai'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 24,\n", " 'endOffset': 26},\n", " {'text': ['yang'],\n", " 'type': 'PRON',\n", " 'score': 1.0,\n", " 'beginOffset': 26,\n", " 'endOffset': 27},\n", " {'text': ['mahu'],\n", " 'type': 'ADV',\n", " 'score': 1.0,\n", " 'beginOffset': 27,\n", " 'endOffset': 28},\n", " {'text': ['pulang'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 28,\n", " 'endOffset': 29},\n", " {'text': ['ke'],\n", " 'type': 'ADP',\n", " 'score': 1.0,\n", " 'beginOffset': 29,\n", " 'endOffset': 30},\n", " {'text': ['kampung', 'halaman'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 30,\n", " 'endOffset': 32},\n", " {'text': ['masing-masing.'],\n", " 'type': 'DET',\n", " 'score': 1.0,\n", " 'beginOffset': 32,\n", " 'endOffset': 33},\n", " {'text': ['Dalam'],\n", " 'type': 'ADP',\n", " 'score': 1.0,\n", " 'beginOffset': 33,\n", " 'endOffset': 34},\n", " {'text': ['video'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 34,\n", " 'endOffset': 35},\n", " {'text': ['pendek'],\n", " 'type': 'ADJ',\n", " 'score': 1.0,\n", " 'beginOffset': 35,\n", " 'endOffset': 36},\n", " {'text': ['terbitan'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 36,\n", " 'endOffset': 37},\n", " {'text': ['Jabatan', 'Keselamatan', 'Jalan', 'Raya'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 37,\n", " 'endOffset': 41},\n", " {'text': ['(JKJR)'],\n", " 'type': 'PUNCT',\n", " 'score': 1.0,\n", " 'beginOffset': 41,\n", " 'endOffset': 42},\n", " {'text': ['itu,'],\n", " 'type': 'DET',\n", " 'score': 1.0,\n", " 'beginOffset': 42,\n", " 'endOffset': 43},\n", " {'text': ['Dr', 'Mahathir'],\n", " 'type': 'PROPN',\n", " 'score': 1.0,\n", " 'beginOffset': 43,\n", " 'endOffset': 45},\n", " {'text': ['menasihati'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 45,\n", " 'endOffset': 46},\n", " {'text': ['mereka'],\n", " 'type': 'PRON',\n", " 'score': 1.0,\n", " 'beginOffset': 46,\n", " 'endOffset': 47},\n", " {'text': ['supaya'],\n", " 'type': 'NOUN',\n", " 'score': 1.0,\n", " 'beginOffset': 47,\n", " 'endOffset': 48},\n", " {'text': ['berhenti', 'berehat'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 48,\n", " 'endOffset': 50},\n", " {'text': ['dan'],\n", " 'type': 'CCONJ',\n", " 'score': 1.0,\n", " 'beginOffset': 50,\n", " 'endOffset': 51},\n", " {'text': ['tidur'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 51,\n", " 'endOffset': 52},\n", " {'text': ['sebentar', 'sekiranya'],\n", " 'type': 'ADV',\n", " 'score': 1.0,\n", " 'beginOffset': 52,\n", " 'endOffset': 54},\n", " {'text': ['mengantuk'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 54,\n", " 'endOffset': 55},\n", " {'text': ['ketika'],\n", " 'type': 'SCONJ',\n", " 'score': 1.0,\n", " 'beginOffset': 55,\n", " 'endOffset': 56},\n", " {'text': ['memandu.'],\n", " 'type': 'VERB',\n", " 'score': 1.0,\n", " 'beginOffset': 56,\n", " 'endOffset': 57}]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.analyze(string)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }