{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Language Detection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/language-detection](https://github.com/huseinzol05/Malaya/tree/master/example/language-detection).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.13 s, sys: 2.83 s, total: 5.96 s\n", "Wall time: 2.71 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" ] } ], "source": [ "%%time\n", "import malaya\n", "import fasttext" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### labels supported\n", "\n", "Default labels for language detection module." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "deep-model \n", " precision recall f1-score support\n", "\n", " eng 0.96760 0.97401 0.97080 553739\n", " ind 0.97635 0.96131 0.96877 576059\n", " malay 0.96985 0.98498 0.97736 1800649\n", " manglish 0.98036 0.96569 0.97297 181442\n", " other 0.99641 0.99627 0.99634 1428083\n", " rojak 0.94221 0.84302 0.88986 189678\n", "\n", " accuracy 0.97779 4729650\n", " macro avg 0.97213 0.95421 0.96268 4729650\n", "weighted avg 0.97769 0.97779 0.97760 4729650\n", "\n", "mesolitica/fasttext-language-detection-v1 \n", " precision recall f1-score support\n", "\n", " eng 0.94014 0.96750 0.95362 553739\n", " ind 0.97290 0.97316 0.97303 576059\n", " malay 0.98674 0.95262 0.96938 1800649\n", " manglish 0.96595 0.98417 0.97498 181442\n", " other 0.98454 0.99698 0.99072 1428083\n", " rojak 0.81149 0.91650 0.86080 189678\n", "\n", " accuracy 0.97002 4729650\n", " macro avg 0.94363 0.96515 0.95375 4729650\n", "weighted avg 0.97111 0.97002 0.97028 4729650\n", "\n", "mesolitica/fasttext-language-detection-v2 \n", " precision recall f1-score support\n", "\n", " local-english 0.88328 0.87926 0.88127 50429\n", " local-malay 0.93159 0.92648 0.92903 59877\n", " local-mandarin 0.62000 0.95044 0.75045 49820\n", " manglish 0.98494 0.98157 0.98325 49648\n", " other 0.99168 0.92850 0.95905 64350\n", "socialmedia-indonesian 0.97626 0.95390 0.96495 75140\n", " standard-english 0.86918 0.88018 0.87465 49776\n", " standard-indonesian 0.99695 0.99713 0.99704 50148\n", " standard-malay 0.92292 0.94851 0.93554 50049\n", " standard-mandarin 0.90855 0.53587 0.67413 53709\n", "\n", " accuracy 0.89953 552946\n", " macro avg 0.90853 0.89818 0.89494 552946\n", " weighted avg 0.91425 0.89953 0.89893 552946\n", "\n", "mesolitica/fasttext-language-detection-ms-id \n", " precision recall f1-score support\n", "\n", " local-malay 0.95063 0.93858 0.94457 199961\n", " other 0.97145 0.98889 0.98009 125920\n", "socialmedia-indonesian 0.97923 0.96303 0.97106 213486\n", " standard-indonesian 0.99119 0.99610 0.99364 149055\n", " standard-malay 0.93743 0.95669 0.94696 149336\n", "\n", " accuracy 0.96584 837758\n", " macro avg 0.96599 0.96866 0.96727 837758\n", " weighted avg 0.96591 0.96584 0.96582 837758\n", "\n", "mesolitica/fasttext-language-detection-en \n", " precision recall f1-score support\n", "\n", " local-english 0.88991 0.89457 0.89223 149823\n", " manglish 0.98619 0.98479 0.98549 149535\n", " other 0.99439 0.99268 0.99354 140651\n", "standard-english 0.89162 0.88967 0.89064 150703\n", "\n", " accuracy 0.93952 590712\n", " macro avg 0.94053 0.94043 0.94047 590712\n", " weighted avg 0.93960 0.93952 0.93955 590712\n", "\n" ] } ], "source": [ "for k, v in malaya.language_detection.metrics.items():\n", " print(k, v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Different models support different languages." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List available language detection models" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mesolitica/fasttext-language-detection-v1': {'Size (MB)': 353,\n", " 'Quantized Size (MB)': 31.1,\n", " 'dim': 16,\n", " 'Label': {0: 'eng',\n", " 1: 'ind',\n", " 2: 'malay',\n", " 3: 'manglish',\n", " 4: 'other',\n", " 5: 'rojak'}},\n", " 'mesolitica/fasttext-language-detection-v2': {'Size (MB)': 1840,\n", " 'Quantized Size (MB)': 227,\n", " 'dim': 16,\n", " 'Label': {0: 'standard-english',\n", " 1: 'local-english',\n", " 2: 'manglish',\n", " 3: 'standard-indonesian',\n", " 4: 'socialmedia-indonesian',\n", " 5: 'standard-malay',\n", " 6: 'local-malay',\n", " 7: 'standard-mandarin',\n", " 8: 'local-mandarin',\n", " 9: 'other'}},\n", " 'mesolitica/fasttext-language-detection-ms-id': {'Size (MB)': 537,\n", " 'Quantized Size (MB)': 62.5,\n", " 'dim': 16,\n", " 'Label': {0: 'standard-indonesian',\n", " 1: 'socialmedia-indonesian',\n", " 2: 'standard-malay',\n", " 3: 'local-malay',\n", " 4: 'other'}},\n", " 'mesolitica/fasttext-language-detection-bahasa-en': {'Size (MB)': 537,\n", " 'Quantized Size (MB)': 62.5,\n", " 'dim': 16,\n", " 'Label': {0: 'bahasa', 1: 'english', 2: 'other'}},\n", " 'mesolitica/fasttext-language-detection-en': {'Size (MB)': 383,\n", " 'Quantized Size (MB)': 42.3,\n", " 'dim': 16,\n", " 'Label': {0: 'standard-english',\n", " 1: 'local-english',\n", " 2: 'manglish',\n", " 3: 'other'}}}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.language_detection.available_fasttext" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "chinese_text = '今天是6月18号,也是Muiriel的生日!'\n", "english_text = 'i totally love it man'\n", "indon_text = 'menjabat saleh perombakan menjabat periode komisi energi fraksi partai pengurus partai periode periode partai terpilih periode menjabat komisi perdagangan investasi persatuan periode'\n", "malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'\n", "socialmedia_malay_text = 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh'\n", "socialmedia_indon_text = 'saking kangen papanya pas vc anakku nangis'\n", "rojak_text = 'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k'\n", "manglish_text = 'power lah even shopback come to edmw riao'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load Fast-text model\n", "\n", "Make sure fast-text already installed, if not, simply,\n", "\n", "```bash\n", "pip install fasttext\n", "```\n", "\n", "```python\n", "def fasttext(quantized: bool = True, **kwargs):\n", "\n", " \"\"\"\n", " Load Fasttext language detection model.\n", " Original size is 353MB, Quantized size 31.1MB.\n", " \n", " Parameters\n", " ----------\n", " quantized: bool, optional (default=True)\n", " if True, load quantized fasttext model. Else, load original fasttext model.\n", "\n", " Returns\n", " -------\n", " result : malaya.model.ml.LanguageDetection class\n", " \"\"\"\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this example, I am going to compare with pretrained fasttext from Facebook. https://fasttext.cc/docs/en/language-identification.html\n", "\n", "Simply download pretrained model,\n", "\n", "```bash\n", "wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz\n", "```" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "model = fasttext.load_model('lid.176.ftz')\n", "fast_text = malaya.language_detection.fasttext()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Language detection in Malaya is not trying to tackle possible languages in this world, just towards to hyperlocal language.**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "([['__label__id']], [array([0.6334154], dtype=float32)])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(['suka makan ayam dan daging'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.50445783,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba(['suka makan ayam dan daging'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__ms',), array([0.57101035]))" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(malay_text)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.9099521,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([malay_text])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__id',), array([0.7870034]))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(socialmedia_malay_text)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.9976433,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([socialmedia_malay_text])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__fr',), array([0.2912012]))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(socialmedia_indon_text)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 1.00003,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([socialmedia_indon_text])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__id',), array([0.87948251]))" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(rojak_text)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.9569701,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([rojak_text])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__en',), array([0.89707506]))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(manglish_text)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.99997073,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([manglish_text])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('__label__zh',), array([0.97311586]))" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(chinese_text)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.5823944,\n", " 'other': 0.0}]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([chinese_text])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.9755073,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.0,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0},\n", " {'standard-english': 0.0,\n", " 'local-english': 0.0,\n", " 'manglish': 0.0,\n", " 'standard-indonesian': 0.0,\n", " 'socialmedia-indonesian': 0.0,\n", " 'standard-malay': 0.9099521,\n", " 'local-malay': 0.0,\n", " 'standard-mandarin': 0.0,\n", " 'local-mandarin': 0.0,\n", " 'other': 0.0}]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fast_text.predict_proba([indon_text,malay_text])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 2 }