{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Noisy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/noisy-translation](https://github.com/huseinzol05/Malaya/tree/master/example/noisy-translation).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ['CUDA_VISIBLE_DEVICES'] = ''" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n", "CPU times: user 2.86 s, sys: 3.84 s, total: 6.7 s\n", "Wall time: 1.93 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" ] } ], "source": [ "%%time\n", "\n", "import malaya\n", "import logging\n", "\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available HuggingFace models" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'mesolitica/translation-t5-tiny-standard-bahasa-cased': {'Size (MB)': 139,\n", " 'Suggested length': 1536,\n", " 'en-ms chrF2++': 65.91,\n", " 'ms-en chrF2++': 61.3,\n", " 'ind-ms chrF2++': 58.15,\n", " 'jav-ms chrF2++': 49.33,\n", " 'pasar ms-ms chrF2++': 58.46,\n", " 'pasar ms-en chrF2++': 55.76,\n", " 'manglish-ms chrF2++': 51.04,\n", " 'manglish-en chrF2++': 52.2,\n", " 'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n", " 'to lang': ['en', 'ms']},\n", " 'mesolitica/translation-t5-small-standard-bahasa-cased': {'Size (MB)': 242,\n", " 'Suggested length': 1536,\n", " 'en-ms chrF2++': 67.37,\n", " 'ms-en chrF2++': 63.79,\n", " 'ind-ms chrF2++': 58.09,\n", " 'jav-ms chrF2++': 52.11,\n", " 'pasar ms-ms chrF2++': 62.49,\n", " 'pasar ms-en chrF2++': 60.77,\n", " 'manglish-ms chrF2++': 52.84,\n", " 'manglish-en chrF2++': 53.65,\n", " 'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n", " 'to lang': ['en', 'ms']},\n", " 'mesolitica/translation-t5-base-standard-bahasa-cased': {'Size (MB)': 892,\n", " 'Suggested length': 1536,\n", " 'en-ms chrF2++': 67.62,\n", " 'ms-en chrF2++': 64.41,\n", " 'ind-ms chrF2++': 59.25,\n", " 'jav-ms chrF2++': 52.86,\n", " 'pasar ms-ms chrF2++': 62.99,\n", " 'pasar ms-en chrF2++': 62.06,\n", " 'manglish-ms chrF2++': 54.4,\n", " 'manglish-en chrF2++': 54.14,\n", " 'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n", " 'to lang': ['en', 'ms']},\n", " 'mesolitica/translation-nanot5-tiny-malaysian-cased': {'Size (MB)': 205,\n", " 'Suggested length': 2048,\n", " 'en-ms chrF2++': 63.61,\n", " 'ms-en chrF2++': 59.55,\n", " 'ind-ms chrF2++': 56.38,\n", " 'jav-ms chrF2++': 47.68,\n", " 'mandarin-ms chrF2++': 36.61,\n", " 'mandarin-en chrF2++': 39.78,\n", " 'pasar ms-ms chrF2++': 58.74,\n", " 'pasar ms-en chrF2++': 54.87,\n", " 'manglish-ms chrF2++': 50.76,\n", " 'manglish-en chrF2++': 53.16,\n", " 'from lang': ['en',\n", " 'ms',\n", " 'ind',\n", " 'jav',\n", " 'bjn',\n", " 'manglish',\n", " 'pasar ms',\n", " 'mandarin',\n", " 'pasar mandarin'],\n", " 'to lang': ['en', 'ms']},\n", " 'mesolitica/translation-nanot5-small-malaysian-cased': {'Size (MB)': 358,\n", " 'Suggested length': 2048,\n", " 'en-ms chrF2++': 66.98,\n", " 'ms-en chrF2++': 63.52,\n", " 'ind-ms chrF2++': 58.1,\n", " 'jav-ms chrF2++': 51.55,\n", " 'mandarin-ms chrF2++': 46.09,\n", " 'mandarin-en chrF2++': 44.13,\n", " 'pasar ms-ms chrF2++': 63.2,\n", " 'pasar ms-en chrF2++': 59.78,\n", " 'manglish-ms chrF2++': 54.09,\n", " 'manglish-en chrF2++': 55.27,\n", " 'from lang': ['en',\n", " 'ms',\n", " 'ind',\n", " 'jav',\n", " 'bjn',\n", " 'manglish',\n", " 'pasar ms',\n", " 'mandarin',\n", " 'pasar mandarin'],\n", " 'to lang': ['en', 'ms']},\n", " 'mesolitica/translation-nanot5-base-malaysian-cased': {'Size (MB)': 990,\n", " 'Suggested length': 2048,\n", " 'en-ms chrF2++': 67.87,\n", " 'ms-en chrF2++': 64.79,\n", " 'ind-ms chrF2++': 56.98,\n", " 'jav-ms chrF2++': 51.21,\n", " 'mandarin-ms chrF2++': 47.39,\n", " 'mandarin-en chrF2++': 48.78,\n", " 'pasar ms-ms chrF2++': 65.06,\n", " 'pasar ms-en chrF2++': 64.03,\n", " 'manglish-ms chrF2++': 57.91,\n", " 'manglish-en chrF2++': 55.66,\n", " 'from lang': ['en',\n", " 'ms',\n", " 'ind',\n", " 'jav',\n", " 'bjn',\n", " 'manglish',\n", " 'pasar ms',\n", " 'mandarin',\n", " 'pasar mandarin'],\n", " 'to lang': ['en', 'ms']}}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.translation.available_huggingface" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1. tested on FLORES200 pair `dev` set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/flores200-eval\n", "2. tested on noisy test set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/noisy-eval\n", "3. check out NLLB 200 metrics from `malaya.translation.nllb_metrics`.\n", "4. check out Google Translate metrics from `malaya.translation.google_translate_metrics`.\n" ] } ], "source": [ "print(malaya.translation.info)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Improvements of new model\n", "\n", "1. able to translate `[en, ms, ind, jav, bjn, manglish, pasar ms, mandarin, pasar mandarin]` while old model only able to translate `[en, ms, pasar ms]`.\n", "2. No longer required `from_lang` part of the prefix.\n", "3. able to retain text structure as it is." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load Transformer models\n", "\n", "```python\n", "def huggingface(\n", " model: str = 'mesolitica/translation-t5-small-standard-bahasa-cased',\n", " force_check: bool = True,\n", " from_lang: List[str] = None,\n", " to_lang: List[str] = None,\n", " old_model: bool = False,\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load HuggingFace model to translate.\n", "\n", " Parameters\n", " ----------\n", " model: str, optional (default='mesolitica/translation-t5-small-standard-bahasa-cased')\n", " Check available models at `malaya.translation.available_huggingface()`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", " Set to False if you have your own huggingface model.\n", "\n", " Returns\n", " -------\n", " result: malaya.torch_model.huggingface.Translation\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "model = malaya.translation.huggingface(model = 'mesolitica/translation-nanot5-small-malaysian-cased')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Translate\n", "\n", "```python\n", "def generate(self, strings: List[str], to_lang: str = 'ms', **kwargs):\n", " \"\"\"\n", " Generate texts from the input.\n", "\n", " Parameters\n", " ----------\n", " strings : List[str]\n", " to_lang: str, optional (default='ms')\n", " target language to translate.\n", " **kwargs: vector arguments pass to huggingface `generate` method.\n", " Read more at https://huggingface.co/docs/transformers/main_classes/text_generation\n", "\n", " If you are using `use_ctranslate2`, vector arguments pass to ctranslate2 `translate_batch` method.\n", " Read more at https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?highlight=translate_batch#ctranslate2.Translator.translate_batch\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Noisy malay" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "strings = [\n", " 'ak tak paham la',\n", " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", " \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n", " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", " 'Jadi haram jadah😀😃🤭',\n", " 'nak gi mana tuu',\n", " 'Macam nak ambil half day',\n", " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", " 'mesolitica boleh buat asr tak',\n", "]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "['Saya tidak faham',\n", " 'Hi guys! Saya perasan semalam dan hari ini ramai yang dapat cookies ni kan. '\n", " 'Jadi hari ini saya ingin berkongsi beberapa post mortem dari batch pertama '\n", " 'kami:',\n", " 'Memanglah. Ini tidak perlu pakar, saya juga tahu. Ini adalah isyarat, bodoh.',\n", " 'Jam 8 di pasar KK memang ramai orang 😂, pandai dia pilih tempat.',\n", " 'Jadi haram jadah 😀😃🤭',\n", " 'Ke mana kamu pergi?',\n", " 'Saya ingin mengambil separuh hari',\n", " 'Bayangkan PH dan menang dalam PRU-14. Kemudian terdapat pelbagai pintu '\n", " 'belakang. Akhirnya, Ismail Sabri naik. Itulah sebabnya saya tidak lagi '\n", " 'peduli tentang politik. Saya bersumpah saya sudah pergi.',\n", " 'Bolehkah mesolitica digunakan untuk membuat asr?']\n", "CPU times: user 34.7 s, sys: 47 ms, total: 34.8 s\n", "Wall time: 2.94 s\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\"I don't understand\",\n", " 'Hi guys! I noticed that many people have received cookies yesterday and '\n", " 'today. So today I want to share some post mortem of our first batch:',\n", " \"Indeed. No need for an expert, I know. It's a gesture, stupid.\",\n", " \"At 8 o'clock in the KK market, it's crowded 😂, he's clever in choosing a \"\n", " 'place.',\n", " \"So it's illegal😀😃🤭\",\n", " 'Where are you going?',\n", " 'How to take half a day',\n", " 'Imagine PH and winning the 14th general election. Then there are all sorts '\n", " \"of backgazes. In the end, Ismail Sabri got in. That's why I don't care about \"\n", " \"politics anymore. I swear I'm already fucked up.\",\n", " 'Can the mesolitica make Asr?']\n", "CPU times: user 51.6 s, sys: 89.9 ms, total: 51.7 s\n", "Wall time: 4.41 s\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Manglish" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "strings = [\n", " 'i know plenty of people who snack on sambal ikan bilis.',\n", " 'I often visualize my own programs algorithm before implemment it.',\n", " 'Am I the only one who used their given name ever since I was a kid?',\n", " 'Gotta be wary of pimples. Oh they bleed bad when cut',\n", " 'Smh the dude literally has a rubbish bin infront of his house',\n", " \"I think I won't be able to catch it within 1 min lol\"\n", "]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Saya kenal ramai orang yang makan sambal ikan bilis.',\n", " 'Saya sering memvisualisasikan algoritma program saya sendiri sebelum '\n", " 'mengimplemmennya.',\n", " 'Adakah saya seorang sahaja yang menggunakan nama mereka sejak saya masih '\n", " 'kecil?',\n", " 'Kena berhati-hati dengan jerawat. Oh, mereka berdarah teruk apabila dipotong',\n", " 'Sial, lelaki itu benar-benar mempunyai tong sampah di depan rumahnya.',\n", " 'Saya rasa saya tidak akan dapat menangkapnya dalam masa 1 minit lol']\n", "CPU times: user 6.07 s, sys: 9.2 ms, total: 6.08 s\n", "Wall time: 519 ms\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['I know a lot of people who take snacks on sambal ikan bilis.',\n", " 'I often visualize my own program algorithm before impersonating it.',\n", " 'Am I the only one who has used their given name ever since I was a child?',\n", " 'You need to be cautious of pimples. Oh, they bleed badly when cut.',\n", " 'Oh my, the man is literally in a rubbish bin in front of his house.',\n", " \"I don't think I can catch it within 1 minute, haha\"]\n", "CPU times: user 8.09 s, sys: 4.09 ms, total: 8.1 s\n", "Wall time: 685 ms\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Local Mandarin" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "strings = [\n", " '某个角度漂亮,但我觉得不是很耐看。',\n", " '就是暂时好看的意思咯?',\n", " 'i think, 有狐狸般的妖媚,确实是第一人选。'\n", "]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Sudut yang cantik, tetapi saya rasa tidak begitu menarik.',\n", " 'Adakah ini bermaksud untuk sementara kelihatan cantik?',\n", " 'Saya rasa, mempunyai gadis-gadis yang sangat cantik dan cantik memang '\n", " 'menjadi pilihan pertama.']\n", "CPU times: user 4.25 s, sys: 2.94 ms, total: 4.26 s\n", "Wall time: 364 ms\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\"A certain angle is beautiful, but I don't think it's very durable.\",\n", " 'Is it just for now good-looking?',\n", " 'I believe that having a fox-like and cute demeanor is indeed the first '\n", " 'choice.']\n", "CPU times: user 4.5 s, sys: 0 ns, total: 4.5 s\n", "Wall time: 378 ms\n" ] } ], "source": [ "%%time\n", "\n", "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }