{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Emotion Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [Malaya/example/emotion](https://github.com/huseinzol05/Malaya/tree/master/example/emotion).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-09-23 18:45:27.864902: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX_VNNI FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2023-09-23 18:45:27.931441: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2023-09-23 18:45:28.381244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n", "2023-09-23 18:45:28.381275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n", "2023-09-23 18:45:28.381288: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.63 s, sys: 4.04 s, total: 7.67 s\n", "Wall time: 2.94 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n", " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" ] } ], "source": [ "%%time\n", "import malaya" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### labels supported\n", "\n", "Default labels for emotion module." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.emotion.label" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example texts\n", "\n", "Copy pasted from random tweets." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "anger_text = 'babi la company ni, aku dah la penat datang dari jauh'\n", "fear_text = 'takut doh tengok cerita hantu tadi'\n", "happy_text = 'bestnya dapat tidur harini, tak payah pergi kerja'\n", "love_text = 'aku sayang sgt dia dah doh'\n", "sadness_text = 'kecewa tengok kerajaan baru ni, janji ape pun tak dapat'\n", "surprise_text = 'sakit jantung aku, terkejut dengan cerita hantu tadi'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load multinomial model\n", "\n", "```python\n", "def multinomial(**kwargs):\n", " \"\"\"\n", " Load multinomial emotion model.\n", "\n", " Returns\n", " -------\n", " result : malaya.model.ml.Bayes class\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/.local/lib/python3.8/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator ComplementNB from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/home/husein/.local/lib/python3.8/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator TfidfTransformer from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/home/husein/.local/lib/python3.8/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator TfidfVectorizer from version 0.22.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n" ] } ], "source": [ "model = malaya.emotion.multinomial()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Predict batch of strings\n", "\n", "```python\n", "def predict(self, strings: List[str]):\n", " \"\"\"\n", " classify list of strings.\n", "\n", " Parameters\n", " ----------\n", " strings: List[str]\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/husein/dev/malaya/malaya/stem.py:50: FutureWarning: Possible nested set at position 3\n", " or re.findall(_expressions['ic'], word.lower())\n" ] }, { "data": { "text/plain": [ "['fear']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict([anger_text])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['fear', 'fear', 'happy', 'love', 'sadness', 'surprise']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(\n", " [anger_text, fear_text, happy_text, love_text, sadness_text, surprise_text]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Predict batch of strings with probability\n", "\n", "```python\n", "def predict_proba(self, strings: List[str]):\n", " \"\"\"\n", " classify list of strings and return probability.\n", "\n", " Parameters\n", " ----------\n", " strings: List[str]\n", "\n", " Returns\n", " -------\n", " result: List[dict[str, float]]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'anger': 0.22968519086673891,\n", " 'fear': 0.33425478385257884,\n", " 'happy': 0.11615463884648307,\n", " 'love': 0.10615954967244598,\n", " 'sadness': 0.10196790232932866,\n", " 'surprise': 0.11177793443242351},\n", " {'anger': 0.11379406005377896,\n", " 'fear': 0.4006934391283133,\n", " 'happy': 0.11389665647702245,\n", " 'love': 0.12481915233837086,\n", " 'sadness': 0.0991261507380643,\n", " 'surprise': 0.14767054126445014},\n", " {'anger': 0.14667998117610198,\n", " 'fear': 0.1422732633232615,\n", " 'happy': 0.29984520430807293,\n", " 'love': 0.1409005078277281,\n", " 'sadness': 0.13374705318404811,\n", " 'surprise': 0.13655399018078768},\n", " {'anger': 0.1590563839629243,\n", " 'fear': 0.14687344690114268,\n", " 'happy': 0.1419948160674701,\n", " 'love': 0.279550441361504,\n", " 'sadness': 0.1285927908584157,\n", " 'surprise': 0.14393212084854254},\n", " {'anger': 0.13425914937312508,\n", " 'fear': 0.12053328146716755,\n", " 'happy': 0.14923350911233682,\n", " 'love': 0.10289492749919464,\n", " 'sadness': 0.36961334597699913,\n", " 'surprise': 0.12346578657117815},\n", " {'anger': 0.06724850384395685,\n", " 'fear': 0.1283628050361525,\n", " 'happy': 0.05801958643852813,\n", " 'love': 0.06666524240157067,\n", " 'sadness': 0.06537667186293224,\n", " 'surprise': 0.6143271904168589}]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict_proba(\n", " [anger_text, fear_text, happy_text, love_text, sadness_text, surprise_text]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available HuggingFace models" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mesolitica/emotion-analysis-nanot5-small-malaysian-cased': {'Size (MB)': 167,\n", " 'macro precision': 0.97336,\n", " 'macro recall': 0.9737,\n", " 'macro f1-score': 0.97363},\n", " 'mesolitica/emotion-analysis-nanot5-base-malaysian-cased': {'Size (MB)': 439,\n", " 'macro precision': 0.98003,\n", " 'macro recall': 0.98311,\n", " 'macro f1-score': 0.98139}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.emotion.available_huggingface" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load HuggingFace model\n", "\n", "```python\n", "def huggingface(\n", " model: str = 'mesolitica/emotion-analysis-nanot5-small-malaysian-cased',\n", " force_check: bool = True,\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load HuggingFace model to classify emotion.\n", "\n", " Parameters\n", " ----------\n", " model: str, optional (default='mesolitica/emotion-analysis-nanot5-small-malaysian-cased')\n", " Check available models at `malaya.emotion.available_huggingface`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", " Set to False if you have your own huggingface model.\n", "\n", " Returns\n", " -------\n", " result: malaya.torch_model.huggingface.Classification\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "model = malaya.emotion.huggingface()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Predict batch of strings\n", "\n", "```python\n", "def predict(self, strings: List[str]):\n", " \"\"\"\n", " classify list of strings.\n", "\n", " Parameters\n", " ----------\n", " strings: List[str]\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['anger', 'fear', 'anger', 'love', 'sadness', 'surprise']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(\n", " [anger_text, fear_text, happy_text, love_text, sadness_text, surprise_text]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Predict batch of strings with probability\n", "\n", "```python\n", "def predict_proba(self, strings: List[str]):\n", " \"\"\"\n", " classify list of strings and return probability.\n", "\n", " Parameters\n", " ----------\n", " strings : List[str]\n", "\n", " Returns\n", " -------\n", " result: List[dict[str, float]]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'anger': 0.9920779466629028,\n", " 'fear': 0.002742587821558118,\n", " 'happy': 0.0007182527333498001,\n", " 'love': 0.003472566604614258,\n", " 'sadness': 0.0004595498612616211,\n", " 'surprise': 0.0005290955305099487},\n", " {'anger': 0.0013869482791051269,\n", " 'fear': 0.9977095127105713,\n", " 'happy': 8.731099660508335e-05,\n", " 'love': 0.0006927275680936873,\n", " 'sadness': 2.510174635972362e-05,\n", " 'surprise': 9.857082477537915e-05},\n", " {'anger': 0.9649528861045837,\n", " 'fear': 0.0035354183055460453,\n", " 'happy': 0.02452198415994644,\n", " 'love': 0.003478029975667596,\n", " 'sadness': 0.000459152739495039,\n", " 'surprise': 0.003052382031455636},\n", " {'anger': 0.0012408840702846646,\n", " 'fear': 0.0002690576366148889,\n", " 'happy': 8.375391917070374e-05,\n", " 'love': 0.9980649948120117,\n", " 'sadness': 0.00024171061522793025,\n", " 'surprise': 9.9410921393428e-05},\n", " {'anger': 0.0002834223269019276,\n", " 'fear': 0.00013902968203183264,\n", " 'happy': 1.7576363461557776e-05,\n", " 'love': 0.00012455208343453705,\n", " 'sadness': 0.9994227886199951,\n", " 'surprise': 1.2706384040939156e-05},\n", " {'anger': 0.0033617503941059113,\n", " 'fear': 0.00024840401601977646,\n", " 'happy': 9.1005269496236e-05,\n", " 'love': 0.0001304154866375029,\n", " 'sadness': 0.00013015331933274865,\n", " 'surprise': 0.9960381388664246}]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict_proba(\n", " [anger_text, fear_text, happy_text, love_text, sadness_text, surprise_text]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stacking models\n", "\n", "More information, you can read at [https://malaya.readthedocs.io/en/latest/Stack.html](https://malaya.readthedocs.io/en/latest/Stack.html)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "multinomial = malaya.emotion.multinomial()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'anger': 0.4773527129219559,\n", " 'fear': 0.030277435484063434,\n", " 'happy': 0.00913391706396899,\n", " 'love': 0.01920016354028784,\n", " 'sadness': 0.00684538940763616,\n", " 'surprise': 0.0076903319510817905}]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.stack.predict_stack([multinomial, model], [anger_text])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'anger': 0.4773527129219559,\n", " 'fear': 0.03027742134692621,\n", " 'happy': 0.00913391706396899,\n", " 'love': 0.01920016740231647,\n", " 'sadness': 0.00684538940763616,\n", " 'surprise': 0.0076903319510817905},\n", " {'anger': 0.0061686367698025315,\n", " 'fear': 0.0040936174462977955,\n", " 'happy': 0.0016195631632911615,\n", " 'love': 0.0035799130708538637,\n", " 'sadness': 0.6077828567403819,\n", " 'surprise': 0.0012525194799234709}]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya.stack.predict_stack([multinomial, model], [anger_text, sadness_text])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }