\n",
"\n",
"This tutorial is available as an IPython notebook at [Malaya/example/gpt2-lm](https://github.com/huseinzol05/Malaya/tree/master/example/gpt2-lm).\n",
" \n",
"
"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ['CUDA_VISIBLE_DEVICES'] = ''"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
" warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
"/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
" self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
]
}
],
"source": [
"import malaya"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### List available GPT2 models"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'mesolitica/gpt2-117m-bahasa-cased': {'Size (MB)': 454}}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"malaya.language_model.available_gpt2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load GPT2 LM model\n",
"\n",
"```python\n",
"def gpt2(\n",
" model: str = 'mesolitica/gpt2-117m-bahasa-cased',\n",
" force_check: bool = True,\n",
" **kwargs,\n",
"):\n",
" \"\"\"\n",
" Load GPT2 language model.\n",
"\n",
" Parameters\n",
" ----------\n",
" model: str, optional (default='mesolitica/gpt2-117m-bahasa-cased')\n",
" Check available models at `malaya.language_model.available_gpt2`.\n",
" force_check: bool, optional (default=True)\n",
" Force check model one of malaya model.\n",
" Set to False if you have your own huggingface model.\n",
"\n",
" Returns\n",
" -------\n",
" result: malaya.torch_model.gpt2_lm.LM class\n",
" \"\"\"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
"You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc\n"
]
}
],
"source": [
"model = malaya.language_model.gpt2()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-51.3840389251709"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.score('saya suke awak')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-46.20505905151367"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.score('saya suka awak')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-48.355825901031494"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.score('najib razak')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-52.79338455200195"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.score('najib comel')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}