Reranker
Contents
Reranker#
This tutorial is available as an IPython notebook at Malaya/example/reranker.
This module trained on both standard and local (included social media) language structures, so it is save to use for both.
[1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
[2]:
import logging
logging.basicConfig(level=logging.INFO)
[3]:
%%time
import malaya
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
CPU times: user 3.22 s, sys: 2.61 s, total: 5.84 s
Wall time: 2.92 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[4]:
string1 = 'Pemuda mogok lapar desak kerajaan prihatin isu iklim'
string2 = 'Perbincangan isu pembalakan perlu babit kerajaan negeri'
string3 = 'kerajaan perlu kisah isu iklim, pemuda mogok lapar'
string4 = 'Kerajaan dicadang tubuh jawatankuasa khas tangani isu alam sekitar'
[5]:
news1 = 'Tun Dr Mahathir Mohamad mengakui pembubaran Parlimen bagi membolehkan pilihan raya diadakan tidak sesuai dilaksanakan pada masa ini berikutan isu COVID-19'
tweet1 = 'DrM sembang pilihan raya tak boleh buat sebab COVID 19'
List available HuggingFace models#
[6]:
malaya.reranker.available_huggingface
[6]:
{'mesolitica/reranker-malaysian-mistral-64M-32k': {'Size (MB)': 95.7,
'Suggested length': 32768},
'mesolitica/reranker-malaysian-mistral-191M-32k': {'Size (MB)': 332,
'Suggested length': 32768},
'mesolitica/reranker-malaysian-mistral-474M-32k': {'Size (MB)': 884,
'Suggested length': 32768}}
Load HuggingFace model#
def huggingface(
model: str = 'mesolitica/reranker-malaysian-mistral-64M-32k',
force_check: bool = True,
**kwargs,
):
"""
Load HuggingFace model for reranking task.
Parameters
----------
model: str, optional (default='mesolitica/reranker-malaysian-mistral-64M-32k')
Check available models at `malaya.reranker.available_huggingface`.
force_check: bool, optional (default=True)
Force check model one of malaya model.
Set to False if you have your own huggingface model.
Returns
-------
result: malaya.torch_model.huggingface.Reranker
"""
[8]:
model = malaya.reranker.huggingface()
Sort batch of strings#
def sort(self, left_string: str, right_strings: List[str]):
"""
Sort the strings.
Parameters
----------
left_string: str
reference string.
right_strings: List[str]
query strings, list of strings need to sort based on reference string.
Returns
-------
result: np.array
"""
[14]:
v = model.sort(string1, [string1, string2, string3, string4, tweet1, news1])
v.shape
[14]:
(6,)
[15]:
v
[15]:
array([ 6.898162 , 5.124153 , 4.9817924, 4.0060225, -8.165119 ,
-1.5455011], dtype=float32)