Embedding#

This tutorial is available as an IPython notebook at Malaya/example/embedding.

This module trained on both standard and local (included social media) language structures, so it is save to use for both.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
[2]:
import logging

logging.basicConfig(level=logging.INFO)
[3]:
%%time
import malaya
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
CPU times: user 3.27 s, sys: 2.86 s, total: 6.13 s
Wall time: 2.83 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[4]:
string1 = 'Pemuda mogok lapar desak kerajaan prihatin isu iklim'
string2 = 'Perbincangan isu pembalakan perlu babit kerajaan negeri'
string3 = 'kerajaan perlu kisah isu iklim, pemuda mogok lapar'
string4 = 'Kerajaan dicadang tubuh jawatankuasa khas tangani isu alam sekitar'
[5]:
news1 = 'Tun Dr Mahathir Mohamad mengakui pembubaran Parlimen bagi membolehkan pilihan raya diadakan tidak sesuai dilaksanakan pada masa ini berikutan isu COVID-19'
tweet1 = 'DrM sembang pilihan raya tak boleh buat sebab COVID 19'

List available HuggingFace models#

[6]:
malaya.embedding.available_huggingface
[6]:
{'mesolitica/mistral-embedding-191m-8k-contrastive': {'Size (MB)': 334,
  'embedding size': 768,
  'Suggested length': 8192},
 'mesolitica/mistral-embedding-349m-8k-contrastive': {'Size (MB)': 633,
  'embedding size': 768,
  'Suggested length': 8192},
 'mesolitica/embedding-malaysian-mistral-64M-32k': {'Size (MB)': 96.5,
  'embedding size': 768,
  'Suggested length': 20480}}
[7]:
print(malaya.embedding.info)

Entire Malaysian embedding benchmark at https://huggingface.co/spaces/mesolitica/malaysian-embedding-leaderboard

Load HuggingFace model#

def huggingface(
    model: str = 'mesolitica/embedding-malaysian-mistral-64M-32k',
    force_check: bool = True,
    **kwargs,
):
    """
    Load HuggingFace model for embedding task.

    Parameters
    ----------
    model: str, optional (default='mesolitica/embedding-malaysian-mistral-64M-32k')
        Check available models at `malaya.embedding.available_huggingface`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.

    Returns
    -------
    result: malaya.torch_model.huggingface.Embedding
    """
[8]:
model = malaya.embedding.huggingface()

Encode batch of strings#

def encode(self, strings: List[str]):
    """
    Encode strings into embedding.

    Parameters
    ----------
    strings: List[str]

    Returns
    -------
    result: np.array
    """
[10]:
v = model.encode([string1, string2, string3, string4, tweet1, news1])
v.shape
[10]:
(6, 768)
[11]:
from sklearn.metrics.pairwise import cosine_similarity
[12]:
cosine_similarity(v)
[12]:
array([[1.0000004 , 0.570497  , 0.90419084, 0.6907457 , 0.5040159 ,
        0.35596827],
       [0.570497  , 0.99999976, 0.52848774, 0.75748587, 0.22503856,
        0.20589375],
       [0.90419084, 0.52848774, 1.        , 0.69484305, 0.5023028 ,
        0.4378497 ],
       [0.6907457 , 0.75748587, 0.69484305, 1.        , 0.3340272 ,
        0.28617752],
       [0.5040159 , 0.22503856, 0.5023028 , 0.3340272 , 1.        ,
        0.6531957 ],
       [0.35596827, 0.20589375, 0.4378497 , 0.28617752, 0.6531957 ,
        1.        ]], dtype=float32)