Transformer#

This tutorial is available as an IPython notebook at Malaya/example/transformer.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
%%time
import malaya
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
CPU times: user 3.3 s, sys: 3.61 s, total: 6.9 s
Wall time: 2.64 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

list Transformer HuggingFace available#

[3]:
malaya.transformer.available_huggingface
[3]:
{'mesolitica/roberta-base-bahasa-cased': {'Size (MB)': 443},
 'mesolitica/roberta-tiny-bahasa-cased': {'Size (MB)': 66.1},
 'mesolitica/bert-base-standard-bahasa-cased': {'Size (MB)': 443},
 'mesolitica/bert-tiny-standard-bahasa-cased': {'Size (MB)': 66.1},
 'mesolitica/roberta-base-standard-bahasa-cased': {'Size (MB)': 443},
 'mesolitica/roberta-tiny-standard-bahasa-cased': {'Size (MB)': 66.1},
 'mesolitica/electra-base-generator-bahasa-cased': {'Size (MB)': 140},
 'mesolitica/electra-small-generator-bahasa-cased': {'Size (MB)': 19.3},
 'mesolitica/malaysian-debertav2-base': {'Size (MB)': 228}}
[4]:
strings = ['Kerajaan galakkan rakyat naik public transport tapi parking kat lrt ada 15. Reserved utk staff rapid je dah berpuluh. Park kereta tepi jalan kang kene saman dgn majlis perbandaran. Kereta pulak senang kene curi. Cctv pun tak ada. Naik grab dah 5-10 ringgit tiap hari. Gampang juga',
           'Alaa Tun lek ahhh npe muka masam cmni kn agong kata usaha kerajaan terdahulu sejak selepas merdeka',
           "Orang ramai cakap nurse kerajaan garang. So i tell u this. Most of our local ppl will treat us as hamba abdi and they don't respect us as a nurse",
          'Pemuda mogok lapar desak kerajaan prihatin isu iklim',
          'kerajaan perlu kisah isu iklim, pemuda mogok lapar',
          'Kerajaan dicadang tubuh jawatankuasa khas tangani isu alam sekitar']

Load HuggingFace model#

def huggingface(
    model: str = 'mesolitica/electra-base-generator-bahasa-cased',
    force_check: bool = True,
    **kwargs,
):
    """
    Load transformer model.

    Parameters
    ----------
    model: str, optional (default='mesolitica/electra-base-generator-bahasa-cased')
        Check available models at `malaya.transformer.available_transformer()`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.
    """
[5]:
model = malaya.transformer.huggingface(model = 'mesolitica/electra-base-generator-bahasa-cased')
[6]:
deberta = malaya.transformer.huggingface(model = 'mesolitica/malaysian-debertav2-base')

I have random sentences copied from Twitter, searched using kerajaan keyword.

Vectorization#

Change a string or batch of strings to latent space / vectors representation.

def vectorize(
    self,
    strings: List[str],
    method: str = 'last',
    method_token: str = 'first',
    t5_head_logits: bool = True,
    **kwargs,
):
    """
    Vectorize string inputs.

    Parameters
    ----------
    strings: List[str]
    method: str, optional (default='last')
        hidden layers supported. Allowed values:

        * ``'last'`` - last layer.
        * ``'first'`` - first layer.
        * ``'mean'`` - average all layers.

        This only applicable for non T5 models.
    method_token: str, optional (default='first')
        token layers supported. Allowed values:

        * ``'last'`` - last token.
        * ``'first'`` - first token.
        * ``'mean'`` - average all tokens.

        usually pretrained models trained on `first` token for classification task.
        This only applicable for non T5 models.
    t5_head_logits: str, optional (default=True)
        if True, will take head logits, else, last token.
        This only applicable for T5 models.

    Returns
    -------
    result: np.array
    """
[7]:
from sklearn.metrics.pairwise import cosine_similarity
[8]:
v = model.vectorize(strings)
v.shape
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[8]:
(6, 256)
[9]:
cosine_similarity(v)
[9]:
array([[1.0000001 , 0.72213906, 0.70548326, 0.6682125 , 0.64426583,
        0.680184  ],
       [0.72213906, 1.        , 0.6226626 , 0.71866846, 0.699285  ,
        0.710604  ],
       [0.70548326, 0.6226626 , 0.99999994, 0.6309347 , 0.63519984,
        0.6296928 ],
       [0.6682125 , 0.71866846, 0.6309347 , 0.9999999 , 0.9547027 ,
        0.85647124],
       [0.64426583, 0.699285  , 0.63519984, 0.9547027 , 1.0000002 ,
        0.8234203 ],
       [0.680184  , 0.710604  , 0.6296928 , 0.85647124, 0.8234203 ,
        1.0000001 ]], dtype=float32)
[10]:
v = deberta.vectorize(strings)
v.shape
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[10]:
(6, 768)
[11]:
cosine_similarity(v)
[11]:
array([[1.0000004 , 0.9992135 , 0.986294  , 0.97192407, 0.9581215 ,
        0.99513686],
       [0.9992135 , 0.9999999 , 0.9872771 , 0.97267383, 0.96142304,
        0.99459785],
       [0.986294  , 0.9872771 , 1.0000001 , 0.99598175, 0.98882604,
        0.9702525 ],
       [0.97192407, 0.97267383, 0.99598175, 0.9999999 , 0.9930133 ,
        0.95082116],
       [0.9581215 , 0.96142304, 0.98882604, 0.9930133 , 1.        ,
        0.9365195 ],
       [0.99513686, 0.99459785, 0.9702525 , 0.95082116, 0.9365195 ,
        0.9999993 ]], dtype=float32)

Attention#

def attention(
    self,
    strings: List[str],
    method: str = 'last',
    method_head: str = 'mean',
    t5_attention: str = 'cross_attentions',
    **kwargs,
):
    """
    Get attention string inputs.

    Parameters
    ----------
    strings: List[str]
    method: str, optional (default='last')
        Attention layer supported. Allowed values:

        * ``'last'`` - attention from last layer.
        * ``'first'`` - attention from first layer.
        * ``'mean'`` - average attentions from all layers.
    method_head: str, optional (default='mean')
        attention head layer supported. Allowed values:

        * ``'last'`` - attention from last layer.
        * ``'first'`` - attention from first layer.
        * ``'mean'`` - average attentions from all layers.
    t5_attention: str, optional (default='cross_attentions')
        attention type for T5 models. Allowed values:

        * ``'cross_attentions'`` - cross attention.
        * ``'encoder_attentions'`` - encoder attention.
        * ``'decoder_attentions'`` - decoder attention.

        This only applicable for T5 models.

    Returns
    -------
    result : List[List[Tuple[str, float]]]
    """

You can give list of strings or a string to get the attention, in this documentation, I just want to use a string.

[12]:
model.attention([strings[1]], method = 'last')
[12]:
[[('Alaa', 0.058868624),
  ('Tun', 0.061252587),
  ('lek', 0.06898942),
  ('ahhh', 0.06439799),
  ('npe', 0.05082519),
  ('muka', 0.07244483),
  ('masam', 0.053202268),
  ('cmni', 0.048232798),
  ('kn', 0.05816199),
  ('agong', 0.06559847),
  ('kata', 0.055140313),
  ('usaha', 0.057437424),
  ('kerajaan', 0.041059937),
  ('terdahulu', 0.044371374),
  ('sejak', 0.069254234),
  ('selepas', 0.06948459),
  ('merdeka', 0.061277922)]]
[13]:
model.attention([strings[1]], method = 'first')
[13]:
[[('Alaa', 0.061838076),
  ('Tun', 0.053071998),
  ('lek', 0.04778199),
  ('ahhh', 0.046944533),
  ('npe', 0.052150372),
  ('muka', 0.05392791),
  ('masam', 0.058074415),
  ('cmni', 0.08068735),
  ('kn', 0.050343554),
  ('agong', 0.054398913),
  ('kata', 0.057019),
  ('usaha', 0.05820992),
  ('kerajaan', 0.06937862),
  ('terdahulu', 0.08067024),
  ('sejak', 0.05798509),
  ('selepas', 0.06437356),
  ('merdeka', 0.053144373)]]
[14]:
model.attention([strings[1]], method = 'mean')
[14]:
[[('Alaa', 0.048754193),
  ('Tun', 0.054038025),
  ('lek', 0.053129513),
  ('ahhh', 0.057060346),
  ('npe', 0.04947073),
  ('muka', 0.060973264),
  ('masam', 0.05763235),
  ('cmni', 0.0723617),
  ('kn', 0.05290027),
  ('agong', 0.053802904),
  ('kata', 0.0701514),
  ('usaha', 0.06137535),
  ('kerajaan', 0.06380818),
  ('terdahulu', 0.06389959),
  ('sejak', 0.05665373),
  ('selepas', 0.052445903),
  ('merdeka', 0.07154253)]]
[19]:
deberta.attention([strings[1]], method = 'last')
[19]:
[[('Alaa', 0.053318705),
  ('Tun', 0.059914347),
  ('lek', 0.060762215),
  ('ahhh', 0.06112733),
  ('npe', 0.06244664),
  ('muka', 0.06293835),
  ('masam', 0.06351074),
  ('cmni', 0.062215753),
  ('kn', 0.062704325),
  ('agong', 0.06305968),
  ('kata', 0.06383533),
  ('usaha', 0.06210411),
  ('kerajaan', 0.05999032),
  ('terdahulu', 0.057203274),
  ('sejak', 0.055250105),
  ('selepas', 0.052908983),
  ('merdeka', 0.036709864)]]
[17]:
roberta = malaya.transformer.huggingface(model = 'mesolitica/roberta-base-standard-bahasa-cased')
[18]:
roberta.attention([strings[1]], method = 'last')
[18]:
[[('Alaa', 0.052424457),
  ('Tun', 0.08523697),
  ('lek', 0.06813958),
  ('ahhh', 0.06153968),
  ('npe', 0.06513652),
  ('muka', 0.059199475),
  ('masam', 0.061626367),
  ('cmni', 0.06737201),
  ('kn', 0.06622732),
  ('agong', 0.052743737),
  ('kata', 0.067238666),
  ('usaha', 0.044102512),
  ('kerajaan', 0.060376044),
  ('terdahulu', 0.04183174),
  ('sejak', 0.04189242),
  ('selepas', 0.039302666),
  ('merdeka', 0.065609865)]]
[ ]: