Language Detection

This tutorial is available as an IPython notebook at Malaya/example/language-detection.

This module trained on both standard and local (included social media) language structures, so it is save to use for both.

[1]:
%%time
import malaya
import fasttext
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_addons/utils/ensure_tf_install.py:68: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.2.0 and strictly below 2.4.0 (nightly versions are not supported).
 The versions of TensorFlow you are currently using is 2.4.1 and is not supported.
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version.
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
  UserWarning,
CPU times: user 5.17 s, sys: 990 ms, total: 6.16 s
Wall time: 6.67 s

List available language detected

[2]:
malaya.language_detection.label
[2]:
['eng', 'ind', 'malay', 'manglish', 'other', 'rojak']
[4]:
chinese_text = '今天是6月18号,也是Muiriel的生日!'
english_text = 'i totally love it man'
indon_text = 'menjabat saleh perombakan menjabat periode komisi energi fraksi partai pengurus partai periode periode partai terpilih periode menjabat komisi perdagangan investasi persatuan periode'
malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'
socialmedia_malay_text = 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh'
socialmedia_indon_text = 'saking kangen papanya pas vc anakku nangis'
rojak_text = 'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k'
manglish_text = 'power lah even shopback come to edmw riao'

Load Fast-text model

Make sure fast-text already installed, if not, simply,

pip install fasttext
def fasttext(quantized: bool = True, **kwargs):

    """
    Load Fasttext language detection model.
    Original size is 353MB, Quantized size 31.1MB.

    Parameters
    ----------
    quantized: bool, optional (default=True)
        if True, load quantized fasttext model. Else, load original fasttext model.

    Returns
    -------
    result : malaya.model.ml.LanguageDetection class
    """

In this example, I am going to compare with pretrained fasttext from Facebook. https://fasttext.cc/docs/en/language-identification.html

Simply download pretrained model,

wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
[4]:
model = fasttext.load_model('lid.176.ftz')
fast_text = malaya.language_detection.fasttext()


[5]:
model.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])
[5]:
([['__label__km']], array([[0.99841499]]))
[6]:
fast_text.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])
[6]:
['other']

Language detection in Malaya is not trying to tackle possible languages in this world, just towards to hyperlocal language.

[7]:
model.predict(['suka makan ayam dan daging'])
[7]:
([['__label__id']], array([[0.6334154]]))
[8]:
fast_text.predict_proba(['suka makan ayam dan daging'])
[8]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.8817721009254456,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]
[9]:
model.predict(malay_text)
[9]:
(('__label__ms',), array([0.57101035]))
[10]:
fast_text.predict_proba([malay_text])
[10]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.9999504089355469,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]
[11]:
model.predict(socialmedia_malay_text)
[11]:
(('__label__id',), array([0.7870034]))
[12]:
fast_text.predict_proba([socialmedia_malay_text])
[12]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.9996305704116821,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]
[13]:
model.predict(socialmedia_indon_text)
[13]:
(('__label__fr',), array([0.2912012]))
[14]:
fast_text.predict_proba([socialmedia_indon_text])
[14]:
[{'eng': 0.0,
  'ind': 1.0000293254852295,
  'malay': 0.0,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]
[15]:
model.predict(rojak_text)
[15]:
(('__label__id',), array([0.87948251]))
[16]:
fast_text.predict_proba([rojak_text])
[16]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.0,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.9994134306907654}]
[17]:
model.predict(manglish_text)
[17]:
(('__label__en',), array([0.89707506]))
[18]:
fast_text.predict_proba([manglish_text])
[18]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.0,
  'manglish': 1.00004243850708,
  'other': 0.0,
  'rojak': 0.0}]
[19]:
model.predict(chinese_text)
[19]:
(('__label__zh',), array([0.97311586]))
[20]:
fast_text.predict_proba([chinese_text])
[20]:
[{'eng': 0.0,
  'ind': 0.0,
  'malay': 0.0,
  'manglish': 0.0,
  'other': 0.9921814203262329,
  'rojak': 0.0}]
[21]:
fast_text.predict_proba([indon_text,malay_text])
[21]:
[{'eng': 0.0,
  'ind': 1.0000287294387817,
  'malay': 0.0,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0},
 {'eng': 0.0,
  'ind': 0.0,
  'malay': 0.9999504089355469,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]

Load Deep learning model

Deep learning model is slightly more accurate then fast-text model, can check accuracy comparison at here, https://malaya.readthedocs.io/en/latest/Accuracy.html#language-detection

def deep_model(quantized: bool = False, **kwargs):
    """
    Load deep learning language detection model.
    Original size is 51.2MB, Quantized size 12.8MB.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.tf.DeepLang class
    """
[5]:
deep = malaya.language_detection.deep_model()
quantized_deep = malaya.language_detection.deep_model(quantized = True)
[6]:
deep.predict_proba([indon_text])
[6]:
[{'eng': 3.6145184e-06,
  'ind': 0.9998913,
  'malay': 5.4685424e-05,
  'manglish': 5.768742e-09,
  'other': 5.8103424e-06,
  'rojak': 4.4987162e-05}]
[7]:
quantized_deep.predict_proba([indon_text])
[7]:
[{'eng': 3.6145184e-06,
  'ind': 0.9998913,
  'malay': 5.4685424e-05,
  'manglish': 5.768742e-09,
  'other': 5.8103424e-06,
  'rojak': 4.4987162e-05}]
[24]:
deep.predict_proba([malay_text])
[24]:
[{'eng': 9.500837e-11,
  'ind': 0.0004703698,
  'malay': 0.9991295,
  'manglish': 1.602048e-13,
  'other': 1.9133091e-07,
  'rojak': 0.0004000054}]
[8]:
quantized_deep.predict_proba([malay_text])
[8]:
[{'eng': 9.500829e-11,
  'ind': 0.00047036994,
  'malay': 0.99912965,
  'manglish': 1.6020499e-13,
  'other': 1.9133095e-07,
  'rojak': 0.00040000546}]
[25]:
deep.predict_proba([indon_text,malay_text])
[25]:
[{'eng': 3.6145207e-06,
  'ind': 0.9998909,
  'malay': 5.468535e-05,
  'manglish': 5.7687397e-09,
  'other': 5.8103406e-06,
  'rojak': 4.4987148e-05},
 {'eng': 9.500837e-11,
  'ind': 0.0004703698,
  'malay': 0.9991295,
  'manglish': 1.602048e-13,
  'other': 1.9133091e-07,
  'rojak': 0.0004000056}]
[9]:
quantized_deep.predict_proba([indon_text,malay_text])
[9]:
[{'eng': 3.614522e-06,
  'ind': 0.9998913,
  'malay': 5.4685373e-05,
  'manglish': 5.768742e-09,
  'other': 5.8103424e-06,
  'rojak': 4.4987162e-05},
 {'eng': 9.500829e-11,
  'ind': 0.00047036994,
  'malay': 0.99912965,
  'manglish': 1.6020499e-13,
  'other': 1.9133095e-07,
  'rojak': 0.0004000057}]
[26]:
deep.predict_proba([socialmedia_malay_text])
[26]:
[{'eng': 1.4520887e-09,
  'ind': 0.0064318455,
  'malay': 0.9824693,
  'manglish': 2.1923141e-13,
  'other': 1.06363805e-05,
  'rojak': 0.0110881105}]
[10]:
quantized_deep.predict_proba([socialmedia_malay_text])
[10]:
[{'eng': 1.4520903e-09,
  'ind': 0.006431847,
  'malay': 0.98246956,
  'manglish': 2.1923168e-13,
  'other': 1.0636383e-05,
  'rojak': 0.011088113}]
[27]:
deep.predict_proba([socialmedia_indon_text])
[27]:
[{'eng': 4.0632068e-07,
  'ind': 0.9999995,
  'malay': 6.871639e-10,
  'manglish': 7.4285925e-11,
  'other': 1.5928721e-07,
  'rojak': 4.892652e-10}]
[28]:
deep.predict_proba([rojak_text, malay_text])
[28]:
[{'eng': 0.0040922514,
  'ind': 0.02200061,
  'malay': 0.0027574676,
  'manglish': 9.336553e-06,
  'other': 0.00023811469,
  'rojak': 0.97090226},
 {'eng': 9.500837e-11,
  'ind': 0.0004703698,
  'malay': 0.9991295,
  'manglish': 1.602048e-13,
  'other': 1.9133091e-07,
  'rojak': 0.0004000056}]