Language Detection word level using rules based#

This tutorial is available as an IPython notebook at Malaya/example/language-detection-words.

[1]:
%%time
import malaya
CPU times: user 2.87 s, sys: 3.85 s, total: 6.73 s
Wall time: 1.97 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

Install pyenchant#

pyenchant is an optional, full installation steps at https://pyenchant.github.io/pyenchant/install.html

Load model#

def substring_rules(model, **kwargs):
    """
    detect EN, MS and OTHER languages in a string.

    EN words detection are using `pyenchant` from https://pyenchant.github.io/pyenchant/ and
    user language detection model.

    MS words detection are using `malaya.dictionary.is_malay` and
    user language detection model.

    OTHER words detection are using any language detection classification model, such as,
    `malaya.language_detection.fasttext`.

    Parameters
    ----------
    model : Callable
        Callable model, must have `predict` method.

    Returns
    -------
    result : malaya.model.rules.LanguageDict class
    """
[2]:
fasttext = malaya.language_detection.fasttext()
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
[3]:
model = malaya.language_detection.substring_rules(model = fasttext)

Predict#

def predict(
    self,
    words: List[str],
    acceptable_ms_label: List[str] = ['malay', 'ind'],
    acceptable_en_label: List[str] = ['eng', 'manglish'],
    use_is_malay: bool = True,
):
    """
    Predict [EN, MS, OTHERS, CAPITAL, NOT_LANG] on word level.
    This method assumed the string already tokenized.

    Parameters
    ----------
    words: List[str]
    acceptable_ms_label: List[str], optional (default = ['malay', 'ind'])
        accept labels from language detection model to assume a word is `MS`.
    acceptable_en_label: List[str], optional (default = ['eng', 'manglish'])
        accept labels from language detection model to assume a word is `EN`.
    use_is_malay: bool, optional (default=True)
        if True`, will predict MS word using `malaya.dictionary.is_malay`,
        else use language detection model.

    Returns
    -------
    result: List[str]
    """
[4]:
string = 'saya suka chicken and fish pda hari isnin'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[4]:
[('saya', 'MS'),
 ('suka', 'MS'),
 ('chicken', 'EN'),
 ('and', 'EN'),
 ('fish', 'EN'),
 ('pda', 'MS'),
 ('hari', 'MS'),
 ('isnin', 'MS')]
[5]:
string = 'saya suka chicken and fish pda hari isnin , tarikh 22 mei'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[5]:
[('saya', 'MS'),
 ('suka', 'MS'),
 ('chicken', 'EN'),
 ('and', 'EN'),
 ('fish', 'EN'),
 ('pda', 'MS'),
 ('hari', 'MS'),
 ('isnin', 'MS'),
 (',', 'NOT_LANG'),
 ('tarikh', 'MS'),
 ('22', 'NOT_LANG'),
 ('mei', 'MS')]
[6]:
string = 'saya suka chicken 🐔 and fish pda hari isnin , tarikh 22 mei'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[6]:
[('saya', 'MS'),
 ('suka', 'MS'),
 ('chicken', 'EN'),
 ('🐔', 'NOT_LANG'),
 ('and', 'EN'),
 ('fish', 'EN'),
 ('pda', 'MS'),
 ('hari', 'MS'),
 ('isnin', 'MS'),
 (',', 'NOT_LANG'),
 ('tarikh', 'MS'),
 ('22', 'NOT_LANG'),
 ('mei', 'MS')]

Use malaya.preprocessing.Tokenizer#

To get better word tokens!

[7]:
string = 'Terminal 1 KKIA dilengkapi kemudahan 64 kaunter daftar masuk, 12 aero bridge selain mampu menampung 3,200 penumpang dalam satu masa.'
[8]:
tokenizer = malaya.preprocessing.Tokenizer()
tokenized = tokenizer.tokenize(string)
tokenized
[8]:
['Terminal',
 '1',
 'KKIA',
 'dilengkapi',
 'kemudahan',
 '64',
 'kaunter',
 'daftar',
 'masuk',
 ',',
 '12',
 'aero',
 'bridge',
 'selain',
 'mampu',
 'menampung',
 '3,200',
 'penumpang',
 'dalam',
 'satu',
 'masa',
 '.']
[9]:
list(zip(tokenized, model.predict(tokenized)))
[9]:
[('Terminal', 'MS'),
 ('1', 'NOT_LANG'),
 ('KKIA', 'CAPITAL'),
 ('dilengkapi', 'MS'),
 ('kemudahan', 'MS'),
 ('64', 'NOT_LANG'),
 ('kaunter', 'MS'),
 ('daftar', 'MS'),
 ('masuk', 'MS'),
 (',', 'NOT_LANG'),
 ('12', 'NOT_LANG'),
 ('aero', 'OTHERS'),
 ('bridge', 'EN'),
 ('selain', 'MS'),
 ('mampu', 'MS'),
 ('menampung', 'MS'),
 ('3,200', 'NOT_LANG'),
 ('penumpang', 'MS'),
 ('dalam', 'MS'),
 ('satu', 'MS'),
 ('masa', 'MS'),
 ('.', 'NOT_LANG')]

If not properly tokenized the string,

[10]:
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[10]:
[('Terminal', 'MS'),
 ('1', 'NOT_LANG'),
 ('KKIA', 'CAPITAL'),
 ('dilengkapi', 'MS'),
 ('kemudahan', 'MS'),
 ('64', 'NOT_LANG'),
 ('kaunter', 'MS'),
 ('daftar', 'MS'),
 ('masuk,', 'OTHERS'),
 ('12', 'NOT_LANG'),
 ('aero', 'OTHERS'),
 ('bridge', 'EN'),
 ('selain', 'MS'),
 ('mampu', 'MS'),
 ('menampung', 'MS'),
 ('3,200', 'NOT_LANG'),
 ('penumpang', 'MS'),
 ('dalam', 'MS'),
 ('satu', 'MS'),
 ('masa.', 'OTHERS')]

More example#

Copy pasted from Twitter.

[11]:
s = "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍"
[12]:
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[12]:
[('just', 'EN'),
 ('attended', 'EN'),
 ('my', 'EN'),
 ("cousin's", 'EN'),
 ('wedding', 'EN'),
 ('.', 'NOT_LANG'),
 ('pelik', 'MS'),
 ('jugak', 'MS'),
 ('dia', 'MS'),
 ('buat', 'MS'),
 ('majlis', 'MS'),
 ('biasa2', 'OTHERS'),
 ('je', 'MS'),
 ('sebab', 'MS'),
 ('her', 'EN'),
 ('lifestyle', 'EN'),
 ('looks', 'EN'),
 ('lavish', 'EN'),
 ('.', 'NOT_LANG'),
 ('then', 'EN'),
 ('i', 'MS'),
 ('found', 'EN'),
 ('out', 'EN'),
 ("they'", 'OTHERS'),
 ('re', 'EN'),
 ('going', 'EN'),
 ('on', 'EN'),
 ('a', 'EN'),
 ('3', 'NOT_LANG'),
 ('weeks', 'EN'),
 ('honeymoon', 'EN'),
 ('.', 'NOT_LANG'),
 ('smart', 'EN'),
 ('decision', 'EN'),
 ('👍', 'NOT_LANG')]
[13]:
s = 'Hello gais, boleh tolong recommend bengkel ketuk yang okay near Wangsa Maju / nearby? Kereta bf i pulak kepek langgar dinding hahahha'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[13]:
[('Hello', 'EN'),
 ('gais', 'MS'),
 (',', 'NOT_LANG'),
 ('boleh', 'MS'),
 ('tolong', 'MS'),
 ('recommend', 'EN'),
 ('bengkel', 'MS'),
 ('ketuk', 'MS'),
 ('yang', 'MS'),
 ('okay', 'EN'),
 ('near', 'EN'),
 ('Wangsa', 'MS'),
 ('Maju', 'MS'),
 ('/', 'NOT_LANG'),
 ('nearby', 'EN'),
 ('?', 'NOT_LANG'),
 ('Kereta', 'MS'),
 ('bf', 'MS'),
 ('i', 'MS'),
 ('pulak', 'MS'),
 ('kepek', 'MS'),
 ('langgar', 'MS'),
 ('dinding', 'MS'),
 ('hahahha', 'NOT_LANG')]
[14]:
s = 'Me after seeing this video: mm dapnya burger benjo extra mayo'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[14]:
[('Me', 'EN'),
 ('after', 'EN'),
 ('seeing', 'EN'),
 ('this', 'EN'),
 ('video', 'MS'),
 (':', 'NOT_LANG'),
 ('mm', 'EN'),
 ('dapnya', 'MS'),
 ('burger', 'MS'),
 ('benjo', 'OTHERS'),
 ('extra', 'EN'),
 ('mayo', 'EN')]
[15]:
s = 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[15]:
[('Hi', 'EN'),
 ('guys', 'EN'),
 ('!', 'NOT_LANG'),
 ('I', 'CAPITAL'),
 ('noticed', 'EN'),
 ('semalam', 'MS'),
 ('&', 'NOT_LANG'),
 ('harini', 'MS'),
 ('dah', 'MS'),
 ('ramai', 'MS'),
 ('yang', 'MS'),
 ('dapat', 'MS'),
 ('cookies', 'EN'),
 ('ni', 'MS'),
 ('kan', 'MS'),
 ('.', 'NOT_LANG'),
 ('So', 'MS'),
 ('harini', 'MS'),
 ('i', 'MS'),
 ('nak', 'MS'),
 ('share', 'EN'),
 ('some', 'EN'),
 ('post', 'MS'),
 ('mortem', 'MS'),
 ('of', 'EN'),
 ('our', 'EN'),
 ('first', 'EN'),
 ('batch', 'EN'),
 (':', 'NOT_LANG')]
[ ]: