Language Detection word level using rules based
Contents
Language Detection word level using rules based#
This tutorial is available as an IPython notebook at Malaya/example/language-detection-words.
[1]:
%%time
import malaya
CPU times: user 2.87 s, sys: 3.85 s, total: 6.73 s
Wall time: 1.97 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
Install pyenchant#
pyenchant is an optional, full installation steps at https://pyenchant.github.io/pyenchant/install.html
Load model#
def substring_rules(model, **kwargs):
"""
detect EN, MS and OTHER languages in a string.
EN words detection are using `pyenchant` from https://pyenchant.github.io/pyenchant/ and
user language detection model.
MS words detection are using `malaya.dictionary.is_malay` and
user language detection model.
OTHER words detection are using any language detection classification model, such as,
`malaya.language_detection.fasttext`.
Parameters
----------
model : Callable
Callable model, must have `predict` method.
Returns
-------
result : malaya.model.rules.LanguageDict class
"""
[2]:
fasttext = malaya.language_detection.fasttext()
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
[3]:
model = malaya.language_detection.substring_rules(model = fasttext)
Predict#
def predict(
self,
words: List[str],
acceptable_ms_label: List[str] = ['malay', 'ind'],
acceptable_en_label: List[str] = ['eng', 'manglish'],
use_is_malay: bool = True,
):
"""
Predict [EN, MS, OTHERS, CAPITAL, NOT_LANG] on word level.
This method assumed the string already tokenized.
Parameters
----------
words: List[str]
acceptable_ms_label: List[str], optional (default = ['malay', 'ind'])
accept labels from language detection model to assume a word is `MS`.
acceptable_en_label: List[str], optional (default = ['eng', 'manglish'])
accept labels from language detection model to assume a word is `EN`.
use_is_malay: bool, optional (default=True)
if True`, will predict MS word using `malaya.dictionary.is_malay`,
else use language detection model.
Returns
-------
result: List[str]
"""
[4]:
string = 'saya suka chicken and fish pda hari isnin'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[4]:
[('saya', 'MS'),
('suka', 'MS'),
('chicken', 'EN'),
('and', 'EN'),
('fish', 'EN'),
('pda', 'MS'),
('hari', 'MS'),
('isnin', 'MS')]
[5]:
string = 'saya suka chicken and fish pda hari isnin , tarikh 22 mei'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[5]:
[('saya', 'MS'),
('suka', 'MS'),
('chicken', 'EN'),
('and', 'EN'),
('fish', 'EN'),
('pda', 'MS'),
('hari', 'MS'),
('isnin', 'MS'),
(',', 'NOT_LANG'),
('tarikh', 'MS'),
('22', 'NOT_LANG'),
('mei', 'MS')]
[6]:
string = 'saya suka chicken 🐔 and fish pda hari isnin , tarikh 22 mei'
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[6]:
[('saya', 'MS'),
('suka', 'MS'),
('chicken', 'EN'),
('🐔', 'NOT_LANG'),
('and', 'EN'),
('fish', 'EN'),
('pda', 'MS'),
('hari', 'MS'),
('isnin', 'MS'),
(',', 'NOT_LANG'),
('tarikh', 'MS'),
('22', 'NOT_LANG'),
('mei', 'MS')]
Use malaya.preprocessing.Tokenizer#
To get better word tokens!
[7]:
string = 'Terminal 1 KKIA dilengkapi kemudahan 64 kaunter daftar masuk, 12 aero bridge selain mampu menampung 3,200 penumpang dalam satu masa.'
[8]:
tokenizer = malaya.preprocessing.Tokenizer()
tokenized = tokenizer.tokenize(string)
tokenized
[8]:
['Terminal',
'1',
'KKIA',
'dilengkapi',
'kemudahan',
'64',
'kaunter',
'daftar',
'masuk',
',',
'12',
'aero',
'bridge',
'selain',
'mampu',
'menampung',
'3,200',
'penumpang',
'dalam',
'satu',
'masa',
'.']
[9]:
list(zip(tokenized, model.predict(tokenized)))
[9]:
[('Terminal', 'MS'),
('1', 'NOT_LANG'),
('KKIA', 'CAPITAL'),
('dilengkapi', 'MS'),
('kemudahan', 'MS'),
('64', 'NOT_LANG'),
('kaunter', 'MS'),
('daftar', 'MS'),
('masuk', 'MS'),
(',', 'NOT_LANG'),
('12', 'NOT_LANG'),
('aero', 'OTHERS'),
('bridge', 'EN'),
('selain', 'MS'),
('mampu', 'MS'),
('menampung', 'MS'),
('3,200', 'NOT_LANG'),
('penumpang', 'MS'),
('dalam', 'MS'),
('satu', 'MS'),
('masa', 'MS'),
('.', 'NOT_LANG')]
If not properly tokenized the string,
[10]:
splitted = string.split()
list(zip(splitted, model.predict(splitted)))
[10]:
[('Terminal', 'MS'),
('1', 'NOT_LANG'),
('KKIA', 'CAPITAL'),
('dilengkapi', 'MS'),
('kemudahan', 'MS'),
('64', 'NOT_LANG'),
('kaunter', 'MS'),
('daftar', 'MS'),
('masuk,', 'OTHERS'),
('12', 'NOT_LANG'),
('aero', 'OTHERS'),
('bridge', 'EN'),
('selain', 'MS'),
('mampu', 'MS'),
('menampung', 'MS'),
('3,200', 'NOT_LANG'),
('penumpang', 'MS'),
('dalam', 'MS'),
('satu', 'MS'),
('masa.', 'OTHERS')]
More example#
Copy pasted from Twitter.
[11]:
s = "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍"
[12]:
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[12]:
[('just', 'EN'),
('attended', 'EN'),
('my', 'EN'),
("cousin's", 'EN'),
('wedding', 'EN'),
('.', 'NOT_LANG'),
('pelik', 'MS'),
('jugak', 'MS'),
('dia', 'MS'),
('buat', 'MS'),
('majlis', 'MS'),
('biasa2', 'OTHERS'),
('je', 'MS'),
('sebab', 'MS'),
('her', 'EN'),
('lifestyle', 'EN'),
('looks', 'EN'),
('lavish', 'EN'),
('.', 'NOT_LANG'),
('then', 'EN'),
('i', 'MS'),
('found', 'EN'),
('out', 'EN'),
("they'", 'OTHERS'),
('re', 'EN'),
('going', 'EN'),
('on', 'EN'),
('a', 'EN'),
('3', 'NOT_LANG'),
('weeks', 'EN'),
('honeymoon', 'EN'),
('.', 'NOT_LANG'),
('smart', 'EN'),
('decision', 'EN'),
('👍', 'NOT_LANG')]
[13]:
s = 'Hello gais, boleh tolong recommend bengkel ketuk yang okay near Wangsa Maju / nearby? Kereta bf i pulak kepek langgar dinding hahahha'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[13]:
[('Hello', 'EN'),
('gais', 'MS'),
(',', 'NOT_LANG'),
('boleh', 'MS'),
('tolong', 'MS'),
('recommend', 'EN'),
('bengkel', 'MS'),
('ketuk', 'MS'),
('yang', 'MS'),
('okay', 'EN'),
('near', 'EN'),
('Wangsa', 'MS'),
('Maju', 'MS'),
('/', 'NOT_LANG'),
('nearby', 'EN'),
('?', 'NOT_LANG'),
('Kereta', 'MS'),
('bf', 'MS'),
('i', 'MS'),
('pulak', 'MS'),
('kepek', 'MS'),
('langgar', 'MS'),
('dinding', 'MS'),
('hahahha', 'NOT_LANG')]
[14]:
s = 'Me after seeing this video: mm dapnya burger benjo extra mayo'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[14]:
[('Me', 'EN'),
('after', 'EN'),
('seeing', 'EN'),
('this', 'EN'),
('video', 'MS'),
(':', 'NOT_LANG'),
('mm', 'EN'),
('dapnya', 'MS'),
('burger', 'MS'),
('benjo', 'OTHERS'),
('extra', 'EN'),
('mayo', 'EN')]
[15]:
s = 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:'
tokenized = tokenizer.tokenize(s)
list(zip(tokenized, model.predict(tokenized)))
[15]:
[('Hi', 'EN'),
('guys', 'EN'),
('!', 'NOT_LANG'),
('I', 'CAPITAL'),
('noticed', 'EN'),
('semalam', 'MS'),
('&', 'NOT_LANG'),
('harini', 'MS'),
('dah', 'MS'),
('ramai', 'MS'),
('yang', 'MS'),
('dapat', 'MS'),
('cookies', 'EN'),
('ni', 'MS'),
('kan', 'MS'),
('.', 'NOT_LANG'),
('So', 'MS'),
('harini', 'MS'),
('i', 'MS'),
('nak', 'MS'),
('share', 'EN'),
('some', 'EN'),
('post', 'MS'),
('mortem', 'MS'),
('of', 'EN'),
('our', 'EN'),
('first', 'EN'),
('batch', 'EN'),
(':', 'NOT_LANG')]
[ ]: