Rules based Normalizer
Contents
Rules based Normalizer#
This tutorial is available as an IPython notebook at Malaya/example/normalizer.
[1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
import logging
logging.basicConfig(level=logging.INFO)
[3]:
%%time
import malaya
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmppnxxs_oa
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmppnxxs_oa/_remote_module_non_scriptable.py
CPU times: user 2.81 s, sys: 3.92 s, total: 6.73 s
Wall time: 2.01 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[4]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'
string8 = 'awak sangat hot ye 🔥🔥. 🔥🙂'
string9 = 'hanyalah rm2 ribu'
string10 = 'mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima .. pelik , dia slhkn org bole hri2 crta sakau then bila kna bls balik xdpt jwb ,kata mcm biasa slh (parti sampah) 🤣🤣🤣 jgn mulakn dlu slhkn org kalau xboleh trima bila kna bls balik 🤣🤣🤣'
string11 = 'Pemimpin yg hebat, panahan2 fitnah tu akan dituju kepadanya.. harap DS terus bersabar. Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga..'
string12 = 'berehatlh najib.. sudah2 lh tu.. jgn buat rakyat hilang kepercyaan tu pda system kehakiman negara.. klu btl x slh kenapa x dibuktikan semasa sblm rayuan.. sudah lah tu kami dh letih dengan drama korang. ok'
string13 = 'DSNR satu satunya legasi kpd negara penyambung perjuangan bangsa melayu..jatuhnya beliau dek kerana fitnah dan dengkinya manusia..semoga Allah lindungi Najib Bin Razak dunia dan akhirat..Aamiin'
string14 = 'Muhammad Najib sbb malaysiakini dah daftar.... Klu dia fitnah...tertuduh boleh saman.... Klu berita2 yg x daftar...tu yg susah nak saman...sbb x tahu owner'
Load normalizer#
normalizer can load any spelling correction model, eg,
malaya.spelling_correction.probability.load
, ormalaya.spelling_correction.transformer.load
.normalizer can load any stemmer model, eg,
malaya.stem.deep_model
.
def load(
speller: Callable = None,
stemmer: Callable = None,
**kwargs,
):
"""
Load a Normalizer using any spelling correction model.
Parameters
----------
speller: Callable, optional (default=None)
function to correct spelling, must have `correct` or `normalize_elongated` method.
stemmer: Callable, optional (default=None)
function to stem, must have `stem_word` method.
If provide stemmer, will accurately to stem kata imbuhan akhir.
Returns
-------
result: malaya.normalizer.rules.Normalizer class
"""
[5]:
lm = malaya.language_model.kenlm(model = 'bahasa-wiki-news')
[6]:
corrector = malaya.spelling_correction.probability.load(language_model = lm)
INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/v27-preprocessing/bm_1grams.json
[7]:
stemmer = malaya.stem.huggingface()
INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/stem-lstm-512/model.pt
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[8]:
normalizer = malaya.normalizer.rules.load(corrector, stemmer)
/home/husein/dev/malaya/malaya/normalizer/rules.py:204: FutureWarning: Possible nested set at position 42
k.lower(): re.compile(_expressions[k]) for k, v in _expressions.items()
/home/husein/dev/malaya/malaya/normalizer/rules.py:204: FutureWarning: Possible nested set at position 3
k.lower(): re.compile(_expressions[k]) for k, v in _expressions.items()
normalize#
def normalize(
self,
string: str,
normalize_text: bool = True,
normalize_url: bool = False,
normalize_email: bool = False,
normalize_year: bool = True,
normalize_telephone: bool = True,
normalize_date: bool = True,
normalize_time: bool = True,
normalize_emoji: bool = True,
normalize_elongated: bool = True,
normalize_hingga: bool = True,
normalize_pada_hari_bulan: bool = True,
normalize_fraction: bool = True,
normalize_money: bool = True,
normalize_units: bool = True,
normalize_percent: bool = True,
normalize_ic: bool = True,
normalize_number: bool = True,
normalize_x_kali: bool = True,
normalize_cardinal: bool = True,
normalize_ordinal: bool = True,
normalize_entity: bool = True,
expand_contractions: bool = True,
check_english_func=is_english,
check_malay_func=is_malay,
translator: Callable = None,
language_detection_word: Callable = None,
acceptable_language_detection: List[str] = ['EN', 'CAPITAL', 'NOT_LANG'],
segmenter: Callable = None,
text_scorer: Callable = None,
text_scorer_window: int = 2,
not_a_word_threshold: float = 1e-4,
dateparser_settings={'TIMEZONE': 'GMT+8'},
**kwargs,
):
"""
Normalize a string.
Parameters
----------
string : str
normalize_text: bool, optional (default=True)
if True, will try to replace shortforms with internal corpus.
normalize_url: bool, optional (default=False)
if True, replace `://` with empty and `.` with `dot`.
`https://huseinhouse.com` -> `https huseinhouse dot com`.
normalize_email: bool, optional (default=False)
if True, replace `@` with `di`, `.` with `dot`.
`husein.zol05@gmail.com` -> `husein dot zol kosong lima di gmail dot com`.
normalize_year: bool, optional (default=True)
if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`.
if True, `1970-an` -> `sembilan belas tujuh puluh an`.
if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`.
normalize_telephone: bool, optional (default=True)
if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh`
normalize_date: bool, optional (default=True)
if True, `01/12/2001` -> `satu disember dua ribu satu`.
if True, `Jun 2017` -> `satu Jun dua ribu tujuh belas`.
if True, `2017 Jun` -> `satu Jun dua ribu tujuh belas`.
if False, `2017 Jun` -> `01/06/2017`.
if False, `Jun 2017` -> `01/06/2017`.
normalize_time: bool, optional (default=True)
if True, `pukul 2.30` -> `pukul dua tiga puluh minit`.
if False, `pukul 2.30` -> `'02:00:00'`
normalize_emoji: bool, (default=True)
if True, `🔥` -> `emoji api`
Load from `malaya.preprocessing.demoji`.
normalize_elongated: bool, optional (default=True)
if True, `betuii` -> `betui`.
normalize_hingga: bool, optional (default=True)
if True, `2011 - 2019` -> `dua ribu sebelas hingga dua ribu sembilan belas`
normalize_pada_hari_bulan: bool, optional (default=True)
if True, `pada 10/4` -> `pada sepuluh hari bulan empat`
normalize_fraction: bool, optional (default=True)
if True, `10 /4` -> `sepuluh per empat`
normalize_money: bool, optional (default=True)
if True, `rm10.4m` -> `sepuluh juta empat ratus ribu ringgit`
normalize_units: bool, optional (default=True)
if True, `61.2 kg` -> `enam puluh satu perpuluhan dua kilogram`
normalize_percent: bool, optional (default=True)
if True, `0.8%` -> `kosong perpuluhan lapan peratus`
normalize_ic: bool, optional (default=True)
if True, `911111-01-1111` -> `sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu`
normalize_number: bool, optional (default=True)
if True `0123` -> `kosong satu dua tiga`
normalize_x_kali: bool, optional (default=True)
if True `10x` -> 'sepuluh kali'
normalize_cardinal: bool, optional (default=True)
if True, `123` -> `seratus dua puluh tiga`
normalize_ordinal: bool, optional (default=True)
if True, `ke-123` -> `keseratus dua puluh tiga`
normalize_entity: bool, optional (default=True)
normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.
expand_contractions: bool, optional (default=True)
expand english contractions.
check_english_func: Callable, optional (default=malaya.text.function.is_english)
function to check a word in english dictionary, default is malaya.text.function.is_english.
this parameter also will be use for malay text normalization.
check_malay_func: Callable, optional (default=malaya.text.function.is_malay)
function to check a word in malay dictionary, default is malaya.text.function.is_malay.
translator: Callable, optional (default=None)
function to translate EN word to MS word.
language_detection_word: Callable, optional (default=None)
function to detect language for each words to get better translation results.
acceptable_language_detection: List[str], optional (default=['EN', 'CAPITAL', 'NOT_LANG'])
only translate substrings if the results from `language_detection_word` is in `acceptable_language_detection`.
segmenter: Callable, optional (default=None)
function to segmentize word.
If provide, it will expand a word, apaitu -> apa itu
text_scorer: Callable, optional (default=None)
function to validate upper word.
If lower case score is higher or equal than upper case score, will choose lower case.
text_scorer_window: int, optional (default=2)
size of lookback and lookforward to validate upper word.
not_a_word_threshold: float, optional (default=1e-4)
assume a word is not a human word if score lower than `not_a_word_threshold`.
only usable if passed `text_scorer` parameter.
dateparser_settings: Dict, optional (default={'TIMEZONE': 'GMT+8'})
default dateparser setting, check support settings at https://dateparser.readthedocs.io/en/latest/
Returns
-------
result: {'normalize', 'date', 'money'}
"""
To get better english checker, we prefer to use https://pyenchant.github.io/pyenchant/
[9]:
import enchant
d = enchant.Dict('en_US')
is_english = lambda x: d.check(x)
is_english('lifestyle')
[9]:
True
[10]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[11]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule.
[11]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 3, 50, 902256),
'8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
'2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
'money': {'rm 3.2k': 'RM3200.0'}}
[12]:
normalizer.normalize(string, normalize_entity = False)
[12]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
'date': {},
'money': {}}
Here you can see, Malaya normalizer will normalize minggu depan
to datetime object, also 3.2k ringgit
to RM3200
[13]:
print(normalizer.normalize(string1))
print(normalizer.normalize(string2))
print(normalizer.normalize(string3))
print(normalizer.normalize(string4))
print(normalizer.normalize(string5))
print(normalizer.normalize(string6))
print(normalizer.normalize(string7))
print(normalizer.normalize(string8))
print(normalizer.normalize(string9))
print(normalizer.normalize(string10))
print(normalizer.normalize(string11))
print(normalizer.normalize(string12))
print(normalizer.normalize(string13))
print(normalizer.normalize(string14))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya hate itu . peliklah , pada', 'date': {}, 'money': {}}
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya love them . peliklah saya', 'date': {}, 'money': {}}
{'normalize': 'perdana menteri kesebelas sgt suka makan ayam , harganya cuma lima belas ringgit lima puluh sen', 'date': {}, 'money': {'rm15.50': 'RM15.50'}}
{'normalize': 'pada sepuluh hari bulan empat , kementerian mengumumkan , satu per seratus', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli dapat tempat kedua belas lumba lari hari ni', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah sekolah King Edward keempat', 'date': {}, 'money': {}}
{'normalize': 'dua jam tiga puluh minit aku tunggu kau , enam puluh perpuluhan satu kilogram kau ni , suhu harini tiga puluh satu perpuluhan dua celsius , aku dahaga minum enam ratus milliliter', 'date': {'2jam': datetime.datetime(2023, 10, 13, 12, 3, 51, 358111)}, 'money': {}}
{'normalize': 'awak sangat hot ye , emoji api , emoji api . Emoji api , emoji muka tersenyum sedikit', 'date': {}, 'money': {}}
{'normalize': 'hanyalah dua ribu ringgit', 'date': {}, 'money': {'rm2 ribu': 'RM2000.0'}}
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima . . pelik , dia salahkan org bole hari-hari cerita sakau then bila kena bilas balik tak dapat jwb , kata mcm biasa slh ( parti sampah ) , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , jgn mulakan dlu salahkan org kalau tak boleh trima bila kena bilas balik , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa', 'date': {}, 'money': {}}
{'normalize': 'Pemimpin yg hebat , panah-panahan fitnah tu akan dituju kepadanya . . harap DS terus bersabar . Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga . .', 'date': {}, 'money': {}}
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok', 'date': {}, 'money': {}}
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin', 'date': {}, 'money': {}}
{'normalize': 'Muhammad Najib sbb malaysiakini dah daftar . . . . Kalau dia fitnah . . . tertuduh boleh saman . . . . Kalau berita-berita yg tak daftar . . tu yg susah nak saman . . sbb tak tahu owner', 'date': {}, 'money': {}}
Use translator#
To use translator, pass a callable variable into translator
parameter,
print(normalizer.normalize(string1, translator = translator))
[14]:
en_ms_vocab = malaya.translation.word(model = 'mesolitica/word-en-ms')
INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/word-en-ms/dictionary.json
[15]:
translator = lambda x: en_ms_vocab.get(x, x)
[16]:
translator('pain'), translator('aduh')
[16]:
('sakit', 'aduh')
[17]:
print(normalizer.normalize(string1, translator = translator))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[18]:
print(normalizer.normalize(string2, translator = translator))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya cinta mereka . peliklah saya', 'date': {}, 'money': {}}
Use Neural Translation Machine#
Problem with dictionary based, if the words is not exist, the translation will not work,
[19]:
translator('love'), translator('them'), translator('pain')
[19]:
('cinta', 'mereka', 'sakit')
[20]:
nmt = malaya.translation.huggingface()
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[21]:
nmt_func = lambda x: nmt.generate([x], to_lang = 'ms', max_length = 256)[0]
[22]:
print(normalizer.normalize(string1, translator = nmt_func))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[23]:
print(normalizer.normalize(string2, translator = nmt_func))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya cinta mereka . peliklah Saya', 'date': {}, 'money': {}}
Use segmenter#
[24]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan'))
{'normalize': 'saya taksuka ayam , tapi saya sukaikan', 'date': {}, 'money': {}}
[25]:
segmenter = malaya.segmentation.huggingface()
[26]:
segmenter_func = lambda x: segmenter.generate([x], max_length = 128)[0]
[27]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan', segmenter = segmenter_func))
{'normalize': 'saya tidak suka ayam , tapi saya suka ikan', 'date': {}, 'money': {}}
Use stemmer#
By default normalizer will ignore kata imbuhan akhir, so to stem kata imbuhan akhir, provide stemmer
parameter.
[33]:
normalizer_without_stem = malaya.normalize.normalizer(corrector, check_malay_func = None)
normalizer_stem = malaya.normalize.normalizer(corrector, stemmer = stemmer, check_malay_func = None)
[34]:
normalizer_without_stem.normalize(string12)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[34]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
'date': {},
'money': {}}
[35]:
normalizer_stem.normalize(string12)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[35]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
'date': {},
'money': {}}
[36]:
normalizer.normalize(string13)
[36]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
'date': {},
'money': {}}
[41]:
normalizer_without_stem.normalize(string13)
[41]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
'date': {},
'money': {}}
[43]:
normalizer_stem.normalize(string13)
[43]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
'date': {},
'money': {}}
[45]:
normalizer_without_stem.normalize('seadil2nya')
[45]:
{'normalize': 'seadilnya', 'date': {}, 'money': {}}
[46]:
normalizer_stem.normalize('seadil2nya')
[46]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}
Validate uppercase#
Problem with social media text, people sometime do uppercase for kata nama am, so it will skip to do spelling correction. So to fix that, we need to pass text_scorer
parameter.
[47]:
import math
math.exp(lm.score('hi'))
[47]:
0.00012796330028274245
[48]:
text_scorer = lambda x: lm.score(x)
[49]:
t = 'Konon nak beat the crowd, skali Kedai x bukak ahaha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV'
normalizer.normalize(t, text_scorer = text_scorer)
[49]:
{'normalize': 'Konon nak beat the crowd , skali kedai tak bukak haha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV',
'date': {},
'money': {}}
[50]:
t = '8 Emiten Cum Dividen Pekan Ini, Jangan Ketinggalan https://t.co/9BV9OqqJUG'
normalizer.normalize(t, text_scorer = text_scorer)
[50]:
{'normalize': 'lapan emiten cum dividen Pekan Ini , jangan ketinggalan https://t.co/9BV9OqqJUG',
'date': {},
'money': {}}
Validate non human word#
A non human word like kasdsahdas
or kasweadsa
, it can be a laugh pattern or a cursing pattern, so to validate it we can use any text scoring. If the score lesser than the threshold, will skip to do spelling correction.
[51]:
normalizer.normalize('bodo la siallll hasdsadwq', text_scorer = text_scorer)
[51]:
{'normalize': 'bodo la sial hasdsadwq', 'date': {}, 'money': {}}
Skip spelling correction#
Simply pass None
to speller
to normalizer = malaya.normalize.normalizer
. By default it is None
.
[52]:
normalizer = malaya.normalize.normalizer(corrector)
without_corrector_normalizer = malaya.normalize.normalizer(None)
[53]:
normalizer.normalize(string2, normalize_elongated = False)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[53]:
{'normalize': 'saya memang-memang tak suka makanan HUSEIN kampung tempat , saya love them . pelikla saya',
'date': {},
'money': {}}
[54]:
without_corrector_normalizer.normalize(string2)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[54]:
{'normalize': 'saya memang-memang tak suka mknn HUSEIN kampng tmpat , saya love them . pelikla saya',
'date': {},
'money': {}}
Pass kwargs preprocessing#
Let say you want to skip to normalize date pattern, you can pass kwargs to normalizer, check word tokenizer kwargs at https://malaya.readthedocs.io/en/latest/load-tokenizer-word.html
[55]:
normalizer = malaya.normalize.normalizer(corrector)
skip_date_normalizer = malaya.normalize.normalizer(corrector, date = False)
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 2558
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3088
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[56]:
normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[56]:
{'normalize': 'tarikh program tersebut empat belas Mei dua ribu dua puluh tiga',
'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
'money': {}}
[57]:
skip_date_normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[57]:
{'normalize': 'tarikh program tersebut empat belas mei',
'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
'money': {}}
Normalize text#
If True,
replace
xkisah
->tak kisah
.replace
berehatlh
->berehatlah
.replace
seadil2nya
->seadil-adilnya
.apply spelling correction if passed
speller
parameter.standardize laughing pattern.
standardize mengeluh pattern.
normalize title,
{
'dr': 'Doktor',
'yb': 'Yang Berhormat',
'hj': 'Haji',
'ybm': 'Yang Berhormat Mulia',
'tyt': 'Tuan Yang Terutama',
'yab': 'Yang Berhormat',
'ybm': 'Yang Berhormat Mulia',
'yabhg': 'Yang Amat Berbahagia',
'ybhg': 'Yang Berbahagia',
'miss': 'Cik',
}
Simply normalizer.normalize(string, normalize_text = True)
, default is True
.
[58]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[59]:
normalizer.normalize('xkisah')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[59]:
{'normalize': 'tak kisah', 'date': {}, 'money': {}}
[60]:
normalizer.normalize('berehatlh')
[60]:
{'normalize': 'berehatlah', 'date': {}, 'money': {}}
[61]:
normalizer.normalize('seadil2nya')
[61]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}
[62]:
normalizer.normalize('bukan2')
[62]:
{'normalize': 'bukan-bukan', 'date': {}, 'money': {}}
[63]:
normalizer.normalize('bukan2 wkwkwkw')
[63]:
{'normalize': 'bukan-bukan haha', 'date': {}, 'money': {}}
[64]:
normalizer.normalize('bukan2 haih')
[64]:
{'normalize': 'bukan-bukan aduh', 'date': {}, 'money': {}}
[65]:
normalizer.normalize('dia sakai hhihihu')
[65]:
{'normalize': 'dia sakai haha', 'date': {}, 'money': {}}
[66]:
normalizer.normalize('hais sorrylah')
[66]:
{'normalize': 'aduh maaflah', 'date': {}, 'money': {}}
[67]:
normalizer.normalize('Dr yahaya')
[67]:
{'normalize': 'Doktor yahaya', 'date': {}, 'money': {}}
[68]:
normalizer.normalize('mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima')
[68]:
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima',
'date': {},
'money': {}}
[69]:
normalizer.normalize('aah la, bodo btul')
[69]:
{'normalize': 'betul la , bodo btul', 'date': {}, 'money': {}}
Normalize url#
Let say you have an url
word, example, https://huseinhouse.com
, this parameter going to,
If True,
replace
://
with empty string.replace
.
withdot
.replace digits with string representation.
Capitalize
https
,http
, andwww
.
Simply normalizer.normalize(string, normalize_url = True)
, default is False
.
[70]:
normalizer = malaya.normalize.normalizer()
[71]:
normalizer.normalize('web saya ialah https://huseinhouse.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[71]:
{'normalize': 'web saya ialah https://huseinhouse.com',
'date': {},
'money': {}}
[72]:
normalizer.normalize('web saya ialah https://huseinhouse.com', normalize_url = True)
[72]:
{'normalize': 'web saya ialah HTTPS huseinhouse dot com',
'date': {},
'money': {}}
[73]:
normalizer.normalize('web saya ialah https://huseinhouse02934.com', normalize_url = True)
[73]:
{'normalize': 'web saya ialah HTTPS huseinhouse kosong dua sembilan tiga empat dot com',
'date': {},
'money': {}}
Normalize email#
Let say you have an email
word, example, husein.zol05@gmail.com
, this parameter going to,
If True,
replace
://
with empty string.replace
.
withdot
.replace
@
withdi
.replace digits with string representation.
Simply normalizer.normalize(string, normalize_email = True)
, default is False
.
[74]:
normalizer = malaya.normalize.normalizer()
[75]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[75]:
{'normalize': 'email saya ialah husein.zol05@gmail.com',
'date': {},
'money': {}}
[76]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com', normalize_email = True)
[76]:
{'normalize': 'email saya ialah husein dot zol kosong lima di gmail dot com',
'date': {},
'money': {}}
Normalize year#
if True,
tahun 1987
->tahun sembilan belas lapan puluh tujuh
.if True,
1970-an
->sembilan belas tujuh puluh an
.if False,
tahun 1987
->tahun seribu sembilan ratus lapan puluh tujuh
.
Simply normalizer.normalize(string, normalize_year = True)
, default is True
.
[77]:
normalizer = malaya.normalize.normalizer()
[78]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[78]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas sembilan puluh lapan berbanding lebih seribu dollar',
'date': {},
'money': {'$400 ': '$400', '$1000': '$1000'}}
[79]:
normalizer.normalize('$400 pada 1970-an berbanding lebih $1000')
[79]:
{'normalize': 'empat ratus dollar pada sembilan belas tujuh puluhan berbanding lebih seribu dollar',
'date': {},
'money': {'$400 ': '$400', '$1000': '$1000'}}
[80]:
normalizer.normalize('$400 pada tahun 1970-an berbanding lebih $1000')
[80]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas tujuh puluhan berbanding lebih seribu dollar',
'date': {},
'money': {'$400 ': '$400', '$1000': '$1000'}}
[81]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000', normalize_year = False)
[81]:
{'normalize': 'empat ratus dollar pada tahun seribu sembilan ratus sembilan puluh lapan berbanding lebih seribu dollar',
'date': {},
'money': {'$400 ': '$400', '$1000': '$1000'}}
Normalize telephone#
if True,
no 012-1234567
->no kosong satu dua, satu dua tiga empat lima enam tujuh
.
Simply normalizer.normalize(string, normalize_telephone = True)
, default is True
.
[82]:
normalizer = malaya.normalize.normalizer()
[83]:
normalizer.normalize('no saya 012-1234567')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[83]:
{'normalize': 'no saya kosong satu dua, satu dua tiga empat lima enam tujuh',
'date': {},
'money': {}}
[84]:
normalizer.normalize('no saya 012-1234567', normalize_telephone = False)
[84]:
{'normalize': 'no saya 012-1234567', 'date': {}, 'money': {}}
Normalize date#
if True,
01/12/2001
->satu disember dua ribu satu
.if False, normalize date string to
%d/%m/%y
.
Simply normalizer.normalize(string, normalize_date = True)
, default is True
.
[85]:
normalizer = malaya.normalize.normalizer()
[86]:
normalizer.normalize('saya akan gerak pada 1/11/2021')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[86]:
{'normalize': 'saya akan gerak pada sebelas Januari dua ribu dua puluh satu',
'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
'money': {}}
[87]:
normalizer.normalize('saya akan gerak pada 1/11/2021', normalize_date = False)
[87]:
{'normalize': 'saya akan gerak pada 11/01/2021',
'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
'money': {}}
[88]:
normalizer.normalize('1 nov 2019')
[88]:
{'normalize': 'satu November dua ribu sembilan belas',
'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
'money': {}}
[89]:
normalizer.normalize('1 nov 2019', normalize_date = False)
[89]:
{'normalize': '01/11/2019',
'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
'money': {}}
[90]:
normalizer.normalize('januari 1 1996')
[90]:
{'normalize': 'satu Januari seribu sembilan ratus sembilan puluh enam',
'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
'money': {}}
[91]:
normalizer.normalize('januari 1 1996', normalize_date = False)
[91]:
{'normalize': '01/01/1996',
'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
'money': {}}
[92]:
normalizer.normalize('januari 2019')
[92]:
{'normalize': 'tiga belas Januari dua ribu sembilan belas',
'date': {'januari 2019': datetime.datetime(2019, 1, 13, 0, 0)},
'money': {}}
[93]:
normalizer.normalize('januari 2019', normalize_date = False)
[93]:
{'normalize': '13/01/2019',
'date': {'januari 2019': datetime.datetime(2019, 1, 13, 0, 0)},
'money': {}}
Normalize time#
if True,
pukul 2.30
->pukul dua tiga puluh minit
.if False
2:01pm
->pukul 14.01
.
Simply normalizer.normalize(string, normalize_time = True)
, default is True
.
[94]:
normalizer = malaya.normalize.normalizer()
[95]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[95]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit tengah hari',
'date': {'pukul 1:30': datetime.datetime(2023, 10, 13, 1, 30)},
'money': {}}
[96]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = False)
[96]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30 tengah hari',
'date': {'pukul 1:30': datetime.datetime(2023, 10, 13, 1, 30)},
'money': {}}
[97]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = True)
[97]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit lima puluh saat tengah hari',
'date': {'pukul 1:30:50': datetime.datetime(2023, 10, 13, 1, 30, 50)},
'money': {}}
[98]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = False)
[98]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30:50 tengah hari',
'date': {'pukul 1:30:50': datetime.datetime(2023, 10, 13, 1, 30, 50)},
'money': {}}
[99]:
normalizer.normalize('2:01pm')
[99]:
{'normalize': 'pukul empat belas satu minit',
'date': {'2:01pm': datetime.datetime(2023, 10, 13, 14, 1)},
'money': {}}
[100]:
normalizer.normalize('2:01pm', normalize_time = False)
[100]:
{'normalize': 'pukul 14.01',
'date': {'2:01pm': datetime.datetime(2023, 10, 13, 14, 1)},
'money': {}}
[101]:
normalizer.normalize('2AM')
[101]:
{'normalize': 'pukul dua',
'date': {'2am': datetime.datetime(2023, 10, 13, 2, 0)},
'money': {}}
[102]:
normalizer.normalize('2AM', normalize_time = False)
[102]:
{'normalize': 'pukul 02',
'date': {'2am': datetime.datetime(2023, 10, 13, 2, 0)},
'money': {}}
[103]:
normalizer.normalize('2pm')
[103]:
{'normalize': 'pukul empat belas',
'date': {'2pm': datetime.datetime(2023, 10, 13, 14, 0)},
'money': {}}
[104]:
normalizer.normalize('2pm', normalize_time = False)
[104]:
{'normalize': 'pukul 14',
'date': {'2pm': datetime.datetime(2023, 10, 13, 14, 0)},
'money': {}}
Normalize emoji#
if True,
🔥
->emoji api
Simply normalizer.normalize(string, normalize_emoji = True)
, default is True
.
[105]:
normalizer = malaya.normalize.normalizer()
[109]:
s = 'u are really damn hot 🔥'
normalizer.normalize(s, translator = nmt_func)
[109]:
{'normalize': 'awak adalah betul-betul sial panas , emoji api',
'date': {},
'money': {}}
Normalize elongated#
Any typical elongated word, eg, pppeeddaaaasss
- > pedas
, but this elongated normalization required to pass speller
parameter to perform the best.
Simply normalizer.normalize(string, normalize_elongated = True)
, default is True
.
[110]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[111]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[111]:
{'normalize': 'saya tak suka makan pedas', 'date': {}, 'money': {}}
[112]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas', normalize_elongated = False)
[112]:
{'normalize': 'saayyyyaa ttttaaak ssssukaaa makaan pedas',
'date': {},
'money': {}}
Normalize hingga#
If True,
2011 - 2019
->dua ribu sebelas hingga dua ribu sembilan belas
.2011.01-2019
- >dua ribu sebelas perpuluhan kosong satu hingga dua ribu sembilan belas
.
Simply normalizer.normalize(string, normalize_hingga = True)
, default is True
.
[113]:
normalizer = malaya.normalize.normalizer()
[114]:
normalizer.normalize('2011 - 2019', normalize_hingga = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[114]:
{'normalize': 'dua ribu sebelas hingga dua ribu sembilan belas',
'date': {},
'money': {}}
[115]:
normalizer.normalize('2011 - 2019', normalize_hingga = False)
[115]:
{'normalize': 'dua ribu sebelas - dua ribu sembilan belas',
'date': {},
'money': {}}
[116]:
normalizer.normalize('2011 - 2019', normalize_hingga = False, normalize_cardinal = False, normalize_ordinal = False)
[116]:
{'normalize': '2011 - 2019', 'date': {}, 'money': {}}
Normalize pada hari bulan#
If True,
pada 10/4
->pada sepuluh hari bulan empat
.
Simply normalizer.normalize(string, normalize_pada_hari_bulan = True)
, default is True
.
[117]:
normalizer = malaya.normalize.normalizer()
[118]:
normalizer.normalize('pada 10/ 4', normalize_pada_hari_bulan = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[118]:
{'normalize': 'pada sepuluh hari bulan empat', 'date': {}, 'money': {}}
[119]:
normalizer.normalize('pada 10/4', normalize_pada_hari_bulan = False)
[119]:
{'normalize': 'pada sepuluh per empat', 'date': {}, 'money': {}}
Normalize fraction#
If True,
10/4
->sepuluh per empat
.
Simply normalizer.normalize(string, normalize_fraction = True)
, default is True
.
[120]:
normalizer = malaya.normalize.normalizer()
[121]:
normalizer.normalize('10/4', normalize_fraction = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[121]:
{'normalize': 'sepuluh per empat', 'date': {}, 'money': {}}
[122]:
normalizer.normalize('201231.1 / 4', normalize_fraction = True)
[122]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu per empat',
'date': {},
'money': {}}
[123]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False)
[123]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu / empat',
'date': {},
'money': {}}
[124]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False, normalize_cardinal = False,
normalize_ordinal = False)
[124]:
{'normalize': '201231.1 / 4', 'date': {}, 'money': {}}
Normalize money#
If True,
RM10.5
->sepuluh ringgit lima puluh sen
.rm 10.5 sen
->sepuluh ringgit lima puluh sen
.20.2m ringgit
->dua puluh juta dua ratus ribu ringgit
.
And so much more!
Simply normalizer.normalize(string, normalize_money = True)
, default is True
.
[125]:
normalizer = malaya.normalize.normalizer()
[126]:
normalizer.normalize('RM10.5')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[126]:
{'normalize': 'sepuluh ringgit lima puluh sen',
'date': {},
'money': {'rm10.5': 'RM10.5'}}
[127]:
normalizer.normalize('rm 10.5 sen')
[127]:
{'normalize': 'sepuluh ringgit lima puluh sen',
'date': {},
'money': {'rm 10.5': 'RM10.5'}}
[128]:
normalizer.normalize('1015 sen')
[128]:
{'normalize': 'sepuluh ringgit lima belas sen',
'date': {},
'money': {'1015 sen': 'RM10.15'}}
[129]:
normalizer.normalize('rm10.4m')
[129]:
{'normalize': 'sepuluh juta empat ratus ribu ringgit',
'date': {},
'money': {'rm10.4m': 'RM10400000.0'}}
[130]:
normalizer.normalize('$10.4K')
[130]:
{'normalize': 'sepuluh ribu empat ratus dollar',
'date': {},
'money': {'$10.4k': '$10400.0'}}
[131]:
normalizer.normalize('22.5123334k ringgit')
[131]:
{'normalize': 'dua puluh dua ribu lima ratus dua belas ringgit tiga ribu tiga ratus tiga puluh empat sen',
'date': {},
'money': {'22.5123334k ringgit': 'RM22512.3334'}}
[132]:
normalizer.normalize('saya ada 20.2m ringgit')
[132]:
{'normalize': 'saya ada dua puluh juta dua ratus ribu ringgit',
'date': {},
'money': {'20.2m ringgit': 'RM20200000.0'}}
[133]:
normalizer.normalize('22.5123334k ringgit', normalize_money = False)
[133]:
{'normalize': '22.5123334k ringgit',
'date': {},
'money': {'22.5123334k ringgit': 'RM22512.3334'}}
Normalize units#
Able to normalize temperature, distance, volume, duration and weight units.
If True,
61.2 kg
->enam puluh satu perpuluhan dua kilogram
.61.2km
->sepuluh ringgit lima puluh sen
.
And so much more!
Simply normalizer.normalize(string, normalize_units = True)
, default is True
.
[134]:
normalizer = malaya.normalize.normalizer()
[135]:
normalizer.normalize('61.2 KG')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[135]:
{'normalize': 'enam puluh satu perpuluhan dua kilogram',
'date': {},
'money': {}}
[136]:
normalizer.normalize('61.2km')
[136]:
{'normalize': 'enam puluh satu perpuluhan dua kilometer',
'date': {},
'money': {}}
[137]:
normalizer.normalize('61.2c')
[137]:
{'normalize': 'enam puluh satu perpuluhan dua celsius',
'date': {},
'money': {}}
[138]:
normalizer.normalize('61.2 ml')
[138]:
{'normalize': 'enam puluh satu perpuluhan dua milliliter',
'date': {},
'money': {}}
[139]:
normalizer.normalize('61.2 l')
[139]:
{'normalize': 'enam puluh satu perpuluhan dua liter', 'date': {}, 'money': {}}
[140]:
normalizer.normalize('61.2 jam')
[140]:
{'normalize': 'enam puluh satu perpuluhan dua jam',
'date': {'61:2 jam': datetime.datetime(2023, 10, 13, 12, 9, 48, 124543)},
'money': {}}
[141]:
normalizer.normalize('61.2 hari')
[141]:
{'normalize': 'enam puluh satu perpuluhan dua hari', 'date': {}, 'money': {}}
Normalize percents#
If True,
61.2%
->enam puluh satu perpuluhan dua peratus
.
Simply normalizer.normalize(string, normalize_percent = True)
, default is True
.
[142]:
normalizer = malaya.normalize.normalizer()
[143]:
normalizer.normalize('61.2%')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[143]:
{'normalize': 'enam puluh satu perpuluhan dua peratus',
'date': {},
'money': {}}
[144]:
normalizer.normalize('61.2%', normalize_percent = False)
[144]:
{'normalize': '61.2%', 'date': {}, 'money': {}}
Normalize IC#
If True,
911111-01-1111
->sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu
.
Simply normalizer.normalize(string, normalize_ic = True)
, default is True
.
[145]:
normalizer = malaya.normalize.normalizer()
[146]:
normalizer.normalize('911111-01-1111')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[146]:
{'normalize': 'sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu',
'date': {},
'money': {}}
[147]:
normalizer.normalize('911111-01-1111', normalize_ic = False)
[147]:
{'normalize': '911111-01-1111', 'date': {}, 'money': {}}
Normalize Numbers#
If the number starts with 0
, will convert into string representation.
If True,
0123
->kosong satu dua tiga
.
Simply normalizer.normalize(string, normalize_number = True)
, default is True
.
[148]:
normalizer = malaya.normalize.normalizer()
[149]:
normalizer.normalize('01234')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[149]:
{'normalize': 'kosong satu dua tiga empat', 'date': {}, 'money': {}}
[150]:
normalizer.normalize('01234', normalize_number = False)
[150]:
{'normalize': '01234', 'date': {}, 'money': {}}
Normalize x kali#
If the word ends with x
and before that is a digit, will convert into string representation.
If True,
10x
->sepuluh kali
.If False,
10x
->10 kali
.
Simply normalizer.normalize(string, normalize_x_kali = True)
, default is True
.
[151]:
normalizer = malaya.normalize.normalizer()
[152]:
normalizer.normalize('saya sokong 10x')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[152]:
{'normalize': 'saya sokong sepuluh kali', 'date': {}, 'money': {}}
[153]:
normalizer.normalize('saya sokong 10x', normalize_x_kali = False)
[153]:
{'normalize': 'saya sokong 10 kali', 'date': {}, 'money': {}}
Normalize Cardinals#
Any numbers will convert using malaya.num2word.to_cardinal
.
If True,
123
->seratus dua puluh tiga
.
Simply normalizer.normalize(string, normalize_cardinal = True)
, default is True
.
[154]:
normalizer = malaya.normalize.normalizer()
[155]:
normalizer.normalize('123')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[155]:
{'normalize': 'seratus dua puluh tiga', 'date': {}, 'money': {}}
[156]:
normalizer.normalize('123.123421231')
[156]:
{'normalize': 'seratus dua puluh tiga perpuluhan satu dua tiga empat dua satu dua tiga satu',
'date': {},
'money': {}}
[157]:
normalizer.normalize('123.123421231', normalize_cardinal = False)
[157]:
{'normalize': '123.123421231', 'date': {}, 'money': {}}
Normalize Ordinals#
Any numbers will convert using malaya.num2word.to_cardinal
.
If True,
123
->keseratus dua puluh tiga
.Able to normalize roman numbers,
ke-XXI
->kedua puluh satu
.
Simply normalizer.normalize(string, normalize_ordinal = True)
, default is True
.
[158]:
normalizer.normalize('123', normalize_cardinal = False)
[158]:
{'normalize': 'keseratus dua puluh tiga', 'date': {}, 'money': {}}
[159]:
normalizer.normalize('123', normalize_cardinal = False, normalize_ordinal = False)
[159]:
{'normalize': '123', 'date': {}, 'money': {}}
[160]:
normalizer.normalize('ke-XXI')
[160]:
{'normalize': 'kedua puluh satu', 'date': {}, 'money': {}}
Normalize entity#
normalize entities, only effect date
, datetime
, time
and money
patterns string only
Simply normalizer.normalize(string, normalize_entity = True)
, default is True
.
[161]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[162]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[163]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[163]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 10, 18, 111175),
'8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
'2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
'money': {'rm 3.2k': 'RM3200.0'}}
[164]:
normalizer.normalize(string, normalize_entity = False)
[164]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
'date': {},
'money': {}}
[165]:
normalizer.normalize(string, normalize_date = False, normalize_time = False, normalize_money = False,
normalize_cardinal = False, normalize_ordinal = False)
[165]:
{'normalize': 'boleh dtg pukul 08 esok tak atau minggu depan ? 02/10/2019 pukul 14 , tolong bayar rm 3.2k sekali tau',
'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 10, 18, 796023),
'8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
'2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
'money': {'rm 3.2k': 'RM3200.0'}}