Rules based Normalizer#

This tutorial is available as an IPython notebook at Malaya/example/normalizer.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
import logging

logging.basicConfig(level=logging.INFO)
[3]:
%%time
import malaya
CPU times: user 4.01 s, sys: 2.41 s, total: 6.42 s
Wall time: 3.57 s
[4]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'
string8 = 'awak sangat hot ye 🔥🔥. 🔥🙂'
string9 = 'hanyalah rm2 ribu'
string10 = 'mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima .. pelik , dia slhkn org bole hri2 crta sakau then bila kna bls balik xdpt jwb ,kata mcm biasa slh (parti sampah) 🤣🤣🤣 jgn mulakn dlu slhkn org kalau xboleh trima bila kna bls balik 🤣🤣🤣'
string11 = 'Pemimpin yg hebat, panahan2 fitnah tu akan dituju kepadanya.. harap DS terus bersabar. Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga..'
string12 = 'berehatlh najib.. sudah2 lh tu.. jgn buat rakyat hilang kepercyaan tu pda system kehakiman negara.. klu btl x slh kenapa x dibuktikan semasa sblm rayuan.. sudah lah tu kami dh letih dengan drama korang. ok'
string13 = 'DSNR satu satunya legasi kpd negara penyambung perjuangan bangsa melayu..jatuhnya beliau dek kerana fitnah dan dengkinya manusia..semoga Allah lindungi Najib Bin Razak dunia dan akhirat..Aamiin'
string14 = 'Muhammad Najib sbb malaysiakini dah daftar.... Klu dia fitnah...tertuduh boleh saman.... Klu berita2 yg x daftar...tu yg susah nak saman...sbb x tahu owner'

Load normalizer#

  1. normalizer can load any spelling correction model, eg, malaya.spelling_correction.probability.load, or malaya.spelling_correction.transformer.load.

  2. normalizer can load any stemmer model, eg, malaya.stem.deep_model.

def load(
    speller: Callable = None,
    stemmer: Callable = None,
    **kwargs,
):
    """
    Load a Normalizer using any spelling correction model.

    Parameters
    ----------
    speller: Callable, optional (default=None)
        function to correct spelling, must have `correct` or `normalize_elongated` method.
    stemmer: Callable, optional (default=None)
        function to stem, must have `stem_word` method.
        If provide stemmer, will accurately to stem kata imbuhan akhir.

    Returns
    -------
    result: malaya.normalizer.rules.Normalizer class
    """
[178]:
lm = malaya.language_model.kenlm(model = 'bahasa-wiki-news')
[177]:
corrector = malaya.spelling_correction.probability.load(language_model = lm)
[176]:
stemmer = malaya.stem.deep_model('noisy')
[8]:
normalizer = malaya.normalizer.rules.load(corrector, stemmer)

normalize#

def normalize(
    self,
    string: str,
    normalize_text: bool = True,
    normalize_url: bool = False,
    normalize_email: bool = False,
    normalize_year: bool = True,
    normalize_telephone: bool = True,
    normalize_date: bool = True,
    normalize_time: bool = True,
    normalize_emoji: bool = True,
    normalize_elongated: bool = True,
    normalize_hingga: bool = True,
    normalize_pada_hari_bulan: bool = True,
    normalize_fraction: bool = True,
    normalize_money: bool = True,
    normalize_units: bool = True,
    normalize_percent: bool = True,
    normalize_ic: bool = True,
    normalize_number: bool = True,
    normalize_x_kali: bool = True,
    normalize_cardinal: bool = True,
    normalize_ordinal: bool = True,
    normalize_entity: bool = True,
    expand_contractions: bool = True,
    check_english_func=is_english,
    check_malay_func=is_malay,
    translator: Callable = None,
    language_detection_word: Callable = None,
    acceptable_language_detection: List[str] = ['EN', 'CAPITAL', 'NOT_LANG'],
    segmenter: Callable = None,
    text_scorer: Callable = None,
    text_scorer_window: int = 2,
    not_a_word_threshold: float = 1e-4,
    dateparser_settings={'TIMEZONE': 'GMT+8'},
    **kwargs,
):
    """
    Normalize a string.

    Parameters
    ----------
    string : str
    normalize_text: bool, optional (default=True)
        if True, will try to replace shortforms with internal corpus.
    normalize_url: bool, optional (default=False)
        if True, replace `://` with empty and `.` with `dot`.
        `https://huseinhouse.com` -> `https huseinhouse dot com`.
    normalize_email: bool, optional (default=False)
        if True, replace `@` with `di`, `.` with `dot`.
        `husein.zol05@gmail.com` -> `husein dot zol kosong lima di gmail dot com`.
    normalize_year: bool, optional (default=True)
        if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`.
        if True, `1970-an` -> `sembilan belas tujuh puluh an`.
        if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`.
    normalize_telephone: bool, optional (default=True)
        if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh`
    normalize_date: bool, optional (default=True)
        if True, `01/12/2001` -> `satu disember dua ribu satu`.
        if True, `Jun 2017` -> `satu Jun dua ribu tujuh belas`.
        if True, `2017 Jun` -> `satu Jun dua ribu tujuh belas`.
        if False, `2017 Jun` -> `01/06/2017`.
        if False, `Jun 2017` -> `01/06/2017`.
    normalize_time: bool, optional (default=True)
        if True, `pukul 2.30` -> `pukul dua tiga puluh minit`.
        if False, `pukul 2.30` -> `'02:00:00'`
    normalize_emoji: bool, (default=True)
        if True, `🔥` -> `emoji api`
        Load from `malaya.preprocessing.demoji`.
    normalize_elongated: bool, optional (default=True)
        if True, `betuii` -> `betui`.
    normalize_hingga: bool, optional (default=True)
        if True, `2011 - 2019` -> `dua ribu sebelas hingga dua ribu sembilan belas`
    normalize_pada_hari_bulan: bool, optional (default=True)
        if True, `pada 10/4` -> `pada sepuluh hari bulan empat`
    normalize_fraction: bool, optional (default=True)
        if True, `10 /4` -> `sepuluh per empat`
    normalize_money: bool, optional (default=True)
        if True, `rm10.4m` -> `sepuluh juta empat ratus ribu ringgit`
    normalize_units: bool, optional (default=True)
        if True, `61.2 kg` -> `enam puluh satu perpuluhan dua kilogram`
    normalize_percent: bool, optional (default=True)
        if True, `0.8%` -> `kosong perpuluhan lapan peratus`
    normalize_ic: bool, optional (default=True)
        if True, `911111-01-1111` -> `sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu`
    normalize_number: bool, optional (default=True)
        if True `0123` -> `kosong satu dua tiga`
    normalize_x_kali: bool, optional (default=True)
        if True `10x` -> 'sepuluh kali'
    normalize_cardinal: bool, optional (default=True)
        if True, `123` -> `seratus dua puluh tiga`
    normalize_ordinal: bool, optional (default=True)
        if True, `ke-123` -> `keseratus dua puluh tiga`
    normalize_entity: bool, optional (default=True)
        normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.
    expand_contractions: bool, optional (default=True)
        expand english contractions.
    check_english_func: Callable, optional (default=malaya.text.function.is_english)
        function to check a word in english dictionary, default is malaya.text.function.is_english.
        this parameter also will be use for malay text normalization.
    check_malay_func: Callable, optional (default=malaya.text.function.is_malay)
        function to check a word in malay dictionary, default is malaya.text.function.is_malay.
    translator: Callable, optional (default=None)
        function to translate EN word to MS word.
    language_detection_word: Callable, optional (default=None)
        function to detect language for each words to get better translation results.
    acceptable_language_detection: List[str], optional (default=['EN', 'CAPITAL', 'NOT_LANG'])
        only translate substrings if the results from `language_detection_word` is in `acceptable_language_detection`.
    segmenter: Callable, optional (default=None)
        function to segmentize word.
        If provide, it will expand a word, apaitu -> apa itu
    text_scorer: Callable, optional (default=None)
        function to validate upper word.
        If lower case score is higher or equal than upper case score, will choose lower case.
    text_scorer_window: int, optional (default=2)
        size of lookback and lookforward to validate upper word.
    not_a_word_threshold: float, optional (default=1e-4)
        assume a word is not a human word if score lower than `not_a_word_threshold`.
        only usable if passed `text_scorer` parameter.
    dateparser_settings: Dict, optional (default={'TIMEZONE': 'GMT+8'})
        default dateparser setting, check support settings at https://dateparser.readthedocs.io/en/latest/

    Returns
    -------
    result: {'normalize', 'date', 'money'}
    """

To get better english checker, we prefer to use https://pyenchant.github.io/pyenchant/

[9]:
import enchant
d = enchant.Dict('en_US')

is_english = lambda x: d.check(x)
is_english('lifestyle')
[9]:
True
[10]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[11]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[11]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {'8AM esok': datetime.datetime(2023, 4, 9, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0),
  'minggu depan': datetime.datetime(2023, 4, 15, 16, 4, 20, 582890)},
 'money': {'rm 3.2k': 'RM3200.0'}}
[12]:
normalizer.normalize(string, normalize_entity = False)
[12]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {},
 'money': {}}

Here you can see, Malaya normalizer will normalize minggu depan to datetime object, also 3.2k ringgit to RM3200

[13]:
print(normalizer.normalize(string1))
print(normalizer.normalize(string2))
print(normalizer.normalize(string3))
print(normalizer.normalize(string4))
print(normalizer.normalize(string5))
print(normalizer.normalize(string6))
print(normalizer.normalize(string7))
print(normalizer.normalize(string8))
print(normalizer.normalize(string9))
print(normalizer.normalize(string10))
print(normalizer.normalize(string11))
print(normalizer.normalize(string12))
print(normalizer.normalize(string13))
print(normalizer.normalize(string14))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya hate itu . peliklah , pada', 'date': {}, 'money': {}}
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya love them . peliklah saya', 'date': {}, 'money': {}}
{'normalize': 'perdana menteri kesebelas sgt suka makan ayam , harganya cuma lima belas ringgit lima puluh sen', 'date': {}, 'money': {'rm15.50': 'RM15.50'}}
{'normalize': 'pada sepuluh hari bulan empat , kementerian mengumumkan , satu per seratus', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli dapat tempat kedua belas lumba lari hari ni', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah sekolah King Edward keempat', 'date': {}, 'money': {}}
{'normalize': 'dua jam tiga puluh minit aku tunggu kau , enam puluh perpuluhan satu kilogram kau ni , suhu harini tiga puluh satu perpuluhan dua celsius , aku dahaga minum enam ratus milliliter', 'date': {'2jam': datetime.datetime(2023, 4, 8, 14, 4, 20, 894270)}, 'money': {}}
{'normalize': 'awak sangat hot ye , emoji api , emoji api . Emoji api , emoji muka tersenyum sedikit', 'date': {}, 'money': {}}
{'normalize': 'hanyalah dua ribu ringgit', 'date': {}, 'money': {'rm2 ribu': 'RM2000.0'}}
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima . . pelik , dia salahkan org bole hari-hari cerita sakau then bila kena bilas balik tak dapat jwb , kata mcm biasa slh ( parti sampah ) , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , jgn mulakan dlu salahkan org kalau tak boleh trima bila kena bilas balik , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa', 'date': {}, 'money': {}}
{'normalize': 'Pemimpin yg hebat , panah-panahan fitnah tu akan dituju kepadanya . . harap DS terus bersabar . Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga . .', 'date': {}, 'money': {}}
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok', 'date': {}, 'money': {}}
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin', 'date': {}, 'money': {}}
{'normalize': 'Muhammad Najib sbb malaysiakini dah daftar . . . . Kalau dia fitnah . . . tertuduh boleh saman . . . . Kalau berita-berita yg tak daftar . . tu yg susah nak saman . . sbb tak tahu owner', 'date': {}, 'money': {}}

Use translator#

To use translator, pass a callable variable into translator parameter,

print(normalizer.normalize(string1, translator = translator))
[175]:
en_ms_vocab = malaya.translation.en_ms.dictionary()
translator = lambda x: en_ms_vocab.get(x, x)
[15]:
translator('pain'), translator('aduh')
[15]:
('kesakitan', 'aduh')
[16]:
print(normalizer.normalize(string1, translator = translator))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[17]:
print(normalizer.normalize(string2, translator = translator))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya love them . peliklah saya', 'date': {}, 'money': {}}

Use Neural Translation Machine#

Problem with dictionary based, if the words is not exist, the translation will not work,

[18]:
translator('love'), translator('them'), translator('pain')
[18]:
('love', 'them', 'kesakitan')
[174]:
nmt = malaya.translation.en_ms.transformer(model = 'small')
[20]:
nmt_func = lambda x: nmt.greedy_decoder([x])[0]
[21]:
print(normalizer.normalize(string1, translator = nmt_func))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[22]:
print(normalizer.normalize(string2, translator = nmt_func))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya cinta mereka . peliklah saya', 'date': {}, 'money': {}}

Problem with single word translation#

When we do direct translate word-by-word, it does not really understand the context, for an example,

talk properly lah,

talk -> bercakap, properly -> betul,

so when we combined, we got bercakap betul lah, not really good translation. So to solve this problem, we need predict language on word level, group substrings based on similar language and translate.

[173]:
nmt = malaya.translation.en_ms.transformer(model = 'noisy-base')
nmt_func = lambda x: nmt.greedy_decoder([x])[0]
[172]:
fasttext = malaya.language_detection.fasttext()
lang_word_model = malaya.language_detection.substring_rules(model = fasttext)
[25]:
language_detection_word_func = lambda x: lang_word_model.predict(x)
[26]:
s = 'u ni, talk properly lah'
[27]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[27]:
{'normalize': 'awak ni , bercakap betul lah', 'date': {}, 'money': {}}
[28]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[28]:
{'normalize': 'awak ni , bercakap dengan betul lah', 'date': {}, 'money': {}}
[29]:
s = 'This looks like awak, but in a less formal kind of way, still comel'
[30]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[30]:
{'normalize': 'This rupa seperti awak , tetapi dalam a kurang formal jenis dari cara , masih comel',
 'date': {},
 'money': {}}
[31]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[31]:
{'normalize': 'Ini kelihatan seperti awak , tetapi dalam yang kurang formal jenis jalan, masih comel',
 'date': {},
 'money': {}}
[32]:
s = "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍"
[33]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[33]:
{'normalize': 'hanya menghadiri saya sepupu perkahwinan . pelik jugak dia buat majlis biasa-biasa je sebab dia gaya hidup rupa mewah . kemudian saya ditemui keluar mereka adalah pergi pada a tiga minggu bulan madu . pintar keputusan , emoji jempol',
 'date': {},
 'money': {}}
[34]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[34]:
{'normalize': 'baru sahaja menghadiri majlis perkahwinan sepupu saya. pelik jugak dia buat majlis biasa-biasa je sebab gaya hidupnya kelihatan mewah. maka saya mengetahui bahawa mereka are teruskan a tiga minggu madu. keputusan pintar, emoji jempol',
 'date': {},
 'money': {}}
[35]:
s = 'Hello gais, boleh tolong recommend bengkel ketuk yang okay near Wangsa Maju / nearby? Kereta bf i pulak kepek langgar dinding hahahha'
[36]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[36]:
{'normalize': 'Hello gais , boleh tolong cadang bengkel ketuk yang okay berhampiran Wangsa Maju / berhampiran ? Kereta bf saya pulak kepek langgar dinding haha',
 'date': {},
 'money': {}}
[37]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[37]:
{'normalize': 'Helo gais , boleh tolong cadang bengkel ketuk yang okay berhampiran Wangsa Maju / berhampiran? Kereta bf saya pulak kepek langgar dinding haha',
 'date': {},
 'money': {}}
[38]:
s = 'Me after seeing this video: mm dapnya burger benjo extra mayo'
[39]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[39]:
{'normalize': 'Me selepas melihat ini video : mm dapnya burger benjo tambahan mayo',
 'date': {},
 'money': {}}
[40]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[40]:
{'normalize': 'Saya selepas melihat ini video : mm dapnya burger benjo mayo tambahan',
 'date': {},
 'money': {}}
[41]:
s = 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:'
[42]:
normalizer.normalize(s, translator = nmt_func,
                    check_english_func = is_english)
[42]:
{'normalize': 'Hi lelaki ! Saya disedari semalam & harini dah ramai yang dapat kuki ni kan . Jadi harini saya nak kongsi beberapa pos mortem dari kami pertama kelompok :',
 'date': {'semalam': datetime.datetime(2023, 4, 7, 16, 5, 9, 850822)},
 'money': {}}
[43]:
normalizer.normalize(s, translator = nmt_func, language_detection_word = language_detection_word_func,
                    check_english_func = is_english)
[43]:
{'normalize': 'Hai kawan! Saya disedari semalam & harini dah ramai yang dapat kuki ni kan . Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami :',
 'date': {'semalam': datetime.datetime(2023, 4, 7, 16, 5, 10, 65937)},
 'money': {}}

Use segmenter#

[44]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan'))
{'normalize': 'saya taksuka ayam , tapi saya sukaikan', 'date': {}, 'money': {}}
[171]:
segmenter = malaya.segmentation.transformer(model = 'small')
[46]:
segmenter_func = lambda x: segmenter.greedy_decoder([x])[0]
[47]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan', segmenter = segmenter_func))
{'normalize': 'saya tidak suka ayam , tapi saya suka ikan', 'date': {}, 'money': {}}

Use stemmer#

By default normalizer will ignore kata imbuhan akhir, so to stem kata imbuhan akhir, provide stemmer parameter. We can use better stemmer model like malaya.stem.deep_model(model = 'noisy').

[48]:
normalizer_without_stem = malaya.normalize.normalizer(corrector)
[49]:
normalizer.normalize(string12)
[49]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
 'date': {},
 'money': {}}
[50]:
normalizer_without_stem.normalize(string12, stemmer = stemmer)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[50]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
 'date': {},
 'money': {}}
[51]:
normalizer.normalize(string13)
[51]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
 'date': {},
 'money': {}}
[52]:
normalizer_without_stem.normalize(string13)
[52]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
 'date': {},
 'money': {}}
[53]:
normalizer.normalize('seadil2nya')
[53]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}
[54]:
normalizer_without_stem.normalize('seadil2nya')
[54]:
{'normalize': 'seadilnya', 'date': {}, 'money': {}}

Validate uppercase#

Problem with social media text, people sometime do uppercase for kata nama am, so it will skip to do spelling correction. So to fix that, we need to pass text_scorer parameter.

[55]:
import math
math.exp(lm.score('hi'))
[55]:
0.00012796330028274245
[56]:
text_scorer = lambda x: lm.score(x)
[57]:
t = 'Konon nak beat the crowd, skali Kedai x bukak ahaha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV'
normalizer.normalize(t, text_scorer = text_scorer)
[57]:
{'normalize': 'Konon nak beat the crowd , skali kedai tak bukak haha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV',
 'date': {},
 'money': {}}
[58]:
t = '8 Emiten Cum Dividen Pekan Ini, Jangan Ketinggalan https://t.co/9BV9OqqJUG'
normalizer.normalize(t, text_scorer = text_scorer)
[58]:
{'normalize': 'lapan emiten cum dividen Pekan Ini , jangan ketinggalan https://t.co/9BV9OqqJUG',
 'date': {},
 'money': {}}

Validate non human word#

A non human word like kasdsahdas or kasweadsa, it can be a laugh pattern or a cursing pattern, so to validate it we can use any text scoring. If the score lesser than the threshold, will skip to do spelling correction.

[59]:
normalizer.normalize('bodo la siallll hasdsadwq', text_scorer = text_scorer)
[59]:
{'normalize': 'bodo la sial hasdsadwq', 'date': {}, 'money': {}}

Skip spelling correction#

Simply pass None to speller to normalizer = malaya.normalize.normalizer. By default it is None.

[60]:
normalizer = malaya.normalize.normalizer(corrector)
without_corrector_normalizer = malaya.normalize.normalizer(None)
[61]:
normalizer.normalize(string2, normalize_elongated = False)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[61]:
{'normalize': 'saya memang-memang tak suka makanan HUSEIN kampung tempat , saya love them . pelikla saya',
 'date': {},
 'money': {}}
[62]:
without_corrector_normalizer.normalize(string2)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[62]:
{'normalize': 'saya memang-memang tak suka mknn HUSEIN kampng tmpat , saya love them . pelikla saya',
 'date': {},
 'money': {}}

Pass kwargs preprocessing#

Let say you want to skip to normalize date pattern, you can pass kwargs to normalizer, check word tokenizer kwargs at https://malaya.readthedocs.io/en/latest/load-tokenizer-word.html

[63]:
normalizer = malaya.normalize.normalizer(corrector)
skip_date_normalizer = malaya.normalize.normalizer(corrector, date = False)
[64]:
normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[64]:
{'normalize': 'tarikh program tersebut empat belas Mei dua ribu dua puluh tiga',
 'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
 'money': {}}
[65]:
skip_date_normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[65]:
{'normalize': 'tarikh program tersebut empat belas mei',
 'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
 'money': {}}

Normalize text#

If True,

  1. replace xkisah -> tak kisah.

  2. replace berehatlh -> berehatlah.

  3. replace seadil2nya -> seadil-adilnya.

  4. apply spelling correction if passed speller parameter.

  5. standardize laughing pattern.

  6. standardize mengeluh pattern.

  7. normalize title,

{
    'dr': 'Doktor',
    'yb': 'Yang Berhormat',
    'hj': 'Haji',
    'ybm': 'Yang Berhormat Mulia',
    'tyt': 'Tuan Yang Terutama',
    'yab': 'Yang Berhormat',
    'ybm': 'Yang Berhormat Mulia',
    'yabhg': 'Yang Amat Berbahagia',
    'ybhg': 'Yang Berbahagia',
    'miss': 'Cik',
}

Simply normalizer.normalize(string, normalize_text = True), default is True.

[66]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[67]:
normalizer.normalize('xkisah')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[67]:
{'normalize': 'tak kisah', 'date': {}, 'money': {}}
[68]:
normalizer.normalize('berehatlh')
[68]:
{'normalize': 'berehatlah', 'date': {}, 'money': {}}
[69]:
normalizer.normalize('seadil2nya')
[69]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}
[70]:
normalizer.normalize('bukan2')
[70]:
{'normalize': 'bukan-bukan', 'date': {}, 'money': {}}
[71]:
normalizer.normalize('bukan2 wkwkwkw')
[71]:
{'normalize': 'bukan-bukan haha', 'date': {}, 'money': {}}
[72]:
normalizer.normalize('bukan2 haih')
[72]:
{'normalize': 'bukan-bukan aduh', 'date': {}, 'money': {}}
[73]:
normalizer.normalize('dia sakai hhihihu')
[73]:
{'normalize': 'dia sakai haha', 'date': {}, 'money': {}}
[74]:
normalizer.normalize('hais sorrylah')
[74]:
{'normalize': 'aduh maaflah', 'date': {}, 'money': {}}
[75]:
normalizer.normalize('Dr yahaya')
[75]:
{'normalize': 'Doktor yahaya', 'date': {}, 'money': {}}
[76]:
normalizer.normalize('mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima')
[76]:
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima',
 'date': {},
 'money': {}}
[77]:
normalizer.normalize('aah la, bodo btul')
[77]:
{'normalize': 'betul la , bodo btul', 'date': {}, 'money': {}}

Normalize url#

Let say you have an url word, example, https://huseinhouse.com, this parameter going to,

If True,

  1. replace :// with empty string.

  2. replace . with dot.

  3. replace digits with string representation.

  4. Capitalize https, http, and www.

Simply normalizer.normalize(string, normalize_url = True), default is False.

[78]:
normalizer = malaya.normalize.normalizer()
[79]:
normalizer.normalize('web saya ialah https://huseinhouse.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[79]:
{'normalize': 'web saya ialah https://huseinhouse.com',
 'date': {},
 'money': {}}
[80]:
normalizer.normalize('web saya ialah https://huseinhouse.com', normalize_url = True)
[80]:
{'normalize': 'web saya ialah HTTPS huseinhouse dot com',
 'date': {},
 'money': {}}
[81]:
normalizer.normalize('web saya ialah https://huseinhouse02934.com', normalize_url = True)
[81]:
{'normalize': 'web saya ialah HTTPS huseinhouse kosong dua sembilan tiga empat dot com',
 'date': {},
 'money': {}}

Normalize email#

Let say you have an email word, example, husein.zol05@gmail.com, this parameter going to,

If True,

  1. replace :// with empty string.

  2. replace . with dot.

  3. replace @ with di.

  4. replace digits with string representation.

Simply normalizer.normalize(string, normalize_email = True), default is False.

[82]:
normalizer = malaya.normalize.normalizer()
[83]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[83]:
{'normalize': 'email saya ialah husein.zol05@gmail.com',
 'date': {},
 'money': {}}
[84]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com', normalize_email = True)
[84]:
{'normalize': 'email saya ialah husein dot zol kosong lima di gmail dot com',
 'date': {},
 'money': {}}

Normalize year#

  1. if True, tahun 1987 -> tahun sembilan belas lapan puluh tujuh.

  2. if True, 1970-an -> sembilan belas tujuh puluh an.

  3. if False, tahun 1987 -> tahun seribu sembilan ratus lapan puluh tujuh.

Simply normalizer.normalize(string, normalize_year = True), default is True.

[85]:
normalizer = malaya.normalize.normalizer()
[86]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[86]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas sembilan puluh lapan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[87]:
normalizer.normalize('$400 pada 1970-an berbanding lebih $1000')
[87]:
{'normalize': 'empat ratus dollar pada sembilan belas tujuh puluhan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[88]:
normalizer.normalize('$400 pada tahun 1970-an berbanding lebih $1000')
[88]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas tujuh puluhan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[89]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000', normalize_year = False)
[89]:
{'normalize': 'empat ratus dollar pada tahun seribu sembilan ratus sembilan puluh lapan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}

Normalize telephone#

  1. if True, no 012-1234567 -> no kosong satu dua, satu dua tiga empat lima enam tujuh.

Simply normalizer.normalize(string, normalize_telephone = True), default is True.

[90]:
normalizer = malaya.normalize.normalizer()
[91]:
normalizer.normalize('no saya 012-1234567')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[91]:
{'normalize': 'no saya kosong satu dua, satu dua tiga empat lima enam tujuh',
 'date': {},
 'money': {}}
[92]:
normalizer.normalize('no saya 012-1234567', normalize_telephone = False)
[92]:
{'normalize': 'no saya 012-1234567', 'date': {}, 'money': {}}

Normalize date#

  1. if True, 01/12/2001 -> satu disember dua ribu satu.

  2. if False, normalize date string to %d/%m/%y.

Simply normalizer.normalize(string, normalize_date = True), default is True.

[93]:
normalizer = malaya.normalize.normalizer()
[94]:
normalizer.normalize('saya akan gerak pada 1/11/2021')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[94]:
{'normalize': 'saya akan gerak pada sebelas Januari dua ribu dua puluh satu',
 'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
 'money': {}}
[95]:
normalizer.normalize('saya akan gerak pada 1/11/2021', normalize_date = False)
[95]:
{'normalize': 'saya akan gerak pada 11/01/2021',
 'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
 'money': {}}
[96]:
normalizer.normalize('1 nov 2019')
[96]:
{'normalize': 'satu November dua ribu sembilan belas',
 'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
 'money': {}}
[97]:
normalizer.normalize('1 nov 2019', normalize_date = False)
[97]:
{'normalize': '01/11/2019',
 'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
 'money': {}}
[98]:
normalizer.normalize('januari 1 1996')
[98]:
{'normalize': 'satu Januari seribu sembilan ratus sembilan puluh enam',
 'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
 'money': {}}
[99]:
normalizer.normalize('januari 1 1996', normalize_date = False)
[99]:
{'normalize': '01/01/1996',
 'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
 'money': {}}
[100]:
normalizer.normalize('januari 2019')
[100]:
{'normalize': 'lapan Januari dua ribu sembilan belas',
 'date': {'januari 2019': datetime.datetime(2019, 1, 8, 0, 0)},
 'money': {}}
[101]:
normalizer.normalize('januari 2019', normalize_date = False)
[101]:
{'normalize': '08/01/2019',
 'date': {'januari 2019': datetime.datetime(2019, 1, 8, 0, 0)},
 'money': {}}

Normalize time#

  1. if True, pukul 2.30 -> pukul dua tiga puluh minit.

  2. if False 2:01pm -> pukul 14.01.

Simply normalizer.normalize(string, normalize_time = True), default is True.

[102]:
normalizer = malaya.normalize.normalizer()
[103]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[103]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit tengah hari',
 'date': {'pukul 1:30': datetime.datetime(2023, 4, 8, 1, 30)},
 'money': {}}
[104]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = False)
[104]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30 tengah hari',
 'date': {'pukul 1:30': datetime.datetime(2023, 4, 8, 1, 30)},
 'money': {}}
[105]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = True)
[105]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit lima puluh saat tengah hari',
 'date': {'pukul 1:30:50': datetime.datetime(2023, 4, 8, 1, 30, 50)},
 'money': {}}
[106]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = False)
[106]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30:50 tengah hari',
 'date': {'pukul 1:30:50': datetime.datetime(2023, 4, 8, 1, 30, 50)},
 'money': {}}
[107]:
normalizer.normalize('2:01pm')
[107]:
{'normalize': 'pukul empat belas satu minit',
 'date': {'2:01pm': datetime.datetime(2023, 4, 8, 14, 1)},
 'money': {}}
[108]:
normalizer.normalize('2:01pm', normalize_time = False)
[108]:
{'normalize': 'pukul 14.01',
 'date': {'2:01pm': datetime.datetime(2023, 4, 8, 14, 1)},
 'money': {}}
[109]:
normalizer.normalize('2AM')
[109]:
{'normalize': 'pukul dua',
 'date': {'2am': datetime.datetime(2023, 4, 8, 2, 0)},
 'money': {}}
[110]:
normalizer.normalize('2AM', normalize_time = False)
[110]:
{'normalize': 'pukul 02',
 'date': {'2am': datetime.datetime(2023, 4, 8, 2, 0)},
 'money': {}}
[111]:
normalizer.normalize('2pm')
[111]:
{'normalize': 'pukul empat belas',
 'date': {'2pm': datetime.datetime(2023, 4, 8, 14, 0)},
 'money': {}}
[112]:
normalizer.normalize('2pm', normalize_time = False)
[112]:
{'normalize': 'pukul 14',
 'date': {'2pm': datetime.datetime(2023, 4, 8, 14, 0)},
 'money': {}}

Normalize emoji#

  1. if True, 🔥 -> emoji api

Simply normalizer.normalize(string, normalize_emoji = True), default is True.

[113]:
normalizer = malaya.normalize.normalizer()
[114]:
s = 'u are really damn hot 🔥'
normalizer.normalize(s, translator = nmt_func,
                     language_detection_word = language_detection_word_func)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[114]:
{'normalize': 'awak are sungguh panas, emoji api', 'date': {}, 'money': {}}

Normalize elongated#

Any typical elongated word, eg, pppeeddaaaasss - > pedas, but this elongated normalization required to pass speller parameter to perform the best.

Simply normalizer.normalize(string, normalize_elongated = True), default is True.

[115]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[116]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[116]:
{'normalize': 'saya tak suka makan pedas', 'date': {}, 'money': {}}
[117]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas', normalize_elongated = False)
[117]:
{'normalize': 'saayyyyaa ttttaaak ssssukaaa makaan pedas',
 'date': {},
 'money': {}}

Normalize hingga#

If True,

  1. 2011 - 2019 -> dua ribu sebelas hingga dua ribu sembilan belas.

  2. 2011.01-2019 - > dua ribu sebelas perpuluhan kosong satu hingga dua ribu sembilan belas.

Simply normalizer.normalize(string, normalize_hingga = True), default is True.

[118]:
normalizer = malaya.normalize.normalizer()
[119]:
normalizer.normalize('2011 - 2019', normalize_hingga = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[119]:
{'normalize': 'dua ribu sebelas hingga dua ribu sembilan belas',
 'date': {},
 'money': {}}
[120]:
normalizer.normalize('2011 - 2019', normalize_hingga = False)
[120]:
{'normalize': 'dua ribu sebelas - dua ribu sembilan belas',
 'date': {},
 'money': {}}
[121]:
normalizer.normalize('2011 - 2019', normalize_hingga = False, normalize_cardinal = False, normalize_ordinal = False)
[121]:
{'normalize': '2011 - 2019', 'date': {}, 'money': {}}

Normalize pada hari bulan#

If True,

  1. pada 10/4 -> pada sepuluh hari bulan empat.

Simply normalizer.normalize(string, normalize_pada_hari_bulan = True), default is True.

[122]:
normalizer = malaya.normalize.normalizer()
[123]:
normalizer.normalize('pada 10/   4', normalize_pada_hari_bulan = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[123]:
{'normalize': 'pada sepuluh hari bulan empat', 'date': {}, 'money': {}}
[124]:
normalizer.normalize('pada 10/4', normalize_pada_hari_bulan = False)
[124]:
{'normalize': 'pada sepuluh per empat', 'date': {}, 'money': {}}

Normalize fraction#

If True,

  1. 10/4 -> sepuluh per empat.

Simply normalizer.normalize(string, normalize_fraction = True), default is True.

[125]:
normalizer = malaya.normalize.normalizer()
[126]:
normalizer.normalize('10/4', normalize_fraction = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[126]:
{'normalize': 'sepuluh per empat', 'date': {}, 'money': {}}
[127]:
normalizer.normalize('201231.1 / 4', normalize_fraction = True)
[127]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu per empat',
 'date': {},
 'money': {}}
[128]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False)
[128]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu / empat',
 'date': {},
 'money': {}}
[129]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False, normalize_cardinal = False,
                    normalize_ordinal = False)
[129]:
{'normalize': '201231.1 / 4', 'date': {}, 'money': {}}

Normalize money#

If True,

  1. RM10.5 -> sepuluh ringgit lima puluh sen.

  2. rm 10.5 sen -> sepuluh ringgit lima puluh sen.

  3. 20.2m ringgit -> dua puluh juta dua ratus ribu ringgit.

And so much more!

Simply normalizer.normalize(string, normalize_money = True), default is True.

[130]:
normalizer = malaya.normalize.normalizer()
[131]:
normalizer.normalize('RM10.5')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[131]:
{'normalize': 'sepuluh ringgit lima puluh sen',
 'date': {},
 'money': {'rm10.5': 'RM10.5'}}
[132]:
normalizer.normalize('rm 10.5 sen')
[132]:
{'normalize': 'sepuluh ringgit lima puluh sen',
 'date': {},
 'money': {'rm 10.5': 'RM10.5'}}
[133]:
normalizer.normalize('1015 sen')
[133]:
{'normalize': 'sepuluh ringgit lima belas sen',
 'date': {},
 'money': {'1015 sen': 'RM10.15'}}
[134]:
normalizer.normalize('rm10.4m')
[134]:
{'normalize': 'sepuluh juta empat ratus ribu ringgit',
 'date': {},
 'money': {'rm10.4m': 'RM10400000.0'}}
[135]:
normalizer.normalize('$10.4K')
[135]:
{'normalize': 'sepuluh ribu empat ratus dollar',
 'date': {},
 'money': {'$10.4k': '$10400.0'}}
[136]:
normalizer.normalize('22.5123334k ringgit')
[136]:
{'normalize': 'dua puluh dua ribu lima ratus dua belas ringgit tiga ribu tiga ratus tiga puluh empat sen',
 'date': {},
 'money': {'22.5123334k ringgit': 'RM22512.3334'}}
[137]:
normalizer.normalize('saya ada 20.2m ringgit')
[137]:
{'normalize': 'saya ada dua puluh juta dua ratus ribu ringgit',
 'date': {},
 'money': {'20.2m ringgit': 'RM20200000.0'}}
[138]:
normalizer.normalize('22.5123334k ringgit', normalize_money = False)
[138]:
{'normalize': '22.5123334k ringgit',
 'date': {},
 'money': {'22.5123334k ringgit': 'RM22512.3334'}}

Normalize units#

Able to normalize temperature, distance, volume, duration and weight units.

If True,

  1. 61.2 kg -> enam puluh satu perpuluhan dua kilogram.

  2. 61.2km -> sepuluh ringgit lima puluh sen.

And so much more!

Simply normalizer.normalize(string, normalize_units = True), default is True.

[139]:
normalizer = malaya.normalize.normalizer()
[140]:
normalizer.normalize('61.2    KG')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[140]:
{'normalize': 'enam puluh satu perpuluhan dua kilogram',
 'date': {},
 'money': {}}
[141]:
normalizer.normalize('61.2km')
[141]:
{'normalize': 'enam puluh satu perpuluhan dua kilometer',
 'date': {},
 'money': {}}
[142]:
normalizer.normalize('61.2c')
[142]:
{'normalize': 'enam puluh satu perpuluhan dua celsius',
 'date': {},
 'money': {}}
[143]:
normalizer.normalize('61.2 ml')
[143]:
{'normalize': 'enam puluh satu perpuluhan dua milliliter',
 'date': {},
 'money': {}}
[144]:
normalizer.normalize('61.2 l')
[144]:
{'normalize': 'enam puluh satu perpuluhan dua liter', 'date': {}, 'money': {}}
[145]:
normalizer.normalize('61.2 jam')
[145]:
{'normalize': 'enam puluh satu perpuluhan dua jam',
 'date': {'61:2 jam': datetime.datetime(2023, 4, 8, 14, 5, 40, 232132)},
 'money': {}}
[146]:
normalizer.normalize('61.2 hari')
[146]:
{'normalize': 'enam puluh satu perpuluhan dua hari', 'date': {}, 'money': {}}

Normalize percents#

  1. If True, 61.2% -> enam puluh satu perpuluhan dua peratus.

Simply normalizer.normalize(string, normalize_percent = True), default is True.

[147]:
normalizer = malaya.normalize.normalizer()
[148]:
normalizer.normalize('61.2%')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[148]:
{'normalize': 'enam puluh satu perpuluhan dua peratus',
 'date': {},
 'money': {}}
[149]:
normalizer.normalize('61.2%', normalize_percent = False)
[149]:
{'normalize': '61.2%', 'date': {}, 'money': {}}

Normalize IC#

  1. If True, 911111-01-1111 -> sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu.

Simply normalizer.normalize(string, normalize_ic = True), default is True.

[150]:
normalizer = malaya.normalize.normalizer()
[151]:
normalizer.normalize('911111-01-1111')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[151]:
{'normalize': 'sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu',
 'date': {},
 'money': {}}
[152]:
normalizer.normalize('911111-01-1111', normalize_ic = False)
[152]:
{'normalize': '911111-01-1111', 'date': {}, 'money': {}}

Normalize Numbers#

If the number starts with 0, will convert into string representation.

  1. If True, 0123 -> kosong satu dua tiga.

Simply normalizer.normalize(string, normalize_number = True), default is True.

[153]:
normalizer = malaya.normalize.normalizer()
[154]:
normalizer.normalize('01234')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[154]:
{'normalize': 'kosong satu dua tiga empat', 'date': {}, 'money': {}}
[155]:
normalizer.normalize('01234', normalize_number = False)
[155]:
{'normalize': '01234', 'date': {}, 'money': {}}

Normalize x kali#

If the word ends with x and before that is a digit, will convert into string representation.

  1. If True, 10x -> sepuluh kali.

  2. If False, 10x -> 10 kali.

Simply normalizer.normalize(string, normalize_x_kali = True), default is True.

[156]:
normalizer = malaya.normalize.normalizer()
[157]:
normalizer.normalize('saya sokong 10x')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[157]:
{'normalize': 'saya sokong sepuluh kali', 'date': {}, 'money': {}}
[158]:
normalizer.normalize('saya sokong 10x', normalize_x_kali = False)
[158]:
{'normalize': 'saya sokong 10 kali', 'date': {}, 'money': {}}

Normalize Cardinals#

Any numbers will convert using malaya.num2word.to_cardinal.

  1. If True, 123 -> seratus dua puluh tiga.

Simply normalizer.normalize(string, normalize_cardinal = True), default is True.

[159]:
normalizer = malaya.normalize.normalizer()
[160]:
normalizer.normalize('123')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[160]:
{'normalize': 'seratus dua puluh tiga', 'date': {}, 'money': {}}
[161]:
normalizer.normalize('123.123421231')
[161]:
{'normalize': 'seratus dua puluh tiga perpuluhan satu dua tiga empat dua satu dua tiga satu',
 'date': {},
 'money': {}}
[162]:
normalizer.normalize('123.123421231', normalize_cardinal = False)
[162]:
{'normalize': '123.123421231', 'date': {}, 'money': {}}

Normalize Ordinals#

Any numbers will convert using malaya.num2word.to_cardinal.

  1. If True, 123 -> keseratus dua puluh tiga.

  2. Able to normalize roman numbers, ke-XXI -> kedua puluh satu.

Simply normalizer.normalize(string, normalize_ordinal = True), default is True.

[163]:
normalizer.normalize('123', normalize_cardinal = False)
[163]:
{'normalize': 'keseratus dua puluh tiga', 'date': {}, 'money': {}}
[164]:
normalizer.normalize('123', normalize_cardinal = False, normalize_ordinal = False)
[164]:
{'normalize': '123', 'date': {}, 'money': {}}
[165]:
normalizer.normalize('ke-XXI')
[165]:
{'normalize': 'kedua puluh satu', 'date': {}, 'money': {}}

Normalize entity#

normalize entities, only effect date, datetime, time and money patterns string only

Simply normalizer.normalize(string, normalize_entity = True), default is True.

[166]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[167]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[168]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[168]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {'8AM esok': datetime.datetime(2023, 4, 9, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0),
  'minggu depan': datetime.datetime(2023, 4, 15, 16, 5, 42, 618293)},
 'money': {'rm 3.2k': 'RM3200.0'}}
[169]:
normalizer.normalize(string, normalize_entity = False)
[169]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {},
 'money': {}}
[170]:
normalizer.normalize(string, normalize_date = False, normalize_time = False, normalize_money = False,
                    normalize_cardinal = False, normalize_ordinal = False)
[170]:
{'normalize': 'boleh dtg pukul 08 esok tak atau minggu depan ? 02/10/2019 pukul 14 , tolong bayar rm 3.2k sekali tau',
 'date': {'8AM esok': datetime.datetime(2023, 4, 9, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0),
  'minggu depan': datetime.datetime(2023, 4, 15, 16, 5, 42, 658692)},
 'money': {'rm 3.2k': 'RM3200.0'}}