Rules based Normalizer#

This tutorial is available as an IPython notebook at Malaya/example/normalizer.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
import logging

logging.basicConfig(level=logging.INFO)
[3]:
%%time
import malaya
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmppnxxs_oa
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmppnxxs_oa/_remote_module_non_scriptable.py
CPU times: user 2.81 s, sys: 3.92 s, total: 6.73 s
Wall time: 2.01 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[4]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'
string8 = 'awak sangat hot ye 🔥🔥. 🔥🙂'
string9 = 'hanyalah rm2 ribu'
string10 = 'mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima .. pelik , dia slhkn org bole hri2 crta sakau then bila kna bls balik xdpt jwb ,kata mcm biasa slh (parti sampah) 🤣🤣🤣 jgn mulakn dlu slhkn org kalau xboleh trima bila kna bls balik 🤣🤣🤣'
string11 = 'Pemimpin yg hebat, panahan2 fitnah tu akan dituju kepadanya.. harap DS terus bersabar. Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga..'
string12 = 'berehatlh najib.. sudah2 lh tu.. jgn buat rakyat hilang kepercyaan tu pda system kehakiman negara.. klu btl x slh kenapa x dibuktikan semasa sblm rayuan.. sudah lah tu kami dh letih dengan drama korang. ok'
string13 = 'DSNR satu satunya legasi kpd negara penyambung perjuangan bangsa melayu..jatuhnya beliau dek kerana fitnah dan dengkinya manusia..semoga Allah lindungi Najib Bin Razak dunia dan akhirat..Aamiin'
string14 = 'Muhammad Najib sbb malaysiakini dah daftar.... Klu dia fitnah...tertuduh boleh saman.... Klu berita2 yg x daftar...tu yg susah nak saman...sbb x tahu owner'

Load normalizer#

  1. normalizer can load any spelling correction model, eg, malaya.spelling_correction.probability.load, or malaya.spelling_correction.transformer.load.

  2. normalizer can load any stemmer model, eg, malaya.stem.deep_model.

def load(
    speller: Callable = None,
    stemmer: Callable = None,
    **kwargs,
):
    """
    Load a Normalizer using any spelling correction model.

    Parameters
    ----------
    speller: Callable, optional (default=None)
        function to correct spelling, must have `correct` or `normalize_elongated` method.
    stemmer: Callable, optional (default=None)
        function to stem, must have `stem_word` method.
        If provide stemmer, will accurately to stem kata imbuhan akhir.

    Returns
    -------
    result: malaya.normalizer.rules.Normalizer class
    """
[5]:
lm = malaya.language_model.kenlm(model = 'bahasa-wiki-news')
[6]:
corrector = malaya.spelling_correction.probability.load(language_model = lm)
INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/v27-preprocessing/bm_1grams.json
[7]:
stemmer = malaya.stem.huggingface()
INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/stem-lstm-512/model.pt
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[8]:
normalizer = malaya.normalizer.rules.load(corrector, stemmer)
/home/husein/dev/malaya/malaya/normalizer/rules.py:204: FutureWarning: Possible nested set at position 42
  k.lower(): re.compile(_expressions[k]) for k, v in _expressions.items()
/home/husein/dev/malaya/malaya/normalizer/rules.py:204: FutureWarning: Possible nested set at position 3
  k.lower(): re.compile(_expressions[k]) for k, v in _expressions.items()

normalize#

def normalize(
    self,
    string: str,
    normalize_text: bool = True,
    normalize_url: bool = False,
    normalize_email: bool = False,
    normalize_year: bool = True,
    normalize_telephone: bool = True,
    normalize_date: bool = True,
    normalize_time: bool = True,
    normalize_emoji: bool = True,
    normalize_elongated: bool = True,
    normalize_hingga: bool = True,
    normalize_pada_hari_bulan: bool = True,
    normalize_fraction: bool = True,
    normalize_money: bool = True,
    normalize_units: bool = True,
    normalize_percent: bool = True,
    normalize_ic: bool = True,
    normalize_number: bool = True,
    normalize_x_kali: bool = True,
    normalize_cardinal: bool = True,
    normalize_ordinal: bool = True,
    normalize_entity: bool = True,
    expand_contractions: bool = True,
    check_english_func=is_english,
    check_malay_func=is_malay,
    translator: Callable = None,
    language_detection_word: Callable = None,
    acceptable_language_detection: List[str] = ['EN', 'CAPITAL', 'NOT_LANG'],
    segmenter: Callable = None,
    text_scorer: Callable = None,
    text_scorer_window: int = 2,
    not_a_word_threshold: float = 1e-4,
    dateparser_settings={'TIMEZONE': 'GMT+8'},
    **kwargs,
):
    """
    Normalize a string.

    Parameters
    ----------
    string : str
    normalize_text: bool, optional (default=True)
        if True, will try to replace shortforms with internal corpus.
    normalize_url: bool, optional (default=False)
        if True, replace `://` with empty and `.` with `dot`.
        `https://huseinhouse.com` -> `https huseinhouse dot com`.
    normalize_email: bool, optional (default=False)
        if True, replace `@` with `di`, `.` with `dot`.
        `husein.zol05@gmail.com` -> `husein dot zol kosong lima di gmail dot com`.
    normalize_year: bool, optional (default=True)
        if True, `tahun 1987` -> `tahun sembilan belas lapan puluh tujuh`.
        if True, `1970-an` -> `sembilan belas tujuh puluh an`.
        if False, `tahun 1987` -> `tahun seribu sembilan ratus lapan puluh tujuh`.
    normalize_telephone: bool, optional (default=True)
        if True, `no 012-1234567` -> `no kosong satu dua, satu dua tiga empat lima enam tujuh`
    normalize_date: bool, optional (default=True)
        if True, `01/12/2001` -> `satu disember dua ribu satu`.
        if True, `Jun 2017` -> `satu Jun dua ribu tujuh belas`.
        if True, `2017 Jun` -> `satu Jun dua ribu tujuh belas`.
        if False, `2017 Jun` -> `01/06/2017`.
        if False, `Jun 2017` -> `01/06/2017`.
    normalize_time: bool, optional (default=True)
        if True, `pukul 2.30` -> `pukul dua tiga puluh minit`.
        if False, `pukul 2.30` -> `'02:00:00'`
    normalize_emoji: bool, (default=True)
        if True, `🔥` -> `emoji api`
        Load from `malaya.preprocessing.demoji`.
    normalize_elongated: bool, optional (default=True)
        if True, `betuii` -> `betui`.
    normalize_hingga: bool, optional (default=True)
        if True, `2011 - 2019` -> `dua ribu sebelas hingga dua ribu sembilan belas`
    normalize_pada_hari_bulan: bool, optional (default=True)
        if True, `pada 10/4` -> `pada sepuluh hari bulan empat`
    normalize_fraction: bool, optional (default=True)
        if True, `10 /4` -> `sepuluh per empat`
    normalize_money: bool, optional (default=True)
        if True, `rm10.4m` -> `sepuluh juta empat ratus ribu ringgit`
    normalize_units: bool, optional (default=True)
        if True, `61.2 kg` -> `enam puluh satu perpuluhan dua kilogram`
    normalize_percent: bool, optional (default=True)
        if True, `0.8%` -> `kosong perpuluhan lapan peratus`
    normalize_ic: bool, optional (default=True)
        if True, `911111-01-1111` -> `sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu`
    normalize_number: bool, optional (default=True)
        if True `0123` -> `kosong satu dua tiga`
    normalize_x_kali: bool, optional (default=True)
        if True `10x` -> 'sepuluh kali'
    normalize_cardinal: bool, optional (default=True)
        if True, `123` -> `seratus dua puluh tiga`
    normalize_ordinal: bool, optional (default=True)
        if True, `ke-123` -> `keseratus dua puluh tiga`
    normalize_entity: bool, optional (default=True)
        normalize entities, only effect `date`, `datetime`, `time` and `money` patterns string only.
    expand_contractions: bool, optional (default=True)
        expand english contractions.
    check_english_func: Callable, optional (default=malaya.text.function.is_english)
        function to check a word in english dictionary, default is malaya.text.function.is_english.
        this parameter also will be use for malay text normalization.
    check_malay_func: Callable, optional (default=malaya.text.function.is_malay)
        function to check a word in malay dictionary, default is malaya.text.function.is_malay.
    translator: Callable, optional (default=None)
        function to translate EN word to MS word.
    language_detection_word: Callable, optional (default=None)
        function to detect language for each words to get better translation results.
    acceptable_language_detection: List[str], optional (default=['EN', 'CAPITAL', 'NOT_LANG'])
        only translate substrings if the results from `language_detection_word` is in `acceptable_language_detection`.
    segmenter: Callable, optional (default=None)
        function to segmentize word.
        If provide, it will expand a word, apaitu -> apa itu
    text_scorer: Callable, optional (default=None)
        function to validate upper word.
        If lower case score is higher or equal than upper case score, will choose lower case.
    text_scorer_window: int, optional (default=2)
        size of lookback and lookforward to validate upper word.
    not_a_word_threshold: float, optional (default=1e-4)
        assume a word is not a human word if score lower than `not_a_word_threshold`.
        only usable if passed `text_scorer` parameter.
    dateparser_settings: Dict, optional (default={'TIMEZONE': 'GMT+8'})
        default dateparser setting, check support settings at https://dateparser.readthedocs.io/en/latest/

    Returns
    -------
    result: {'normalize', 'date', 'money'}
    """

To get better english checker, we prefer to use https://pyenchant.github.io/pyenchant/

[9]:
import enchant
d = enchant.Dict('en_US')

is_english = lambda x: d.check(x)
is_english('lifestyle')
[9]:
True
[10]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[11]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule.
[11]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 3, 50, 902256),
  '8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
 'money': {'rm 3.2k': 'RM3200.0'}}
[12]:
normalizer.normalize(string, normalize_entity = False)
[12]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {},
 'money': {}}

Here you can see, Malaya normalizer will normalize minggu depan to datetime object, also 3.2k ringgit to RM3200

[13]:
print(normalizer.normalize(string1))
print(normalizer.normalize(string2))
print(normalizer.normalize(string3))
print(normalizer.normalize(string4))
print(normalizer.normalize(string5))
print(normalizer.normalize(string6))
print(normalizer.normalize(string7))
print(normalizer.normalize(string8))
print(normalizer.normalize(string9))
print(normalizer.normalize(string10))
print(normalizer.normalize(string11))
print(normalizer.normalize(string12))
print(normalizer.normalize(string13))
print(normalizer.normalize(string14))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya hate itu . peliklah , pada', 'date': {}, 'money': {}}
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya love them . peliklah saya', 'date': {}, 'money': {}}
{'normalize': 'perdana menteri kesebelas sgt suka makan ayam , harganya cuma lima belas ringgit lima puluh sen', 'date': {}, 'money': {'rm15.50': 'RM15.50'}}
{'normalize': 'pada sepuluh hari bulan empat , kementerian mengumumkan , satu per seratus', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli dapat tempat kedua belas lumba lari hari ni', 'date': {}, 'money': {}}
{'normalize': 'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah sekolah King Edward keempat', 'date': {}, 'money': {}}
{'normalize': 'dua jam tiga puluh minit aku tunggu kau , enam puluh perpuluhan satu kilogram kau ni , suhu harini tiga puluh satu perpuluhan dua celsius , aku dahaga minum enam ratus milliliter', 'date': {'2jam': datetime.datetime(2023, 10, 13, 12, 3, 51, 358111)}, 'money': {}}
{'normalize': 'awak sangat hot ye , emoji api , emoji api . Emoji api , emoji muka tersenyum sedikit', 'date': {}, 'money': {}}
{'normalize': 'hanyalah dua ribu ringgit', 'date': {}, 'money': {'rm2 ribu': 'RM2000.0'}}
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima . . pelik , dia salahkan org bole hari-hari cerita sakau then bila kena bilas balik tak dapat jwb , kata mcm biasa slh ( parti sampah ) , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , jgn mulakan dlu salahkan org kalau tak boleh trima bila kena bilas balik , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa , emoji berguling di lantai ketawa', 'date': {}, 'money': {}}
{'normalize': 'Pemimpin yg hebat , panah-panahan fitnah tu akan dituju kepadanya . . harap DS terus bersabar . Jasa baik DS menjadi asbab di sana kelak mahupun rakyat yg terhutang budi juga . .', 'date': {}, 'money': {}}
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok', 'date': {}, 'money': {}}
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin', 'date': {}, 'money': {}}
{'normalize': 'Muhammad Najib sbb malaysiakini dah daftar . . . . Kalau dia fitnah . . . tertuduh boleh saman . . . . Kalau berita-berita yg tak daftar . . tu yg susah nak saman . . sbb tak tahu owner', 'date': {}, 'money': {}}

Use translator#

To use translator, pass a callable variable into translator parameter,

print(normalizer.normalize(string1, translator = translator))
[14]:
en_ms_vocab = malaya.translation.word(model = 'mesolitica/word-en-ms')
INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/word-en-ms/dictionary.json
[15]:
translator = lambda x: en_ms_vocab.get(x, x)
[16]:
translator('pain'), translator('aduh')
[16]:
('sakit', 'aduh')
[17]:
print(normalizer.normalize(string1, translator = translator))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[18]:
print(normalizer.normalize(string2, translator = translator))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya cinta mereka . peliklah saya', 'date': {}, 'money': {}}

Use Neural Translation Machine#

Problem with dictionary based, if the words is not exist, the translation will not work,

[19]:
translator('love'), translator('them'), translator('pain')
[19]:
('cinta', 'mereka', 'sakit')
[20]:
nmt = malaya.translation.huggingface()
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[21]:
nmt_func = lambda x: nmt.generate([x], to_lang = 'ms', max_length = 256)[0]
[22]:
print(normalizer.normalize(string1, translator = nmt_func))
{'normalize': 'tak jadi ke , kenapa awak tak suka makan HUSEIN kt situ tmpt , saya benci ia . peliklah , pada', 'date': {}, 'money': {}}
[23]:
print(normalizer.normalize(string2, translator = nmt_func))
{'normalize': 'saya memang-memang tak suka makan HUSEIN kampung tempat , saya cinta mereka . peliklah Saya', 'date': {}, 'money': {}}

Use segmenter#

[24]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan'))
{'normalize': 'saya taksuka ayam , tapi saya sukaikan', 'date': {}, 'money': {}}
[25]:
segmenter = malaya.segmentation.huggingface()
[26]:
segmenter_func = lambda x: segmenter.generate([x], max_length = 128)[0]
[27]:
print(normalizer.normalize('saya taksuka ayam, tapi saya sukaikan', segmenter = segmenter_func))
{'normalize': 'saya tidak suka ayam , tapi saya suka ikan', 'date': {}, 'money': {}}

Use stemmer#

By default normalizer will ignore kata imbuhan akhir, so to stem kata imbuhan akhir, provide stemmer parameter.

[33]:
normalizer_without_stem = malaya.normalize.normalizer(corrector, check_malay_func = None)
normalizer_stem = malaya.normalize.normalizer(corrector, stemmer = stemmer, check_malay_func = None)
[34]:
normalizer_without_stem.normalize(string12)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[34]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
 'date': {},
 'money': {}}
[35]:
normalizer_stem.normalize(string12)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[35]:
{'normalize': 'berehatlah najib . . sudah-sudah lh tu . . jgn buat rakyat hilang kepercayaan tu pda system kehakiman negara . . klu betul tak slh kenapa tak dibuktikan semasa sblm rayuan . . sudah lah tu kami dh letih dengan drama korang . ok',
 'date': {},
 'money': {}}
[36]:
normalizer.normalize(string13)
[36]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
 'date': {},
 'money': {}}
[41]:
normalizer_without_stem.normalize(string13)
[41]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
 'date': {},
 'money': {}}
[43]:
normalizer_stem.normalize(string13)
[43]:
{'normalize': 'DATUK SERI NAJIB RAZAK satu satunya legasi kpd negara penyambung perjuangan bangsa melayu . . jatuhnya beliau dek kerana fitnah dan dengkinya manusia . semoga Allah lindungi Najib Bin Razak dunia dan akhirat . . Aamiin',
 'date': {},
 'money': {}}
[45]:
normalizer_without_stem.normalize('seadil2nya')
[45]:
{'normalize': 'seadilnya', 'date': {}, 'money': {}}
[46]:
normalizer_stem.normalize('seadil2nya')
[46]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}

Validate uppercase#

Problem with social media text, people sometime do uppercase for kata nama am, so it will skip to do spelling correction. So to fix that, we need to pass text_scorer parameter.

[47]:
import math
math.exp(lm.score('hi'))
[47]:
0.00012796330028274245
[48]:
text_scorer = lambda x: lm.score(x)
[49]:
t = 'Konon nak beat the crowd, skali Kedai x bukak ahaha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV'
normalizer.normalize(t, text_scorer = text_scorer)
[49]:
{'normalize': 'Konon nak beat the crowd , skali kedai tak bukak haha @ Chef Ammar Xpress Souk Cafe https://t.co/QrcBlq6ftV',
 'date': {},
 'money': {}}
[50]:
t = '8 Emiten Cum Dividen Pekan Ini, Jangan Ketinggalan https://t.co/9BV9OqqJUG'
normalizer.normalize(t, text_scorer = text_scorer)
[50]:
{'normalize': 'lapan emiten cum dividen Pekan Ini , jangan ketinggalan https://t.co/9BV9OqqJUG',
 'date': {},
 'money': {}}

Validate non human word#

A non human word like kasdsahdas or kasweadsa, it can be a laugh pattern or a cursing pattern, so to validate it we can use any text scoring. If the score lesser than the threshold, will skip to do spelling correction.

[51]:
normalizer.normalize('bodo la siallll hasdsadwq', text_scorer = text_scorer)
[51]:
{'normalize': 'bodo la sial hasdsadwq', 'date': {}, 'money': {}}

Skip spelling correction#

Simply pass None to speller to normalizer = malaya.normalize.normalizer. By default it is None.

[52]:
normalizer = malaya.normalize.normalizer(corrector)
without_corrector_normalizer = malaya.normalize.normalizer(None)
[53]:
normalizer.normalize(string2, normalize_elongated = False)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[53]:
{'normalize': 'saya memang-memang tak suka makanan HUSEIN kampung tempat , saya love them . pelikla saya',
 'date': {},
 'money': {}}
[54]:
without_corrector_normalizer.normalize(string2)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[54]:
{'normalize': 'saya memang-memang tak suka mknn HUSEIN kampng tmpat , saya love them . pelikla saya',
 'date': {},
 'money': {}}

Pass kwargs preprocessing#

Let say you want to skip to normalize date pattern, you can pass kwargs to normalizer, check word tokenizer kwargs at https://malaya.readthedocs.io/en/latest/load-tokenizer-word.html

[55]:
normalizer = malaya.normalize.normalizer(corrector)
skip_date_normalizer = malaya.normalize.normalizer(corrector, date = False)
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 2558
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3088
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[56]:
normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[56]:
{'normalize': 'tarikh program tersebut empat belas Mei dua ribu dua puluh tiga',
 'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
 'money': {}}
[57]:
skip_date_normalizer.normalize('tarikh program tersebut 14 mei')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[57]:
{'normalize': 'tarikh program tersebut empat belas mei',
 'date': {'14 mei': datetime.datetime(2023, 5, 14, 0, 0)},
 'money': {}}

Normalize text#

If True,

  1. replace xkisah -> tak kisah.

  2. replace berehatlh -> berehatlah.

  3. replace seadil2nya -> seadil-adilnya.

  4. apply spelling correction if passed speller parameter.

  5. standardize laughing pattern.

  6. standardize mengeluh pattern.

  7. normalize title,

{
    'dr': 'Doktor',
    'yb': 'Yang Berhormat',
    'hj': 'Haji',
    'ybm': 'Yang Berhormat Mulia',
    'tyt': 'Tuan Yang Terutama',
    'yab': 'Yang Berhormat',
    'ybm': 'Yang Berhormat Mulia',
    'yabhg': 'Yang Amat Berbahagia',
    'ybhg': 'Yang Berbahagia',
    'miss': 'Cik',
}

Simply normalizer.normalize(string, normalize_text = True), default is True.

[58]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[59]:
normalizer.normalize('xkisah')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[59]:
{'normalize': 'tak kisah', 'date': {}, 'money': {}}
[60]:
normalizer.normalize('berehatlh')
[60]:
{'normalize': 'berehatlah', 'date': {}, 'money': {}}
[61]:
normalizer.normalize('seadil2nya')
[61]:
{'normalize': 'seadil-adilnya', 'date': {}, 'money': {}}
[62]:
normalizer.normalize('bukan2')
[62]:
{'normalize': 'bukan-bukan', 'date': {}, 'money': {}}
[63]:
normalizer.normalize('bukan2 wkwkwkw')
[63]:
{'normalize': 'bukan-bukan haha', 'date': {}, 'money': {}}
[64]:
normalizer.normalize('bukan2 haih')
[64]:
{'normalize': 'bukan-bukan aduh', 'date': {}, 'money': {}}
[65]:
normalizer.normalize('dia sakai hhihihu')
[65]:
{'normalize': 'dia sakai haha', 'date': {}, 'money': {}}
[66]:
normalizer.normalize('hais sorrylah')
[66]:
{'normalize': 'aduh maaflah', 'date': {}, 'money': {}}
[67]:
normalizer.normalize('Dr yahaya')
[67]:
{'normalize': 'Doktor yahaya', 'date': {}, 'money': {}}
[68]:
normalizer.normalize('mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima')
[68]:
{'normalize': 'mulakan slh org boleh , bila geng tuh kena salahkan jgk tak boleh trima',
 'date': {},
 'money': {}}
[69]:
normalizer.normalize('aah la, bodo btul')
[69]:
{'normalize': 'betul la , bodo btul', 'date': {}, 'money': {}}

Normalize url#

Let say you have an url word, example, https://huseinhouse.com, this parameter going to,

If True,

  1. replace :// with empty string.

  2. replace . with dot.

  3. replace digits with string representation.

  4. Capitalize https, http, and www.

Simply normalizer.normalize(string, normalize_url = True), default is False.

[70]:
normalizer = malaya.normalize.normalizer()
[71]:
normalizer.normalize('web saya ialah https://huseinhouse.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[71]:
{'normalize': 'web saya ialah https://huseinhouse.com',
 'date': {},
 'money': {}}
[72]:
normalizer.normalize('web saya ialah https://huseinhouse.com', normalize_url = True)
[72]:
{'normalize': 'web saya ialah HTTPS huseinhouse dot com',
 'date': {},
 'money': {}}
[73]:
normalizer.normalize('web saya ialah https://huseinhouse02934.com', normalize_url = True)
[73]:
{'normalize': 'web saya ialah HTTPS huseinhouse kosong dua sembilan tiga empat dot com',
 'date': {},
 'money': {}}

Normalize email#

Let say you have an email word, example, husein.zol05@gmail.com, this parameter going to,

If True,

  1. replace :// with empty string.

  2. replace . with dot.

  3. replace @ with di.

  4. replace digits with string representation.

Simply normalizer.normalize(string, normalize_email = True), default is False.

[74]:
normalizer = malaya.normalize.normalizer()
[75]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[75]:
{'normalize': 'email saya ialah husein.zol05@gmail.com',
 'date': {},
 'money': {}}
[76]:
normalizer.normalize('email saya ialah husein.zol05@gmail.com', normalize_email = True)
[76]:
{'normalize': 'email saya ialah husein dot zol kosong lima di gmail dot com',
 'date': {},
 'money': {}}

Normalize year#

  1. if True, tahun 1987 -> tahun sembilan belas lapan puluh tujuh.

  2. if True, 1970-an -> sembilan belas tujuh puluh an.

  3. if False, tahun 1987 -> tahun seribu sembilan ratus lapan puluh tujuh.

Simply normalizer.normalize(string, normalize_year = True), default is True.

[77]:
normalizer = malaya.normalize.normalizer()
[78]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[78]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas sembilan puluh lapan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[79]:
normalizer.normalize('$400 pada 1970-an berbanding lebih $1000')
[79]:
{'normalize': 'empat ratus dollar pada sembilan belas tujuh puluhan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[80]:
normalizer.normalize('$400 pada tahun 1970-an berbanding lebih $1000')
[80]:
{'normalize': 'empat ratus dollar pada tahun sembilan belas tujuh puluhan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}
[81]:
normalizer.normalize('$400 pada tahun 1998 berbanding lebih $1000', normalize_year = False)
[81]:
{'normalize': 'empat ratus dollar pada tahun seribu sembilan ratus sembilan puluh lapan berbanding lebih seribu dollar',
 'date': {},
 'money': {'$400 ': '$400', '$1000': '$1000'}}

Normalize telephone#

  1. if True, no 012-1234567 -> no kosong satu dua, satu dua tiga empat lima enam tujuh.

Simply normalizer.normalize(string, normalize_telephone = True), default is True.

[82]:
normalizer = malaya.normalize.normalizer()
[83]:
normalizer.normalize('no saya 012-1234567')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[83]:
{'normalize': 'no saya kosong satu dua, satu dua tiga empat lima enam tujuh',
 'date': {},
 'money': {}}
[84]:
normalizer.normalize('no saya 012-1234567', normalize_telephone = False)
[84]:
{'normalize': 'no saya 012-1234567', 'date': {}, 'money': {}}

Normalize date#

  1. if True, 01/12/2001 -> satu disember dua ribu satu.

  2. if False, normalize date string to %d/%m/%y.

Simply normalizer.normalize(string, normalize_date = True), default is True.

[85]:
normalizer = malaya.normalize.normalizer()
[86]:
normalizer.normalize('saya akan gerak pada 1/11/2021')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[86]:
{'normalize': 'saya akan gerak pada sebelas Januari dua ribu dua puluh satu',
 'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
 'money': {}}
[87]:
normalizer.normalize('saya akan gerak pada 1/11/2021', normalize_date = False)
[87]:
{'normalize': 'saya akan gerak pada 11/01/2021',
 'date': {'1/11/2021': datetime.datetime(2021, 1, 11, 0, 0)},
 'money': {}}
[88]:
normalizer.normalize('1 nov 2019')
[88]:
{'normalize': 'satu November dua ribu sembilan belas',
 'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
 'money': {}}
[89]:
normalizer.normalize('1 nov 2019', normalize_date = False)
[89]:
{'normalize': '01/11/2019',
 'date': {'1 nov 2019': datetime.datetime(2019, 11, 1, 0, 0)},
 'money': {}}
[90]:
normalizer.normalize('januari 1 1996')
[90]:
{'normalize': 'satu Januari seribu sembilan ratus sembilan puluh enam',
 'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
 'money': {}}
[91]:
normalizer.normalize('januari 1 1996', normalize_date = False)
[91]:
{'normalize': '01/01/1996',
 'date': {'januari 1 1996': datetime.datetime(1996, 1, 1, 0, 0)},
 'money': {}}
[92]:
normalizer.normalize('januari 2019')
[92]:
{'normalize': 'tiga belas Januari dua ribu sembilan belas',
 'date': {'januari 2019': datetime.datetime(2019, 1, 13, 0, 0)},
 'money': {}}
[93]:
normalizer.normalize('januari 2019', normalize_date = False)
[93]:
{'normalize': '13/01/2019',
 'date': {'januari 2019': datetime.datetime(2019, 1, 13, 0, 0)},
 'money': {}}

Normalize time#

  1. if True, pukul 2.30 -> pukul dua tiga puluh minit.

  2. if False 2:01pm -> pukul 14.01.

Simply normalizer.normalize(string, normalize_time = True), default is True.

[94]:
normalizer = malaya.normalize.normalizer()
[95]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[95]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit tengah hari',
 'date': {'pukul 1:30': datetime.datetime(2023, 10, 13, 1, 30)},
 'money': {}}
[96]:
s = 'Operasi tamat sepenuhnya pada pukul 1.30 tengah hari'
normalizer.normalize(s, normalize_time = False)
[96]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30 tengah hari',
 'date': {'pukul 1:30': datetime.datetime(2023, 10, 13, 1, 30)},
 'money': {}}
[97]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = True)
[97]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul satu tiga puluh minit lima puluh saat tengah hari',
 'date': {'pukul 1:30:50': datetime.datetime(2023, 10, 13, 1, 30, 50)},
 'money': {}}
[98]:
s = 'Operasi tamat sepenuhnya pada pukul 1:30:50 tengah hari'
normalizer.normalize(s, normalize_time = False)
[98]:
{'normalize': 'Operasi tamat sepenuhnya pada pukul 01.30:50 tengah hari',
 'date': {'pukul 1:30:50': datetime.datetime(2023, 10, 13, 1, 30, 50)},
 'money': {}}
[99]:
normalizer.normalize('2:01pm')
[99]:
{'normalize': 'pukul empat belas satu minit',
 'date': {'2:01pm': datetime.datetime(2023, 10, 13, 14, 1)},
 'money': {}}
[100]:
normalizer.normalize('2:01pm', normalize_time = False)
[100]:
{'normalize': 'pukul 14.01',
 'date': {'2:01pm': datetime.datetime(2023, 10, 13, 14, 1)},
 'money': {}}
[101]:
normalizer.normalize('2AM')
[101]:
{'normalize': 'pukul dua',
 'date': {'2am': datetime.datetime(2023, 10, 13, 2, 0)},
 'money': {}}
[102]:
normalizer.normalize('2AM', normalize_time = False)
[102]:
{'normalize': 'pukul 02',
 'date': {'2am': datetime.datetime(2023, 10, 13, 2, 0)},
 'money': {}}
[103]:
normalizer.normalize('2pm')
[103]:
{'normalize': 'pukul empat belas',
 'date': {'2pm': datetime.datetime(2023, 10, 13, 14, 0)},
 'money': {}}
[104]:
normalizer.normalize('2pm', normalize_time = False)
[104]:
{'normalize': 'pukul 14',
 'date': {'2pm': datetime.datetime(2023, 10, 13, 14, 0)},
 'money': {}}

Normalize emoji#

  1. if True, 🔥 -> emoji api

Simply normalizer.normalize(string, normalize_emoji = True), default is True.

[105]:
normalizer = malaya.normalize.normalizer()
[109]:
s = 'u are really damn hot 🔥'
normalizer.normalize(s, translator = nmt_func)
[109]:
{'normalize': 'awak adalah betul-betul sial panas , emoji api',
 'date': {},
 'money': {}}

Normalize elongated#

Any typical elongated word, eg, pppeeddaaaasss - > pedas, but this elongated normalization required to pass speller parameter to perform the best.

Simply normalizer.normalize(string, normalize_elongated = True), default is True.

[110]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[111]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[111]:
{'normalize': 'saya tak suka makan pedas', 'date': {}, 'money': {}}
[112]:
normalizer.normalize('saayyyyaa ttttaaak ssssukaaa makaan pedas', normalize_elongated = False)
[112]:
{'normalize': 'saayyyyaa ttttaaak ssssukaaa makaan pedas',
 'date': {},
 'money': {}}

Normalize hingga#

If True,

  1. 2011 - 2019 -> dua ribu sebelas hingga dua ribu sembilan belas.

  2. 2011.01-2019 - > dua ribu sebelas perpuluhan kosong satu hingga dua ribu sembilan belas.

Simply normalizer.normalize(string, normalize_hingga = True), default is True.

[113]:
normalizer = malaya.normalize.normalizer()
[114]:
normalizer.normalize('2011 - 2019', normalize_hingga = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[114]:
{'normalize': 'dua ribu sebelas hingga dua ribu sembilan belas',
 'date': {},
 'money': {}}
[115]:
normalizer.normalize('2011 - 2019', normalize_hingga = False)
[115]:
{'normalize': 'dua ribu sebelas - dua ribu sembilan belas',
 'date': {},
 'money': {}}
[116]:
normalizer.normalize('2011 - 2019', normalize_hingga = False, normalize_cardinal = False, normalize_ordinal = False)
[116]:
{'normalize': '2011 - 2019', 'date': {}, 'money': {}}

Normalize pada hari bulan#

If True,

  1. pada 10/4 -> pada sepuluh hari bulan empat.

Simply normalizer.normalize(string, normalize_pada_hari_bulan = True), default is True.

[117]:
normalizer = malaya.normalize.normalizer()
[118]:
normalizer.normalize('pada 10/   4', normalize_pada_hari_bulan = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[118]:
{'normalize': 'pada sepuluh hari bulan empat', 'date': {}, 'money': {}}
[119]:
normalizer.normalize('pada 10/4', normalize_pada_hari_bulan = False)
[119]:
{'normalize': 'pada sepuluh per empat', 'date': {}, 'money': {}}

Normalize fraction#

If True,

  1. 10/4 -> sepuluh per empat.

Simply normalizer.normalize(string, normalize_fraction = True), default is True.

[120]:
normalizer = malaya.normalize.normalizer()
[121]:
normalizer.normalize('10/4', normalize_fraction = True)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[121]:
{'normalize': 'sepuluh per empat', 'date': {}, 'money': {}}
[122]:
normalizer.normalize('201231.1 / 4', normalize_fraction = True)
[122]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu per empat',
 'date': {},
 'money': {}}
[123]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False)
[123]:
{'normalize': 'dua ratus satu ribu dua ratus tiga puluh satu perpuluhan satu / empat',
 'date': {},
 'money': {}}
[124]:
normalizer.normalize('201231.1 / 4', normalize_fraction = False, normalize_cardinal = False,
                    normalize_ordinal = False)
[124]:
{'normalize': '201231.1 / 4', 'date': {}, 'money': {}}

Normalize money#

If True,

  1. RM10.5 -> sepuluh ringgit lima puluh sen.

  2. rm 10.5 sen -> sepuluh ringgit lima puluh sen.

  3. 20.2m ringgit -> dua puluh juta dua ratus ribu ringgit.

And so much more!

Simply normalizer.normalize(string, normalize_money = True), default is True.

[125]:
normalizer = malaya.normalize.normalizer()
[126]:
normalizer.normalize('RM10.5')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[126]:
{'normalize': 'sepuluh ringgit lima puluh sen',
 'date': {},
 'money': {'rm10.5': 'RM10.5'}}
[127]:
normalizer.normalize('rm 10.5 sen')
[127]:
{'normalize': 'sepuluh ringgit lima puluh sen',
 'date': {},
 'money': {'rm 10.5': 'RM10.5'}}
[128]:
normalizer.normalize('1015 sen')
[128]:
{'normalize': 'sepuluh ringgit lima belas sen',
 'date': {},
 'money': {'1015 sen': 'RM10.15'}}
[129]:
normalizer.normalize('rm10.4m')
[129]:
{'normalize': 'sepuluh juta empat ratus ribu ringgit',
 'date': {},
 'money': {'rm10.4m': 'RM10400000.0'}}
[130]:
normalizer.normalize('$10.4K')
[130]:
{'normalize': 'sepuluh ribu empat ratus dollar',
 'date': {},
 'money': {'$10.4k': '$10400.0'}}
[131]:
normalizer.normalize('22.5123334k ringgit')
[131]:
{'normalize': 'dua puluh dua ribu lima ratus dua belas ringgit tiga ribu tiga ratus tiga puluh empat sen',
 'date': {},
 'money': {'22.5123334k ringgit': 'RM22512.3334'}}
[132]:
normalizer.normalize('saya ada 20.2m ringgit')
[132]:
{'normalize': 'saya ada dua puluh juta dua ratus ribu ringgit',
 'date': {},
 'money': {'20.2m ringgit': 'RM20200000.0'}}
[133]:
normalizer.normalize('22.5123334k ringgit', normalize_money = False)
[133]:
{'normalize': '22.5123334k ringgit',
 'date': {},
 'money': {'22.5123334k ringgit': 'RM22512.3334'}}

Normalize units#

Able to normalize temperature, distance, volume, duration and weight units.

If True,

  1. 61.2 kg -> enam puluh satu perpuluhan dua kilogram.

  2. 61.2km -> sepuluh ringgit lima puluh sen.

And so much more!

Simply normalizer.normalize(string, normalize_units = True), default is True.

[134]:
normalizer = malaya.normalize.normalizer()
[135]:
normalizer.normalize('61.2    KG')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[135]:
{'normalize': 'enam puluh satu perpuluhan dua kilogram',
 'date': {},
 'money': {}}
[136]:
normalizer.normalize('61.2km')
[136]:
{'normalize': 'enam puluh satu perpuluhan dua kilometer',
 'date': {},
 'money': {}}
[137]:
normalizer.normalize('61.2c')
[137]:
{'normalize': 'enam puluh satu perpuluhan dua celsius',
 'date': {},
 'money': {}}
[138]:
normalizer.normalize('61.2 ml')
[138]:
{'normalize': 'enam puluh satu perpuluhan dua milliliter',
 'date': {},
 'money': {}}
[139]:
normalizer.normalize('61.2 l')
[139]:
{'normalize': 'enam puluh satu perpuluhan dua liter', 'date': {}, 'money': {}}
[140]:
normalizer.normalize('61.2 jam')
[140]:
{'normalize': 'enam puluh satu perpuluhan dua jam',
 'date': {'61:2 jam': datetime.datetime(2023, 10, 13, 12, 9, 48, 124543)},
 'money': {}}
[141]:
normalizer.normalize('61.2 hari')
[141]:
{'normalize': 'enam puluh satu perpuluhan dua hari', 'date': {}, 'money': {}}

Normalize percents#

  1. If True, 61.2% -> enam puluh satu perpuluhan dua peratus.

Simply normalizer.normalize(string, normalize_percent = True), default is True.

[142]:
normalizer = malaya.normalize.normalizer()
[143]:
normalizer.normalize('61.2%')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[143]:
{'normalize': 'enam puluh satu perpuluhan dua peratus',
 'date': {},
 'money': {}}
[144]:
normalizer.normalize('61.2%', normalize_percent = False)
[144]:
{'normalize': '61.2%', 'date': {}, 'money': {}}

Normalize IC#

  1. If True, 911111-01-1111 -> sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu.

Simply normalizer.normalize(string, normalize_ic = True), default is True.

[145]:
normalizer = malaya.normalize.normalizer()
[146]:
normalizer.normalize('911111-01-1111')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[146]:
{'normalize': 'sembilan satu satu satu satu satu sempang kosong satu sempang satu satu satu satu',
 'date': {},
 'money': {}}
[147]:
normalizer.normalize('911111-01-1111', normalize_ic = False)
[147]:
{'normalize': '911111-01-1111', 'date': {}, 'money': {}}

Normalize Numbers#

If the number starts with 0, will convert into string representation.

  1. If True, 0123 -> kosong satu dua tiga.

Simply normalizer.normalize(string, normalize_number = True), default is True.

[148]:
normalizer = malaya.normalize.normalizer()
[149]:
normalizer.normalize('01234')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[149]:
{'normalize': 'kosong satu dua tiga empat', 'date': {}, 'money': {}}
[150]:
normalizer.normalize('01234', normalize_number = False)
[150]:
{'normalize': '01234', 'date': {}, 'money': {}}

Normalize x kali#

If the word ends with x and before that is a digit, will convert into string representation.

  1. If True, 10x -> sepuluh kali.

  2. If False, 10x -> 10 kali.

Simply normalizer.normalize(string, normalize_x_kali = True), default is True.

[151]:
normalizer = malaya.normalize.normalizer()
[152]:
normalizer.normalize('saya sokong 10x')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[152]:
{'normalize': 'saya sokong sepuluh kali', 'date': {}, 'money': {}}
[153]:
normalizer.normalize('saya sokong 10x', normalize_x_kali = False)
[153]:
{'normalize': 'saya sokong 10 kali', 'date': {}, 'money': {}}

Normalize Cardinals#

Any numbers will convert using malaya.num2word.to_cardinal.

  1. If True, 123 -> seratus dua puluh tiga.

Simply normalizer.normalize(string, normalize_cardinal = True), default is True.

[154]:
normalizer = malaya.normalize.normalizer()
[155]:
normalizer.normalize('123')
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[155]:
{'normalize': 'seratus dua puluh tiga', 'date': {}, 'money': {}}
[156]:
normalizer.normalize('123.123421231')
[156]:
{'normalize': 'seratus dua puluh tiga perpuluhan satu dua tiga empat dua satu dua tiga satu',
 'date': {},
 'money': {}}
[157]:
normalizer.normalize('123.123421231', normalize_cardinal = False)
[157]:
{'normalize': '123.123421231', 'date': {}, 'money': {}}

Normalize Ordinals#

Any numbers will convert using malaya.num2word.to_cardinal.

  1. If True, 123 -> keseratus dua puluh tiga.

  2. Able to normalize roman numbers, ke-XXI -> kedua puluh satu.

Simply normalizer.normalize(string, normalize_ordinal = True), default is True.

[158]:
normalizer.normalize('123', normalize_cardinal = False)
[158]:
{'normalize': 'keseratus dua puluh tiga', 'date': {}, 'money': {}}
[159]:
normalizer.normalize('123', normalize_cardinal = False, normalize_ordinal = False)
[159]:
{'normalize': '123', 'date': {}, 'money': {}}
[160]:
normalizer.normalize('ke-XXI')
[160]:
{'normalize': 'kedua puluh satu', 'date': {}, 'money': {}}

Normalize entity#

normalize entities, only effect date, datetime, time and money patterns string only

Simply normalizer.normalize(string, normalize_entity = True), default is True.

[161]:
string = 'boleh dtg 8pagi esok tak atau minggu depan? 2 oktober 2019 2pm, tlong bayar rm 3.2k sekali tau'
[162]:
normalizer = malaya.normalize.normalizer(corrector, stemmer)
[163]:
normalizer.normalize(string)
INFO:malaya.normalizer.rules:caching malaya.preprocessing.demoji inside normalizer
[163]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 10, 18, 111175),
  '8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
 'money': {'rm 3.2k': 'RM3200.0'}}
[164]:
normalizer.normalize(string, normalize_entity = False)
[164]:
{'normalize': 'boleh dtg pukul lapan esok tak atau minggu depan ? dua Oktober dua ribu sembilan belas pukul empat belas , tolong bayar tiga ribu dua ratus ringgit sekali tau',
 'date': {},
 'money': {}}
[165]:
normalizer.normalize(string, normalize_date = False, normalize_time = False, normalize_money = False,
                    normalize_cardinal = False, normalize_ordinal = False)
[165]:
{'normalize': 'boleh dtg pukul 08 esok tak atau minggu depan ? 02/10/2019 pukul 14 , tolong bayar rm 3.2k sekali tau',
 'date': {'minggu depan': datetime.datetime(2023, 10, 20, 14, 10, 18, 796023),
  '8AM esok': datetime.datetime(2023, 10, 14, 8, 0),
  '2 oktober 2019 2pm': datetime.datetime(2019, 10, 2, 14, 0)},
 'money': {'rm 3.2k': 'RM3200.0'}}