Source code for malaya.model.rules

from malaya.text.normalization import _is_number_regex
from malaya.text.function import (
    check_ratio_numbers,
    check_ratio_punct,
    is_emoji,
    is_laugh,
    is_mengeluh,
    PUNCTUATION,
)
from malaya.dictionary import is_malay, is_english
from typing import List
import logging

logger = logging.getLogger(__name__)


[docs]class LanguageDict: def __init__(self, model, **kwargs): enchant_available = True try: import enchant except BaseException: logger.warning( 'pyenchant not installed. Please install it by `pip3 install pyenchant` and try again. For now, pyenchant will be disabled.') enchant_available = False try: self.d = enchant.Dict('en_US') self.d.check('Hello') except BaseException: logger.warning( 'cannot load `en_US` enchant dictionary. Please install it from https://pyenchant.github.io/pyenchant/install.html and try again. For now, pyenchant will be disabled.') enchant_available = False self._enchant_available = enchant_available self._model = model
[docs] def predict( self, words: List[str], acceptable_ms_label: List[str] = ['malay', 'ind'], acceptable_en_label: List[str] = ['eng', 'manglish'], use_is_malay: bool = True, ): """ Predict [EN, MS, OTHERS, CAPITAL, NOT_LANG] on word level. This method assumed the string already tokenized. Parameters ---------- words: List[str] acceptable_ms_label: List[str], optional (default = ['malay', 'ind']) accept labels from language detection model to assume a word is `MS`. acceptable_en_label: List[str], optional (default = ['eng', 'manglish']) accept labels from language detection model to assume a word is `EN`. use_is_malay: bool, optional (default=True) if True`, will predict MS word using `malaya.dictionary.is_malay`, else use language detection model. Returns ------- result: List[str] """ results, others, indices = [], [], [] for no, word in enumerate(words): if is_emoji(word): results.append('NOT_LANG') elif word.isupper(): results.append('CAPITAL') elif _is_number_regex(word.replace(',', '').replace('.', '')): results.append('NOT_LANG') elif word in PUNCTUATION: results.append('NOT_LANG') elif is_laugh(word): results.append('NOT_LANG') elif is_mengeluh(word): results.append('NOT_LANG') elif check_ratio_numbers(word) > 0.6666: results.append('NOT_LANG') elif check_ratio_punct(word) > 0.66666: results.append('NOT_LANG') elif self._enchant_available and self.d.check(word): results.append('EN') elif use_is_malay and is_malay(word.lower()): results.append('MS') else: results.append('REPLACE_ME') others.append(word) indices.append(no) labels = self._model.predict(others) for no in range(len(labels)): if labels[no] in acceptable_ms_label: results[indices[no]] = 'MS' elif labels[no] in acceptable_en_label: results[indices[no]] = 'EN' else: results[indices[no]] = 'OTHERS' return results