Source code for malaya.model.rules

from malaya.text.normalization import _is_number_regex, _is_mandarin_char
from malaya.text.function import (
    check_ratio_numbers,
    check_ratio_punct,
    is_emoji,
    is_laugh,
    is_mengeluh,
    PUNCTUATION,
)
from malaya.dictionary import is_malay, is_english
from malaya.dictionary.mandarin.pinyin import pinyin_dict
from typing import List
import logging

logger = logging.getLogger(__name__)
pinyin = set(pinyin_dict.values())


[docs]class LanguageDict:
    def __init__(self, model, **kwargs):

        enchant_available = True
        try:
            import enchant
        except BaseException:
            logger.warning(
                'pyenchant not installed. Please install it by `pip3 install pyenchant` and try again. For now, pyenchant will be disabled.')
            enchant_available = False

        try:
            self.d = enchant.Dict('en_US')
            self.d.check('Hello')
        except BaseException:
            logger.warning(
                'cannot load `en_US` enchant dictionary. Please install it from https://pyenchant.github.io/pyenchant/install.html and try again. For now, pyenchant will be disabled.')
            enchant_available = False

        self._enchant_available = enchant_available
        self._model = model

[docs]    def predict(
        self,
        words: List[str],
        acceptable_ms_label: List[str] = ['malay', 'ind'],
        acceptable_en_label: List[str] = ['eng', 'manglish'],
        ignore_capital: bool = False,
        use_is_malay: bool = True,
        predict_mandarin: bool = False,
    ):
        """
        Predict [EN, MS, OTHERS, CAPITAL, NOT_LANG] on word level.
        This method assumed the string already tokenized.

        Parameters
        ----------
        words: List[str]
        acceptable_ms_label: List[str], optional (default = ['malay', 'ind'])
            accept labels from language detection model to assume a word is `MS`.
        acceptable_en_label: List[str], optional (default = ['eng', 'manglish'])
            accept labels from language detection model to assume a word is `EN`.
        ignore_capital: bool, optional (default=False)
            if True, will predict language for capital word.
        use_is_malay: bool, optional (default=True)
            if True`, will predict MS word using `malaya.dictionary.is_malay`,
            else use language detection model.
        predict_mandarin: bool, optional (default=False)
            if True, will slide the string to match pinyin dict.

        Returns
        -------
        result: List[str]
        """

        results, others, indices = [], [], []
        for no, word in enumerate(words):
            if is_emoji(word):
                results.append('NOT_LANG')
            elif word.isupper() and not ignore_capital:
                results.append('CAPITAL')
            elif _is_number_regex(word.replace(',', '').replace('.', '')):
                results.append('NOT_LANG')
            elif word in PUNCTUATION:
                results.append('NOT_LANG')
            elif is_laugh(word):
                results.append('NOT_LANG')
            elif is_mengeluh(word):
                results.append('NOT_LANG')
            elif check_ratio_numbers(word) > 0.6666:
                results.append('NOT_LANG')
            elif check_ratio_punct(word) > 0.66666:
                results.append('NOT_LANG')
            elif _is_mandarin_char(word):
                results.append('MANDARIN')
            elif self._enchant_available and self.d.check(word) and not is_malay(word.lower()):
                results.append('EN')
            elif use_is_malay and is_malay(word.lower()):
                results.append('MS')
            else:
                results.append('REPLACE_ME')
                others.append(word)
                indices.append(no)

        labels = self._model.predict(others)
        for no in range(len(labels)):
            if labels[no] in acceptable_ms_label:
                results[indices[no]] = 'MS'
            elif labels[no] in acceptable_en_label:
                results[indices[no]] = 'EN'
            else:
                results[indices[no]] = 'OTHERS'

        if predict_mandarin:
            temp, indices = [], []
            for no, word in enumerate(words):
                if word in pinyin:
                    temp.append(word)
                    indices.append(no)
                else:
                    is_chinese = False
                    if len(temp):
                        if len(temp) == 1:
                            if temp[0][0] in 'xz':
                                is_chinese = True
                            elif len(temp[0] > 2):
                                is_chinese = True
                        else:
                            is_chinese = True

                    if is_chinese:
                        for i in indices:
                            results[i] = 'MANDARIN'

        return results