Source code for malaya.language_detection

from malaya_boilerplate.huggingface import download_files
from malaya.model.ml import LanguageDetection
from malaya.model.rules import LanguageDict
import logging

logger = logging.getLogger(__name__)

metrics = {
    'deep-model': """
              precision    recall  f1-score   support

         eng    0.96760   0.97401   0.97080    553739
         ind    0.97635   0.96131   0.96877    576059
       malay    0.96985   0.98498   0.97736   1800649
    manglish    0.98036   0.96569   0.97297    181442
       other    0.99641   0.99627   0.99634   1428083
       rojak    0.94221   0.84302   0.88986    189678

    accuracy                        0.97779   4729650
   macro avg    0.97213   0.95421   0.96268   4729650
weighted avg    0.97769   0.97779   0.97760   4729650
""",
    'mesolitica/fasttext-language-detection-v1': """
              precision    recall  f1-score   support

         eng    0.94014   0.96750   0.95362    553739
         ind    0.97290   0.97316   0.97303    576059
       malay    0.98674   0.95262   0.96938   1800649
    manglish    0.96595   0.98417   0.97498    181442
       other    0.98454   0.99698   0.99072   1428083
       rojak    0.81149   0.91650   0.86080    189678

    accuracy                        0.97002   4729650
   macro avg    0.94363   0.96515   0.95375   4729650
weighted avg    0.97111   0.97002   0.97028   4729650
""",
    'mesolitica/fasttext-language-detection-v2': """
                        precision    recall  f1-score   support

         local-english    0.88328   0.87926   0.88127     50429
           local-malay    0.93159   0.92648   0.92903     59877
        local-mandarin    0.62000   0.95044   0.75045     49820
              manglish    0.98494   0.98157   0.98325     49648
                 other    0.99168   0.92850   0.95905     64350
socialmedia-indonesian    0.97626   0.95390   0.96495     75140
      standard-english    0.86918   0.88018   0.87465     49776
   standard-indonesian    0.99695   0.99713   0.99704     50148
        standard-malay    0.92292   0.94851   0.93554     50049
     standard-mandarin    0.90855   0.53587   0.67413     53709

              accuracy                        0.89953    552946
             macro avg    0.90853   0.89818   0.89494    552946
          weighted avg    0.91425   0.89953   0.89893    552946
""",
    'mesolitica/fasttext-language-detection-ms-id': """
                        precision    recall  f1-score   support

           local-malay    0.95063   0.93858   0.94457    199961
                 other    0.97145   0.98889   0.98009    125920
socialmedia-indonesian    0.97923   0.96303   0.97106    213486
   standard-indonesian    0.99119   0.99610   0.99364    149055
        standard-malay    0.93743   0.95669   0.94696    149336

              accuracy                        0.96584    837758
             macro avg    0.96599   0.96866   0.96727    837758
          weighted avg    0.96591   0.96584   0.96582    837758
""",
    'mesolitica/fasttext-language-detection-en': """
                  precision    recall  f1-score   support

   local-english    0.88991   0.89457   0.89223    149823
        manglish    0.98619   0.98479   0.98549    149535
           other    0.99439   0.99268   0.99354    140651
standard-english    0.89162   0.88967   0.89064    150703

        accuracy                        0.93952    590712
       macro avg    0.94053   0.94043   0.94047    590712
    weighted avg    0.93960   0.93952   0.93955    590712
""",
}

lang_labels_v1 = {
    0: 'eng',
    1: 'ind',
    2: 'malay',
    3: 'manglish',
    4: 'other',
    5: 'rojak',
}

lang_labels_v2 = {
    0: 'standard-english',
    1: 'local-english',
    2: 'manglish',
    3: 'standard-indonesian',
    4: 'socialmedia-indonesian',
    5: 'standard-malay',
    6: 'local-malay',
    7: 'standard-mandarin',
    8: 'local-mandarin',
    9: 'other',
}

lang_labels_ms_id = {
    0: 'standard-indonesian',
    1: 'socialmedia-indonesian',
    2: 'standard-malay',
    3: 'local-malay',
    4: 'other',
}

lang_labels_bahasa_en = {
    0: 'bahasa',
    1: 'english',
    2: 'other',
}

lang_labels_en = {
    0: 'standard-english',
    1: 'local-english',
    2: 'manglish',
    3: 'other',
}

label_v1 = list(lang_labels_v1.values())
label_v2 = list(lang_labels_v2.values())
label_ms_id = list(lang_labels_ms_id.values())
label_bahasa_en = list(lang_labels_bahasa_en.values())
label_en = list(lang_labels_en.values())

available_fasttext = {
    'mesolitica/fasttext-language-detection-v1': {
        'Size (MB)': 353,
        'Quantized Size (MB)': 31.1,
        'dim': 16,
        'Label': lang_labels_v1,
    },
    'mesolitica/fasttext-language-detection-v2': {
        'Size (MB)': 1840,
        'Quantized Size (MB)': 227,
        'dim': 16,
        'Label': lang_labels_v2,
    },
    'mesolitica/fasttext-language-detection-ms-id': {
        'Size (MB)': 537,
        'Quantized Size (MB)': 62.5,
        'dim': 16,
        'Label': lang_labels_ms_id,
    },
    'mesolitica/fasttext-language-detection-bahasa-en': {
        'Size (MB)': 537,
        'Quantized Size (MB)': 62.5,
        'dim': 16,
        'Label': lang_labels_bahasa_en,
    },
    'mesolitica/fasttext-language-detection-en': {
        'Size (MB)': 383,
        'Quantized Size (MB)': 42.3,
        'dim': 16,
        'Label': lang_labels_en,
    }
}

info = """
trained on 90% dataset, tested on another 10% test set, test dataset prepared at https://github.com/mesolitica/malaya/tree/5.1/pretrained-model/language-detection-v2
"""


[docs]def fasttext(
    model: str = 'mesolitica/fasttext-language-detection-v2',
    quantized: bool = True,
    **kwargs,
):
    """
    Load Fasttext language detection model.

    Parameters
    ----------
    model: str, optional (default='mesolitica/fasttext-language-detection-v2')
    quantized: bool, optional (default=True)
        if True, load quantized fasttext model. Else, load original fasttext model.

    Returns
    -------
    result : malaya.model.ml.LanguageDetection class
    """

    try:
        import fasttext
    except BaseException:
        raise ModuleNotFoundError(
            'fasttext not installed. Please install it by `pip install fasttext` and try again.'
        )

    if model not in available_fasttext:
        raise ValueError(
            'model not supported, please check supported models from `malaya.language_detection.available_fasttext`.'
        )

    if quantized:
        filename = 'fasttext.ftz'
    else:
        filename = 'fasttext.bin'

    s3_file = {'model': filename}
    path = download_files(model, s3_file, **kwargs)
    model_fasttext = fasttext.load_model(path['model'])
    return LanguageDetection(model_fasttext, available_fasttext[model]['Label'])


[docs]def substring_rules(model, **kwargs):
    """
    detect EN, MS, MANDARIN and OTHER languages in a string.

    EN words detection are using `pyenchant` from https://pyenchant.github.io/pyenchant/ and
    user language detection model.

    MS words detection are using `malaya.text.function.is_malay` and
    user language detection model.

    OTHER words detection are using any language detection classification model, such as,
    `malaya.language_detection.fasttext`.

    Parameters
    ----------
    model : Callable
        Callable model, must have `predict` method.

    Returns
    -------
    result : malaya.model.rules.LanguageDict class
    """

    if not hasattr(model, 'predict'):
        raise ValueError('model must have `predict` method')

    return LanguageDict(model=model, **kwargs)