Source code for malaya.language_model

from malaya.function import check_file
from malaya.torch_model.causal_lm import GPT2 as GPT2LM
from malaya.torch_model.mask_lm import MLMScorer

available_kenlm = {
    'bahasa-wiki': {
        'Size (MB)': 70.5,
        'LM order': 3,
        'Description': 'MS wikipedia.',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 3 --prune 0 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
    'bahasa-news': {
        'Size (MB)': 107,
        'LM order': 3,
        'Description': 'local news.',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 3 --prune 0 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
    'bahasa-wiki-news': {
        'Size (MB)': 165,
        'LM order': 3,
        'Description': 'MS wikipedia + local news.',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 3 --prune 0 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
    'bahasa-wiki-news-iium-stt': {
        'Size (MB)': 416,
        'LM order': 3,
        'Description': 'MS wikipedia + local news + IIUM + STT',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 3 --prune 0 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
    'dump-combined': {
        'Size (MB)': 310,
        'LM order': 3,
        'Description': 'Academia + News + IIUM + Parliament + Watpadd + Wikipedia + Common Crawl',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 3 --prune 0 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
    'redape-community': {
        'Size (MB)': 887.1,
        'LM order': 4,
        'Description': 'Mirror for https://github.com/redapesolutions/suara-kami-community',
        'Command': [
            './lmplz --text text.txt --arpa out.arpa -o 4 --prune 0 1 1 1',
            './build_binary -q 8 -b 7 -a 256 trie out.arpa out.trie.klm',
        ],
    },
}

available_gpt2 = {
    'mesolitica/gpt2-117m-bahasa-cased': {
        'Size (MB)': 454,
    },
}

available_mlm = {
    'mesolitica/bert-base-standard-bahasa-cased': {
        'Size (MB)': 310,
    },
    'mesolitica/bert-tiny-standard-bahasa-cased': {
        'Size (MB)': 66.1,
    },
    'mesolitica/roberta-base-standard-bahasa-cased': {
        'Size (MB)': 443,
    },
    'mesolitica/roberta-tiny-standard-bahasa-cased': {
        'Size (MB)': 66.1,
    },
    'mesolitica/malaysian-debertav2-base': {
        'Size (MB)': 228,
    },
}


[docs]def kenlm(model: str = 'dump-combined', **kwargs):
    """
    Load KenLM language model.

    Parameters
    ----------
    model: str, optional (default='dump-combined')
        Check available models at `malaya.language_model.available_kenlm`.
    Returns
    -------
    result: kenlm.Model class
    """

    try:
        import kenlm
    except BaseException:
        raise ModuleNotFoundError(
            'kenlm not installed. Please install it by `pip install pypi-kenlm` and try again.'
        )

    if model not in available_kenlm:
        raise ValueError(
            'model not supported, please check supported models from `malaya.language_model.available_kenlm`.'
        )

    path = check_file(
        file=model,
        module='language-model',
        keys={
            'model': 'model.klm',
        },
        quantized=False,
        **kwargs,
    )
    return kenlm.Model(path['model'])


[docs]def gpt2(
    model: str = 'mesolitica/gpt2-117m-bahasa-cased',
    force_check: bool = True,
    **kwargs,
):
    """
    Load GPT2 language model.

    Parameters
    ----------
    model: str, optional (default='mesolitica/gpt2-117m-bahasa-cased')
        Check available models at `malaya.language_model.available_gpt2`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.

    Returns
    -------
    result: malaya.torch_model.gpt2_lm.LM class
    """

    if model not in available_gpt2 and force_check:
        raise ValueError(
            'model not supported, please check supported models from `malaya.language_model.available_gpt2`.'
        )
    model = GPT2LM.from_pretrained(model, **kwargs)
    model.load_tokenizer()
    return model


[docs]def mlm(
    model: str = 'mesolitica/malaysian-debertav2-base',
    force_check: bool = True,
    **kwargs
):
    """
    Load Masked language model.

    Parameters
    ----------
    model: str, optional (default='mesolitica/malaysian-debertav2-base')
        Check available models at `malaya.language_model.available_mlm`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.

    Returns
    -------
    result: malaya.torch_model.mask_lm.MLMScorer class
    """

    if model not in available_mlm and force_check:
        raise ValueError(
            'model not supported, please check supported models from `malaya.language_model.available_mlm`.'
        )

    return MLMScorer(model, **kwargs)