Source code for malaya.translation

from malaya.supervised.huggingface import load
from malaya.torch_model.huggingface import Translation
from malaya_boilerplate.huggingface import download_files
from typing import Callable, List
import json

nllb_metrics = {
    'en-ms': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, eng_Latn-zsm_Latn, 66.5
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, eng_Latn-zsm_Latn, 66.3
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, eng_Latn-zsm_Latn, 65.2
4. NLLB-200-Distilled, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200densedst1bmetrics, eng_Latn-zsm_Latn, 65.5
5. NLLB-200-Distilled, Dense, 600M, 2.46 GB, https://tinyurl.com/nllb200densedst600mmetrics, eng_Latn-zsm_Latn, 63.5
""",
    'ind-ms': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, ind_Latn-zsm_Latn, 60.2
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, ind_Latn-zsm_Latn, None
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, ind_Latn-zsm_Latn, None
""",
    'jav-ms': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, jav_Latn-zsm_Latn, 56.5
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, jav_Latn-zsm_Latn, None
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, jav_Latn-zsm_Latn, None
""",
    'ms-en': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, zsm_Latn-eng_Latn,68
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, zsm_Latn-eng_Latn,67.8
3. NLLB-200, Dense, 1.3B,  5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, zsm_Latn-eng_Latn,66.4
4. NLLB-200-Distilled, Dense, 1.3B,  5.48 GB, https://tinyurl.com/nllb200densedst1bmetrics, zsm_Latn-eng_Latn,66.2
5. NLLB-200-Distilled, Dense, 600M, 2.46 GB, https://tinyurl.com/nllb200densedst600mmetrics, zsm_Latn-eng_Latn,64.3
""",
    'ms-ind': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, zsm_Latn-ind_Latn, 62.4
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, zsm_Latn-ind_Latn, None
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, zsm_Latn-ind_Latn, None
""",
    'ms-jav': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, zsm_Latn-jav_Latn, 49.5
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, zsm_Latn-jav_Latn, None
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, zsm_Latn-jav_Latn, None
""",
    'en-zho_Hans': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, eng_Latn-zho_Hans,22.8
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, eng_Latn-zho_Hans,22.3
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, eng_Latn-zho_Hans,21.3
""",
    'zho_Hans-en': """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, zho_Hans-eng_Latn,54.7
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, zho_Hans-eng_Latn,56.2
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, zho_Hans-eng_Latn,54.7
"""
}

google_translate_metrics = {
    'en-ms': """
Google Translation metrics (2022-07-23) on FLORES200, https://github.com/huseinzol05/malay-dataset/blob/master/translation/malay-english/flores200-en-ms-google-translate.ipynb:
{'name': 'BLEU',
 'score': 39.12728212969207,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '71.1/47.2/32.7/22.8 (BP = 0.984 ratio = 0.984 hyp_len = 21679 ref_len = 22027)',
 'bp': 0.9840757522087613,
 'counts': [15406, 9770, 6435, 4256],
 'totals': [21679, 20682, 19685, 18688],
 'sys_len': 21679,
 'ref_len': 22027,
 'precisions': [71.0641634761751,
  47.2391451503723,
  32.68986537973076,
  22.773972602739725],
 'prec_str': '71.1/47.2/32.7/22.8',
 'ratio': 0.9842012076088437}
chrF2++ = 64.45
""",
    'ms-en': """
Google Translation metrics (2022-07-23) on FLORES200, https://github.com/huseinzol05/malay-dataset/blob/master/translation/malay-english/flores200-ms-en-google-translate.ipynb:
{'name': 'BLEU',
 'score': 36.152220848177286,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '68.2/43.5/29.7/20.5 (BP = 0.986 ratio = 0.986 hyp_len = 23243 ref_len = 23570)',
 'bp': 0.9860297505310752,
 'counts': [15841, 9688, 6318, 4147],
 'totals': [23243, 22246, 21249, 20252],
 'sys_len': 23243,
 'ref_len': 23570,
 'precisions': [68.15385277287785,
  43.54940213971051,
  29.733163913595934,
  20.476989926920798],
 'prec_str': '68.2/43.5/29.7/20.5',
 'ratio': 0.986126431904964}
chrF2++ = 60.27
"""
}

available_word = {
    'mesolitica/word-en-ms': {
        'Size (MB)': 42.6,
        'total words': 1599797,
    },
    'mesolitica/word-id-ms': {
        'Size (MB)': 53,
        'total words': 1902607,
    }
}

available_huggingface = {
    'mesolitica/translation-t5-tiny-standard-bahasa-cased': {
        'Size (MB)': 139,
        'Suggested length': 1536,
        'en-ms chrF2++': 65.91,
        'ms-en chrF2++': 61.30,
        'ind-ms chrF2++': 58.15,
        'jav-ms chrF2++': 49.33,
        'pasar ms-ms chrF2++': 58.46,
        'pasar ms-en chrF2++': 55.76,
        'manglish-ms chrF2++': 51.04,
        'manglish-en chrF2++': 52.20,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-t5-small-standard-bahasa-cased': {
        'Size (MB)': 242,
        'Suggested length': 1536,
        'en-ms chrF2++': 67.37,
        'ms-en chrF2++': 63.79,
        'ind-ms chrF2++': 58.09,
        'jav-ms chrF2++': 52.11,
        'pasar ms-ms chrF2++': 62.49,
        'pasar ms-en chrF2++': 60.77,
        'manglish-ms chrF2++': 52.84,
        'manglish-en chrF2++': 53.65,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-t5-base-standard-bahasa-cased': {
        'Size (MB)': 892,
        'Suggested length': 1536,
        'en-ms chrF2++': 67.62,
        'ms-en chrF2++': 64.41,
        'ind-ms chrF2++': 59.25,
        'jav-ms chrF2++': 52.86,
        'pasar ms-ms chrF2++': 62.99,
        'pasar ms-en chrF2++': 62.06,
        'manglish-ms chrF2++': 54.40,
        'manglish-en chrF2++': 54.14,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2': {
        'Size (MB)': 242,
        'Suggested length': 2048,
        'en-ms chrF2++': 67.8,
        'ms-en chrF2++': 64.53,
        'ind-ms chrF2++': 60.38,
        'jav-ms chrF2++': 53.48,
        'pasar ms-ms chrF2++': 63.13,
        'pasar ms-en chrF2++': 63.04,
        'manglish-ms chrF2++': 56.57,
        'manglish-en chrF2++': 54.14,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-t5-small-standard-bahasa-cased-code': {
        'Size (MB)': 242,
        'Suggested length': 2048,
        'en-ms chrF2++': 66.89,
        'ms-en chrF2++': 63.79,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-nanot5-tiny-malaysian-cased': {
        'Size (MB)': 205,
        'Suggested length': 2048,
        'en-ms chrF2++': 63.61,
        'ms-en chrF2++': 59.55,
        'ind-ms chrF2++': 56.38,
        'jav-ms chrF2++': 47.68,
        'mandarin-ms chrF2++': 36.61,
        'mandarin-en chrF2++': 39.78,
        'pasar ms-ms chrF2++': 58.74,
        'pasar ms-en chrF2++': 54.87,
        'manglish-ms chrF2++': 50.76,
        'manglish-en chrF2++': 53.16,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms', 'mandarin', 'pasar mandarin'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-nanot5-small-malaysian-cased': {
        'Size (MB)': 358,
        'Suggested length': 2048,
        'en-ms chrF2++': 66.98,
        'ms-en chrF2++': 63.52,
        'ind-ms chrF2++': 58.10,
        'jav-ms chrF2++': 51.55,
        'mandarin-ms chrF2++': 46.09,
        'mandarin-en chrF2++': 44.13,
        'pasar ms-ms chrF2++': 63.20,
        'pasar ms-en chrF2++': 59.78,
        'manglish-ms chrF2++': 54.09,
        'manglish-en chrF2++': 55.27,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms', 'mandarin', 'pasar mandarin'],
        'to lang': ['en', 'ms'],
    },
    'mesolitica/translation-nanot5-base-malaysian-cased': {
        'Size (MB)': 990,
        'Suggested length': 2048,
        'en-ms chrF2++': 67.87,
        'ms-en chrF2++': 64.79,
        'ind-ms chrF2++': 56.98,
        'jav-ms chrF2++': 51.21,
        'mandarin-ms chrF2++': 47.39,
        'mandarin-en chrF2++': 48.78,
        'pasar ms-ms chrF2++': 65.06,
        'pasar ms-en chrF2++': 64.03,
        'manglish-ms chrF2++': 57.91,
        'manglish-en chrF2++': 55.66,
        'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms', 'mandarin', 'pasar mandarin'],
        'to lang': ['en', 'ms'],
    },

}

info = """
1. tested on FLORES200 pair `dev` set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/flores200-eval
2. tested on noisy test set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/noisy-eval
3. check out NLLB 200 metrics from `malaya.translation.nllb_metrics`.
4. check out Google Translate metrics from `malaya.translation.google_translate_metrics`.
""".strip()


[docs]def word(model: str = 'mesolitica/word-en-ms', **kwargs): """ Load word dictionary, based on google translate. Parameters ---------- model, optional (default='mesolitica/word-en-ms') Check available models at `malaya.translation.available_word`. Returns ------- result: Dict[str, str] """ if model not in available_word: raise ValueError( 'model not supported, please check supported models from `malaya.translation.available_word`.' ) s3_file = {'model': 'dictionary.json'} path = download_files(model, s3_file, **kwargs) with open(path['model']) as fopen: translator = json.load(fopen) return translator
[docs]def huggingface( model: str = 'mesolitica/translation-t5-small-standard-bahasa-cased', force_check: bool = True, **kwargs, ): """ Load HuggingFace model to translate. Parameters ---------- model: str, optional (default='mesolitica/translation-t5-small-standard-bahasa-cased') Check available models at `malaya.translation.available_huggingface`. force_check: bool, optional (default=True) Force check model one of malaya model. Set to False if you have your own huggingface model. Returns ------- result: malaya.torch_model.huggingface.Translation """ return load( model=model, class_model=Translation, available_huggingface=available_huggingface, force_check=force_check, path=__name__, **kwargs, )