from malaya.model.tf import Translation
from malaya.model.huggingface import Generator
from malaya.model.bigbird import Translation as BigBird_Translation
from malaya.supervised import transformer as load_transformer
from malaya.supervised import bigbird as load_bigbird
from malaya.supervised import huggingface as load_huggingface
from malaya.function import describe_availability, check_file
from herpetologist import check_type
from malaya.path import PATH_PREPROCESSING, S3_PATH_PREPROCESSING
import json
import logging
import warnings
logger = logging.getLogger(__name__)
nllb_metrics = """
NLLB Metrics, https://github.com/facebookresearch/fairseq/tree/nllb#multilingual-translation-models:
1. NLLB-200, MOE, 54.5B, https://tinyurl.com/nllb200moe54bmetrics, eng_Latn-zsm_Latn, 66.5
2. NLLB-200, Dense, 3.3B, 17.58 GB, https://tinyurl.com/nllb200dense3bmetrics, eng_Latn-zsm_Latn, 66.3
3. NLLB-200, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200dense1bmetrics, eng_Latn-zsm_Latn, 65.2
4. NLLB-200-Distilled, Dense, 1.3B, 5.48 GB, https://tinyurl.com/nllb200densedst1bmetrics, eng_Latn-zsm_Latn, 65.5
5. NLLB-200-Distilled, Dense, 600M, 2.46 GB, https://tinyurl.com/nllb200densedst600mmetrics, eng_Latn-zsm_Latn, 63.5
"""
google_translate_metrics = """
Google Translation metrics (2022-07-23) on FLORES200, https://github.com/huseinzol05/malay-dataset/blob/master/translation/malay-english/flores200-en-ms-google-translate.ipynb:
{'name': 'BLEU',
'score': 39.12728212969207,
'_mean': -1.0,
'_ci': -1.0,
'_verbose': '71.1/47.2/32.7/22.8 (BP = 0.984 ratio = 0.984 hyp_len = 21679 ref_len = 22027)',
'bp': 0.9840757522087613,
'counts': [15406, 9770, 6435, 4256],
'totals': [21679, 20682, 19685, 18688],
'sys_len': 21679,
'ref_len': 22027,
'precisions': [71.0641634761751,
47.2391451503723,
32.68986537973076,
22.773972602739725],
'prec_str': '71.1/47.2/32.7/22.8',
'ratio': 0.9842012076088437}
chrF2++ = 64.45
"""
_transformer_availability = {
'small': {
'Size (MB)': 42.7,
'Quantized Size (MB)': 13.4,
'BLEU': 39.80538744027295,
'SacreBLEU Verbose': '80.2/63.8/52.8/44.4 (BP = 0.997 ratio = 0.997 hyp_len = 2621510 ref_len = 2630014)',
'SacreBLEU-chrF++-FLORES200': 64.46,
'Suggested length': 256,
},
'base': {
'Size (MB)': 234,
'Quantized Size (MB)': 82.7,
'BLEU': 42.21071347388556,
'SacreBLEU Verbose': '86.3/73.3/64.1/56.8 (BP = 0.985 ratio = 0.985 hyp_len = 2591093 ref_len = 2630014)',
'SacreBLEU-chrF++-FLORES200': 66.28,
'Suggested length': 256,
},
'bigbird': {
'Size (MB)': 246,
'Quantized Size (MB)': 63.7,
'BLEU': 39.09071749208737,
'SacreBLEU Verbose': '70.5/46.7/32.4/22.9 (BP = 0.989 ratio = 0.989 hyp_len = 21782 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 63.96,
'Suggested length': 1024,
},
'small-bigbird': {
'Size (MB)': 50.4,
'Quantized Size (MB)': 13.1,
'BLEU': 36.90195033318057,
'SacreBLEU Verbose': '67.0/43.8/30.1/21.0 (BP = 1.000 ratio = 1.028 hyp_len = 22637 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 62.85,
'Suggested length': 1024,
},
'noisy-base': {
'Size (MB)': 234,
'Quantized Size (MB)': 82.7,
'BLEU': 41.8278308435666,
'SacreBLEU Verbose': '73.1/49.7/35.3/25.4 (BP = 0.985 ratio = 0.985 hyp_len = 63506 ref_len = 64473)',
'SacreBLEU-chrF++-FLORES200': 66.46,
'Suggested length': 256,
},
}
_huggingface_availability = {
'mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased': {
'Size (MB)': 23.3,
'BLEU': 36.29074311583665,
'SacreBLEU Verbose': '71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 hyp_len = 20958 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 61.89,
'Suggested length': 256,
},
'mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased': {
'Size (MB)': 50.7,
'BLEU': 39.18834189893951,
'SacreBLEU Verbose': '72.6/48.3/33.5/23.6 (BP = 0.960 ratio = 0.961 hyp_len = 21172 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 64.03,
'Suggested length': 256,
},
'mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased': {
'Size (MB)': 139,
'BLEU': 41.625536185056305,
'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 65.70,
'Suggested length': 256,
},
'mesolitica/finetune-translation-t5-small-standard-bahasa-cased': {
'Size (MB)': 242,
'BLEU': 43.93729753370648,
'SacreBLEU Verbose': '74.9/52.2/37.9/27.7 (BP = 0.976 ratio = 0.977 hyp_len = 21510 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 67.43,
'Suggested length': 256,
},
'mesolitica/finetune-translation-t5-base-standard-bahasa-cased': {
'Size (MB)': 892,
'BLEU': 44.17355862158963,
'SacreBLEU Verbose': '74.7/52.3/38.0/28.0 (BP = 0.979 ratio = 0.979 hyp_len = 21569 ref_len = 22027)',
'SacreBLEU-chrF++-FLORES200': 67.60,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased': {
'Size (MB)': 139,
'BLEU': 41.03641425544081,
'SacreBLEU Verbose': '72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 hyp_len = 63005 ref_len = 64473)',
'SacreBLEU-chrF++-FLORES200': 65.58,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased': {
'Size (MB)': 242,
'BLEU': 41.15794003172596,
'SacreBLEU Verbose': '72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 hyp_len = 63689 ref_len = 64473)',
'SacreBLEU-chrF++-FLORES200': 65.51,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased': {
'Size (MB)': 892,
'BLEU': 41.8278308435666,
'SacreBLEU Verbose': '73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 hyp_len = 63335 ref_len = 64473)',
'SacreBLEU-chrF++-FLORES200': 66.51,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2': {
'Size (MB)': 139,
'BLEU': 60.0009672168891,
'SacreBLEU Verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)',
'SacreBLEU-chrF++-FLORES200': None,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4': {
'Size (MB)': 242,
'BLEU': 64.06258219941243,
'SacreBLEU Verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)',
'SacreBLEU-chrF++-FLORES200': None,
'Suggested length': 256,
},
'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2': {
'Size (MB)': 892,
'BLEU': 64.583819005204,
'SacreBLEU Verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)',
'SacreBLEU-chrF++-FLORES200': None,
'Suggested length': 256,
},
}
def _describe():
logger.info('tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200')
logger.info('for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set')
[docs]def available_huggingface():
"""
List available HuggingFace models.
"""
_describe()
return describe_availability(_huggingface_availability)
[docs]def dictionary(**kwargs):
"""
Load dictionary {EN: MS} .
Returns
-------
result: Dict[str, str]
"""
path = check_file(
PATH_PREPROCESSING['english-malay'],
S3_PATH_PREPROCESSING['english-malay'],
**kwargs,
)
try:
with open(path['model']) as fopen:
translator = json.load(fopen)
except BaseException:
raise Exception('failed to load EN-MS vocab, please try clear cache or rerun again.')
return translator
[docs]@check_type
def huggingface(
model: str = 'mesolitica/finetune-translation-t5-small-standard-bahasa-cased',
force_check: bool = True,
**kwargs,
):
"""
Load HuggingFace model to translate EN-to-MS.
Parameters
----------
model: str, optional (default='mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
Check available models at `malaya.translation.en_ms.available_huggingface()`.
force_check: bool, optional (default=True)
Force check model one of malaya model.
Set to False if you have your own huggingface model.
Returns
-------
result: malaya.torch_model.huggingface.Generator
"""
if model not in _huggingface_availability and force_check:
raise ValueError(
'model not supported, please check supported models from `malaya.translation.en_ms.available_huggingface()`.'
)
return load_huggingface.load_generator(model=model, initial_text='terjemah Inggeris ke Melayu: ', **kwargs)