EN to MS Noisy#

This tutorial is available as an IPython notebook at Malaya/example/noisy-en-ms-translation.

This module trained on standard language and augmented local language structures, proceed with caution.

[1]:
%%time

import malaya
import logging

logging.basicConfig(level=logging.INFO)
CPU times: user 5.82 s, sys: 1.23 s, total: 7.05 s
Wall time: 8.71 s

List available Transformer models#

[2]:
malaya.translation.en_ms.available_transformer()
INFO:malaya.translation.en_ms:tested on 77k EN-MS test set generated from teacher semisupervised model, https://huggingface.co/datasets/mesolitica/en-ms
INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200
[2]:
Size (MB) Quantized Size (MB) BLEU SacreBLEU Verbose SacreBLEU-chrF++-FLORES200 Suggested length
small 42.7 13.4 58.67129 80.2/63.8/52.8/44.4 (BP = 0.997 ratio = 0.997 ... 64.46 256
base 234 82.7 68.259569 86.3/73.3/64.1/56.8 (BP = 0.985 ratio = 0.985 ... 66.28 256
bigbird 246 63.7 59.863535 82.2/65.9/54.9/46.4 (BP = 0.982 ratio = 0.982 ... 59.64 1024
small-bigbird 50.4 13.1 56.701338 80.7/63.2/51.6/42.8 (BP = 0.979 ratio = 0.979 ... 58.01 1024
noisy-base 234 82.7 67.285716 86.1/72.7/63.3/55.8 (BP = 0.981 ratio = 0.981 ... 66.2 256

Load Transformer models#

def transformer(model: str = 'base', quantized: bool = False, **kwargs):
    """
    Load Transformer encoder-decoder model to translate EN-to-MS.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'small'`` - Transformer SMALL parameters.
        * ``'base'`` - Transformer BASE parameters.
        * ``'bigbird'`` - BigBird BASE parameters.
        * ``'small-bigbird'`` - BigBird SMALL parameters.
        * ``'noisy-base'`` - Transformer BASE parameters trained on noisy dataset.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result: model
        if `bigbird` in model, return malaya.model.bigbird.Translation
        else, return malaya.model.tf.Translation
    """
[3]:
transformer = malaya.translation.en_ms.transformer()
INFO:malaya_boilerplate.frozen_graph:running Users/huseinzolkepli/.cache/huggingface/hub using device /device:CPU:0
[17]:
transformer_noisy = malaya.translation.en_ms.transformer(model = 'noisy-base')
INFO:malaya_boilerplate.frozen_graph:running Users/huseinzolkepli/.cache/huggingface/hub using device /device:CPU:0

Translate#

Using greedy decoder#

def greedy_decoder(self, strings: List[str]):
    """
    translate list of strings.

    Parameters
    ----------
    strings : List[str]

    Returns
    -------
    result: List[str]
    """

Using beam decoder#

def beam_decoder(self, strings: List[str], beam_size: int = 3, temperature: float = 0.5):
    """
    translate list of strings using beam decoder.
    Currently only `noisy` models supported `beam_size` and `temperature` parameters.

    Parameters
    ----------
    strings : List[str]
    beam_size: int, optional (default=3)
    temperature: float, optional (default=0.5)

    Returns
    -------
    result: List[str]
    """

For better results, always split by end of sentences.

[5]:
from pprint import pprint
[6]:
# https://www.malaymail.com/news/malaysia/2020/07/01/dr-mahathir-again-claims-anwar-lacks-popularity-with-malays-to-be-pakatans/1880420

string_news1 = 'KUALA LUMPUR, July 1 - Datuk Seri Anwar Ibrahim is not suitable to as the prime minister candidate as he is allegedly not "popular" among the Malays, Tun Dr Mahathir Mohamad claimed. The former prime minister reportedly said the PKR president needs someone like himself in order to acquire support from the Malays and win the election.'
pprint(string_news1)
('KUALA LUMPUR, July 1 - Datuk Seri Anwar Ibrahim is not suitable to as the '
 'prime minister candidate as he is allegedly not "popular" among the Malays, '
 'Tun Dr Mahathir Mohamad claimed. The former prime minister reportedly said '
 'the PKR president needs someone like himself in order to acquire support '
 'from the Malays and win the election.')
[7]:
# https://edition.cnn.com/2020/07/06/politics/new-york-attorney-general-blm/index.html

string_news2 = '(CNN)New York Attorney General Letitia James on Monday ordered the Black Lives Matter Foundation -- which she said is not affiliated with the larger Black Lives Matter movement -- to stop collecting donations in New York. "I ordered the Black Lives Matter Foundation to stop illegally accepting donations that were intended for the #BlackLivesMatter movement. This foundation is not affiliated with the movement, yet it accepted countless donations and deceived goodwill," James tweeted.'
pprint(string_news2)
('(CNN)New York Attorney General Letitia James on Monday ordered the Black '
 'Lives Matter Foundation -- which she said is not affiliated with the larger '
 'Black Lives Matter movement -- to stop collecting donations in New York. "I '
 'ordered the Black Lives Matter Foundation to stop illegally accepting '
 'donations that were intended for the #BlackLivesMatter movement. This '
 'foundation is not affiliated with the movement, yet it accepted countless '
 'donations and deceived goodwill," James tweeted.')
[8]:
# https://www.thestar.com.my/business/business-news/2020/07/04/malaysia-worries-new-eu-food-rules-could-hurt-palm-oil-exports

string_news3 = 'Amongst the wide-ranging initiatives proposed are a sustainable food labelling framework, a reformulation of processed foods, and a sustainability chapter in all EU bilateral trade agreements. The EU also plans to publish a proposal for a legislative framework for sustainable food systems by 2023 to ensure all foods on the EU market become increasingly sustainable.'
pprint(string_news3)
('Amongst the wide-ranging initiatives proposed are a sustainable food '
 'labelling framework, a reformulation of processed foods, and a '
 'sustainability chapter in all EU bilateral trade agreements. The EU also '
 'plans to publish a proposal for a legislative framework for sustainable food '
 'systems by 2023 to ensure all foods on the EU market become increasingly '
 'sustainable.')
[9]:
# https://jamesclear.com/articles

string_article1 = 'This page shares my best articles to read on topics like health, happiness, creativity, productivity and more. The central question that drives my work is, “How can we live better?” To answer that question, I like to write about science-based ways to solve practical problems.'
pprint(string_article1)
('This page shares my best articles to read on topics like health, happiness, '
 'creativity, productivity and more. The central question that drives my work '
 'is, “How can we live better?” To answer that question, I like to write about '
 'science-based ways to solve practical problems.')
[13]:
%%time

pprint(transformer_noisy.greedy_decoder([string_news1, string_news2, string_news3]))
['KUALA LUMPUR 1 Julai - Datuk Seri Anwar Ibrahim tidak sesuai sebagai calon '
 'Perdana Menteri kerana didakwa tidak "popular" dalam kalangan orang Melayu, '
 'Tun Dr Mahathir Mohamad mendakwa bekas perdana menteri itu dilaporkan '
 'berkata, Presiden PKR itu memerlukan seseorang seperti dirinya untuk '
 'mendapatkan sokongan daripada orang Melayu dan memenangi pilihan raya.',
 '(CNN) Peguam Negara New York Letitia James pada hari Isnin memerintahkan '
 'Yayasan Black Lives Matter - yang menurutnya tidak berafiliasi dengan '
 'gerakan Black Lives Matter yang lebih besar - untuk berhenti mengumpulkan '
 'sumbangan di New York. "Saya memerintahkan Yayasan Black Lives Matter untuk '
 'berhenti menerima sumbangan secara haram yang ditujukan untuk gerakan '
 '#BlackLivesMatter. Yayasan ini tidak berafiliasi dengan gerakan itu, namun '
 'ia menerima sumbangan yang tak terhitung jumlahnya dan muhibah yang ditipu," '
 'tweet James.',
 'Di antara inisiatif luas yang diusulkan adalah kerangka pelabelan makanan '
 'lestari, reformulasi makanan olahan, dan bab keberlanjutan dalam semua '
 'perjanjian perdagangan dua hala EU. EU juga merancang untuk menerbitkan '
 'cadangan untuk kerangka perundangan untuk sistem makanan lestari menjelang '
 '2023 untuk memastikan semua makanan di pasaran EU menjadi semakin lestari.']
CPU times: user 31.4 s, sys: 8.26 s, total: 39.7 s
Wall time: 22.3 s

compare results using local language structure#

[18]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
[19]:
%%time

pprint(transformer_noisy.greedy_decoder(strings))
['u ni, bercakap betul lah',
 'baru sahaja menghadiri majlis perkahwinan sepupu saya. jugak pelik dia buat '
 'majlis biasa2 je sebab gaya hidupnya kelihatan mewah. maka saya mendapat '
 'tahu mereka akan menjalani bulan madu selama 3 minggu. keputusan pintar',
 'Saya setelah melihat video ini: mm dapnya burger benjo extra mayo',
 'Hai kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni '
 'kan. Jadi harini saya nak berkongsi beberapa post mortem kumpulan pertama '
 'kami:']
CPU times: user 18.6 s, sys: 6.75 s, total: 25.4 s
Wall time: 18.2 s
[20]:
%%time

pprint(transformer.greedy_decoder(strings))
['u ni, bercakap dengan betul lah',
 'baru sahaja menghadiri majlis perkahwinan sepupu saya. jugak buat dia majlis '
 'biasa2 je sebab gaya hidupnya kelihatan mewah. kemudian saya mendapat tahu '
 'bahawa mereka akan berbulan madu selama 3 minggu. keputusan pintar',
 'Saya setelah melihat video ini: mm dapnya burger benjo extra mayo',
 'Hai kawan-kawan! Saya perhatikan semalam & harini dah ramai yang dapat '
 'cookies ni kan. Jadi harini saya nak berkongsi beberapa post mortem kumpulan '
 'pertama kami:']
CPU times: user 9.1 s, sys: 787 ms, total: 9.89 s
Wall time: 2.39 s

compare with Google translate using googletrans#

Install it by,

pip3 install googletrans==4.0.0rc1
[22]:
from googletrans import Translator

translator = Translator()
[23]:
for t in strings:
    r = translator.translate(t, src='en', dest = 'ms')
    print(r.text)
u ni, bercakap dengan betul lah
Baru sahaja menghadiri majlis perkahwinan sepupu saya.Pelik Jugak Dia Buat Majlis Biasa2 Je Sebab Gaya Hidupnya kelihatan mewah.Kemudian saya dapati mereka akan berbulan madu selama 3 minggu.Keputusan Pintar 👍
Saya setelah melihat video ini: mm dapnya burger benjo tambahan mayo
Hai semua!Saya perhatikan Semalam & Harini Dah Ramai Yang Dapate Cookies Ni Kan.Jadi harini i nak berkongsi beberapa bedah siasat kumpulan pertama kami: