Spelling Correction using Symspeller#

This tutorial is available as an IPython notebook at Malaya/example/spelling-correction-symspell.

Dependencies#

This spelling correction is an improvement version for https://github.com/mammothb/symspellpy to adapt with our local shortform / typos. Before you able to use this spelling correction, you need to install,

pip install symspellpy
[2]:
import malaya
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp8er24wly
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp8er24wly/_remote_module_non_scriptable.py
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[3]:
# some text examples copied from Twitter

string1 = 'krajaan patut bagi pencen awal skt kpd warga emas supaya emosi'
string2 = 'Husein ska mkn aym dkat kampng Jawa'
string3 = 'Melayu malas ni narration dia sama je macam men are trash. True to some, false to some.'
string4 = 'Tapi tak pikir ke bahaya perpetuate myths camtu. Nanti kalau ada hiring discrimination despite your good qualifications because of your race tau pulak marah. Your kids will be victims of that too.'
string5 = 'DrM cerita Melayu malas semenjak saya kat University (early 1980s) and now as i am edging towards retirement in 4-5 years time after a career of being an Engineer, Project Manager, General Manager'
string6 = 'blh bntg dlm kls nlp sy, nnti intch'
string7 = 'mulakn slh org boleh ,bila geng tuh kena slhkn jgk xboleh trima .. pelik'

Load symspeller model#

def load(
    max_edit_distance_dictionary: int = 2,
    prefix_length: int = 7,
    term_index: int = 0,
    count_index: int = 1,
    top_k: int = 10,
    **kwargs
):
    """
    Load a symspell Spell Corrector for Malay.

    Returns
    -------
    result: malaya.spelling_correction.symspell.Symspell class
    """
[4]:
model = malaya.spelling_correction.symspell.load()
INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/v27-preprocessing/bm_1grams.txt
INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/v27-preprocessing/bm_1grams.json

List possible generated pool of words#

def edit_candidates(self, word):
    """
    Generate candidates given a word.

    Parameters
    ----------
    word: str

    Returns
    -------
    result: List[str]
    """
[5]:
model.edit_candidates('mhthir')
[5]:
['mahathir',
 'mahadhir',
 'mahatir',
 'mahadzir',
 'matahir',
 'mahathma',
 'hathir',
 'mahachi',
 'mahathat',
 'mahadhika',
 'mahavira',
 'mhthir']
[6]:
model.edit_candidates('smbng')
[6]:
['sambang',
 'lambang',
 'tambang',
 'ambang',
 'bambang',
 'sambung',
 'sembang',
 'sumbang',
 'mambang',
 'sumbangan',
 'sambungan',
 'sembawang',
 'sembarang',
 'sambaran',
 'samarang',
 'sambakong',
 'sembayang',
 'tambalang',
 'ambacang',
 'sabatang',
 'samalanga',
 'kabananga',
 'smbng']

To correct a word#

def correct(self, word: str, **kwargs):
    """
    Most probable spelling correction for word.

    Parameters
    ----------
    word: str

    Returns
    -------
    result: str
    """
[7]:
model.correct('suke')
[7]:
'suka'
[8]:
model.correct('kpd')
[8]:
'kpd'
[9]:
model.correct('krajaan')
[9]:
'kerajaan'

To correct a sentence#

def correct_text(self, text: str):
    """
    Correct all the words within a text, returning the corrected text.

    Parameters
    ----------
    text: str

    Returns
    -------
    result: str
    """
[10]:
model.correct_text(string1)
[10]:
'kerajaan patut bagi pencen awal saat kpd warga emas supaya emosi'
[11]:
tokenizer = malaya.tokenizer.Tokenizer()
[12]:
string2
[12]:
'Husein ska mkn aym dkat kampng Jawa'
[13]:
tokenized = tokenizer.tokenize(string2)
model.correct_text(' '.join(tokenized))
[13]:
'Hussein ska makan ayam dapat kampung Jawa'
[14]:
tokenized = tokenizer.tokenize(string3)
model.correct_text(' '.join(tokenized))
[14]:
'Melayu malas ni narration dia sama je macam men are trash . True to some , false to some .'
[15]:
tokenized = tokenizer.tokenize(string5)
model.correct_text(' '.join(tokenized))
[15]:
'DrM cerita Melayu malas semenjak saya kat University ( early 1980s ) and now asia i am edging towards retirement cina 4 - 5 years time after a career of being an Engineer , Project Manager , General Manager'
[16]:
tokenized = tokenizer.tokenize(string6)
model.correct_text(' '.join(tokenized))
[16]:
'blh bintang dlm kelas malaya saya , nnti mintalah'
[17]:
tokenized = tokenizer.tokenize(string7)
model.correct_text(' '.join(tokenized))
[17]:
'mulakan slh org boleh , bila geng tuh kena salahkan jgk boleh trima . . pelik'