Source code for malaya.spelling_correction.symspell

import json
from malaya.path import PATH_NGRAM, S3_PATH_NGRAM
from malaya.function import check_file
from malaya.spelling_correction.base import (
    _augment_vowel_alternate,
    get_permulaan_hujung,
)
from malaya.text.function import case_of, check_ratio_upper_lower
from malaya.dictionary import is_english, is_malay
from malaya.text.rules import rules_normalizer
from malaya.spelling_correction.probability import Spell
from malaya.text.tatabahasa import (
    consonants,
    permulaan,
    hujung,
    stopword_tatabahasa,
)
from typing import List


[docs]class Symspell(Spell): """ The SymspellCorrector extends the functionality of symspeller, https://github.com/mammothb/symspellpy And improve it using some algorithms from Normalization of noisy texts in Malaysian online reviews, https://www.researchgate.net/publication/287050449_Normalization_of_noisy_texts_in_Malaysian_online_reviews Added custom vowels augmentation """ def __init__(self, model, verbosity, corpus, k=10): self._model = model self._verbosity = verbosity self._corpus = corpus self.k = k def predict(self, word): max_edit_distance_lookup = 2 suggestion_verbosity = self._verbosity suggestions = self._model.lookup( word, suggestion_verbosity, max_edit_distance_lookup )[: self.k] return suggestions
[docs] def edit_step(self, word): """ Generate candidates given a word. Parameters ---------- word: str Returns ------- result: {candidate1, candidate2} """ result = list(_augment_vowel_alternate(word)) if len(word): # berape -> berapa, mne -> mna if word[-1] == 'e': inner = word[:-1] + 'a' result.extend(list(_augment_vowel_alternate(inner))) # pikir -> fikir if word[0] == 'p': inner = 'f' + word[1:] result.extend(list(_augment_vowel_alternate(inner))) if len(word) > 2: # bapak -> bapa, mintak -> minta, mntak -> mnta if word[-2:] == 'ak': inner = word[:-1] result.extend(list(_augment_vowel_alternate(word[:-1]))) # hnto -> hantar, bako -> bkar, sabo -> sabar # tido -> tidur if word[-1] == 'o' and word[-2] in consonants: inner = word[:-1] + 'ar' result.extend(list(_augment_vowel_alternate(inner))) inner = word[:-1] + 'ur' result.extend(list(_augment_vowel_alternate(inner))) # antu -> hantu, antar -> hantar if word[0] == 'a' and word[1] in consonants: inner = 'h' + word result.extend(list(_augment_vowel_alternate(inner))) # ptg -> ptng, dtg -> dtng if ( word[-3] in consonants and word[-2] in consonants and word[-1] == 'g' ): inner = word[:-1] + 'ng' result.extend(list(_augment_vowel_alternate(inner))) # igt -> ingt if word[1] == 'g' and word[2] in consonants: inner = word[0] + 'n' + word[1:] result.extend(list(_augment_vowel_alternate(inner))) words = {} for r in result: suggestions = self.predict(r) for s in suggestions: words[s.term] = words.get(s.term, 0) + ( s.count / (s.distance + 1) ) return words
[docs] def edit_candidates(self, word, get_score=False): """ Generate candidates given a word. Parameters ---------- word: str Returns ------- result: List[str] """ ttt = self.edit_step(word) ttt = {k: v for k, v in ttt.items() if not all([c in consonants for c in k])} or {word: 10} ttt = { k: v for k, v in ttt.items() if len(k) > 3 and not is_english(k) } ttt[word] = ttt.get(word, 0) + 10 if not len(ttt): ttt = {word: 10} if get_score: return ttt else: return list(ttt)
[docs] def correct(self, word: str, **kwargs): """ Most probable spelling correction for word. Parameters ---------- word: str Returns ------- result: str """ if is_english(word): return word if self._corpus.get(word, 0) > 5000: return word if is_malay(word): return word if word in stopword_tatabahasa: return word cp_word = word[:] word, hujung_result, permulaan_result = get_permulaan_hujung(word) combined = True if len(word): if word in rules_normalizer: word = rules_normalizer[word] else: candidates1 = self.edit_candidates(word, get_score=True) candidates2 = self.edit_candidates(cp_word, get_score=True) word1 = max(candidates1, key=candidates1.get) word2 = max(candidates2, key=candidates2.get) if candidates1[word1] > candidates2[word2]: word = word1 else: word = word2 combined = False if len(hujung_result) and not word.endswith(hujung_result) and combined: word = word + hujung_result if len(permulaan_result) and not word.startswith( permulaan_result) and combined: if len(word) and permulaan_result[-1] == word[0]: word = permulaan_result + word[1:] else: word = permulaan_result + word return word
[docs]def load( max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs ): """ Load a symspell Spell Corrector for Malay. Returns ------- result: malaya.spelling_correction.symspell.Symspell class """ try: from symspellpy.symspellpy import SymSpell, Verbosity except BaseException: raise ModuleNotFoundError( 'symspellpy not installed. Please install it and try again.' ) path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(path['model'], term_index, count_index) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)