Source code for malaya.augmentation.rules

from malaya.path import PATH_AUGMENTATION, S3_PATH_AUGMENTATION
from malaya.augmentation.base import _make_upper
from malaya.text.tatabahasa import consonants, vowels
from malaya.text.function import case_of
from malaya.function import check_file
from collections import defaultdict
import random
import json
from typing import Dict, List

_synonym_dict = None


def replace_synonym(string, threshold):
    for no, word in enumerate(string):
        if word in _synonym_dict and random.random() > threshold:
            w = random.choice(_synonym_dict[word])
            string[no] = w
    return string


[docs]def synonym( string: str, threshold: float = 0.5, top_n=5, **kwargs ): """ augmenting a string using synonym, https://github.com/huseinzol05/Malaya-Dataset#90k-synonym Parameters ---------- string: str this string input assumed been properly tokenized and cleaned. threshold: float, optional (default=0.5) random selection for a word. top_n: int, (default=5) number of nearest neighbors returned. Length of returned result should as top_n. Returns ------- result: List[str] """ global _synonym_dict if _synonym_dict is None: path = check_file( PATH_AUGMENTATION['synonym'], S3_PATH_AUGMENTATION['synonym'], **kwargs ) files = list(path.values()) synonyms = defaultdict(list) for file in files: with open(file) as fopen: data = json.load(fopen) for i in data: if not len(i[1]): continue synonyms[i[0]].extend(i[1]) for r in i[1]: synonyms[r].append(i[0]) for k, v in synonyms.items(): synonyms[k] = list(set(v)) _synonym_dict = synonyms original_string = string string = string.split() augmented = [] for i in range(top_n): string_ = replace_synonym(string, threshold) augmented.append( _make_upper(' '.join(string_), ' '.join(original_string)) ) return augmented
[docs]def replace_similar_consonants( word: str, threshold: float = 0.5, replace_consonants: Dict[str, List[str]] = { 'n': ['m'], 'r': ['t', 'q'], 'g': ['h'], 'j': ['k'], 'k': ['l'], 'd': ['s', 'f'], 'g': ['f', 'h'], 'b': ['n'], 'f': ['p'], } ): """ Naively replace consonants with another consonants to simulate typo or slang if after consonants is a vowel. Parameters ---------- word: str threshold: float, optional (default=0.5) Returns ------- result: List[str] """ results = list(word) for no, c in enumerate(results[:-1]): if random.random() >= threshold and c in consonants and results[no + 1] in vowels: results[no] = random.choice(replace_consonants.get(c, [c])) if random.random( ) >= threshold and results[-1] in consonants and results[-2] in vowels and results[-3] in consonants: results[-1] = random.choice(replace_consonants.get(results[-1], [results[-1]])) return ''.join(results)
[docs]def replace_similar_vowels( word: str, threshold: float = 0.5, replace_vowels: Dict[str, List[str]] = { 'u': ['o'], 'a': ['o'], 'i': ['o'], 'o': ['u'], } ): """ Naively replace vowels with another vowels to simulate typo or slang if after vowels is a consonant. Parameters ---------- word: str threshold: float, optional (default=0.5) Returns ------- result: str """ results = list(word) for no, c in enumerate(results[:-1]): if random.random() >= threshold and c in vowels and results[no + 1] in consonants: results[no] = random.choice(replace_vowels.get(c, [c])) if random.random( ) >= threshold and results[-1] in vowels and results[-2] in consonants and results[-3] in vowels: results[-1] = random.choice(replace_vowels.get(results[-1], [results[-1]])) return ''.join(results)
[docs]def socialmedia_form(word: str): """ augmenting a word into socialmedia form. Parameters ---------- word: str Returns ------- result: List[str] """ word_temp = word word = word.lower() if not len(word): raise ValueError('word is too short to augment shortform.') results = [] if len(word) > 1: if word[-1] == 'a' and word[-2] in consonants: results.append(word[:-1] + 'e') if word[0] == 'f' and word[-1] == 'r': results.append('p' + word[1:]) if word[-2] in consonants and word[-1] in vowels: results.append(word + 'k') if word[-2] in vowels and word[-1] == 'h': results.append(word[:-1]) if len(word) > 2: if word[-3] in consonants and word[-2:] == 'ar': results.append(word[:-2] + 'o') if word[0] == 'h' and word[1] in vowels and word[2] in consonants: results.append(word[1:]) if word[-3] in consonants and word[-2:] == 'ng': results.append(word[:-2] + 'g') if word[1:3] == 'ng': results.append(word[:1] + x[2:]) results = list(set(results)) results = [case_of(word_temp)(r) for r in results] return results
[docs]def vowel_alternate(word: str, threshold: float = 0.5): """ augmenting a word into vowel alternate. vowel_alternate('singapore') -> sngpore vowel_alternate('kampung') -> kmpng vowel_alternate('ayam') -> aym Parameters ---------- word: str threshold: float, optional (default=0.5) Returns ------- result: str """ word_temp = word word = word.lower() if not len(word): raise ValueError('word is too short to augment shortform.') word = list(word[:]) i = 0 while i < len(word) - 2: subword = word[i: i + 3] if subword[0] in consonants and subword[1] in vowels and subword[2] in consonants \ and random.random() >= threshold: word.pop(i + 1) i += 1 return case_of(word_temp)(''.join(word))
[docs]def kelantanese_form(word: str): """ augmenting a word into kelantanese form. `ayam` -> `ayom` `otak` -> `otok` `kakak` -> `kakok` `barang` -> `bare` `kembang` -> `kembe` `nyarang` -> `nyare` Parameters ---------- word: str Returns ------- result: List[str] """ word_temp = word word = word.lower() if not len(word): raise ValueError('word is too short to augment shortform.') results = [] if len(word) == 3: if word[0] in consonants and word[1] in 'a' and word[2] in consonants: results.append(word[0] + 'o' + word[2]) if len(word) >= 4: if word[-1] in 'ao' and word[-2] in consonants and word[-3] in 'ae': results.append(word[:-1] + 'o') if word[-1] in consonants and word[-2] in 'au' and word[-3] in consonants and word[-4] in 'aou': results.append(word[:-2] + 'o' + word[-1]) if word[-3:] == 'ang' and word[-4] in consonants: results.append(word[:-3] + 'e') if word[-2:] == 'ar' and word[-3] in consonants: results.append(word[:-2] + 'o') if word[-2] == 'an' and word[-3] in consonants: results.append(word[:-2] + 'e') results = list(set(results)) results = [case_of(word_temp)(r) for r in results] return results