from malaya.path import PATH_AUGMENTATION, S3_PATH_AUGMENTATION
from malaya.augmentation.base import _make_upper
from malaya.text.tatabahasa import consonants, vowels
from malaya.text.function import case_of
from malaya.function import check_file
from collections import defaultdict
import random
import json
from typing import Dict, List
_synonym_dict = None
def replace_synonym(string, threshold):
for no, word in enumerate(string):
if word in _synonym_dict and random.random() > threshold:
w = random.choice(_synonym_dict[word])
string[no] = w
return string
[docs]def synonym(
string: str,
threshold: float = 0.5,
top_n=5,
**kwargs
):
"""
augmenting a string using synonym, https://github.com/huseinzol05/Malaya-Dataset#90k-synonym
Parameters
----------
string: str
this string input assumed been properly tokenized and cleaned.
threshold: float, optional (default=0.5)
random selection for a word.
top_n: int, (default=5)
number of nearest neighbors returned. Length of returned result should as top_n.
Returns
-------
result: List[str]
"""
global _synonym_dict
if _synonym_dict is None:
path = check_file(
PATH_AUGMENTATION['synonym'],
S3_PATH_AUGMENTATION['synonym'],
**kwargs
)
files = list(path.values())
synonyms = defaultdict(list)
for file in files:
with open(file) as fopen:
data = json.load(fopen)
for i in data:
if not len(i[1]):
continue
synonyms[i[0]].extend(i[1])
for r in i[1]:
synonyms[r].append(i[0])
for k, v in synonyms.items():
synonyms[k] = list(set(v))
_synonym_dict = synonyms
original_string = string
string = string.split()
augmented = []
for i in range(top_n):
string_ = replace_synonym(string, threshold)
augmented.append(
_make_upper(' '.join(string_), ' '.join(original_string))
)
return augmented
[docs]def replace_similar_consonants(
word: str,
threshold: float = 0.5,
replace_consonants: Dict[str, List[str]] = {
'n': ['m'],
'r': ['t', 'q'],
'g': ['h'],
'j': ['k'],
'k': ['l'],
'd': ['s', 'f'],
'g': ['f', 'h'],
'b': ['n'],
'f': ['p'],
}
):
"""
Naively replace consonants with another consonants to simulate typo or slang
if after consonants is a vowel.
Parameters
----------
word: str
threshold: float, optional (default=0.5)
Returns
-------
result: List[str]
"""
results = list(word)
for no, c in enumerate(results[:-1]):
if random.random() >= threshold and c in consonants and results[no + 1] in vowels:
results[no] = random.choice(replace_consonants.get(c, [c]))
if random.random(
) >= threshold and results[-1] in consonants and results[-2] in vowels and results[-3] in consonants:
results[-1] = random.choice(replace_consonants.get(results[-1], [results[-1]]))
return ''.join(results)
[docs]def replace_similar_vowels(
word: str,
threshold: float = 0.5,
replace_vowels: Dict[str, List[str]] = {
'u': ['o'],
'a': ['o'],
'i': ['o'],
'o': ['u'],
}
):
"""
Naively replace vowels with another vowels to simulate typo or slang
if after vowels is a consonant.
Parameters
----------
word: str
threshold: float, optional (default=0.5)
Returns
-------
result: str
"""
results = list(word)
for no, c in enumerate(results[:-1]):
if random.random() >= threshold and c in vowels and results[no + 1] in consonants:
results[no] = random.choice(replace_vowels.get(c, [c]))
if random.random(
) >= threshold and results[-1] in vowels and results[-2] in consonants and results[-3] in vowels:
results[-1] = random.choice(replace_vowels.get(results[-1], [results[-1]]))
return ''.join(results)
[docs]def vowel_alternate(word: str, threshold: float = 0.5):
"""
augmenting a word into vowel alternate.
vowel_alternate('singapore')
-> sngpore
vowel_alternate('kampung')
-> kmpng
vowel_alternate('ayam')
-> aym
Parameters
----------
word: str
threshold: float, optional (default=0.5)
Returns
-------
result: str
"""
word_temp = word
word = word.lower()
if not len(word):
raise ValueError('word is too short to augment shortform.')
word = list(word[:])
i = 0
while i < len(word) - 2:
subword = word[i: i + 3]
if subword[0] in consonants and subword[1] in vowels and subword[2] in consonants \
and random.random() >= threshold:
word.pop(i + 1)
i += 1
return case_of(word_temp)(''.join(word))