import re
from unidecode import unidecode
from malaya.text.tatabahasa import permulaan, hujung
from malaya.text.rules import rules_normalizer
from malaya.function import (
check_file,
load_graph,
generate_session,
nodes_session,
)
from malaya.text.function import pad_sentence_batch, case_of
from malaya.text.regex import _expressions, _money, _date
from malaya.model.abstract import Abstract
from malaya.preprocessing import Tokenizer
from malaya.text.bpe import YTTMEncoder
from malaya.path import STEMMER_VOCAB
from herpetologist import check_type
def _classification_textcleaning_stemmer(string, stemmer):
string = re.sub(
'http\\S+|www.\\S+',
'',
' '.join(
[i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
),
)
string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
string = re.sub('[^A-Za-z ]+', ' ', string)
string = re.sub(r'[ ]+', ' ', string.lower()).strip()
string = [rules_normalizer.get(w, w) for w in string.split()]
string = [(stemmer.stem(word), word) for word in string]
return ' '.join([word[0] for word in string if len(word[0]) > 1])
[docs]class Sastrawi:
def __init__(self, factory):
self.sastrawi_stemmer = factory.create_stemmer()
@check_type
def stem(self, string: str):
return self.sastrawi_stemmer.stem(string)
[docs]class Naive:
def __init__(self, tokenizer):
self._tokenizer = tokenizer
def stem_word(self, word):
hujung_result = [v for k, v in hujung.items() if word.endswith(k)]
if len(hujung_result):
hujung_result = max(hujung_result, key=len)
if len(hujung_result):
word = word[: -len(hujung_result)]
permulaan_result = [
v for k, v in permulaan.items() if word.startswith(k)
]
if len(permulaan_result):
permulaan_result = max(permulaan_result, key=len)
if len(permulaan_result):
word = word[len(permulaan_result):]
return word
@check_type
def stem(self, string: str):
result = []
tokenized = self._tokenizer(string)
for no, word in enumerate(tokenized):
if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
result.append(word)
elif (
re.findall(_money, word.lower())
or re.findall(_date, word.lower())
or re.findall(_expressions['time'], word.lower())
or re.findall(_expressions['hashtag'], word.lower())
or re.findall(_expressions['url'], word.lower())
or re.findall(_expressions['user'], word.lower())
):
result.append(word)
else:
result.append(self.stem_word(word))
return ' '.join(result)
[docs]class DeepStemmer(Abstract):
def __init__(
self, input_nodes, output_nodes, sess, bpe, tokenizer
):
self._input_nodes = input_nodes
self._output_nodes = output_nodes
self._sess = sess
self._bpe = bpe
self._tokenizer = tokenizer
[docs] @check_type
def stem(self, string: str, beam_search: bool = False):
"""
Stem a string, this also include lemmatization.
Parameters
----------
string : str
beam_search : bool, (optional=False)
If True, use beam search decoder, else use greedy decoder.
Returns
-------
result: str
"""
tokenized = self._tokenizer(string)
result, batch, actual, mapping = [], [], [], {}
for no, word in enumerate(tokenized):
if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-':
result.append(word)
elif (
re.findall(_money, word.lower())
or re.findall(_date, word.lower())
or re.findall(_expressions['time'], word.lower())
or re.findall(_expressions['hashtag'], word.lower())
or re.findall(_expressions['url'], word.lower())
or re.findall(_expressions['user'], word.lower())
):
result.append(word)
else:
mapping[len(batch)] = no
result.append('REPLACE-ME')
actual.append(word)
batch.append(word.lower())
if len(batch):
batch = self._bpe.bpe.encode(batch, output_type=self._bpe.mode)
batch = [i + [1] for i in batch]
batch = pad_sentence_batch(batch, 0)[0]
if beam_search:
output = 'beam'
else:
output = 'greedy'
r = self._execute(
inputs=[batch],
input_labels=['Placeholder'],
output_labels=[output],
)
output = r[output].tolist()
for no, o in enumerate(output):
predicted = list(dict.fromkeys(o))
predicted = (
self._bpe.bpe.decode(predicted)[0]
.replace('<EOS>', '')
.replace('<PAD>', '')
)
predicted = case_of(actual[no])(predicted)
result[mapping[no]] = predicted
return ' '.join(result)
[docs]@check_type
def naive():
"""
Load stemming model using startswith and endswith naively using regex patterns.
Returns
-------
result : malaya.stem.Naive class
"""
tokenizer = Tokenizer().tokenize
return Naive(tokenizer=tokenizer)
[docs]@check_type
def sastrawi():
"""
Load stemming model using Sastrawi, this also include lemmatization.
Returns
-------
result: malaya.stem.Sastrawi class
"""
try:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
except BaseException:
raise ModuleNotFoundError(
'PySastrawi not installed. Please install it by `pip install PySastrawi` and try again.'
)
return Sastrawi(StemmerFactory())
[docs]@check_type
def deep_model(quantized: bool = False, **kwargs):
"""
Load LSTM + Bahdanau Attention stemming model, this also include lemmatization.
Original size 41.6MB, quantized size 10.6MB .
Parameters
----------
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
Returns
-------
result: malaya.stem.DeepStemmer class
"""
path = check_file(
file='lstm-bahdanau',
module='stem',
keys={'model': 'model.pb', 'vocab': STEMMER_VOCAB},
quantized=quantized,
**kwargs,
)
g = load_graph(path['model'], **kwargs)
inputs = ['Placeholder']
outputs = []
bpe = YTTMEncoder(vocab_file=path['vocab'], id_mode=True)
input_nodes, output_nodes = nodes_session(
g,
inputs,
outputs,
extra={
'greedy': 'import/decode_1/greedy:0',
'beam': 'import/decode_2/beam:0',
},
)
tokenizer = Tokenizer().tokenize
return DeepStemmer(
input_nodes=input_nodes,
output_nodes=output_nodes,
sess=generate_session(graph=g, **kwargs),
bpe=bpe,
tokenizer=tokenizer,
)