from malaya.text.regex import _expressions
from malaya.text.function import split_into_sentences
import re
import html
[docs]class Tokenizer:
def __init__(self, **kwargs):
"""
Load Tokenizer object.
Check supported regex pattern at
https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85
Parameters
----------
emojis: bool, optional (default=False)
True to keep emojis.
urls: bool, optional (default=True)
True to keep urls.
urls_improved: bool, optional (default=True)
True to keep urls, better version.
tags: bool, optional (default=True)
True to keep tags: <tag>.
emails: bool, optional (default=True)
True to keep emails.
users: bool, optional (default=True)
True to keep users handles: @cbaziotis.
hashtags: bool, optional (default=True)
True to keep hashtags.
phones: bool, optional (default=True)
True to keep phones.
percents: bool, optional (default=True)
True to keep percents.
money: bool, optional (default=True)
True to keep money expressions.
date: bool, optional (default=True)
True to keep date expressions.
time: bool, optional (default=True)
True to keep time expressions.
time_pukul: bool, optional (default=True)
True to keep time `pukul` expressions.
acronyms: bool, optional (default=True)
True to keep acronyms.
emoticons: bool, optional (default=True)
True to keep emoticons.
censored: bool, optional (default=True)
True to keep censored words: f**k.
emphasis: bool, optional (default=True)
True to keep words with emphasis: *very* good.
numbers: bool, optional (default=True)
True to keep numbers.
numbers_with_shortform: bool, optional (default=True)
True to keep numbers with shortform.
temperature: bool, optional (default=True)
True to keep temperatures
distance: bool, optional (default=True)
True to keep distances.
volume: bool, optional (default=True)
True to keep volumes.
duration: bool, optional (default=True)
True to keep durations.
weight: bool, optional (default=True)
True to keep weights.
hypen: bool, optional (default=True)
True to keep hypens.
ic: bool, optional (default=True)
True to keep Malaysian IC.
title: bool, optional (default=True)
True to keep title with dot, Dr. ayam -> ['Dr.', 'ayam']
parliament: bool, optional (default=True)
True to keep P.123 / D.123
"""
pipeline = []
self.regexes = _expressions
emojis = kwargs.get('emojis', False)
urls = kwargs.get('urls', True)
urls_improved = kwargs.get('urls_improved', True)
tags = kwargs.get('tags', True)
emails = kwargs.get('emails', True)
users = kwargs.get('users', True)
hashtags = kwargs.get('hashtags', True)
cashtags = kwargs.get('cashtags', True)
phones = kwargs.get('phones', True)
percents = kwargs.get('percents', True)
money = kwargs.get('money', True)
date = kwargs.get('date', True)
time = kwargs.get('time', True)
time_pukul = kwargs.get('time_pukul', True)
acronyms = kwargs.get('acronyms', True)
emoticons = kwargs.get('emoticons', True)
censored = kwargs.get('censored', True)
emphasis = kwargs.get('emphasis', True)
numbers = kwargs.get('numbers', True)
numbers_with_shortform = kwargs.get('numbers_with_shortform', True)
temperatures = kwargs.get('temperature', True)
distances = kwargs.get('distance', True)
volumes = kwargs.get('volume', True)
durations = kwargs.get('duration', True)
weights = kwargs.get('weight', True)
hypens = kwargs.get('hypen', True)
ic = kwargs.get('ic', True)
title = kwargs.get('title', True)
parliament = kwargs.get('parliament', True)
if title:
pipeline.append(self.regexes['title'])
if parliament:
pipeline.append(self.regexes['parliament'])
if urls:
pipeline.append(self.regexes['url'])
if urls_improved:
pipeline.append(self.regexes['url_v2'])
pipeline.append(self.regexes['url_dperini'])
if tags:
pipeline.append(self.regexes['tag'])
if emails:
pipeline.append(self.wrap_non_matching(self.regexes['email']))
if users:
pipeline.append(self.wrap_non_matching(self.regexes['user']))
if hashtags:
pipeline.append(self.wrap_non_matching(self.regexes['hashtag']))
if cashtags:
pipeline.append(self.wrap_non_matching(self.regexes['cashtag']))
if phones:
pipeline.append(self.wrap_non_matching(self.regexes['phone']))
if percents:
pipeline.append(self.wrap_non_matching(self.regexes['percent']))
if money:
pipeline.append(self.wrap_non_matching(self.regexes['money']))
if date:
pipeline.append(self.wrap_non_matching(self.regexes['date']))
if time:
pipeline.append(self.wrap_non_matching(self.regexes['time']))
if time_pukul:
pipeline.append(self.wrap_non_matching(self.regexes['time_pukul']))
if acronyms:
pipeline.append(self.wrap_non_matching(self.regexes['acronym']))
if emoticons:
pipeline.append(self.regexes['ltr_face'])
pipeline.append(self.regexes['rtl_face'])
if censored:
pipeline.append(self.wrap_non_matching(self.regexes['censored']))
if emphasis:
pipeline.append(self.wrap_non_matching(self.regexes['emphasis']))
if emoticons:
pipeline.append(
self.wrap_non_matching(self.regexes['rest_emoticons'])
)
if temperatures:
pipeline.append(self.wrap_non_matching(self.regexes['temperature']))
if distances:
pipeline.append(self.wrap_non_matching(self.regexes['distance']))
if volumes:
pipeline.append(self.wrap_non_matching(self.regexes['volume']))
if durations:
pipeline.append(self.wrap_non_matching(self.regexes['duration']))
if weights:
pipeline.append(self.wrap_non_matching(self.regexes['weight']))
if ic:
pipeline.append(self.wrap_non_matching(self.regexes['ic']))
if numbers_with_shortform:
pipeline.append(self.regexes['number_with_shortform'])
if numbers:
pipeline.append(self.regexes['number'])
if emojis:
pipeline.append(self.regexes['emoji'])
if hypens:
pipeline.append(self.regexes['hypen'])
pipeline.append(self.regexes['apostrophe'])
pipeline.append(self.regexes['word'])
if emoticons:
pipeline.append(
self.wrap_non_matching(self.regexes['eastern_emoticons'])
)
# keep repeated puncts as one term
# pipeline.append(r"")
pipeline.append('(?:\\S)') # CATCH ALL remaining terms
self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
@staticmethod
def wrap_non_matching(exp):
return '(?:{})'.format(exp)
[docs] def tokenize(self, string: str, lowercase: bool = False):
"""
Tokenize string into words.
Parameters
----------
string : str
lowercase: bool, optional (default=False)
Returns
-------
result: List[str]
"""
escaped = html.unescape(string)
tokenized = self.tok.findall(escaped)
tokenized = [t[0] if isinstance(t, tuple) else t for t in tokenized]
tokenized_all = []
for t in tokenized:
if len(re.findall(r'\.{2,}', t)):
splitted = [w if len(w) else '.' for w in t.split('.')]
tokenized_all.extend(splitted)
else:
tokenized_all.append(t)
tokenized = [re.sub(r'[ ]+', ' ', t).strip() for t in tokenized_all]
if lowercase:
tokenized = [t.lower() for t in tokenized]
return tokenized
[docs]class SentenceTokenizer:
def __init__(self):
pass
[docs] def tokenize(self, string, minimum_length=5):
"""
Tokenize string into multiple strings.
Parameters
----------
string : str
minimum_length: int, optional (default=5)
minimum length to assume a string is a string, default 5 characters.
Returns
-------
result: List[str]
"""
return split_into_sentences(string, minimum_length=minimum_length)