Source code for malaya.preprocessing

import re
import json
import ftfy
from functools import lru_cache
from malaya.dictionary import is_english
from malaya.text.rules import rules_normalizer
from malaya.text.regex import _expressions
from malaya.text.normalization import unpack_english_contractions
from malaya.text.function import case_of
from malaya.tokenizer import Tokenizer
from malaya.function import validator
from typing import List, Callable
import logging

logger = logging.getLogger(__name__)

_annotate = [
    'hashtag',
    'allcaps',
    'elongated',
    'repeated',
    'emphasis',
    'censored',
]

_normalize = list(_expressions.keys())

rejected = ['<', '</', '>', '>']


def get_normalize():
    return _normalize


def get_annotate():
    return _annotate


def _get_expression_dict():
    return {
        k.lower(): re.compile(_expressions[k]) for k, v in _expressions.items()
    }


[docs]class Preprocessing: def __init__( self, normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number', ], annotate=[ 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored', 'hashtag', ], lowercase=True, fix_unidecode=True, expand_english_contractions=True, segmenter=None, demoji=None, ): self._fix_unidecode = fix_unidecode self._normalize = normalize self._annotate = annotate self._regexes = _get_expression_dict() self._tokenizer = Tokenizer(lowercase=lowercase).tokenize self._expand_contractions = expand_english_contractions self._segmenter = segmenter if self._segmenter: self._expand_hashtags = True else: self._expand_hashtags = False self._demoji = demoji def _add_special_tag(self, m, tag, mode='single'): if isinstance(m, str): text = m else: text = m.group() if mode == 'single': return ' {} <{}> '.format(text, tag) elif mode == 'wrap': return ' '.join([' <{}> {} </{}> '.format(tag, text, tag)]) + ' ' elif mode == 'every': tokens = text.split() processed = ' '.join([' {} <{}> '.format(t, tag) for t in tokens]) return ' ' + processed + ' ' @lru_cache(maxsize=65536) def _handle_hashtag_match(self, m): expanded = m.group() if self._expand_hashtags: expanded = self._segmenter(expanded[1:]) expanded = ' '.join(expanded.split('-')) expanded = ' '.join(expanded.split('_')) if 'hashtag' in self._annotate: expanded = self._add_special_tag(expanded, 'hashtag', mode='wrap') return expanded @lru_cache(maxsize=65536) def _handle_repeated_puncts(self, m): text = m.group() text = ''.join(sorted(set(text), reverse=True)) if 'repeated' in self._annotate: text = self._add_special_tag(text, 'repeated', mode='wrap') return text @lru_cache(maxsize=65536) def _handle_generic_match(self, m, tag, mode='wrap'): text = m.group() text = self._add_special_tag(text, tag, mode=mode) return text def _handle_elongated_match(self, m): text = m.group() if 'elongated' in self._annotate: text = self._add_special_tag(text, 'elongated', mode='wrap') return text @lru_cache(maxsize=65536) def _handle_emphasis_match(self, m): text = m.group().replace('*', '') if 'emphasis' in self._annotate: text = self._add_special_tag(text, 'emphasis', mode='wrap') return text @lru_cache(maxsize=65536) def _handle_emphasis_emoji(self, m): text = m.group().replace('*', '') text = self._add_special_tag(text, 'emoji', mode='wrap') return text def _dict_replace(self, wordlist, _dict): return [_dict.get(w, w) for w in wordlist] @staticmethod def text(wordlist): in_hashtag = False _words = [] for word in wordlist: if word == '<hashtag>': in_hashtag = True elif word == '</hashtag>': in_hashtag = False elif word in {'<allcaps>', '</allcaps>'} and in_hashtag: continue _words.append(word) return _words def process(self, text): logger.debug(f'early process: {text}') text = re.sub(r' +', ' ', text) if self._fix_unidecode: text = ftfy.fix_text(text) for item in self._normalize: text = self._regexes[item].sub( lambda m: ' ' + '<' + item + '>' + ' ', text ) text = self._regexes['hashtag'].sub( lambda w: self._handle_hashtag_match(w), text ) if 'allcaps' in self._annotate: text = self._regexes['allcaps'].sub( lambda w: self._handle_generic_match( w, 'allcaps', mode='wrap' ), text, ) if 'repeated' in self._annotate: text = self._regexes['repeat_puncts'].sub( lambda w: self._handle_repeated_puncts(w), text ) if 'emphasis' in self._annotate: text = self._regexes['emphasis'].sub( lambda w: self._handle_emphasis_match(w), text ) if 'censored' in self._annotate: text = self._regexes['censored'].sub( lambda w: self._handle_generic_match(w, 'censored'), text ) if self._demoji is not None: text = self._demoji._compiled.sub( lambda w: self._handle_generic_match(w, 'emoji'), text ) if self._expand_contractions: text = unpack_english_contractions(text) logger.debug(f'before self._tokenizer: {text}') text = re.sub(r' +', ' ', text) text = self.text(text.split()) text = ' '.join(text) text = self._tokenizer(text) logger.debug(f'after self._tokenizer: {text}') logger.debug(f'before rules_normalizer: {text}') text = self._dict_replace(text, rules_normalizer) logger.debug(f'after rules_normalizer: {text}') text = [w for w in text if len(w) > 0] return text
[docs]class Demoji: def __init__(self, dictionary): self._dictionary = dictionary escp = (re.escape(c) for c in sorted(self._dictionary, key=len, reverse=True)) self._compiled = re.compile(r'|'.join(escp))
[docs] def demoji(self, string: str): """ Find emojis with string representation. 🔥 -> emoji api. Parameters ---------- string: str Returns ------- result: Dist[str] """ results = self._compiled.findall(string) return {r: self._dictionary[r]['ms'] for r in results}
[docs]def preprocessing( normalize: List[str] = [ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number', ], annotate: List[str] = [ 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored', 'hashtag', ], lowercase: bool = True, fix_unidecode: bool = True, expand_english_contractions: bool = True, segmenter: Callable = None, demoji: Callable = None, **kwargs, ): """ Load Preprocessing class. Parameters ---------- normalize: List[str], optional (default=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number']) normalizing tokens, can check all supported normalizing at `malaya.preprocessing.get_normalize()`. annotate: List[str], optional (default=['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']) annonate tokens <open></open>, only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']. lowercase: bool, optional (default=True) fix_unidecode: bool, optional (default=True) fix unidecode using `ftfy.fix_text`. expand_english_contractions: bool, optional (default=True) expand english contractions. segmenter: Callable, optional (default=None) function to segmentize word. If provide, it will expand hashtags, #mondayblues == monday blues demoji: object demoji object, need to have a method `demoji`. Returns ------- result : malaya.preprocessing.Preprocessing class """ if any([e not in _normalize for e in normalize]): raise ValueError( 'normalize element not able to recognize, supported normalization can check at get_normalize()' ) if any([e not in _annotate for e in annotate]): raise ValueError( f"annotate only accept {str(_annotate)}" ) validator.validate_object_methods( demoji, ['demoji'], 'demoji' ) return Preprocessing( normalize=normalize, annotate=annotate, lowercase=lowercase, fix_unidecode=fix_unidecode, expand_english_contractions=expand_english_contractions, segmenter=segmenter, demoji=demoji, )
[docs]def demoji(): """ Download latest emoji malay description from https://github.com/huseinzol05/malay-dataset/tree/master/dictionary/emoji Returns ------- result : malaya.preprocessing.Demoji class """ try: import requests except BaseException: raise ModuleNotFoundError( 'requests not installed. Please install it by `pip3 install requests` and try again.' ) r = requests.get('https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/dictionary/emoji/demoji.json').json() return Demoji(dictionary=r)