Source code for malaya.tokenizer

from malaya.text.regex import _expressions
from malaya.text.function import split_into_sentences
import re
import html


[docs]class Tokenizer:
    def __init__(self, **kwargs):
        """
        Load Tokenizer object.
        Check supported regex pattern at
        https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85

        Parameters
        ----------
        emojis: bool, optional (default=False)
            True to keep emojis.
        urls: bool, optional (default=True)
            True to keep urls.
        urls_improved: bool, optional (default=True)
            True to keep urls, better version.
        tags: bool, optional (default=True)
            True to keep tags: <tag>.
        emails: bool, optional (default=True)
            True to keep emails.
        users: bool, optional (default=True)
            True to keep users handles: @cbaziotis.
        hashtags: bool, optional (default=True)
            True to keep hashtags.
        phones: bool, optional (default=True)
            True to keep phones.
        percents: bool, optional (default=True)
            True to keep percents.
        money: bool, optional (default=True)
            True to keep money expressions.
        date: bool, optional (default=True)
            True to keep date expressions.
        time: bool, optional (default=True)
            True to keep time expressions.
        time_pukul: bool, optional (default=True)
            True to keep time `pukul` expressions.
        acronyms: bool, optional (default=True)
            True to keep acronyms.
        emoticons: bool, optional (default=True)
            True to keep emoticons.
        censored: bool, optional (default=True)
            True to keep censored words: f**k.
        emphasis: bool, optional (default=True)
            True to keep words with emphasis: *very* good.
        numbers: bool, optional (default=True)
            True to keep numbers.
        numbers_with_shortform: bool, optional (default=True)
            True to keep numbers with shortform.
        temperature: bool, optional (default=True)
            True to keep temperatures
        distance: bool, optional (default=True)
            True to keep distances.
        volume: bool, optional (default=True)
            True to keep volumes.
        duration: bool, optional (default=True)
            True to keep durations.
        weight: bool, optional (default=True)
            True to keep weights.
        hypen: bool, optional (default=True)
            True to keep hypens.
        ic: bool, optional (default=True)
            True to keep Malaysian IC.
        title: bool, optional (default=True)
            True to keep title with dot, Dr. ayam -> ['Dr.', 'ayam']
        parliament: bool, optional (default=True)
            True to keep P.123 / D.123
        """

        pipeline = []
        self.regexes = _expressions

        emojis = kwargs.get('emojis', False)
        urls = kwargs.get('urls', True)
        urls_improved = kwargs.get('urls_improved', True)
        tags = kwargs.get('tags', True)
        emails = kwargs.get('emails', True)
        users = kwargs.get('users', True)
        hashtags = kwargs.get('hashtags', True)
        cashtags = kwargs.get('cashtags', True)
        phones = kwargs.get('phones', True)
        percents = kwargs.get('percents', True)
        money = kwargs.get('money', True)
        date = kwargs.get('date', True)
        time = kwargs.get('time', True)
        time_pukul = kwargs.get('time_pukul', True)
        acronyms = kwargs.get('acronyms', True)
        emoticons = kwargs.get('emoticons', True)
        censored = kwargs.get('censored', True)
        emphasis = kwargs.get('emphasis', True)
        numbers = kwargs.get('numbers', True)
        numbers_with_shortform = kwargs.get('numbers_with_shortform', True)
        temperatures = kwargs.get('temperature', True)
        distances = kwargs.get('distance', True)
        volumes = kwargs.get('volume', True)
        durations = kwargs.get('duration', True)
        weights = kwargs.get('weight', True)
        hypens = kwargs.get('hypen', True)
        ic = kwargs.get('ic', True)
        title = kwargs.get('title', True)
        parliament = kwargs.get('parliament', True)

        if title:
            pipeline.append(self.regexes['title'])

        if parliament:
            pipeline.append(self.regexes['parliament'])

        if urls:
            pipeline.append(self.regexes['url'])

        if urls_improved:
            pipeline.append(self.regexes['url_v2'])
            pipeline.append(self.regexes['url_dperini'])

        if tags:
            pipeline.append(self.regexes['tag'])

        if emails:
            pipeline.append(self.wrap_non_matching(self.regexes['email']))

        if users:
            pipeline.append(self.wrap_non_matching(self.regexes['user']))

        if hashtags:
            pipeline.append(self.wrap_non_matching(self.regexes['hashtag']))

        if cashtags:
            pipeline.append(self.wrap_non_matching(self.regexes['cashtag']))

        if phones:
            pipeline.append(self.wrap_non_matching(self.regexes['phone']))

        if percents:
            pipeline.append(self.wrap_non_matching(self.regexes['percent']))

        if money:
            pipeline.append(self.wrap_non_matching(self.regexes['money']))

        if date:
            pipeline.append(self.wrap_non_matching(self.regexes['date']))

        if time:
            pipeline.append(self.wrap_non_matching(self.regexes['time']))

        if time_pukul:
            pipeline.append(self.wrap_non_matching(self.regexes['time_pukul']))

        if acronyms:
            pipeline.append(self.wrap_non_matching(self.regexes['acronym']))

        if emoticons:
            pipeline.append(self.regexes['ltr_face'])
            pipeline.append(self.regexes['rtl_face'])

        if censored:
            pipeline.append(self.wrap_non_matching(self.regexes['censored']))

        if emphasis:
            pipeline.append(self.wrap_non_matching(self.regexes['emphasis']))

        if emoticons:
            pipeline.append(
                self.wrap_non_matching(self.regexes['rest_emoticons'])
            )

        if temperatures:
            pipeline.append(self.wrap_non_matching(self.regexes['temperature']))

        if distances:
            pipeline.append(self.wrap_non_matching(self.regexes['distance']))

        if volumes:
            pipeline.append(self.wrap_non_matching(self.regexes['volume']))

        if durations:
            pipeline.append(self.wrap_non_matching(self.regexes['duration']))

        if weights:
            pipeline.append(self.wrap_non_matching(self.regexes['weight']))

        if ic:
            pipeline.append(self.wrap_non_matching(self.regexes['ic']))

        if numbers_with_shortform:
            pipeline.append(self.regexes['number_with_shortform'])

        if numbers:
            pipeline.append(self.regexes['number'])

        if emojis:
            pipeline.append(self.regexes['emoji'])

        if hypens:
            pipeline.append(self.regexes['hypen'])

        pipeline.append(self.regexes['apostrophe'])
        pipeline.append(self.regexes['word'])

        if emoticons:
            pipeline.append(
                self.wrap_non_matching(self.regexes['eastern_emoticons'])
            )

        # keep repeated puncts as one term
        # pipeline.append(r"")

        pipeline.append('(?:\\S)')  # CATCH ALL remaining terms

        self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

    @staticmethod
    def wrap_non_matching(exp):
        return '(?:{})'.format(exp)

[docs]    def tokenize(self, string: str, lowercase: bool = False):
        """
        Tokenize string into words.

        Parameters
        ----------
        string : str
        lowercase: bool, optional (default=False)

        Returns
        -------
        result: List[str]
        """
        escaped = html.unescape(string)
        tokenized = self.tok.findall(escaped)
        tokenized = [t[0] if isinstance(t, tuple) else t for t in tokenized]
        tokenized_all = []
        for t in tokenized:
            if len(re.findall(r'\.{2,}', t)):
                splitted = [w if len(w) else '.' for w in t.split('.')]
                tokenized_all.extend(splitted)
            else:
                tokenized_all.append(t)
        tokenized = [re.sub(r'[ ]+', ' ', t).strip() for t in tokenized_all]

        if lowercase:
            tokenized = [t.lower() for t in tokenized]

        return tokenized


[docs]class SentenceTokenizer:
    def __init__(self):
        pass

[docs]    def tokenize(self, string, minimum_length=5):
        """
        Tokenize string into multiple strings.

        Parameters
        ----------
        string : str
        minimum_length: int, optional (default=5)
            minimum length to assume a string is a string, default 5 characters.

        Returns
        -------
        result: List[str]
        """
        return split_into_sentences(string, minimum_length=minimum_length)