Source code for malaya.cluster

from typing import List, Tuple

_accepted_pos = [
    'ADJ',
    'ADP',
    'ADV',
    'ADX',
    'CCONJ',
    'DET',
    'NOUN',
    'NUM',
    'PART',
    'PRON',
    'PROPN',
    'PUNCT',
    'SCONJ',
    'SYM',
    'VERB',
    'X',
]
_accepted_entities = [
    'OTHER',
    'law',
    'location',
    'organization',
    'person',
    'quantity',
    'time',
    'event',
    'X',
]


[docs]def cluster_words(list_words: List[str], lowercase: bool = False):
    """
    cluster similar words based on structure, eg, ['mahathir mohamad', 'mahathir'] = ['mahathir mohamad'].
    big O = n^2

    Parameters
    ----------
    list_words: List[str]
    lowercase: bool, optional (default=True)
        if True, will group using lowercase but maintain the original form.

    Returns
    -------
    string: List[str]
    """

    dict_words = {}
    for word in list_words:
        found = False
        for key in dict_words.keys():
            if lowercase:
                check = [
                    word.lower() in inside.lower() for inside in dict_words[key]
                ]
            else:
                check = [word in inside for inside in dict_words[key]]
            if word in key or any(check):
                dict_words[key].append(word)
                found = True
            if key in word:
                dict_words[key].append(word)
        if not found:
            dict_words[word] = [word]
    results = []
    for key, words in dict_words.items():
        results.append(max(list(set([key] + words)), key=len))
    return list(set(results))


[docs]def cluster_pos(result: List[Tuple[str, str]]):
    """
    cluster similar POS.

    Parameters
    ----------
    result: List[Tuple[str, str]]

    Returns
    -------
    result: Dict[str, List[str]]
    """

    if not all([i[1] in _accepted_pos for i in result]):
        raise ValueError(
            'elements of result must be a subset or equal of supported POS, please run `malaya.pos.describe()` to get supported POS.'
        )

    output = {p: [] for p in _accepted_pos}
    last_label, words = None, []
    for word, label in result:
        if last_label != label and last_label:
            joined = ' '.join(words)
            if joined not in output[last_label]:
                output[last_label].append(joined)
            words = []
            last_label = label
            words.append(word)

        else:
            if not last_label:
                last_label = label
            words.append(word)
    output[last_label].append(' '.join(words))
    return output


[docs]def cluster_entities(result: List[Tuple[str, str]]):
    """
    cluster similar Entities.

    Parameters
    ----------
    result: List[Tuple[str, str]]

    Returns
    -------
    result: Dict[str, List[str]]
    """
    if not all([i[1] in _accepted_entities for i in result]):
        raise ValueError(
            'elements of result must be a subset or equal of supported Entities, please run `malaya.entity.describe` to get supported Entities.'
        )

    output = {e: [] for e in _accepted_entities}
    last_label, words = None, []
    for word, label in result:
        if last_label != label and last_label:
            joined = ' '.join(words)
            if joined not in output[last_label]:
                output[last_label].append(joined)
            words = []
            last_label = label
            words.append(word)

        else:
            if not last_label:
                last_label = label
            words.append(word)
    output[last_label].append(' '.join(words))
    return output


[docs]def cluster_tagging(result: List[Tuple[str, str]]):
    """
    cluster any tagging results, as long the data passed `[(string, label), (string, label)]`.

    Parameters
    ----------
    result: List[Tuple[str, str]]

    Returns
    -------
    result: Dict[str, List[str]]
    """

    _, labels = list(zip(*result))

    output = {l: [] for l in labels}
    last_label, words = None, []
    for word, label in result:
        if last_label != label and last_label:
            joined = ' '.join(words)
            if joined not in output[last_label]:
                output[last_label].append(joined)
            words = []
            last_label = label
            words.append(word)

        else:
            if not last_label:
                last_label = label
            words.append(word)
    output[last_label].append(' '.join(words))
    return output