Source code for malaya.cluster

from typing import List, Tuple

_accepted_pos = [
_accepted_entities = [

[docs]def cluster_words(list_words: List[str], lowercase: bool = False): """ cluster similar words based on structure, eg, ['mahathir mohamad', 'mahathir'] = ['mahathir mohamad']. big O = n^2 Parameters ---------- list_words: List[str] lowercase: bool, optional (default=True) if True, will group using lowercase but maintain the original form. Returns ------- string: List[str] """ dict_words = {} for word in list_words: found = False for key in dict_words.keys(): if lowercase: check = [ word.lower() in inside.lower() for inside in dict_words[key] ] else: check = [word in inside for inside in dict_words[key]] if word in key or any(check): dict_words[key].append(word) found = True if key in word: dict_words[key].append(word) if not found: dict_words[word] = [word] results = [] for key, words in dict_words.items(): results.append(max(list(set([key] + words)), key=len)) return list(set(results))
[docs]def cluster_pos(result: List[Tuple[str, str]]): """ cluster similar POS. Parameters ---------- result: List[Tuple[str, str]] Returns ------- result: Dict[str, List[str]] """ if not all([i[1] in _accepted_pos for i in result]): raise ValueError( 'elements of result must be a subset or equal of supported POS, please run `malaya.pos.describe()` to get supported POS.' ) output = {p: [] for p in _accepted_pos} last_label, words = None, [] for word, label in result: if last_label != label and last_label: joined = ' '.join(words) if joined not in output[last_label]: output[last_label].append(joined) words = [] last_label = label words.append(word) else: if not last_label: last_label = label words.append(word) output[last_label].append(' '.join(words)) return output
[docs]def cluster_entities(result: List[Tuple[str, str]]): """ cluster similar Entities. Parameters ---------- result: List[Tuple[str, str]] Returns ------- result: Dict[str, List[str]] """ if not all([i[1] in _accepted_entities for i in result]): raise ValueError( 'elements of result must be a subset or equal of supported Entities, please run `malaya.entity.describe` to get supported Entities.' ) output = {e: [] for e in _accepted_entities} last_label, words = None, [] for word, label in result: if last_label != label and last_label: joined = ' '.join(words) if joined not in output[last_label]: output[last_label].append(joined) words = [] last_label = label words.append(word) else: if not last_label: last_label = label words.append(word) output[last_label].append(' '.join(words)) return output
[docs]def cluster_tagging(result: List[Tuple[str, str]]): """ cluster any tagging results, as long the data passed `[(string, label), (string, label)]`. Parameters ---------- result: List[Tuple[str, str]] Returns ------- result: Dict[str, List[str]] """ _, labels = list(zip(*result)) output = {l: [] for l in labels} last_label, words = None, [] for word, label in result: if last_label != label and last_label: joined = ' '.join(words) if joined not in output[last_label]: output[last_label].append(joined) words = [] last_label = label words.append(word) else: if not last_label: last_label = label words.append(word) output[last_label].append(' '.join(words)) return output