Source code for malaya.pos

import re
from malaya.text.tatabahasa import tatabahasa_dict, hujung, permulaan
from malaya.supervised.huggingface import load
from malaya.torch_model.huggingface import Tagging

label = [
    'OTHER',
    'ADJ',
    'ADP',
    'ADV',
    'AUX',
    'CCONJ',
    'DET',
    'NOUN',
    'NUM',
    'PART',
    'PRON',
    'PROPN',
    'PUNCT',
    'SCONJ',
    'SYM',
    'VERB',
    'X'
]


describe = [
    {'Tag': 'ADJ', 'Description': 'Adjective, kata sifat'},
    {'Tag': 'ADP', 'Description': 'Adposition'},
    {'Tag': 'ADV', 'Description': 'Adverb, kata keterangan'},
    {'Tag': 'ADX', 'Description': 'Auxiliary verb, kata kerja tambahan'},
    {'Tag': 'CCONJ', 'Description': 'Coordinating conjuction, kata hubung'},
    {'Tag': 'DET', 'Description': 'Determiner, kata penentu'},
    {'Tag': 'NOUN', 'Description': ' Noun, kata nama'},
    {'Tag': 'NUM', 'Description': 'Number, nombor'},
    {'Tag': 'PART', 'Description': 'Particle'},
    {'Tag': 'PRON', 'Description': 'Pronoun, kata ganti'},
    {'Tag': 'PROPN', 'Description': 'Proper noun, kata ganti nama khas'},
    {'Tag': 'SCONJ', 'Description': 'Subordinating conjunction'},
    {'Tag': 'SYM', 'Description': 'Symbol'},
    {'Tag': 'VERB', 'Description': 'Verb, kata kerja'},
    {'Tag': 'X', 'Description': 'Other'},
]

available_huggingface = {
    'mesolitica/pos-t5-tiny-standard-bahasa-cased': {
        'Size (MB)': 84.7,
        'PART': {'precision': 0.8938547486033519,
                 'recall': 0.9411764705882353,
                 'f1': 0.9169054441260744,
                 'number': 170},
        'CCONJ': {'precision': 0.9713905522288756,
                  'recall': 0.9785522788203753,
                  'f1': 0.974958263772955,
                  'number': 1492},
        'ADJ': {'precision': 0.9192897497982244,
                'recall': 0.88984375,
                'f1': 0.9043271139341008,
                'number': 1280},
        'ADP': {'precision': 0.9770908087220536,
                'recall': 0.9844271412680756,
                'f1': 0.9807452555755645,
                'number': 3596},
        'ADV': {'precision': 0.9478672985781991,
                'recall': 0.9523809523809523,
                'f1': 0.9501187648456056,
                'number': 1260},
        'VERB': {'precision': 0.9654357459379616,
                 'recall': 0.9662921348314607,
                 'f1': 0.9658637505541599,
                 'number': 3382},
        'DET': {'precision': 0.9603854389721628,
                'recall': 0.9542553191489361,
                'f1': 0.9573105656350054,
                'number': 940},
        'NOUN': {'precision': 0.8789933694996986,
                 'recall': 0.8976608187134503,
                 'f1': 0.8882290239074159,
                 'number': 6498},
        'PRON': {'precision': 0.9888991674375578,
                 'recall': 0.9861623616236163,
                 'f1': 0.9875288683602771,
                 'number': 1084},
        'PROPN': {'precision': 0.8842357164223751,
                  'recall': 0.8982072318444242,
                  'f1': 0.891166716912873,
                  'number': 6582},
        'NUM': {'precision': 0.9532391622016562,
                'recall': 0.9688118811881188,
                'f1': 0.9609624355511908,
                'number': 2020},
        'PUNCT': {'precision': 0.9991261796574624,
                  'recall': 0.9980796089385475,
                  'f1': 0.9986026200873362,
                  'number': 5728},
        'AUX': {'precision': 1.0,
                'recall': 0.9852941176470589,
                'f1': 0.9925925925925926,
                'number': 204},
        'SYM': {'precision': 0.8950617283950617,
                'recall': 0.90625,
                'f1': 0.9006211180124224,
                'number': 160},
        'X': {'precision': 0.4444444444444444,
              'recall': 0.5,
              'f1': 0.47058823529411764,
              'number': 16},
        'overall_precision': 0.9370964022140221,
        'overall_recall': 0.9446123445309775,
        'overall_f1': 0.9408393632416786,
        'overall_accuracy': 0.9579554043839759
    },
    'mesolitica/pos-t5-small-standard-bahasa-cased': {
        'Size (MB)': 141,
        'PART': {'precision': 0.950920245398773,
                 'recall': 0.9117647058823529,
                 'f1': 0.9309309309309309,
                 'number': 170},
        'SCONJ': {'precision': 0.9883481836874571,
                  'recall': 0.9664879356568364,
                  'f1': 0.9772958319213825,
                  'number': 1492},
        'ADJ': {'precision': 0.9257425742574258,
                'recall': 0.8765625,
                'f1': 0.9004815409309791,
                'number': 1280},
        'ADP': {'precision': 0.9854219231847491,
                'recall': 0.9774749721913237,
                'f1': 0.9814323607427056,
                'number': 3596},
        'ADV': {'precision': 0.9580306698950767,
                'recall': 0.942063492063492,
                'f1': 0.9499799919967987,
                'number': 1260},
        'VERB': {'precision': 0.9693969396939695,
                 'recall': 0.9553518628030752,
                 'f1': 0.9623231571109457,
                 'number': 3382},
        'DET': {'precision': 0.9666307857911733,
                'recall': 0.9553191489361702,
                'f1': 0.9609416800428037,
                'number': 940},
        'NOUN': {'precision': 0.892811906269791,
                 'recall': 0.8678054786088027,
                 'f1': 0.880131106602154,
                 'number': 6498},
        'PRON': {'precision': 0.9906803355079217,
                 'recall': 0.9806273062730627,
                 'f1': 0.9856281872971719,
                 'number': 1084},
        'PROPN': {'precision': 0.8682452062754212,
                  'recall': 0.9080826496505622,
                  'f1': 0.8877172137234517,
                  'number': 6582},
        'NUM': {'precision': 0.9799899949974987,
                'recall': 0.9698019801980198,
                'f1': 0.9748693704901717,
                'number': 2020},
        'PUNCT': {'precision': 0.9986033519553073,
                  'recall': 0.9986033519553073,
                  'f1': 0.9986033519553073,
                  'number': 5728},
        'AUX': {'precision': 0.9900990099009901,
                'recall': 0.9803921568627451,
                'f1': 0.9852216748768472,
                'number': 204},
        'SYM': {'precision': 0.9246575342465754,
                'recall': 0.84375,
                'f1': 0.8823529411764707,
                'number': 160},
        'X': {'precision': 1.0, 'recall': 0.25, 'f1': 0.4, 'number': 16},
        'overall_precision': 0.941408302679979,
        'overall_recall': 0.9370859002673486,
        'overall_f1': 0.939242128564355,
        'overall_accuracy': 0.955475245653817
    },
}


def _naive_POS_word(word):
    for key, vals in tatabahasa_dict.items():
        if word in vals:
            return (key, word)
    try:
        if len(re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung[:1])), i)[0]) > 1:
            return ('KJ', word)
    except BaseException:
        pass
    try:
        if (
            len(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[:-4])), word)[0])
            > 1
        ):
            return ('KJ', word)
    except Exception as e:
        pass
    if len(word) > 2:
        return ('KN', word)
    else:
        return ('', word)


[docs]def huggingface( model: str = 'mesolitica/pos-t5-small-standard-bahasa-cased', force_check: bool = True, **kwargs, ): """ Load HuggingFace model to Part-of-Speech Recognition. Parameters ---------- model: str, optional (default='mesolitica/pos-t5-small-standard-bahasa-cased') Check available models at `malaya.pos.available_huggingface`. force_check: bool, optional (default=True) Force check model one of malaya model. Set to False if you have your own huggingface model. Returns ------- result: malaya.torch_model.huggingface.Tagging """ return load( model=model, class_model=Tagging, available_huggingface=available_huggingface, force_check=force_check, path=__name__, **kwargs, )