Part-of-Speech Recognition#

This tutorial is available as an IPython notebook at Malaya/example/part-of-speech.

This module only trained on standard language structure, so it is not save to use it for local language structure.

[1]:
%%time
import malaya
CPU times: user 2.83 s, sys: 3.88 s, total: 6.71 s
Wall time: 1.95 s
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

Describe supported POS#

[2]:
malaya.pos.describe
[2]:
[{'Tag': 'ADJ', 'Description': 'Adjective, kata sifat'},
 {'Tag': 'ADP', 'Description': 'Adposition'},
 {'Tag': 'ADV', 'Description': 'Adverb, kata keterangan'},
 {'Tag': 'ADX', 'Description': 'Auxiliary verb, kata kerja tambahan'},
 {'Tag': 'CCONJ', 'Description': 'Coordinating conjuction, kata hubung'},
 {'Tag': 'DET', 'Description': 'Determiner, kata penentu'},
 {'Tag': 'NOUN', 'Description': ' Noun, kata nama'},
 {'Tag': 'NUM', 'Description': 'Number, nombor'},
 {'Tag': 'PART', 'Description': 'Particle'},
 {'Tag': 'PRON', 'Description': 'Pronoun, kata ganti'},
 {'Tag': 'PROPN', 'Description': 'Proper noun, kata ganti nama khas'},
 {'Tag': 'SCONJ', 'Description': 'Subordinating conjunction'},
 {'Tag': 'SYM', 'Description': 'Symbol'},
 {'Tag': 'VERB', 'Description': 'Verb, kata kerja'},
 {'Tag': 'X', 'Description': 'Other'}]

List available HuggingFace POS models#

[3]:
malaya.pos.available_huggingface
[3]:
{'mesolitica/pos-t5-tiny-standard-bahasa-cased': {'Size (MB)': 84.7,
  'PART': {'precision': 0.8938547486033519,
   'recall': 0.9411764705882353,
   'f1': 0.9169054441260744,
   'number': 170},
  'CCONJ': {'precision': 0.9713905522288756,
   'recall': 0.9785522788203753,
   'f1': 0.974958263772955,
   'number': 1492},
  'ADJ': {'precision': 0.9192897497982244,
   'recall': 0.88984375,
   'f1': 0.9043271139341008,
   'number': 1280},
  'ADP': {'precision': 0.9770908087220536,
   'recall': 0.9844271412680756,
   'f1': 0.9807452555755645,
   'number': 3596},
  'ADV': {'precision': 0.9478672985781991,
   'recall': 0.9523809523809523,
   'f1': 0.9501187648456056,
   'number': 1260},
  'VERB': {'precision': 0.9654357459379616,
   'recall': 0.9662921348314607,
   'f1': 0.9658637505541599,
   'number': 3382},
  'DET': {'precision': 0.9603854389721628,
   'recall': 0.9542553191489361,
   'f1': 0.9573105656350054,
   'number': 940},
  'NOUN': {'precision': 0.8789933694996986,
   'recall': 0.8976608187134503,
   'f1': 0.8882290239074159,
   'number': 6498},
  'PRON': {'precision': 0.9888991674375578,
   'recall': 0.9861623616236163,
   'f1': 0.9875288683602771,
   'number': 1084},
  'PROPN': {'precision': 0.8842357164223751,
   'recall': 0.8982072318444242,
   'f1': 0.891166716912873,
   'number': 6582},
  'NUM': {'precision': 0.9532391622016562,
   'recall': 0.9688118811881188,
   'f1': 0.9609624355511908,
   'number': 2020},
  'PUNCT': {'precision': 0.9991261796574624,
   'recall': 0.9980796089385475,
   'f1': 0.9986026200873362,
   'number': 5728},
  'AUX': {'precision': 1.0,
   'recall': 0.9852941176470589,
   'f1': 0.9925925925925926,
   'number': 204},
  'SYM': {'precision': 0.8950617283950617,
   'recall': 0.90625,
   'f1': 0.9006211180124224,
   'number': 160},
  'X': {'precision': 0.4444444444444444,
   'recall': 0.5,
   'f1': 0.47058823529411764,
   'number': 16},
  'overall_precision': 0.9370964022140221,
  'overall_recall': 0.9446123445309775,
  'overall_f1': 0.9408393632416786,
  'overall_accuracy': 0.9579554043839759},
 'mesolitica/pos-t5-small-standard-bahasa-cased': {'Size (MB)': 141,
  'PART': {'precision': 0.950920245398773,
   'recall': 0.9117647058823529,
   'f1': 0.9309309309309309,
   'number': 170},
  'SCONJ': {'precision': 0.9883481836874571,
   'recall': 0.9664879356568364,
   'f1': 0.9772958319213825,
   'number': 1492},
  'ADJ': {'precision': 0.9257425742574258,
   'recall': 0.8765625,
   'f1': 0.9004815409309791,
   'number': 1280},
  'ADP': {'precision': 0.9854219231847491,
   'recall': 0.9774749721913237,
   'f1': 0.9814323607427056,
   'number': 3596},
  'ADV': {'precision': 0.9580306698950767,
   'recall': 0.942063492063492,
   'f1': 0.9499799919967987,
   'number': 1260},
  'VERB': {'precision': 0.9693969396939695,
   'recall': 0.9553518628030752,
   'f1': 0.9623231571109457,
   'number': 3382},
  'DET': {'precision': 0.9666307857911733,
   'recall': 0.9553191489361702,
   'f1': 0.9609416800428037,
   'number': 940},
  'NOUN': {'precision': 0.892811906269791,
   'recall': 0.8678054786088027,
   'f1': 0.880131106602154,
   'number': 6498},
  'PRON': {'precision': 0.9906803355079217,
   'recall': 0.9806273062730627,
   'f1': 0.9856281872971719,
   'number': 1084},
  'PROPN': {'precision': 0.8682452062754212,
   'recall': 0.9080826496505622,
   'f1': 0.8877172137234517,
   'number': 6582},
  'NUM': {'precision': 0.9799899949974987,
   'recall': 0.9698019801980198,
   'f1': 0.9748693704901717,
   'number': 2020},
  'PUNCT': {'precision': 0.9986033519553073,
   'recall': 0.9986033519553073,
   'f1': 0.9986033519553073,
   'number': 5728},
  'AUX': {'precision': 0.9900990099009901,
   'recall': 0.9803921568627451,
   'f1': 0.9852216748768472,
   'number': 204},
  'SYM': {'precision': 0.9246575342465754,
   'recall': 0.84375,
   'f1': 0.8823529411764707,
   'number': 160},
  'X': {'precision': 1.0, 'recall': 0.25, 'f1': 0.4, 'number': 16},
  'overall_precision': 0.941408302679979,
  'overall_recall': 0.9370859002673486,
  'overall_f1': 0.939242128564355,
  'overall_accuracy': 0.955475245653817}}
[5]:
string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'

Load HuggingFace model#

def huggingface(
    model: str = 'mesolitica/pos-t5-small-standard-bahasa-cased',
    force_check: bool = True,
    **kwargs,
):
    """
    Load HuggingFace model to Part-of-Speech Recognition.

    Parameters
    ----------
    model: str, optional (default='mesolitica/pos-t5-small-standard-bahasa-cased')
        Check available models at `malaya.pos.available_huggingface`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.

    Returns
    -------
    result: malaya.torch_model.huggingface.Tagging
    """
[9]:
model = malaya.pos.huggingface()

Predict#

def predict(self, string: str):
    """
    Tag a string.

    Parameters
    ----------
    string : str

    Returns
    -------
    result: Tuple[str, str]
    """
[7]:
model.predict(string)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[7]:
[('KUALA', 'PROPN'),
 ('LUMPUR:', 'PROPN'),
 ('Sempena', 'PROPN'),
 ('sambutan', 'NOUN'),
 ('Aidilfitri', 'PROPN'),
 ('minggu', 'NOUN'),
 ('depan,', 'ADJ'),
 ('Perdana', 'PROPN'),
 ('Menteri', 'PROPN'),
 ('Tun', 'PROPN'),
 ('Dr', 'PROPN'),
 ('Mahathir', 'PROPN'),
 ('Mohamad', 'PROPN'),
 ('dan', 'CCONJ'),
 ('Menteri', 'PROPN'),
 ('Pengangkutan', 'PROPN'),
 ('Anthony', 'PROPN'),
 ('Loke', 'PROPN'),
 ('Siew', 'PROPN'),
 ('Fook', 'PROPN'),
 ('menitipkan', 'VERB'),
 ('pesanan', 'NOUN'),
 ('khas', 'ADJ'),
 ('kepada', 'ADP'),
 ('orang', 'NOUN'),
 ('ramai', 'NOUN'),
 ('yang', 'PRON'),
 ('mahu', 'ADV'),
 ('pulang', 'VERB'),
 ('ke', 'ADP'),
 ('kampung', 'NOUN'),
 ('halaman', 'NOUN'),
 ('masing-masing.', 'DET'),
 ('Dalam', 'ADP'),
 ('video', 'NOUN'),
 ('pendek', 'ADJ'),
 ('terbitan', 'NOUN'),
 ('Jabatan', 'PROPN'),
 ('Keselamatan', 'PROPN'),
 ('Jalan', 'PROPN'),
 ('Raya', 'PROPN'),
 ('(JKJR)', 'PUNCT'),
 ('itu,', 'DET'),
 ('Dr', 'PROPN'),
 ('Mahathir', 'PROPN'),
 ('menasihati', 'VERB'),
 ('mereka', 'PRON'),
 ('supaya', 'NOUN'),
 ('berhenti', 'VERB'),
 ('berehat', 'VERB'),
 ('dan', 'CCONJ'),
 ('tidur', 'VERB'),
 ('sebentar', 'ADV'),
 ('sekiranya', 'ADV'),
 ('mengantuk', 'VERB'),
 ('ketika', 'SCONJ'),
 ('memandu.', 'VERB')]

Group similar tags#

def analyze(self, string: str):
        """
        Analyze a string.

        Parameters
        ----------
        string : str

        Returns
        -------
        result: {'words': List[str], 'tags': [{'text': 'text', 'type': 'location', 'score': 1.0, 'beginOffset': 0, 'endOffset': 1}]}
        """
[8]:
model.analyze(string)
[8]:
[{'text': ['KUALA', 'LUMPUR:', 'Sempena'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 0,
  'endOffset': 3},
 {'text': ['sambutan'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 3,
  'endOffset': 4},
 {'text': ['Aidilfitri'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 4,
  'endOffset': 5},
 {'text': ['minggu'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 5,
  'endOffset': 6},
 {'text': ['depan,'],
  'type': 'ADJ',
  'score': 1.0,
  'beginOffset': 6,
  'endOffset': 7},
 {'text': ['Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 7,
  'endOffset': 13},
 {'text': ['dan'],
  'type': 'CCONJ',
  'score': 1.0,
  'beginOffset': 13,
  'endOffset': 14},
 {'text': ['Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 14,
  'endOffset': 20},
 {'text': ['menitipkan'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 20,
  'endOffset': 21},
 {'text': ['pesanan'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 21,
  'endOffset': 22},
 {'text': ['khas'],
  'type': 'ADJ',
  'score': 1.0,
  'beginOffset': 22,
  'endOffset': 23},
 {'text': ['kepada'],
  'type': 'ADP',
  'score': 1.0,
  'beginOffset': 23,
  'endOffset': 24},
 {'text': ['orang', 'ramai'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 24,
  'endOffset': 26},
 {'text': ['yang'],
  'type': 'PRON',
  'score': 1.0,
  'beginOffset': 26,
  'endOffset': 27},
 {'text': ['mahu'],
  'type': 'ADV',
  'score': 1.0,
  'beginOffset': 27,
  'endOffset': 28},
 {'text': ['pulang'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 28,
  'endOffset': 29},
 {'text': ['ke'],
  'type': 'ADP',
  'score': 1.0,
  'beginOffset': 29,
  'endOffset': 30},
 {'text': ['kampung', 'halaman'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 30,
  'endOffset': 32},
 {'text': ['masing-masing.'],
  'type': 'DET',
  'score': 1.0,
  'beginOffset': 32,
  'endOffset': 33},
 {'text': ['Dalam'],
  'type': 'ADP',
  'score': 1.0,
  'beginOffset': 33,
  'endOffset': 34},
 {'text': ['video'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 34,
  'endOffset': 35},
 {'text': ['pendek'],
  'type': 'ADJ',
  'score': 1.0,
  'beginOffset': 35,
  'endOffset': 36},
 {'text': ['terbitan'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 36,
  'endOffset': 37},
 {'text': ['Jabatan', 'Keselamatan', 'Jalan', 'Raya'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 37,
  'endOffset': 41},
 {'text': ['(JKJR)'],
  'type': 'PUNCT',
  'score': 1.0,
  'beginOffset': 41,
  'endOffset': 42},
 {'text': ['itu,'],
  'type': 'DET',
  'score': 1.0,
  'beginOffset': 42,
  'endOffset': 43},
 {'text': ['Dr', 'Mahathir'],
  'type': 'PROPN',
  'score': 1.0,
  'beginOffset': 43,
  'endOffset': 45},
 {'text': ['menasihati'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 45,
  'endOffset': 46},
 {'text': ['mereka'],
  'type': 'PRON',
  'score': 1.0,
  'beginOffset': 46,
  'endOffset': 47},
 {'text': ['supaya'],
  'type': 'NOUN',
  'score': 1.0,
  'beginOffset': 47,
  'endOffset': 48},
 {'text': ['berhenti', 'berehat'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 48,
  'endOffset': 50},
 {'text': ['dan'],
  'type': 'CCONJ',
  'score': 1.0,
  'beginOffset': 50,
  'endOffset': 51},
 {'text': ['tidur'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 51,
  'endOffset': 52},
 {'text': ['sebentar', 'sekiranya'],
  'type': 'ADV',
  'score': 1.0,
  'beginOffset': 52,
  'endOffset': 54},
 {'text': ['mengantuk'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 54,
  'endOffset': 55},
 {'text': ['ketika'],
  'type': 'SCONJ',
  'score': 1.0,
  'beginOffset': 55,
  'endOffset': 56},
 {'text': ['memandu.'],
  'type': 'VERB',
  'score': 1.0,
  'beginOffset': 56,
  'endOffset': 57}]