Contents

Entities Recognition

Contents

Entities Recognition#

This tutorial is available as an IPython notebook at Malaya/example/entities.

This module only trained on standard language structure, so it is not save to use it for local language structure.

[1]:

import logging

logging.basicConfig(level=logging.INFO)

[2]:

%%time
import malaya

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp0npqw77q
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp0npqw77q/_remote_module_non_scriptable.py

CPU times: user 2.89 s, sys: 3.49 s, total: 6.38 s
Wall time: 2.3 s

/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

Describe supported entities#

[3]:

malaya.entity.describe

[3]:

[{'Tag': 'OTHER', 'Description': 'other'},
 {'Tag': 'law',
  'Description': 'law, regulation, related law documents, documents, etc'},
 {'Tag': 'location', 'Description': 'location, place'},
 {'Tag': 'organization',
  'Description': 'organization, company, government, facilities, etc'},
 {'Tag': 'person',
  'Description': 'person, group of people, believes, unique arts (eg; food, drink), etc'},
 {'Tag': 'quantity', 'Description': 'numbers, quantity'},
 {'Tag': 'time', 'Description': 'date, day, time, etc'},
 {'Tag': 'event', 'Description': 'unique event happened, etc'}]

List available HuggingFace NER models#

[4]:

malaya.entity.available_huggingface

[4]:

{'mesolitica/ner-t5-tiny-standard-bahasa-cased': {'Size (MB)': 84.7,
  'law': {'precision': 0.9642625081221572,
   'recall': 0.9598965071151359,
   'f1': 0.9620745542949757,
   'number': 1546},
  'person': {'precision': 0.9673319980661648,
   'recall': 0.971424608128728,
   'f1': 0.9693739834584906,
   'number': 14418},
  'time': {'precision': 0.9796992481203007,
   'recall': 0.983148893360161,
   'f1': 0.9814210394175245,
   'number': 3976},
  'location': {'precision': 0.966455899689208,
   'recall': 0.9753406878650227,
   'f1': 0.970877967379017,
   'number': 9246},
  'organization': {'precision': 0.9308265342319971,
   'recall': 0.9475204622051036,
   'f1': 0.9390993140471219,
   'number': 8308},
  'quantity': {'precision': 0.9824689554419284,
   'recall': 0.9853479853479854,
   'f1': 0.9839063643013899,
   'number': 2730},
  'event': {'precision': 0.8535980148883374,
   'recall': 0.8973913043478261,
   'f1': 0.8749470114455278,
   'number': 1150},
  'overall_precision': 0.9585080133195985,
  'overall_recall': 0.9670566055977183,
  'overall_f1': 0.9627633336140621,
  'overall_accuracy': 0.9951433495221682},
 'mesolitica/ner-t5-small-standard-bahasa-cased': {'Size (MB)': 141,
  'law': {'precision': 0.9320327249842668,
   'recall': 0.9579560155239327,
   'f1': 0.9448165869218501,
   'number': 1546},
  'person': {'precision': 0.9745341614906833,
   'recall': 0.9794007490636704,
   'f1': 0.976961394769614,
   'number': 14418},
  'time': {'precision': 0.9583539910758553,
   'recall': 0.9723340040241448,
   'f1': 0.9652933832709114,
   'number': 3976},
  'location': {'precision': 0.9709677419354839,
   'recall': 0.9766385463984426,
   'f1': 0.9737948883856357,
   'number': 9246},
  'organization': {'precision': 0.9493625210488333,
   'recall': 0.9500481463649495,
   'f1': 0.9497052099627,
   'number': 8308},
  'quantity': {'precision': 0.9823008849557522,
   'recall': 0.9758241758241758,
   'f1': 0.9790518191841234,
   'number': 2730},
  'event': {'precision': 0.8669991687448046,
   'recall': 0.9069565217391304,
   'f1': 0.88652783680408,
   'number': 1150},
  'overall_precision': 0.9629220498535133,
  'overall_recall': 0.9691593754531832,
  'overall_f1': 0.9660306446949986,
  'overall_accuracy': 0.9953954840983863}}

[6]:

string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'
string1 = 'memperkenalkan Husein, dia sangat comel, berumur 25 tahun, bangsa melayu, agama islam, tinggal di cyberjaya malaysia, bercakap bahasa melayu, semua membaca buku undang-undang kewangan, dengar laju Siti Nurhaliza - Seluruh Cinta sambil makan ayam goreng KFC'

Load HuggingFace model#

def huggingface(
    model: str = 'mesolitica/ner-t5-small-standard-bahasa-cased',
    force_check: bool = True,
    **kwargs,
):
    """
    Load HuggingFace model to Entity Recognition.

    Parameters
    ----------
    model: str, optional (default='mesolitica/ner-t5-small-standard-bahasa-cased')
        Check available models at `malaya.entity.available_huggingface`.
    force_check: bool, optional (default=True)
        Force check model one of malaya model.
        Set to False if you have your own huggingface model.

    Returns
    -------
    result: malaya.torch_model.huggingface.Tagging
    """

[7]:

model = malaya.entity.huggingface()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

[8]:

_ = model.eval()

Predict#

def predict(self, string: str):
    """
    Tag a string.

    Parameters
    ----------
    string : str

    Returns
    -------
    result: Tuple[str, str]
    """

[9]:

model.predict(string)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

[9]:

[('KUALA', 'location'),
 ('LUMPUR:', 'location'),
 ('Sempena', 'OTHER'),
 ('sambutan', 'OTHER'),
 ('Aidilfitri', 'event'),
 ('minggu', 'OTHER'),
 ('depan,', 'OTHER'),
 ('Perdana', 'person'),
 ('Menteri', 'person'),
 ('Tun', 'person'),
 ('Dr', 'person'),
 ('Mahathir', 'person'),
 ('Mohamad', 'person'),
 ('dan', 'OTHER'),
 ('Menteri', 'person'),
 ('Pengangkutan', 'person'),
 ('Anthony', 'person'),
 ('Loke', 'person'),
 ('Siew', 'person'),
 ('Fook', 'person'),
 ('menitipkan', 'OTHER'),
 ('pesanan', 'OTHER'),
 ('khas', 'OTHER'),
 ('kepada', 'OTHER'),
 ('orang', 'OTHER'),
 ('ramai', 'OTHER'),
 ('yang', 'OTHER'),
 ('mahu', 'OTHER'),
 ('pulang', 'OTHER'),
 ('ke', 'OTHER'),
 ('kampung', 'OTHER'),
 ('halaman', 'OTHER'),
 ('masing-masing.', 'OTHER'),
 ('Dalam', 'OTHER'),
 ('video', 'OTHER'),
 ('pendek', 'OTHER'),
 ('terbitan', 'OTHER'),
 ('Jabatan', 'organization'),
 ('Keselamatan', 'organization'),
 ('Jalan', 'organization'),
 ('Raya', 'organization'),
 ('(JKJR)', 'organization'),
 ('itu,', 'OTHER'),
 ('Dr', 'person'),
 ('Mahathir', 'person'),
 ('menasihati', 'OTHER'),
 ('mereka', 'OTHER'),
 ('supaya', 'OTHER'),
 ('berhenti', 'OTHER'),
 ('berehat', 'OTHER'),
 ('dan', 'OTHER'),
 ('tidur', 'OTHER'),
 ('sebentar', 'OTHER'),
 ('sekiranya', 'OTHER'),
 ('mengantuk', 'OTHER'),
 ('ketika', 'OTHER'),
 ('memandu.', 'OTHER')]

[10]:

model.predict(string1)

[10]:

[('memperkenalkan', 'OTHER'),
 ('Husein,', 'person'),
 ('dia', 'OTHER'),
 ('sangat', 'OTHER'),
 ('comel,', 'OTHER'),
 ('berumur', 'OTHER'),
 ('25', 'OTHER'),
 ('tahun,', 'OTHER'),
 ('bangsa', 'OTHER'),
 ('melayu,', 'person'),
 ('agama', 'person'),
 ('islam,', 'person'),
 ('tinggal', 'OTHER'),
 ('di', 'OTHER'),
 ('cyberjaya', 'location'),
 ('malaysia,', 'OTHER'),
 ('bercakap', 'OTHER'),
 ('bahasa', 'OTHER'),
 ('melayu,', 'OTHER'),
 ('semua', 'OTHER'),
 ('membaca', 'OTHER'),
 ('buku', 'OTHER'),
 ('undang-undang', 'law'),
 ('kewangan,', 'event'),
 ('dengar', 'OTHER'),
 ('laju', 'OTHER'),
 ('Siti', 'person'),
 ('Nurhaliza', 'person'),
 ('-', 'person'),
 ('Seluruh', 'person'),
 ('Cinta', 'event'),
 ('sambil', 'OTHER'),
 ('makan', 'OTHER'),
 ('ayam', 'OTHER'),
 ('goreng', 'OTHER'),
 ('KFC', 'person')]

Group similar tags#

def analyze(self, string: str):
        """
        Analyze a string.

        Parameters
        ----------
        string : str

        Returns
        -------
        result: {'words': List[str], 'tags': [{'text': 'text', 'type': 'location', 'score': 1.0, 'beginOffset': 0, 'endOffset': 1}]}
        """

[11]:

model.analyze(string)

[11]:

[{'text': ['KUALA', 'LUMPUR:'],
  'type': 'location',
  'score': 1.0,
  'beginOffset': 0,
  'endOffset': 2},
 {'text': ['Sempena', 'sambutan'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 2,
  'endOffset': 4},
 {'text': ['Aidilfitri'],
  'type': 'event',
  'score': 1.0,
  'beginOffset': 4,
  'endOffset': 5},
 {'text': ['minggu', 'depan,'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 5,
  'endOffset': 7},
 {'text': ['Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 7,
  'endOffset': 13},
 {'text': ['dan'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 13,
  'endOffset': 14},
 {'text': ['Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 14,
  'endOffset': 20},
 {'text': ['menitipkan',
   'pesanan',
   'khas',
   'kepada',
   'orang',
   'ramai',
   'yang',
   'mahu',
   'pulang',
   'ke',
   'kampung',
   'halaman',
   'masing-masing.',
   'Dalam',
   'video',
   'pendek',
   'terbitan'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 20,
  'endOffset': 37},
 {'text': ['Jabatan', 'Keselamatan', 'Jalan', 'Raya', '(JKJR)'],
  'type': 'organization',
  'score': 1.0,
  'beginOffset': 37,
  'endOffset': 42},
 {'text': ['itu,'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 42,
  'endOffset': 43},
 {'text': ['Dr', 'Mahathir'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 43,
  'endOffset': 45},
 {'text': ['menasihati',
   'mereka',
   'supaya',
   'berhenti',
   'berehat',
   'dan',
   'tidur',
   'sebentar',
   'sekiranya',
   'mengantuk',
   'ketika',
   'memandu.'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 45,
  'endOffset': 57}]

[13]:

model.analyze(string1)

[13]:

[{'text': ['memperkenalkan'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 0,
  'endOffset': 1},
 {'text': ['Husein,'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 1,
  'endOffset': 2},
 {'text': ['dia', 'sangat', 'comel,', 'berumur', '25', 'tahun,', 'bangsa'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 2,
  'endOffset': 9},
 {'text': ['melayu,', 'agama', 'islam,'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 9,
  'endOffset': 12},
 {'text': ['tinggal', 'di'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 12,
  'endOffset': 14},
 {'text': ['cyberjaya'],
  'type': 'location',
  'score': 1.0,
  'beginOffset': 14,
  'endOffset': 15},
 {'text': ['malaysia,',
   'bercakap',
   'bahasa',
   'melayu,',
   'semua',
   'membaca',
   'buku'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 15,
  'endOffset': 22},
 {'text': ['undang-undang'],
  'type': 'law',
  'score': 1.0,
  'beginOffset': 22,
  'endOffset': 23},
 {'text': ['kewangan,'],
  'type': 'event',
  'score': 1.0,
  'beginOffset': 23,
  'endOffset': 24},
 {'text': ['dengar', 'laju'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 24,
  'endOffset': 26},
 {'text': ['Siti', 'Nurhaliza', '-', 'Seluruh'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 26,
  'endOffset': 30},
 {'text': ['Cinta'],
  'type': 'event',
  'score': 1.0,
  'beginOffset': 30,
  'endOffset': 31},
 {'text': ['sambil', 'makan', 'ayam', 'goreng'],
  'type': 'OTHER',
  'score': 1.0,
  'beginOffset': 31,
  'endOffset': 35},
 {'text': ['KFC'],
  'type': 'person',
  'score': 1.0,
  'beginOffset': 35,
  'endOffset': 36}]

Load general Malaya entity model#

This model able to classify,

date
money
temperature
distance
volume
duration
phone
email
url
time
datetime
local and generic foods, can check available rules in malaya.texts._food
local and generic drinks, can check available rules in malaya.texts._food

We can insert BERT or any deep learning model by passing malaya.entity.general_entity(model = model), as long the model has predict method and return [(string, label), (string, label)]. This is an optional.

[14]:

entity = malaya.entity.general_entity(model = model)

[15]:

entity.predict('Husein baca buku Perlembagaan yang berharga 3k ringgit dekat kfc sungai petani minggu lepas, 2 ptg 2 oktober 2019 , suhu 32 celcius, sambil makan ayam goreng dan milo o ais')

[15]:

{'person': ['Husein', 'milo o ais'],
 'OTHER': ['baca buku',
  'yang berharga',
  'dekat',
  ', suhu',
  'sambil makan ayam goreng dan'],
 'law': ['Perlembagaan'],
 'quantity': ['3k ringgit', '32 celcius,'],
 'location': ['kfc sungai petani'],
 'time': {'minggu lalu, 2 PM 2 oktober 2019': None,
  '2 PM': datetime.datetime(2023, 10, 12, 14, 0)},
 'date': {'2 oktober 2019': datetime.datetime(2019, 10, 2, 0, 0),
  'minggu lalu': datetime.datetime(2023, 10, 5, 15, 37, 26, 457099)},
 'money': {'3k ringgit': 'RM3000.0'},
 'temperature': ['32 celcius'],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'datetime': {'2 ptg 2 oktober 2019': datetime.datetime(2019, 10, 2, 14, 0)},
 'food': ['ayam goreng'],
 'drink': ['milo o ais'],
 'weight': []}

[16]:

entity.predict('contact Husein at husein.zol05@gmail.com')

[16]:

{'OTHER': ['contact', 'at'],
 'person': ['Husein'],
 'organization': ['husein.zol05@gmail.com'],
 'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': ['husein.zol05@gmail.com'],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

[17]:

entity.predict('tolong tempahkan meja makan makan nasi dagang dan jus apple, milo tarik esok dekat Restoran Sebulek')

[17]:

{'OTHER': ['tolong tempahkan meja makan makan nasi dagang dan',
  'tarik esok dekat'],
 'person': ['jus'],
 'event': ['apple, milo'],
 'location': ['Restoran Sebulek'],
 'date': {'esok': datetime.datetime(2023, 10, 13, 15, 37, 33, 353440)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': ['nasi dagang'],
 'drink': ['milo tarik', 'jus apple'],
 'weight': []}

Voting stack model#

[19]:

tiny = malaya.entity.huggingface(model = 'mesolitica/ner-t5-tiny-standard-bahasa-cased')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

[20]:

malaya.stack.voting_stack([tiny, model, model], string1)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

[20]:

[('memperkenalkan', 'OTHER'),
 ('Husein,', 'person'),
 ('dia', 'OTHER'),
 ('sangat', 'OTHER'),
 ('comel,', 'OTHER'),
 ('berumur', 'OTHER'),
 ('25', 'OTHER'),
 ('tahun,', 'OTHER'),
 ('bangsa', 'OTHER'),
 ('melayu,', 'person'),
 ('agama', 'person'),
 ('islam,', 'person'),
 ('tinggal', 'OTHER'),
 ('di', 'OTHER'),
 ('cyberjaya', 'location'),
 ('malaysia,', 'OTHER'),
 ('bercakap', 'OTHER'),
 ('bahasa', 'OTHER'),
 ('melayu,', 'OTHER'),
 ('semua', 'OTHER'),
 ('membaca', 'OTHER'),
 ('buku', 'OTHER'),
 ('undang-undang', 'law'),
 ('kewangan,', 'event'),
 ('dengar', 'OTHER'),
 ('laju', 'OTHER'),
 ('Siti', 'person'),
 ('Nurhaliza', 'person'),
 ('-', 'person'),
 ('Seluruh', 'person'),
 ('Cinta', 'event'),
 ('sambil', 'OTHER'),
 ('makan', 'OTHER'),
 ('ayam', 'OTHER'),
 ('goreng', 'OTHER'),
 ('KFC', 'person')]

[ ]: