Source code for malaya.language_detection

import pickle
from malaya.function import (
from import LanguageDetection
from import DeepLang
from malaya.model.rules import LanguageDict
from malaya.text.bpe import YTTMEncoder
from malaya.path import (
from herpetologist import check_type
from malaya.function import describe_availability
import logging

logger = logging.getLogger(__name__)

lang_labels = {
    0: 'eng',
    1: 'ind',
    2: 'malay',
    3: 'manglish',
    4: 'other',
    5: 'rojak',

label = list(lang_labels.values())

[docs]@check_type def fasttext(quantized: bool = True, **kwargs): """ Load Fasttext language detection model. Original size is 353MB, Quantized size 31.1MB. Parameters ---------- quantized: bool, optional (default=True) if True, load quantized fasttext model. Else, load original fasttext model. Returns ------- result : class """ try: import fasttext except BaseException: raise ModuleNotFoundError( 'fasttext not installed. Please install it by `pip install fasttext` and try again.' ) if quantized: model = 'fasttext-quantized' else: model = 'fasttext-original' path = check_file( PATH_LANG_DETECTION[model], S3_PATH_LANG_DETECTION[model], **kwargs ) try: model_fasttext = fasttext.load_model(path['model']) except: raise Exception(f'failed to load fasttext model, please try clear the cache and try again') return LanguageDetection(model_fasttext, lang_labels)
[docs]@check_type def deep_model(quantized: bool = False, **kwargs): """ Load deep learning language detection model. Original size is 51.2MB, Quantized size 12.8MB. Parameters ---------- quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : class """ path = check_file( file='lang-32', module='language-detection', keys={ 'model': 'model.pb', 'vector': LANGUAGE_DETECTION_BOW, 'bpe': LANGUAGE_DETECTION_VOCAB, }, quantized=quantized, **kwargs, ) g = load_graph(path['model'], **kwargs) bpe = YTTMEncoder(vocab_file=path['bpe']) with open(path['vector'], 'rb') as fopen: vector = pickle.load(fopen) inputs = [ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ] outputs = ['logits'] input_nodes, output_nodes = nodes_session(g, inputs, outputs) return DeepLang( input_nodes=input_nodes, output_nodes=output_nodes, sess=generate_session(graph=g, **kwargs), vectorizer=vector, bpe=bpe, label=lang_labels, )
[docs]def substring_rules(model, **kwargs): """ detect EN, MS and OTHER languages in a string. EN words detection are using `pyenchant` from and user language detection model. MS words detection are using `malaya.text.function.is_malay` and user language detection model. OTHER words detection are using any language detection classification model, such as, `malaya.language_detection.fasttext` or `malaya.language_detection.deep_model`. Parameters ---------- model : Callable Callable model, must have `predict` method. Returns ------- result : malaya.model.rules.LanguageDict class """ if not hasattr(model, 'predict'): raise ValueError('model must have `predict` method') return LanguageDict(model=model, **kwargs)