Malay#

This tutorial is available as an IPython notebook at Malaya/example/dictionary-malay.

requirements#

Make sure you already installed,

pip3 install requests beautifulsoup4
[2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[3]:
import malaya
/home/ubuntu/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3361
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/ubuntu/dev/malaya/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3879
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))

DBP#

Query from https://prpm.dbp.gov.my/cari1?keyword=,

def keyword_dbp(word, parse: bool = False):
    """
    crawl https://prpm.dbp.gov.my/cari1?keyword= to check a word is a malay word.

    Parameters
    ----------
    word: str
    parse: bool, optional (default=False)
        if True, will parse using BeautifulSoup.

    Returns
    -------
    result: Dict
    """
[4]:
malaya.dictionary.keyword_dbp('ayam')
[4]:
True
[6]:
malaya.dictionary.keyword_dbp('ayamaaaaa')
[6]:
False
[5]:
malaya.dictionary.keyword_dbp('ayam', parse = True)
[5]:
{'definisi': ['Definisi : sj ikan; ~ hutan a) Euxiphippops sextriatus; b) Pomacanthus annularis; ~ laut, Abalistes spp.\xa0(Kamus Dewan Edisi Keempat)',
  'Definisi : beberapa jenis binatang (yg bentuk tubuhnya seakan-akan burung tetapi tidak pandai terbang) yg biasanya dipelihara, Gallus gallus. ~ belanda sj ayam yg besar, Meleagris gallopavo. ~ beroga (denak, hutan) sj ayam liar, Gallus bankiva. ~ biring ayam jantan yg kuning kakinya. ~ bulu balik ayam yg bulunya terbalik. ~ dara ayam betina yg hampir bertelur. ~ katik ayam yg kecil. ~ percik ayam panggang yg disaluti sos atau kuah yg dibuat drpd santan dan rempah-ratus. ~ sabung ayam yg dipelihara utk disabung. ~ serama sj ayam peliharaan yg kecil, jinak, berbulu cantik dan berkaki pendek. ~ tambatan ki orang yg dianggap hebat dan diharapkan dpt membawa kemenangan dlm sesuatu perlawanan, mis bola sepak dan bola jaring.\xa0(Kamus Pelajar Edisi Kedua)'],
 'tesaurus': None}
[7]:
malaya.dictionary.keyword_dbp('ayamaaaaa', parse = True)
[7]:
False

Wiktionary#

Query from https://en.wiktionary.org/wiki/,

def keyword_wiktionary(
    word,
    acceptable_lang: List[str] = ['brunei malay', 'malay'],
):
    """
    crawl https://en.wiktionary.org/wiki/ to check a word is a malay word.

    Parameters
    ----------
    word: str
    acceptable_lang: List[str], optional (default=['brunei malay', 'malay'])
        acceptable languages in wiktionary section.

    Returns
    -------
    result: Dict
    """
[8]:
malaya.dictionary.keyword_wiktionary('ayam')
[8]:
{'brunei malay': [{'etymology': 'From Proto-Malayic *hayam, from Proto-Malayo-Polynesian *qayam.\n',
   'definitions': [{'partOfSpeech': 'noun',
     'text': ['ayam', 'chicken (bird)', 'chicken (meat)'],
     'relatedWords': [],
     'examples': []}],
   'pronunciations': {'text': ['IPA: /ajam/',
     '(Kedayan) IPA: /hajam/',
     'Hyphenation: a‧yam'],
    'audio': []}}],
 'malay': [{'etymology': 'From hayam, from Proto-Malayic *hayam, from Proto-Malayo-Polynesian *qayam.\n',
   'definitions': [{'partOfSpeech': 'noun',
     'text': ['ayam (Jawi spelling ايم\u200e, plural ayam-ayam, informal 1st possessive ayamku, 2nd possessive ayammu, 3rd possessive ayamnya)',
      'chicken (bird)',
      'chicken (meat)'],
     'relatedWords': [{'relationshipType': 'synonyms',
       'words': ['manuk / مانوق\u200e']}],
     'examples': []}],
   'pronunciations': {'text': ['IPA: /ajam/', 'Rhymes: -ajam, -jam, -am'],
    'audio': []}}]}
[9]:
malaya.dictionary.keyword_wiktionary('ayamaaaa')
[9]:
{'brunei malay': [{'etymology': '',
   'definitions': [],
   'pronunciations': {'text': [], 'audio': []}}],
 'malay': [{'etymology': '',
   'definitions': [],
   'pronunciations': {'text': [], 'audio': []}}]}

Check a word is a malay word#

def is_malay(word, stemmer=None):
    """
    Check a word is a malay word.

    Parameters
    ----------
    word: str
    stemmer: Callable, optional (default=None)
        a Callable object, must have `stem_word` method.

    Returns
    -------
    result: bool
    """
[10]:
malaya.dictionary.is_malay('ayam')
[10]:
True
[13]:
malaya.dictionary.is_malay('sakitkan')
[13]:
True
[12]:
malaya.dictionary.is_malay('tersakitkan')
[12]:
False
[14]:
stemmer = malaya.stem.sastrawi()
[15]:
malaya.dictionary.is_malay('tersakitkan', stemmer = stemmer)
[15]:
True