Source code for malaya.dictionary

from malaya.dictionary.english.words import words as _english_words
from malaya.dictionary.bahasa.words import words as _malay_words
from malaya.dictionary.bahasa.cambridge_words import words as _cambridge_malay_words
from malaya.dictionary.bahasa.kamus_dewan import words as _kamus_dewan_words
from malaya.dictionary.bahasa.dbp import words as _dbp_words
from malaya.dictionary.bahasa.negeri import negeri
from malaya.dictionary.bahasa.city import city
from malaya.dictionary.bahasa.country import country
from malaya.dictionary.bahasa.places import places
from malaya.dictionary.bahasa.daerah import daerah
from malaya.dictionary.bahasa.parlimen import parlimen
from malaya.dictionary.bahasa.adun import adun
from malaya.dictionary.mandarin.pinyin import pinyin_dict
from malaya.text.rules import rules_normalizer
from typing import List

ENGLISH_WORDS = {i for i in _english_words if len(i) > 2} | {'me', 'on', 'of'}
MALAY_WORDS = _malay_words
CAMBRIDGE_MALAY_WORDS = _cambridge_malay_words
KAMUS_DEWAN_WORDS = _kamus_dewan_words
DBP_WORDS = _dbp_words

available_requests = True
try:
    import requests
    from requests.structures import CaseInsensitiveDict

    headers_corpus = CaseInsensitiveDict()
    headers_corpus["Accept"] = "*/*"
    headers_corpus["Accept-Language"] = "en-MY,en;q=0.9,en-US;q=0.8,ms;q=0.7"
    headers_corpus["Cache-Control"] = "no-cache"
    headers_corpus["Connection"] = "keep-alive"
    headers_corpus["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
    headers_corpus["Cookie"] = "__ssds=3; __ssuzjsr3=a9be0cd8e; __uzmaj3=f3203f85-ceb6-46a0-a558-b1e11a637ecd; __uzmbj3=1661748827; ASP.NET_SessionId=vgtl1vyxxybw4m3rpkpvatmg; _gid=GA1.3.1764105897.1661921894; _ga=GA1.1.1010277242.1661748827; __uzmcj3=1743010683960; __uzmdj3=1661934381; _ga_50X5PQW96W=GS1.1.1661934348.9.1.1661934385.0.0.0"
    headers_corpus["Origin"] = "http://sbmb.dbp.gov.my"
    headers_corpus["Pragma"] = "no-cache"
    headers_corpus["Referer"] = "http://sbmb.dbp.gov.my/korpusdbp/Search2.aspx"
    headers_corpus["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
    headers_corpus["X-MicrosoftAjax"] = "Delta=true"
    headers_corpus["X-Requested-With"] = "XMLHttpRequest"

except Exception as e:
    available_requests = False

available_bs4 = True
try:
    from bs4 import BeautifulSoup
except Exception as e:
    available_bs4 = False

parser = None


def check_requests():
    if not available_requests:
        raise ModuleNotFoundError(
            'requests not installed. Please install it by `pip3 install requests` and try again.'
        )


def check_bs4():
    if not available_bs4:
        raise ModuleNotFoundError(
            'beautifulsoup4 not installed. Please install it by `pip3 install beautifulsoup4` and try again.'
        )


[docs]def keyword_wiktionary( word: str, acceptable_lang: List[str] = ['brunei malay', 'malay'], ): """ crawl https://en.wiktionary.org/wiki/ to check a word is a malay word. Parameters ---------- word: str acceptable_lang: List[str], optional (default=['brunei malay', 'malay']) acceptable languages in wiktionary section. Returns ------- result: Dict """ global parser try: from wiktionaryparser import WiktionaryParser except BaseException: raise ModuleNotFoundError( 'wiktionaryparser not installed. Please install it by `pip3 install wiktionaryparser` and try again.' ) if parser is None: parser = WiktionaryParser() response = parser.session.get(parser.url.format(word), params={'oldid': None}) parser.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') parser.current_word = word parser.clean_html() results = {} for lang in acceptable_lang: results[lang] = parser.get_word_data(lang) return results
[docs]def keyword_dbp(word: str, parse: bool = False): """ crawl https://prpm.dbp.gov.my/cari1?keyword= to check a word is a malay word. Parameters ---------- word: str parse: bool, optional (default=False) if True, will parse using BeautifulSoup. Returns ------- result: Dict """ check_requests() url = f'https://prpm.dbp.gov.my/Cari1?keyword={word}' r = requests.get(url) if parse: check_bs4() soup = BeautifulSoup(r._content) definitions = soup.select("div[class*=tab-pane]") definitions = [d.text for d in definitions] spans = soup.find_all('span', id='MainContent_SearchInfoTesaurus_lblTesaurus') if len(spans): tesaurus = spans[0] tds = tesaurus.findAll('td') selected_td = None for td in tds: if 'javascript:showModalDialog' in str(td): selected_td = td if selected_td: tesaurus = [a.text for a in selected_td.find_all('a')] else: tesaurus = None else: tesaurus = None if len(definitions): return {'definisi': definitions, 'tesaurus': tesaurus} else: return False else: if b'Definisi :' in r._content: return True else: return False
[docs]def corpus_dbp(word): """ crawl http://sbmb.dbp.gov.my/korpusdbp/Search2.aspx to search corpus based on a word. Parameters ---------- word: str Returns ------- result: pandas.core.frame.DataFrame """ import pandas as pd check_requests() check_bs4() url = 'http://sbmb.dbp.gov.my/korpusdbp/Search2.aspx' data = f"ctl00%24ContentPlaceHolder1%24txtKata={word}&ctl00%24RadScriptManager1=ctl00%24ContentPlaceHolder1%24ctl00%24ContentPlaceHolder1%24pnlContentPanel%7Cctl00%24ContentPlaceHolder1%24cmdSearch&RadScriptManager1_TSM=%3B%3BTelerik.Web.UI%2C%20Version%3D2014.3.1024.45%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A45fa33f7-195b-4d8e-a1d1-d5955cf24e2c%3A16e4e7cd%3Aed16cbdc%3A365331c3%3A24ee1bba%3A92fe8ea0%3Af46195d3%3Afa31b949%3A874f8ea2%3Ac128760b%3A19620875%3A490a9d4e%3A88144a7a&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24cmdSearch&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwUKLTMwMzk1OTI4MA9kFgJmD2QWAgIDD2QWAgIDD2QWBgIDD2QWBAIBD2QWAgICDw8WAh4RVXNlU3VibWl0QmVoYXZpb3JoZGQCAw8PFgIeB1Zpc2libGVoZBYEAgEPFCsAAhQrAAJkEBYCZgIBFgIUKwACZGQUKwACZGQPFgJmZhYBBW9UZWxlcmlrLldlYi5VSS5SYWRUYWIsIFRlbGVyaWsuV2ViLlVJLCBWZXJzaW9uPTIwMTQuMy4xMDI0LjQ1LCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPTEyMWZhZTc4MTY1YmEzZDRkZAIDDxQrAAJkFQIKS29ua29yZGFucwhLb2xva2FzaRYEZg9kFgQCAQ8PFgIfAGhkZAIDDzwrAA4CABQrAAIPFgIeElJlc29sdmVkUmVuZGVyTW9kZQspc1RlbGVyaWsuV2ViLlVJLlJlbmRlck1vZGUsIFRlbGVyaWsuV2ViLlVJLCBWZXJzaW9uPTIwMTQuMy4xMDI0LjQ1LCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPTEyMWZhZTc4MTY1YmEzZDQBZBcAARYCFgsPAgUUKwAFZDwrAAUBBAULS29udGVrc0tpcmlkPCsABQEEBQxLb250ZWtzS2FuYW48KwAFAQQFDU1ha2x1bWF0QmFoYW5kZRQrAAALKXpUZWxlcmlrLldlYi5VSS5HcmlkQ2hpbGRMb2FkTW9kZSwgVGVsZXJpay5XZWIuVUksIFZlcnNpb249MjAxNC4zLjEwMjQuNDUsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49MTIxZmFlNzgxNjViYTNkNAE8KwAHAAspdVRlbGVyaWsuV2ViLlVJLkdyaWRFZGl0TW9kZSwgVGVsZXJpay5XZWIuVUksIFZlcnNpb249MjAxNC4zLjEwMjQuNDUsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49MTIxZmFlNzgxNjViYTNkNAFkZGRkZmQCAQ9kFgYCAQ8UKwACZBAWAmYCARYCFCsAAmRkFCsAAmRkDxYCZmYWAQV5VGVsZXJpay5XZWIuVUkuUmFkVG9vbEJhckJ1dHRvbiwgVGVsZXJpay5XZWIuVUksIFZlcnNpb249MjAxNC4zLjEwMjQuNDUsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49MTIxZmFlNzgxNjViYTNkNGQCAw9kFgoCAQ8PZBYCHgVzdHlsZQURdGV4dC1hbGlnbjpyaWdodDtkAgcPEGQQFQgHUGVydGFtYQRLZS0yBEtlLTMES2UtNARLZS01BEtlLTYES2UtNwRLZS04FQgBMQEyATMBNAE1ATYBNwE4FCsDCGdnZ2dnZ2dnFgFmZAIJDxBkEBUIB1BlcnRhbWEES2UtMgRLZS0zBEtlLTQES2UtNQRLZS02BEtlLTcES2UtOBUIATEBMgEzATQBNQE2ATcBOBQrAwhnZ2dnZ2dnZxYBZmQCCw8PFgIfAGhkZAIPD2QWAmYPZBYCZg9kFgICAQ88KwAOAgAUKwACDxYCHwILKwQBZBcAARYCFgsPAgUUKwAFZDwrAAUBBAULS29udGVrc0tpcmlkPCsABQEEBQxLb250ZWtzS2FuYW48KwAFAQQFDU1ha2x1bWF0QmFoYW5kZRQrAAALKwUBPCsABwALKwYBZGRkZGZkAgUPZBYCAgEPPCsADgIAFCsAAg8WAh8CCysEAWQXAAEWAhYLZGRlFCsAAAsrBQE8KwAHAAsrBgFkZGRkZmQCBQ8UKwADDxYIHhdFbmFibGVBamF4U2tpblJlbmRlcmluZ2geHEVuYWJsZUVtYmVkZGVkQmFzZVN0eWxlc2hlZXRnHwILKwQBHhVFbmFibGVFbWJlZGRlZFNjcmlwdHNnZGRkZAIHDw8WAh8EaGRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBS5jdGwwMCRDb250ZW50UGxhY2VIb2xkZXIxJHJ3QXN5bmNQcm9jZXNzV2luZG93li5%2BC%2BdmzAEF6P6astcpGGrBT5A%3D&__VIEWSTATEGENERATOR=93ABCA44&__EVENTVALIDATION=%2FwEdAAORbmMsdI9RgJFn3DgB0hnlDMG9drBKQHQbvep7utWD7%2F3VBRkrGvjMj%2Bf9fGNUz2vF5aj38Uz5QuvKWes0idadGtr%2BCg%3D%3D&ctl00_ContentPlaceHolder1_rwAsyncProcessWindow_ClientState=&__ASYNCPOST=true&RadAJAXControlID=ctl00_ContentPlaceHolder1_RadAjaxManager1" r = requests.post(url, headers=headers_corpus, data=data) soup = BeautifulSoup(r._content) table = soup.find_all('table', id='ctl00_ContentPlaceHolder1_rgKonkordan_ctl00') if not len(table): return False df = pd.read_html(str(table[0])) if not len(df): return False df = df[0].iloc[1:-3] df.columns = ['No', 'Konteks Kiri', 'Kata', 'Konteks Kanan', 'Maklumat Artikel/Bahan'] return df
[docs]def is_english(word): """ Check a word is an english word. Parameters ---------- word: str Returns ------- result: bool """ is_in = False if word in ENGLISH_WORDS: is_in = True elif len(word) > 1 and word[-1] in 's' and word[:-1] in ENGLISH_WORDS: is_in = True return is_in
def is_malaysia_location(string): string_lower = string.lower() title = string_lower.title() if string_lower in negeri or title in city or title in country or title in daerah or title in parlimen or title in adun: return True return False
[docs]def is_malay(word, stemmer=None): """ Check a word is a malay word. Parameters ---------- word: str stemmer: Callable, optional (default=None) a Callable object, must have `stem_word` method. Returns ------- result: bool """ if word.lower() in rules_normalizer and not is_english(word.lower()): return True if stemmer is not None: if not hasattr(stemmer, 'stem_word'): raise ValueError('stemmer must have `stem_word` method') word = stemmer.stem_word(word) if word.lower() in rules_normalizer and not is_english(word.lower()): return True return word in MALAY_WORDS or word in CAMBRIDGE_MALAY_WORDS or word in KAMUS_DEWAN_WORDS or word in DBP_WORDS
[docs]def convert_pinyin(string): """ Convert mandarin characters to pinyin form. Original vocab from https://github.com/lxyu/pinyin `你好` -> `ni hao` Parameters ---------- string: str Returns ------- result: str """ r = [] for char in string: key = '%X' % ord(char) pinyin = pinyin_dict.get(key, char) r.append(pinyin) return ' '.join(r)