LDA2Vec#

This tutorial is available as an IPython notebook at Malaya/example/topic-modeling-lda2vec.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
import pandas as pd
import malaya
[3]:
df = pd.read_csv('tests/02032018.csv',sep=';')
df = df.iloc[3:,1:]
df.columns = ['text','label']
corpus = df.text.tolist()

You can get this file https://github.com/huseinzol05/malaya/blob/master/tests/02032018.csv . This csv already stemmed.

Load vectorizer object#

You can use TfidfVectorizer, CountVectorizer, or any vectorizer as long fit_transform method exists.

[4]:
from malaya.text.vectorizer import SkipGramCountVectorizer

stopwords = malaya.text.function.get_stopwords()
vectorizer = SkipGramCountVectorizer(
    max_df = 0.95,
    min_df = 1,
    ngram_range = (1, 3),
    stop_words = stopwords,
    skip = 2
)

Train LDA2Vec model#

def fit(
    corpus: List[str],
    vectorizer,
    n_topics: int = 10,
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 1000,
    random_state: int = 10,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    epoch: int, (default=10)
        training iteration, how many loop need to train.
    switch_loss: int, (default=3)
        baseline to switch from document based loss to document + word based loss.
    random_state: int, (default=10)
        `random_state` for sklearn.utils.shuffle parameter

    Returns
    -------
    result: malaya.topic_modeling.lda2vec.DeepTopic class
    """
[5]:
lda2vec = malaya.topic_model.lda2vec.fit(corpus, vectorizer, n_topics = 10,
                                     switch_loss = 5000, epoch = 5)
Train LDA2Vec will disable eager execution.
2022-11-11 21:32:39.526392: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-11 21:32:39.530994: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-11-11 21:32:39.531013: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31
2022-11-11 21:32:39.531017: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31
2022-11-11 21:32:39.531087: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2022-11-11 21:32:39.531106: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3
minibatch loop: 100%|███| 2187/2187 [00:35<00:00, 61.37it/s, cost=22.5, epoch=1]
minibatch loop: 100%|███| 2187/2187 [00:35<00:00, 61.92it/s, cost=9.86, epoch=2]
minibatch loop: 100%|███| 2187/2187 [00:34<00:00, 62.76it/s, cost=8.26, epoch=3]
minibatch loop: 100%|███| 2187/2187 [00:34<00:00, 63.02it/s, cost=7.16, epoch=4]
minibatch loop: 100%|███| 2187/2187 [00:34<00:00, 63.33it/s, cost=3.69, epoch=5]

Get topics#

def top_topics(
    self, len_topic: int, top_n: int = 10, return_df: bool = True
):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.
    top_n: int, optional (default=10)
        top n of each topic.
    return_df: bool, optional (default=True)
        return as pandas.DataFrame, else JSON.
    """
[6]:
lda2vec.top_topics(5, top_n = 10, return_df = True)
[6]:
topic 0 topic 1 topic 2 topic 3 topic 4
0 rakyat kononnya fokus pilihan kononnya fokus pilihan suci baiknya kepentingan malaysia pembangunannya negara
1 ros paul seng menteri suci baiknya kepentingan ambil ambil keputusan konvensional ancaman konvensional
2 undi cakap mesyuarat jemaah kena cakap kononnya fokus pilihan pertimbangan membentuk
3 agenda yahudi tular kali melayu berkaitan pergerakan akar paul seng menteri sumber pembacaan rasakan
4 punca seng menteri jabatan stres tinggalkan benda kedudukan pengurusan harapan harapan menyelamatkan
5 rendah kedudukan pengurusan kuok kuok pilihan stres tinggalkan benda berperang
6 kebimbangan menyertai suci baiknya kepentingan ijangka pencemaran mdb salah hilang ruang lingkup
7 menemukan pesawat hilang lupa asli wang keluarga putihnya sesetengah peralatan kebergantungan kerajaan gagal
8 baiknya stres tinggalkan strategi dasar destinasi dasar dilaksanakan kena tanah pemilik
9 beritahu punca tempoh hutang stres tinggalkan kuok kuok pilihan sabah perkongsian idea

Important sentences based on topics#

def get_sentences(self, len_sentence: int, k: int = 0):
    """
    Return important sentences related to selected column based on decomposition.

    Parameters
    ----------
    len_sentence: int
    k: int, (default=0)
        index of decomposition matrix.

    Returns
    -------
    result: List[str]
    """
[7]:
lda2vec.get_sentences(5)
[7]:
['menolak sebarang percubaan merosak memusnahkan tanah suci menjaga baiknya tumpuan kepentingan islam',
 'malaysia gala berpunca tindakan diambil',
 'mendakwa anti muslim anti islam merancang cina memerintah malaysia',
 'memalukan kegagalan jakoa mewakili sebahagian masyarakat asli',
 'cabaran gabungan media halangan membantu']

Get topics as string#

def get_topics(self, len_topic: int):
    """
    Return important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.

    Returns
    -------
    result: List[str]
    """
[9]:
lda2vec.get_topics(10)
[9]:
[(0,
  'rakyat ros undi agenda yahudi tular punca rendah kebimbangan menyertai menemukan pesawat hilang baiknya beritahu punca'),
 (1,
  'kononnya fokus pilihan paul seng menteri cakap mesyuarat jemaah kali melayu seng menteri jabatan kedudukan pengurusan suci baiknya kepentingan lupa asli stres tinggalkan tempoh hutang'),
 (2,
  'kononnya fokus pilihan suci baiknya kepentingan kena cakap berkaitan pergerakan akar stres tinggalkan benda kuok kuok pilihan ijangka pencemaran wang keluarga putihnya strategi dasar destinasi stres tinggalkan'),
 (3,
  'suci baiknya kepentingan ambil ambil keputusan kononnya fokus pilihan paul seng menteri kedudukan pengurusan stres tinggalkan benda mdb salah hilang sesetengah peralatan kebergantungan dasar dilaksanakan kena kuok kuok pilihan'),
 (4,
  'malaysia pembangunannya negara konvensional ancaman konvensional pertimbangan membentuk sumber pembacaan rasakan harapan harapan menyelamatkan berperang ruang lingkup kerajaan gagal tanah pemilik sabah perkongsian idea'),
 (5,
  'rakyat agenda yahudi tular yahudi ros rendah mab dipersetujui keluarga agenda baiknya tular undi beritahu punca terpulang'),
 (6,
  'rakyat ros agenda yahudi tular berita palsu beritahu punca menemukan pesawat hilang rendah kebimbangan menyertai pas beritahu punca wang'),
 (7,
  'mempromosi kemudahan negara kononnya fokus pilihan suci baiknya kepentingan cakap mesyuarat jemaah paul seng menteri keberkesanan ambil ambil keputusan mdb salah hilang low seng jerman'),
 (8,
  'rakyat ros pas releven parti menemukan pesawat hilang yahudi baiknya wujud kebimbangan beritahu punca perasaan baiknya undi'),
 (9,
  'keberkesanan kononnya fokus pilihan paul seng menteri sesetengah peralatan kebergantungan wang hitam putihnya ambil ambil keputusan penangguhan perjalanan selaras aziz nampak keberkesanan mempromosi kemudahan negara press cakap berlaku')]
[ ]: