Topic Modeling

This tutorial is available as an IPython notebook at Malaya/example/topic-modeling.

[1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
[2]:
import pandas as pd
import malaya
/Users/huseinzolkepli/Documents/Malaya/malaya/preprocessing.py:259: FutureWarning: Possible nested set at position 2289
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[3]:
df = pd.read_csv('tests/02032018.csv',sep=';')
df = df.iloc[3:,1:]
df.columns = ['text','label']
corpus = df.text.tolist()

You can get this file Malaya/tests. This csv already stemmed.

Load Transformer

We can use Transformer model to build topic modeling for corpus we have, the power of attention!

def attention(
    corpus: List[str],
    n_topics: int,
    vectorizer,
    cleaning = simple_textcleaning,
    stopwords = get_stopwords,
    ngram: Tuple[int, int] = (1, 3),
    batch_size: int = 10,
):

    """
    Use attention from transformer model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    n_topics: int, (default=10)
        size of decomposition column.
    vectorizer: object
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=10)
        size of strings for each vectorization and attention.

    Returns
    -------
    result: malaya.topic_modelling.AttentionTopic class
    """
[4]:
electra = malaya.transformer.load(model = 'electra')
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:56: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/modeling.py:240: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
WARNING:tensorflow:From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:79: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:93: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/sampling.py:26: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:115: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.random.categorical` instead.
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:118: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:119: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:121: The name tf.get_collection is deprecated. Please use tf.compat.v1.get_collection instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:122: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:128: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/transformers/electra/__init__.py:130: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

INFO:tensorflow:Restoring parameters from /Users/huseinzolkepli/Malaya/electra-model/base/electra-base/model.ckpt
[5]:
attention = malaya.topic_model.attention(corpus, n_topics = 10, vectorizer = electra)

Get topics

def top_topics(
    self, len_topic: int, top_n: int = 10, return_df: bool = True
):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.
    top_n: int, optional (default=10)
        top n of each topic.
    return_df: bool, optional (default=True)
        return as pandas.DataFrame, else JSON.
    """
[6]:
attention.top_topics(5, top_n = 10, return_df = True)
[6]:
topic 0 topic 1 topic 2 topic 3 topic 4
0 kwsp negara umno menteri projek
1 mahkamah malaysia parti perdana hutang
2 dana rakyat pas bahasa malaysia
3 syarikat pengalaman kerajaan perdana menteri mdb
4 bon berkongsi ros kerajaan kementerian
5 dakwaan kerajaan perlembagaan laporan rumah
6 kelulusan berkembang keputusan isu kerajaan
7 bank parti menteri pelan gembira
8 jppm kemudahan bersatu pemilihan pendekatan
9 kenyataan rakyat malaysia isu penjelasan gembira projek

Get topics as string

def get_topics(self, len_topic: int):
    """
    Return important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.

    Returns
    -------
    result: List[str]
    """
[7]:
attention.get_topics(10)
[7]:
[(0, 'kwsp mahkamah dana syarikat bon dakwaan kelulusan bank jppm kenyataan'),
 (1,
  'negara malaysia rakyat pengalaman berkongsi kerajaan berkembang parti kemudahan rakyat malaysia'),
 (2, 'umno parti pas kerajaan ros perlembagaan keputusan menteri bersatu isu'),
 (3,
  'menteri perdana bahasa perdana menteri kerajaan laporan isu pelan pemilihan penjelasan'),
 (4,
  'projek hutang malaysia mdb kementerian rumah kerajaan gembira pendekatan gembira projek'),
 (5,
  'bayar rakyat selesaikan raya pilihan raya ppsmi bincang bayar tutup mca jppm'),
 (6, 'kapal malaysia asli low jho jho low negara wang berita islam'),
 (7,
  'undi parti pimpinan pakatan sokong pucuk suara pucuk suara bertanding suara pucuk pimpinan'),
 (8,
  'pertumbuhan hutang harga pendapatan produk malaysia kaya kenaikan kumpulan peningkatan'),
 (9,
  'lancar rakyat teknikal berjalan lancar kerja buku bahasa berjalan catatan berlaku')]

Train LDA2Vec model

def lda2vec(
    corpus: List[str],
    vectorizer,
    n_topics: int = 10,
    cleaning = simple_textcleaning,
    stopwords = get_stopwords,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 3,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    epoch: int, (default=10)
        one complete iteration.
    switch_loss: int, (default=1000)
        baseline to switch from document based loss to document + word based loss.

    Returns
    -------
    result: malaya.topic_modelling.DeepTopic class
    """
[8]:
from malaya.text.vectorizer import SkipGramCountVectorizer

stopwords = malaya.text.function.get_stopwords()
vectorizer = SkipGramCountVectorizer(
    max_df = 0.95,
    min_df = 1,
    ngram_range = (1, 3),
    stop_words = stopwords,
    skip = 2
)
[9]:
lda2vec = malaya.topic_model.lda2vec(corpus, vectorizer, n_topics = 10,
                                     switch_loss = 5000, epoch = 5)
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/model/lda2vec.py:43: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/model/lda2vec.py:46: The name tf.truncated_normal is deprecated. Please use tf.random.truncated_normal instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/model/lda2vec.py:54: The name tf.random_normal is deprecated. Please use tf.random.normal instead.

WARNING:tensorflow:
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/Malaya/malaya/model/lda2vec.py:117: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

minibatch loop: 100%|██████████| 2187/2187 [00:22<00:00, 95.41it/s, cost=40.5, epoch=1]
minibatch loop: 100%|██████████| 2187/2187 [00:24<00:00, 88.48it/s, cost=12.9, epoch=2]
minibatch loop: 100%|██████████| 2187/2187 [00:23<00:00, 93.20it/s, cost=591, epoch=3]
minibatch loop: 100%|██████████| 2187/2187 [00:23<00:00, 91.28it/s, cost=479, epoch=4]
minibatch loop: 100%|██████████| 2187/2187 [00:24<00:00, 89.11it/s, cost=449, epoch=5]

Get topics

def top_topics(
    self, len_topic: int, top_n: int = 10, return_df: bool = True
):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.
    top_n: int, optional (default=10)
        top n of each topic.
    return_df: bool, optional (default=True)
        return as pandas.DataFrame, else JSON.
    """
[10]:
lda2vec.top_topics(5, top_n = 10, return_df = True)
[10]:
topic 0 topic 1 topic 2 topic 3 topic 4
0 bank dakwaan wang bank negara dakwaan ros bank dakwaan wang ros
1 dakwaan pemindahan akaun bank dakwaan wang perlembagaan subjek menjadikan ranking perlembagaan
2 bank pemindahan wang bank rhb syarikat lancar menangguhkan menangguhkan kebenaran berjalan
3 menangguhkan menangguhkan kebenaran kerajaan berkaitan status menjadikan subjek lancar
4 subjek menjadikan ranking penilaian tahunan ditawarkan dihentikan mencadangkan pembangkang azizah pilihan
5 persendirian dibenarkan lingkup bank milik syarikat berjalan bank rhb syarikat dihentikan
6 menjadikan subjek subjek menjadikan ranking sedar mengambil dakwaan pemindahan akaun status
7 wang bank bank pemindahan wang sahkan perolehi keputusan wang dolar sedar mengambil
8 dolar bank milik mendedahkan ruang had berjalan lancar bank negara dakwaan sahkan perolehi keputusan
9 luas menjadikan dolar bank milik pilihan jabatan malaysia berjalan lancar

Important sentences based on topics

def get_sentences(self, len_sentence: int, k: int = 0):
    """
    Return important sentences related to selected column based on decomposition.

    Parameters
    ----------
    len_sentence: int
    k: int, (default=0)
        index of decomposition matrix.

    Returns
    -------
    result: List[str]
    """
[11]:
lda2vec.get_sentences(5)
[11]:
['bank negara dakwaan pemindahan wang akaun dolar bank rhb milik syarikat persendirian mendedahkan dibenarkan ruang lingkup had perundangan',
 'jho low anak kapal ditahan perairan indonesia',
 'tumpuan pekan najib tumpuan langkawi',
 'april berbangkit status memegang jawatan umno',
 'membantu negara negara maju bidang perancangan ekonomi kewangan perdagangan pertanian pendidikan latihan teknikal industri diplomasi']

Get topics as string

def get_topics(self, len_topic: int):
    """
    Return important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.

    Returns
    -------
    result: List[str]
    """
[12]:
lda2vec.get_topics(10)
[12]:
[(0,
  'bank dakwaan wang dakwaan pemindahan akaun bank pemindahan wang menangguhkan menangguhkan kebenaran subjek menjadikan ranking persendirian dibenarkan lingkup menjadikan subjek wang bank dolar bank milik luas menjadikan'),
 (1,
  'bank negara dakwaan bank dakwaan wang bank rhb syarikat kerajaan berkaitan penilaian tahunan ditawarkan bank milik syarikat subjek menjadikan ranking bank pemindahan wang mendedahkan ruang had dolar bank milik'),
 (2,
  'ros perlembagaan lancar status dihentikan berjalan sedar mengambil sahkan perolehi keputusan berjalan lancar pilihan'),
 (3,
  'bank dakwaan wang subjek menjadikan ranking menangguhkan menangguhkan kebenaran menjadikan subjek mencadangkan pembangkang azizah bank rhb syarikat dakwaan pemindahan akaun wang dolar bank negara dakwaan jabatan malaysia'),
 (4,
  'ros perlembagaan berjalan lancar pilihan dihentikan status sedar mengambil sahkan perolehi keputusan berjalan lancar'),
 (5,
  'ros perlembagaan dihentikan lancar sedar mengambil rakyat berjalan status sahkan perolehi keputusan berjalan lancar'),
 (6,
  'ros perlembagaan lancar sedar mengambil status rakyat berjalan lancar berjalan pilihan dihentikan'),
 (7,
  'bank dakwaan wang menjadikan subjek bank rhb syarikat subjek menjadikan ranking menangguhkan menangguhkan kebenaran bank milik syarikat dakwaan pemindahan akaun kerajaan berkaitan dolar bank milik jabatan malaysia'),
 (8,
  'ros perlembagaan lancar sedar mengambil pilihan berjalan lancar kebenaran sahkan perolehi keputusan berjalan rakyat'),
 (9,
  'ros perlembagaan lancar berjalan sedar mengambil status dihentikan sahkan perolehi keputusan pilihan rakyat')]

Visualize topics

This will initiate pyLDAvis object, to understand pyLDAvis more, read at https://github.com/bmabey/pyLDAvis.

def visualize_topics(self, notebook_mode: int = False, mds: str = 'pcoa'):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    mds : str, optional (default='pcoa')
        2D Decomposition. Allowed values:

        * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
        * ``'mmds'`` - Dimension reduction via Multidimensional scaling
        * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
    """
[15]:
lda2vec.visualize_topics(notebook_mode = True)
[15]:

Train SKLearn LDA model

[16]:
from sklearn.decomposition import LatentDirichletAllocation

lda = malaya.topic_model.sklearn(
    corpus,
    LatentDirichletAllocation,
    vectorizer = vectorizer,
    n_topics = 10,
)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
def top_topics(
    self, len_topic: int, top_n: int = 10, return_df: bool = True
):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    len_topic: int
        size of topics.
    top_n: int, optional (default=10)
        top n of each topic.
    return_df: bool, optional (default=True)
        return as pandas.DataFrame, else JSON.
    """
[17]:
lda.top_topics(5, top_n = 10, return_df = True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
[17]:
topic 0 topic 1 topic 2 topic 3 topic 4
0 negara malaysia rakyat sukan menteri
1 mdb negara negara saham rakyat
2 malaysia parti umno sukan suka kerajaan
3 perniagaan kerajaan keputusan berlaku perdana
4 ahli mdb hutang kerajaan perdana menteri
5 negara bidang menteri tindakan rendah malaysia
6 negara maju bidang isu kepentingan anak
7 membantu negara bidang kuok air sumber nilai
8 membantu negara berlaku hutang hutang meningkatkan ph
9 negara maju bidang pendidikan negeri diterjemahkan beban

Important sentences based on topics

def get_sentences(self, len_sentence: int, k: int = 0):
    """
    Return important sentences related to selected column based on decomposition.

    Parameters
    ----------
    len_sentence: int
    k: int, (default=0)
        index of decomposition matrix.

    Returns
    -------
    result: List[str]
    """
[18]:
lda.get_sentences(5)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
[18]:
['catatan itu menunjukkan exco pkr selangor elizabeth wong adun pkr chua yee ling dan ahli majlis selayang daripada pkr fok wai mun di sebuah acara perayaan cina tetapi membabitkan saya dalam gambar itu',
 'rakyat malaysia yang berfikiran waras akan ingat bagaimana mahathir memulakan serangan berita palsu terhadap mdb apabila menyatakan dengan salah bahawa rm bilion telah hilang hanya untuk dibuktikan berulang kali bahawa kenyataan itu adalah salah',
 'sehingga hari ini selain dakwaan asas yang terkandung dalam tuntutan sivil itu doj belum mengemukakan sebarang bukti kukuh bahawa jho low ialah pemilik sebenar kapal mewah itu ataupun ia dibeli menggunakan dana daripada mdb',
 'sebagai negara yang menandatangani who pertubuhan kesihatan sedunia kami juga komited untuk mencapai strategi sektor kesihatan global dengan matlamat untuk menghapuskan viral hepatitis menjelang tahun',
 'mdb berulang kali menjelaskan bahawa walaupun ia mempunyai urusan perniagaan dengan aabar bvi mdb tidak mempunyai sebarang urusan perniagaan dengan jho low dan yang lebih penting mdb bukanlah pihak dalam tuntutan sivil doj']

Get topics

def get_topics(self, len_topic: int):
    """
    Return important topics based on decomposition.

    Parameters
    ----------
    len_topic: int

    Returns
    -------
    result: List[str]
    """
[19]:
lda.get_topics(10)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
[19]:
[(0,
  'negara mdb malaysia perniagaan ahli negara bidang negara maju membantu negara bidang membantu negara negara maju bidang'),
 (1,
  'malaysia negara parti kerajaan mdb menteri bidang kuok berlaku pendidikan'),
 (2,
  'rakyat negara umno keputusan hutang tindakan isu air hutang hutang negeri'),
 (3,
  'sukan saham sukan suka berlaku kerajaan rendah kepentingan sumber meningkatkan diterjemahkan'),
 (4,
  'menteri rakyat kerajaan perdana perdana menteri malaysia anak nilai ph beban'),
 (5,
  'kerajaan malaysia dana negara pendapatan asli peningkatan awam usaha tertinggi'),
 (6, 'projek masyarakat harga isu rm malaysia rakyat hutang dijual sokongan'),
 (7,
  'pembangunannya negara selatan negara selatan negara malaysia berkongsi pengalaman berkongsi pengalaman pengalaman pembangunannya negara berkongsi pengalaman negara pembangunannya negara'),
 (8,
  'projek negara parti syarikat kerajaan harapan undi malaysia berjalan asli'),
 (9,
  'parti bahasa faktor berita perlembagaan umno kelulusan amanah pas islam')]

Visualize topics

def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'):
    """
    Print important topics based on decomposition.

    Parameters
    ----------
    mds : str, optional (default='pcoa')
        2D Decomposition. Allowed values:

        * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
        * ``'mmds'`` - Dimension reduction via Multidimensional scaling
        * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
    """
[20]:
lda.visualize_topics(notebook_mode = True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)
[20]: