Using BERTopic#

This tutorial is available as an IPython notebook at Malaya/example/topic-modeling-bertopic.

Install BERTopic#

Make sure you already installed BERTopic,

pip3 install bertopic

You can read more about BERTopic at https://maartengr.github.io/BERTopic/#quick-start

[2]:
import pandas as pd
import malaya
[3]:
df = pd.read_csv('tests/02032018.csv',sep=';')
df = df.iloc[3:,1:]
df.columns = ['text','label']
corpus = df.text.tolist()

Load vectorizer model#

[4]:
vectorizer = malaya.transformer.huggingface()
vectorizer.vectorize(['hello', 'ayam']).shape
[4]:
(2, 256)

Create embedder object#

[5]:
from bertopic.backend import BaseEmbedder
import numpy as np
from typing import List
[6]:
class embedder(BaseEmbedder):

    def embed(self,
              documents: List[str],
              verbose: bool = False):
        return vectorizer.vectorize(documents)

    def embed_words(self,
                    words: List[str],
                    verbose: bool = False):
        return self.embed(words, verbose)

    def embed_documents(self,
                        document: List[str],
                        verbose: bool = False):
        return self.embed(document, verbose)
[7]:
embedder = embedder()

Train BERTopic#

[8]:
from bertopic import BERTopic
from bertopic.backend._utils import select_backend

model = BERTopic(language = None, embedding_model = embedder)
model.language
[9]:
embedding_model = select_backend(model.embedding_model,
                                                  language=model.language)
[11]:
topics, probs = model.fit_transform(corpus)

Get topic frequencies#

[21]:
model.get_topic_freq()
[21]:
Topic Count
0 -1 91
1 0 74
2 1 50
3 2 32
4 3 21
5 4 18
6 5 18

Get topic#

[25]:
model.get_topic(0)[:10]
[25]:
[('dan', 0.06565089788198161),
 ('yang', 0.05259762576836089),
 ('untuk', 0.040200581210431734),
 ('dalam', 0.03976023208988511),
 ('ini', 0.03830356403041938),
 ('malaysia', 0.03714433613321695),
 ('akan', 0.03629905145154332),
 ('lebih', 0.03627484217552929),
 ('kami', 0.035442639029197864),
 ('di', 0.03322709631876396)]

Find similar topics#

[20]:
similar_topics, similarity = model.find_topics('najib', top_n=10)
similar_topics
[20]:
[1, 5, 2, 3, 4, 0, -1]
[ ]: