Source code for malaya.topic_model.decomposition

import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from malaya.text.function import (
    simple_textcleaning,
    get_stopwords,
    print_topics_modeling,
)
from malaya.function import validator
from typing import List, Tuple


[docs]class Topic:
    def __init__(
        self, features, comp, corpus, transformed, vectorizer, vectors
    ):
        self.features = features
        self.comp = comp
        self.corpus = corpus
        self.transformed = transformed
        self.vectorizer = vectorizer
        self._vectors = vectors

[docs]    def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        mds : str, optional (default='pcoa')
            2D Decomposition. Allowed values:

            * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
            * ``'mmds'`` - Dimension reduction via Multidimensional scaling
            * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
        """

        if not isinstance(self.comp, LatentDirichletAllocation):
            raise ValueError('only support LatentDirichletAllocation model.')

        import pyLDAvis
        import pyLDAvis.sklearn

        if notebook_mode:
            pyLDAvis.enable_notebook()

        prepared_vis_data = pyLDAvis.sklearn.prepare(
            self.comp, self._vectors, self.vectorizer, mds=mds
        )
        if notebook_mode:
            return prepared_vis_data
        else:
            pyLDAvis.show(prepared_vis_data)

[docs]    def top_topics(
        self, len_topic: int, top_n: int = 10, return_df: bool = True
    ):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        len_topic: int
            size of topics.
        top_n: int, optional (default=10)
            top n of each topic.
        return_df: bool, optional (default=True)
            return as pandas.DataFrame, else JSON.
        """
        return print_topics_modeling(
            len_topic,
            feature_names=np.array(self.features),
            sorting=np.argsort(self.comp.components_)[:, ::-1],
            n_words=top_n,
            return_df=return_df,
        )

[docs]    def get_topics(self, len_topic: int):
        """
        Return important topics based on decomposition.

        Parameters
        ----------
        len_topic: int

        Returns
        -------
        result: List[str]
        """
        if not isinstance(len_topic, int):
            raise ValueError('len_topic must be an integer')
        results = []
        for no, topic in enumerate(self.comp.components_):
            results.append(
                (
                    no,
                    ' '.join(
                        [
                            self.features[i]
                            for i in topic.argsort()[: -len_topic - 1: -1]
                        ]
                    ),
                )
            )
        return results

[docs]    def get_sentences(self, len_sentence: int, k: int = 0):
        """
        Return important sentences related to selected column based on decomposition.

        Parameters
        ----------
        len_sentence: int
        k: int, (default=0)
            index of decomposition matrix.

        Returns
        -------
        result: List[str]
        """
        if not (k < self.transformed.shape[1] and k >= 0):
            raise ValueError('k should be between 0 and n_topics')
        reverse_sorted = np.argsort(self.transformed[:, k])[::-1]
        return [self.corpus[i] for i in reverse_sorted[:len_sentence]]


[docs]def fit(
    corpus: List[str],
    model,
    vectorizer,
    n_topics: int,
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Train a SKlearn model to do topic modelling based on corpus given.

    Parameters
    ----------
    corpus: list
    model : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.decomposition.TruncatedSVD`` - LSA algorithm.
        * ``sklearn.decomposition.LatentDirichletAllocation`` - LDA algorithm.
        * ``sklearn.decomposition.NMF`` - NMF algorithm.
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: malaya.topic_model.decomposition.Topic class
    """
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)
    validator.validate_function(cleaning, 'cleaning')
    if not hasattr(vectorizer, 'fit_transform'):
        raise ValueError('vectorizer must have `fit_transform` method')

    if len(corpus) < n_topics:
        raise ValueError(
            'length corpus must be bigger than or equal to n_topics'
        )

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])

    tf = vectorizer.fit_transform(corpus)
    tf_features = vectorizer.get_feature_names()
    compose = model(n_topics).fit(tf)
    return Topic(
        tf_features, compose, corpus, compose.transform(tf), vectorizer, tf
    )