Source code for malaya.keyword.extractive

import re
import operator
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from malaya.text import rake as rake_function
from malaya.text.function import (
    transformer_textcleaning,
    get_stopwords,
)
from malaya.text.bpe import SentencePieceTokenizer
from malaya.path import MODEL_VOCAB, MODEL_BPE
from malaya.function import validator
from malaya.graph.pagerank import pagerank
from typing import List


def _calculate_count(strings):
    vocab = defaultdict(int)
    for k in strings:
        results = [(m.start(0), m.end(0))
                   for m in re.finditer(r'\b' + k, string, flags=re.IGNORECASE)]
        vocab[k] = len(results)
    return vocab


def _auto_ngram(string, stopwords):
    splitted = rake_function.split_sentences(string)
    stop_word_regex_list = []
    for word in stopwords:
        word_regex = r'\b' + word + r'(?![\w-])'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile(
        '|'.join(stop_word_regex_list), re.IGNORECASE
    )
    phrase_list = rake_function.generate_candidate_keywords(
        splitted, stop_word_pattern
    )
    vocab = defaultdict(int)
    for t in phrase_list:
        vocab[t] += 1
    return vocab


def _base(string, vectorizer, **kwargs):
    s = vectorizer.fit([string])
    vocab = defaultdict(int)
    tokens = s.build_analyzer()(string)
    for t in tokens:
        vocab[t] += 1
    return vocab


[docs]def rake(
    string: str,
    vocab: List[str] = None,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    vocab: List[str], optional (default=None)
        List of important substrings.
        This will override `vectorizer` parameter.
    model: Object, optional (default=None)
        model must has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.text.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
        For automatic Ngram generator.

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if vocab:
        vocab = [v for v in vocab if v in string]
        vocab = _calculate_count(vocab)
    else:
        if auto_ngram:
            vocab = _auto_ngram(string, stopwords)
        else:
            vocab = _base(string, vectorizer=vectorizer, **kwargs)

    phrase_list = list(vocab.keys())

    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores
    )

    sortedKeywords = sorted(
        keywordcandidates.items(), key=operator.itemgetter(1), reverse=True
    )

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [
        (i[1] / total, i[0]) for i in sortedKeywords if vocab[i[0]] >= atleast
    ]
    return ranked_sentences[:top_k]


[docs]def textrank(
    string: str,
    vocab: List[str] = None,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Textrank algorithm.

    Parameters
    ----------
    string: str
    vocab: List[str], optional (default=None)
        List of important substrings.
        This will override `vectorizer` parameter.
    model: Object, optional (default='None')
        model must has `fit_transform` or `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.text.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'fit_transform') and not hasattr(model, 'vectorize'):
        raise ValueError(
            'model must have `fit_transform` or `vectorize` method'
        )

    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if vocab:
        vocab = [v for v in vocab if v in string]
        vocab = _calculate_count(vocab)
    else:
        if auto_ngram:
            vocab = _auto_ngram(string, stopwords)
        else:
            vocab = _base(string, vectorizer=vectorizer, **kwargs)

    phrase_list = list(vocab.keys())

    if hasattr(model, 'fit_transform'):
        vectors = model.fit_transform(phrase_list)
    if hasattr(model, 'vectorize'):
        vectors = model.vectorize(phrase_list)
    similar = cosine_similarity(vectors, vectors)
    similar[np.diag_indices(len(similar))] = 0.0
    scores = pagerank(similar, **kwargs)
    total = sum(scores)
    ranked_sentences = sorted(
        [
            (scores[i] / total, s)
            for i, s in enumerate(vocab.keys())
            if vocab[s] >= atleast
        ],
        reverse=True,
    )

    return ranked_sentences[:top_k]


[docs]def attention(
    string: str,
    model,
    vocab: List[str] = None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Attention mechanism.

    Parameters
    ----------
    string: str
    model: Object
        model must has `attention` method.
    vocab: List[str], optional (default=None)
        List of important substrings.
        This will override `vectorizer` parameter.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.text.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """

    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'attention'):
        raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    string = transformer_textcleaning(string)

    if vocab:
        vocab = [v for v in vocab if v in string]
        vocab = _calculate_count(vocab)
    else:
        if auto_ngram:
            vocab = _auto_ngram(string, stopwords)
        else:
            vocab = _base(string, vectorizer=vectorizer, **kwargs)

    attention = model.attention([string])[0]
    d = defaultdict(float)
    for k, v in attention:
        d[k] += v

    scores = []
    for k in vocab.keys():
        scores.append(sum([d.get(w, 0) for w in k.split()]))

    total = sum(scores)

    ranked_sentences = sorted(
        [
            (scores[i] / total, s)
            for i, s in enumerate(vocab.keys())
            if vocab[s] >= atleast
        ],
        reverse=True,
    )
    return ranked_sentences[:top_k]


[docs]def similarity(
    string: str,
    model,
    vocab: List[str] = None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Sentence embedding VS keyword embedding similarity.

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.text.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'vectorize'):
        raise ValueError('model must have `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    string = transformer_textcleaning(string)

    if vocab:
        vocab = [v for v in vocab if v in string]
        vocab = _calculate_count(vocab)
    else:
        if auto_ngram:
            vocab = _auto_ngram(string, stopwords)
        else:
            vocab = _base(string, vectorizer=vectorizer, **kwargs)

    words = list(vocab.keys())
    vectors_keywords = model.vectorize(words)
    vectors_string = model.vectorize([string])

    distances = cosine_similarity(vectors_string, vectors_keywords)
    ranked_sentences = [
        (distances[0][index], words[index]) for index in distances.argsort()[0]
    ][::-1]

    ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
    return ranked_sentences[:top_k]