Source code for malaya.keyword.extractive

import re
import operator
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from malaya.text import rake as rake_function
from malaya.text.function import (
    transformer_textcleaning,
    get_stopwords,
)
from malaya.text.bpe import SentencePieceTokenizer
from malaya.path import MODEL_VOCAB, MODEL_BPE
from malaya.function import validator
from malaya.graph.pagerank import pagerank
from typing import List


def _calculate_count(strings):
    vocab = defaultdict(int)
    for k in strings:
        results = [(m.start(0), m.end(0))
                   for m in re.finditer(r'\b' + k, string, flags=re.IGNORECASE)]
        vocab[k] = len(results)
    return vocab


def _auto_ngram(string, stopwords):
    splitted = rake_function.split_sentences(string)
    stop_word_regex_list = []
    for word in stopwords:
        word_regex = r'\b' + word + r'(?![\w-])'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile(
        '|'.join(stop_word_regex_list), re.IGNORECASE
    )
    phrase_list = rake_function.generate_candidate_keywords(
        splitted, stop_word_pattern
    )
    vocab = defaultdict(int)
    for t in phrase_list:
        vocab[t] += 1
    return vocab


def _base(string, vectorizer, **kwargs):
    s = vectorizer.fit([string])
    vocab = defaultdict(int)
    tokens = s.build_analyzer()(string)
    for t in tokens:
        vocab[t] += 1
    return vocab


[docs]def rake( string: str, vocab: List[str] = None, model=None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Rake algorithm. Parameters ---------- string: str vocab: List[str], optional (default=None) List of important substrings. This will override `vectorizer` parameter. model: Object, optional (default=None) model must has `attention` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.text.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] For automatic Ngram generator. Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if model is not None: if not hasattr(model, 'attention'): raise ValueError('model must have `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if model: string = transformer_textcleaning(string) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v else: d = None if vocab: vocab = [v for v in vocab if v in string] vocab = _calculate_count(vocab) else: if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) phrase_list = list(vocab.keys()) scores = rake_function.calculate_word_scores(phrase_list, attentions=d) keywordcandidates = rake_function.generate_candidate_keyword_scores( phrase_list, scores ) sortedKeywords = sorted( keywordcandidates.items(), key=operator.itemgetter(1), reverse=True ) total = sum([i[1] for i in sortedKeywords]) ranked_sentences = [ (i[1] / total, i[0]) for i in sortedKeywords if vocab[i[0]] >= atleast ] return ranked_sentences[:top_k]
[docs]def textrank( string: str, vocab: List[str] = None, model=None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Textrank algorithm. Parameters ---------- string: str vocab: List[str], optional (default=None) List of important substrings. This will override `vectorizer` parameter. model: Object, optional (default='None') model must has `fit_transform` or `vectorize` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.text.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'fit_transform') and not hasattr(model, 'vectorize'): raise ValueError( 'model must have `fit_transform` or `vectorize` method' ) if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if vocab: vocab = [v for v in vocab if v in string] vocab = _calculate_count(vocab) else: if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) phrase_list = list(vocab.keys()) if hasattr(model, 'fit_transform'): vectors = model.fit_transform(phrase_list) if hasattr(model, 'vectorize'): vectors = model.vectorize(phrase_list) similar = cosine_similarity(vectors, vectors) similar[np.diag_indices(len(similar))] = 0.0 scores = pagerank(similar, **kwargs) total = sum(scores) ranked_sentences = sorted( [ (scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast ], reverse=True, ) return ranked_sentences[:top_k]
[docs]def attention( string: str, model, vocab: List[str] = None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Attention mechanism. Parameters ---------- string: str model: Object model must has `attention` method. vocab: List[str], optional (default=None) List of important substrings. This will override `vectorizer` parameter. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.text.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'attention'): raise ValueError('model must have `attention` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') string = transformer_textcleaning(string) if vocab: vocab = [v for v in vocab if v in string] vocab = _calculate_count(vocab) else: if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) attention = model.attention([string])[0] d = defaultdict(float) for k, v in attention: d[k] += v scores = [] for k in vocab.keys(): scores.append(sum([d.get(w, 0) for w in k.split()])) total = sum(scores) ranked_sentences = sorted( [ (scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast ], reverse=True, ) return ranked_sentences[:top_k]
[docs]def similarity( string: str, model, vocab: List[str] = None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Sentence embedding VS keyword embedding similarity. Parameters ---------- string: str model: Object Transformer model or any model has `vectorize` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.text.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'vectorize'): raise ValueError('model must have `vectorize` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') string = transformer_textcleaning(string) if vocab: vocab = [v for v in vocab if v in string] vocab = _calculate_count(vocab) else: if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) words = list(vocab.keys()) vectors_keywords = model.vectorize(words) vectors_string = model.vectorize([string]) distances = cosine_similarity(vectors_string, vectors_keywords) ranked_sentences = [ (distances[0][index], words[index]) for index in distances.argsort()[0] ][::-1] ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast] return ranked_sentences[:top_k]