import re
import operator
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from malaya.text import rake as rake_function
from malaya.text.function import (
transformer_textcleaning,
get_stopwords,
)
from malaya.text.bpe import SentencePieceTokenizer
from malaya.path import MODEL_VOCAB, MODEL_BPE
from malaya.function import validator
from malaya.graph.pagerank import pagerank
from typing import List
def _calculate_count(strings):
vocab = defaultdict(int)
for k in strings:
results = [(m.start(0), m.end(0))
for m in re.finditer(r'\b' + k, string, flags=re.IGNORECASE)]
vocab[k] = len(results)
return vocab
def _auto_ngram(string, stopwords):
splitted = rake_function.split_sentences(string)
stop_word_regex_list = []
for word in stopwords:
word_regex = r'\b' + word + r'(?![\w-])'
stop_word_regex_list.append(word_regex)
stop_word_pattern = re.compile(
'|'.join(stop_word_regex_list), re.IGNORECASE
)
phrase_list = rake_function.generate_candidate_keywords(
splitted, stop_word_pattern
)
vocab = defaultdict(int)
for t in phrase_list:
vocab[t] += 1
return vocab
def _base(string, vectorizer, **kwargs):
s = vectorizer.fit([string])
vocab = defaultdict(int)
tokens = s.build_analyzer()(string)
for t in tokens:
vocab[t] += 1
return vocab
[docs]def rake(
string: str,
vocab: List[str] = None,
model=None,
vectorizer=None,
top_k: int = 5,
atleast: int = 1,
stopwords=get_stopwords,
**kwargs,
):
"""
Extract keywords using Rake algorithm.
Parameters
----------
string: str
vocab: List[str], optional (default=None)
List of important substrings.
This will override `vectorizer` parameter.
model: Object, optional (default=None)
model must has `attention` method.
vectorizer: Object, optional (default=None)
Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
`malaya.text.vectorizer.SkipGramCountVectorizer`.
If None, will generate ngram automatically based on `stopwords`.
top_k: int, optional (default=5)
return top-k results.
atleast: int, optional (default=1)
at least count appeared in the string to accept as candidate.
stopwords: List[str], (default=malaya.text.function.get_stopwords)
A callable that returned a List[str], or a List[str], or a Tuple[str]
For automatic Ngram generator.
Returns
-------
result: Tuple[float, str]
"""
stopwords = validator.validate_stopwords(stopwords)
if model is not None:
if not hasattr(model, 'attention'):
raise ValueError('model must have `attention` method')
if top_k < 1:
raise ValueError('top_k must bigger than 0')
if atleast < 1:
raise ValueError('atleast must bigger than 0')
if not vectorizer:
auto_ngram = True
else:
auto_ngram = False
if not hasattr(vectorizer, 'fit'):
raise ValueError('vectorizer must have `fit` method')
if auto_ngram and not len(stopwords):
raise ValueError('insert stopwords if auto_ngram')
if model:
string = transformer_textcleaning(string)
attention = model.attention([string])[0]
d = defaultdict(float)
for k, v in attention:
d[k] += v
else:
d = None
if vocab:
vocab = [v for v in vocab if v in string]
vocab = _calculate_count(vocab)
else:
if auto_ngram:
vocab = _auto_ngram(string, stopwords)
else:
vocab = _base(string, vectorizer=vectorizer, **kwargs)
phrase_list = list(vocab.keys())
scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
keywordcandidates = rake_function.generate_candidate_keyword_scores(
phrase_list, scores
)
sortedKeywords = sorted(
keywordcandidates.items(), key=operator.itemgetter(1), reverse=True
)
total = sum([i[1] for i in sortedKeywords])
ranked_sentences = [
(i[1] / total, i[0]) for i in sortedKeywords if vocab[i[0]] >= atleast
]
return ranked_sentences[:top_k]
[docs]def attention(
string: str,
model,
vocab: List[str] = None,
vectorizer=None,
top_k: int = 5,
atleast: int = 1,
stopwords=get_stopwords,
**kwargs,
):
"""
Extract keywords using Attention mechanism.
Parameters
----------
string: str
model: Object
model must has `attention` method.
vocab: List[str], optional (default=None)
List of important substrings.
This will override `vectorizer` parameter.
vectorizer: Object, optional (default=None)
Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
`malaya.text.vectorizer.SkipGramCountVectorizer`.
If None, will generate ngram automatically based on `stopwords`.
top_k: int, optional (default=5)
return top-k results.
atleast: int, optional (default=1)
at least count appeared in the string to accept as candidate.
stopwords: List[str], (default=malaya.text.function.get_stopwords)
A callable that returned a List[str], or a List[str], or a Tuple[str]
Returns
-------
result: Tuple[float, str]
"""
stopwords = validator.validate_stopwords(stopwords)
if not hasattr(model, 'attention'):
raise ValueError('model must have `attention` method')
if top_k < 1:
raise ValueError('top_k must bigger than 0')
if atleast < 1:
raise ValueError('atleast must bigger than 0')
if not vectorizer:
auto_ngram = True
else:
auto_ngram = False
if not hasattr(vectorizer, 'fit'):
raise ValueError('vectorizer must have `fit` method')
if auto_ngram and not len(stopwords):
raise ValueError('insert stopwords if auto_ngram')
string = transformer_textcleaning(string)
if vocab:
vocab = [v for v in vocab if v in string]
vocab = _calculate_count(vocab)
else:
if auto_ngram:
vocab = _auto_ngram(string, stopwords)
else:
vocab = _base(string, vectorizer=vectorizer, **kwargs)
attention = model.attention([string])[0]
d = defaultdict(float)
for k, v in attention:
d[k] += v
scores = []
for k in vocab.keys():
scores.append(sum([d.get(w, 0) for w in k.split()]))
total = sum(scores)
ranked_sentences = sorted(
[
(scores[i] / total, s)
for i, s in enumerate(vocab.keys())
if vocab[s] >= atleast
],
reverse=True,
)
return ranked_sentences[:top_k]
[docs]def similarity(
string: str,
model,
vocab: List[str] = None,
vectorizer=None,
top_k: int = 5,
atleast: int = 1,
stopwords=get_stopwords,
**kwargs,
):
"""
Extract keywords using Sentence embedding VS keyword embedding similarity.
Parameters
----------
string: str
model: Object
Transformer model or any model has `vectorize` method.
vectorizer: Object, optional (default=None)
Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
`malaya.text.vectorizer.SkipGramCountVectorizer`.
If None, will generate ngram automatically based on `stopwords`.
top_k: int, optional (default=5)
return top-k results.
atleast: int, optional (default=1)
at least count appeared in the string to accept as candidate.
stopwords: List[str], (default=malaya.text.function.get_stopwords)
A callable that returned a List[str], or a List[str], or a Tuple[str]
Returns
-------
result: Tuple[float, str]
"""
stopwords = validator.validate_stopwords(stopwords)
if not hasattr(model, 'vectorize'):
raise ValueError('model must have `vectorize` method')
if top_k < 1:
raise ValueError('top_k must bigger than 0')
if atleast < 1:
raise ValueError('atleast must bigger than 0')
if not vectorizer:
auto_ngram = True
else:
auto_ngram = False
if not hasattr(vectorizer, 'fit'):
raise ValueError('vectorizer must have `fit` method')
if auto_ngram and not len(stopwords):
raise ValueError('insert stopwords if auto_ngram')
string = transformer_textcleaning(string)
if vocab:
vocab = [v for v in vocab if v in string]
vocab = _calculate_count(vocab)
else:
if auto_ngram:
vocab = _auto_ngram(string, stopwords)
else:
vocab = _base(string, vectorizer=vectorizer, **kwargs)
words = list(vocab.keys())
vectors_keywords = model.vectorize(words)
vectors_string = model.vectorize([string])
distances = cosine_similarity(vectors_string, vectors_keywords)
ranked_sentences = [
(distances[0][index], words[index]) for index in distances.argsort()[0]
][::-1]
ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
return ranked_sentences[:top_k]