Source code for malaya.similarity.doc2vec

import numpy as np
from malaya.preprocessing import Tokenizer
from malaya.text.jarowinkler import JaroWinkler
from sklearn.metrics.pairwise import (
    cosine_similarity,
    euclidean_distances,
    manhattan_distances,
)
from typing import List, Tuple, Callable

similarity_functions = {
    'cosine': cosine_similarity,
    'euclidean': euclidean_distances,
    'manhattan': manhattan_distances,
}


[docs]class VectorizerSimilarity: def __init__(self, vectorizer): self._vectorizer = vectorizer def _predict( self, left_strings: List[str], right_strings: List[str], similarity: str = 'cosine', ): if len(left_strings) != len(right_strings): raise ValueError( 'length list of left strings must be same with length list of right strings' ) identical = left_strings == right_strings similarity = similarity.lower() similarity_function = similarity_functions.get(similarity) if similarity_function is None: raise ValueError( "similarity only supports 'cosine', 'euclidean', and 'manhattan'" ) left_vectors = self._vectorizer.vectorize(left_strings) if identical: similar = similarity_function(left_vectors, left_vectors) else: right_vectors = self._vectorizer.vectorize(right_strings) similar = similarity_function(left_vectors, right_vectors) if similarity == 'cosine': return (similar + 1) / 2 else: return 1 / (similar + 1)
[docs] def predict_proba( self, left_strings: List[str], right_strings: List[str], similarity: str = 'cosine', ): """ calculate similarity for two different batch of texts. Parameters ---------- left_strings : list of str right_strings : list of str similarity : str, optional (default='mean') similarity supported. Allowed values: * ``'cosine'`` - cosine similarity. * ``'euclidean'`` - euclidean similarity. * ``'manhattan'`` - manhattan similarity. Returns ------- result: List[float] """ return self._predict( left_strings, right_strings, similarity=similarity ).diagonal()
[docs] def heatmap( self, strings: List[str], similarity: str = 'cosine', visualize: bool = True, annotate: bool = True, figsize: Tuple[int, int] = (7, 7), ): """ plot a heatmap based on output from bert similarity. Parameters ---------- strings : list of str list of strings. similarity : str, optional (default='mean') similarity supported. Allowed values: * ``'cosine'`` - cosine similarity. * ``'euclidean'`` - euclidean similarity. * ``'manhattan'`` - manhattan similarity. visualize : bool if True, it will render plt.show, else return data. figsize : tuple, (default=(7, 7)) figure size for plot. Returns ------- result: list list of results """ results = self._predict(strings, strings, similarity=similarity) if not visualize: return results try: import matplotlib.pyplot as plt import seaborn as sns sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) plt.figure(figsize=figsize) g = sns.heatmap( results, cmap='Blues', xticklabels=strings, yticklabels=strings, annot=annotate, ) plt.show()
[docs]def vectorizer(v): """ Doc2vec interface for text similarity using Encoder model. Parameters ---------- v: object encoder interface object, BERT, XLNET. should have `vectorize` method. Returns ------- result: malaya.similarity.doc2vec.VectorizerSimilarity """ if not hasattr(v, 'vectorize'): raise ValueError('vectorizer must have `vectorize` method') return VectorizerSimilarity(v)