Doc2Vec#

This tutorial is available as an IPython notebook at Malaya/example/doc2vec.

This module trained on both standard and local (included social media) language structures, so it is save to use for both.

[1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
[2]:
%%time
import malaya
CPU times: user 3.02 s, sys: 3.72 s, total: 6.74 s
Wall time: 2.16 s
/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3372
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3890
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
[3]:
string1 = 'Pemuda mogok lapar desak kerajaan prihatin isu iklim'
string2 = 'Perbincangan isu pembalakan perlu babit kerajaan negeri'
string3 = 'kerajaan perlu kisah isu iklim, pemuda mogok lapar'
string4 = 'Kerajaan dicadang tubuh jawatankuasa khas tangani isu alam sekitar'
[4]:
news1 = 'Tun Dr Mahathir Mohamad mengakui pembubaran Parlimen bagi membolehkan pilihan raya diadakan tidak sesuai dilaksanakan pada masa ini berikutan isu COVID-19'
tweet1 = 'DrM sembang pilihan raya tak boleh buat sebab COVID 19'

Word Vector#

def wordvector(wv):
    """
    Doc2vec interface for text similarity using Word Vector.

    Parameters
    ----------
    wv: object
        malaya.wordvector.WordVector object.
        should have `get_vector_by_name` method.

    Returns
    -------
    result: malaya.similarity.doc2vec.Doc2VecSimilarity
    """

    if not hasattr(wv, 'get_vector_by_name'):
        raise ValueError('wordvector must have `get_vector_by_name` method')
    return Doc2VecSimilarity(wv)

Using Interface#

I will use malaya.wordvector.load(model = 'news'), pretty accurate related to local issues.

[6]:
vocab_news, embedded_news = malaya.wordvector.load(model = 'news')
w2v = malaya.wordvector.WordVector(embedded_news, vocab_news)
[8]:
%%time

doc2vec = malaya.similarity.doc2vec.wordvector(w2v)
CPU times: user 41 µs, sys: 0 ns, total: 41 µs
Wall time: 43.4 µs

predict batch of strings with probability#

def predict_proba(
    self,
    left_strings: List[str],
    right_strings: List[str],
    aggregation: Callable = np.mean,
    similarity: str = 'cosine',
    soft: bool = False,
):
    """
    calculate similarity for two different batch of texts.

    Parameters
    ----------
    left_strings : list of str
    right_strings : list of str
    aggregation : Callable, optional (default=numpy.mean)
    similarity : str, optional (default='mean')
        similarity supported. Allowed values:

        * ``'cosine'`` - cosine similarity.
        * ``'euclidean'`` - euclidean similarity.
        * ``'manhattan'`` - manhattan similarity.
    soft: bool, optional (default=False)
        word not inside word vector will replace with nearest word if True, else, will skip.

    Returns
    -------
    result: List[float]
    """
[9]:
%%time

doc2vec.predict_proba([string1], [string2])
CPU times: user 1.97 ms, sys: 0 ns, total: 1.97 ms
Wall time: 10.3 ms
[9]:
array([0.89971105])
[10]:
%%time

doc2vec.predict_proba([string1, string2], [string3, string4])
CPU times: user 0 ns, sys: 2.39 ms, total: 2.39 ms
Wall time: 1.79 ms
[10]:
array([0.91679387, 0.82348571])
[11]:
%%time

doc2vec.predict_proba([string1, string2], [string3, tweet1])
CPU times: user 992 µs, sys: 952 µs, total: 1.94 ms
Wall time: 1.23 ms
[11]:
array([0.91679387, 0.78542261])

Vectorizer Model#

We can use any Vectorizer models provided by Malaya to use encoder similarity interface, example, BERT, XLNET. Again, these encoder models not trained to do similarity classification, it just encode the strings into vector representation.

def vectorizer(v):
    """
    Doc2vec interface for text similarity using Encoder model.

    Parameters
    ----------
    v: object
        encoder interface object, BERT, XLNET.
        should have `vectorize` method.

    Returns
    -------
    result: malaya.similarity.doc2vec.VectorizerSimilarity
    """

using ALXLNET#

[15]:
alxlnet = malaya.transformer.load(model = 'alxlnet')
doc2vec_vectorizer = malaya.similarity.doc2vec.vectorizer(alxlnet)

predict for 2 strings with probability#

def predict_proba(
    self,
    left_strings: List[str],
    right_strings: List[str],
    similarity: str = 'cosine',
):
    """
    calculate similarity for two different batch of texts.

    Parameters
    ----------
    left_strings : list of str
    right_strings : list of str
    similarity : str, optional (default='mean')
        similarity supported. Allowed values:

        * ``'cosine'`` - cosine similarity.
        * ``'euclidean'`` - euclidean similarity.
        * ``'manhattan'`` - manhattan similarity.

    Returns
    -------
    result: List[float]
    """
[13]:
%%time

doc2vec_vectorizer.predict_proba([string1], [string2])
CPU times: user 457 ms, sys: 99.9 ms, total: 557 ms
Wall time: 286 ms
[13]:
array([0.8906925], dtype=float32)
[14]:
%%time

doc2vec_vectorizer.predict_proba([string1, string2], [string3, string4])
CPU times: user 386 ms, sys: 29.7 ms, total: 416 ms
Wall time: 49 ms
[14]:
array([0.6371902, 0.6291744], dtype=float32)