Source code for botiverse.preprocessors.TF_IDF_GLOVE.TF_IDF_GLOVE

from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from botiverse.preprocessors import GloVe, TF_IDF

[docs]class TF_IDF_GLOVE():
    '''
    An interface for transforming sentences into idf-glove vectors by weighting word GloVe vectors by their tf-idf.
    '''
    def __init__(self, force_download=False):
        '''
        Initialize the GloVe and TF-IDF transformer and download the embeddings if needed.
        
        :param force_download: If True, download the embeddings even if they already exist.
        :type force_download: bool
        '''
        self.glove = GloVe(force_download)
        self.glove_dict = self.glove.glove_dict
        self.tf_idf = TF_IDF()
        #self.glove_dict = None
        self.tf = None
        self.idf = None
        self.all_words = None


[docs]    def transform_list(self, sentence_list, all_words):
        '''
        Given a list of tokenized sentences, return a table of idf-GloVe vectors (one for each sentence) in the form of a numpy array.
        This also initializes the tf and idf tables of the class for use in the transform() method.
        
        :param sentence_list: A list of tokenized sentences
        :type sentence_list: list
        :param all_words: A list of all the words in the corpus
        :type all_words: list
        
        :return: A 2D numpy array of idf-GloVe vectors 
        :rtype: numpy.ndarray
        '''
        self.all_words = all_words
        # just to set tf and idf
        self.tf_idf.transform_list(sentence_list, all_words)
        self.tf, self.idf = self.tf_idf.tf, self.tf_idf.idf
        # make a numpy array of the sentence vectors
        X = np.zeros((len(sentence_list), 50))
        for i, sentence in enumerate(sentence_list):
                X[i] = self.transform(sentence)
        return X

[docs]    def transform(self, sentence):
        '''
        Given a sentence, return its idf-GloVe vector as a numpy array by weighting the GloVe vectors of the words in the sentence by their idf then averaging.
        
        :param sentence: A string of words
        :type sentence: str
        
        :return: A numpy array of the idf-GloVe vector
        :rtype: numpy.ndarray
        
        '''
        tokens = list(tokenize(sentence, to_lower=True))
        tokens_s = [stemmer.stem(word.lower()) for word in tokens if word not in ['?', '!', '.', ',']]

        # get the idf of each word in the sentence
        weights = np.zeros((len(tokens)))
        for i, word in enumerate(tokens_s):
            # if word in self.all_words:
            if word in self.all_words and tokens[i] in self.glove_dict:
                weights[i] = self.idf[self.all_words.index(word)]
        
        # normalize the weights
        weights = weights / np.sum(weights)
        
        # get the weighted average of the glove vectors
        avg_vector = np.zeros(50)
        for i, word in enumerate(tokens):
            if word in self.glove_dict:
                avg_vector += weights[i] * self.glove_dict[word]
        
        avg_vector = avg_vector[np.newaxis, :]
        return avg_vector