Source code for botiverse.preprocessors.TF_IDF_GLOVE.TF_IDF_GLOVE
from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from botiverse.preprocessors import GloVe, TF_IDF
[docs]class TF_IDF_GLOVE():
'''
An interface for transforming sentences into idf-glove vectors by weighting word GloVe vectors by their tf-idf.
'''
def __init__(self, force_download=False):
'''
Initialize the GloVe and TF-IDF transformer and download the embeddings if needed.
:param force_download: If True, download the embeddings even if they already exist.
:type force_download: bool
'''
self.glove = GloVe(force_download)
self.glove_dict = self.glove.glove_dict
self.tf_idf = TF_IDF()
#self.glove_dict = None
self.tf = None
self.idf = None
self.all_words = None
[docs] def transform_list(self, sentence_list, all_words):
'''
Given a list of tokenized sentences, return a table of idf-GloVe vectors (one for each sentence) in the form of a numpy array.
This also initializes the tf and idf tables of the class for use in the transform() method.
:param sentence_list: A list of tokenized sentences
:type sentence_list: list
:param all_words: A list of all the words in the corpus
:type all_words: list
:return: A 2D numpy array of idf-GloVe vectors
:rtype: numpy.ndarray
'''
self.all_words = all_words
# just to set tf and idf
self.tf_idf.transform_list(sentence_list, all_words)
self.tf, self.idf = self.tf_idf.tf, self.tf_idf.idf
# make a numpy array of the sentence vectors
X = np.zeros((len(sentence_list), 50))
for i, sentence in enumerate(sentence_list):
X[i] = self.transform(sentence)
return X
[docs] def transform(self, sentence):
'''
Given a sentence, return its idf-GloVe vector as a numpy array by weighting the GloVe vectors of the words in the sentence by their idf then averaging.
:param sentence: A string of words
:type sentence: str
:return: A numpy array of the idf-GloVe vector
:rtype: numpy.ndarray
'''
tokens = list(tokenize(sentence, to_lower=True))
tokens_s = [stemmer.stem(word.lower()) for word in tokens if word not in ['?', '!', '.', ',']]
# get the idf of each word in the sentence
weights = np.zeros((len(tokens)))
for i, word in enumerate(tokens_s):
# if word in self.all_words:
if word in self.all_words and tokens[i] in self.glove_dict:
weights[i] = self.idf[self.all_words.index(word)]
# normalize the weights
weights = weights / np.sum(weights)
# get the weighted average of the glove vectors
avg_vector = np.zeros(50)
for i, word in enumerate(tokens):
if word in self.glove_dict:
avg_vector += weights[i] * self.glove_dict[word]
avg_vector = avg_vector[np.newaxis, :]
return avg_vector