Source code for botiverse.preprocessors.GloVe.GloVe
import os
import gdown
from gensim.utils import tokenize
import numpy as np
[docs]class GloVe():
'''
An interface for transforming words into GloVe vectors.
'''
def __init__(self, force_download=False):
'''
Initialize the GloVe transformer and download the embeddings if needed.
:param force_download: If True, download the embeddings even if they already exist.
:type force_download: bool
'''
self.glove_dict = None
self.load_glove_vectors(force_download=force_download)
# Load GLOVE word vectors
[docs] def load_glove_vectors(self, force_download):
'''
Load GloVe vectors from gensim into the class. By default uses 50D vectors.
:param force_download: If True, download the embeddings even if they already exist.
:type force_download: bool
'''
curr_dir = os.path.dirname(os.path.abspath(__file__))
path = curr_dir + '/glove.6B.50d.txt'
if not os.path.exists(path) or force_download:
print("GLoVE embeddings not found. Downloading now...")
f_id = '1BOSO0rR3ZzjWlv5WYzCux6ZluBP_vNDv'
gdown.download(f'https://drive.google.com/uc?export=download&confirm=pbef&id={f_id}', curr_dir + '/glove.6B.50d.txt', quiet=False)
print("Done.")
glove_dict = {} # dictionary mapping words to their GloVe vector representation
with open(path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
# In each line, the first value is the word, the rest are the values of the vector
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
glove_dict[word] = vector
self.glove_dict = glove_dict
[docs] def transform_list(self, sentence_list, **kwargs):
'''
Transform a sentence list into a numpy array of GloVe vectors
:param sentence_list: A list of sentences
:type sentence_list: list
:return: A numpy array of GloVe vectors
:rtype: numpy.ndarray
'''
# make a numpy array of the sentence vectors
X = np.zeros((len(sentence_list), 50))
for i, sentence in enumerate(sentence_list):
X[i] = self.transform(sentence)
return X
[docs] def transform(self, sentence):
'''
Transform a sentence into a GloVe vector by averaging the word vectors in it.
:param sentence: a string of words
:type sentence: str
:return: the corresponding GloVe vector
:rtype: numpy.ndarray
'''
tokens = list(tokenize(sentence, to_lower=True))
vector_sum = np.zeros(50)
num_glove_tokens = 0 # num of words that occur in glove vocab
for token in tokens:
if token in self.glove_dict:
vector_sum += self.glove_dict[token]
num_glove_tokens += 1
avg_vector = np.zeros_like(self.glove_dict['a']) if num_glove_tokens == 0 else vector_sum / num_glove_tokens
avg_vector = avg_vector[np.newaxis, :]
return avg_vector