Source code for botiverse.preprocessors.GloVe.GloVe

import os
import gdown
from gensim.utils import tokenize
import numpy as np


[docs]class GloVe():
    '''
    An interface for transforming words into GloVe vectors.
    '''
    def __init__(self, force_download=False):
        '''
        Initialize the GloVe transformer and download the embeddings if needed.
        
        :param force_download: If True, download the embeddings even if they already exist.
        :type force_download: bool
        '''
        self.glove_dict = None
        self.load_glove_vectors(force_download=force_download)

    # Load GLOVE word vectors
[docs]    def load_glove_vectors(self, force_download):
        '''
        Load GloVe vectors from gensim into the class. By default uses 50D vectors.
        
        :param force_download: If True, download the embeddings even if they already exist.
        :type force_download: bool
        '''
        curr_dir = os.path.dirname(os.path.abspath(__file__))
        path = curr_dir + '/glove.6B.50d.txt'
        if not os.path.exists(path) or force_download:
            print("GLoVE embeddings not found. Downloading now...")
            f_id = '1BOSO0rR3ZzjWlv5WYzCux6ZluBP_vNDv' 
            gdown.download(f'https://drive.google.com/uc?export=download&confirm=pbef&id={f_id}', curr_dir + '/glove.6B.50d.txt', quiet=False)
            print("Done.")   
        
        glove_dict = {}         # dictionary mapping words to their GloVe vector representation
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                # In each line, the first value is the word, the rest are the values of the vector
                word = values[0]            
                vector = np.asarray(values[1:], dtype='float32')
                glove_dict[word] = vector
        self.glove_dict = glove_dict


[docs]    def transform_list(self, sentence_list, **kwargs):
        '''
        Transform a sentence list into a numpy array of GloVe vectors
        
        :param sentence_list: A list of sentences
        :type sentence_list: list
        
        :return: A numpy array of GloVe vectors
        :rtype: numpy.ndarray
        '''
        # make a numpy array of the sentence vectors
        X = np.zeros((len(sentence_list), 50))
        for i, sentence in enumerate(sentence_list):
                X[i] = self.transform(sentence)
        return X
    
[docs]    def transform(self, sentence):
        '''
        Transform a sentence into a GloVe vector by averaging the word vectors in it.
        
        :param sentence: a string of words
        :type sentence: str
        
        :return: the corresponding GloVe vector
        :rtype: numpy.ndarray
        '''
        tokens = list(tokenize(sentence, to_lower=True))
        vector_sum = np.zeros(50)  
        num_glove_tokens = 0            # num of words that occur in glove vocab
        for token in tokens:
            if token in self.glove_dict:
                vector_sum += self.glove_dict[token]
                num_glove_tokens += 1

        avg_vector = np.zeros_like(self.glove_dict['a'])  if num_glove_tokens == 0 else vector_sum / num_glove_tokens
        avg_vector = avg_vector[np.newaxis, :]
        return avg_vector