Source code for botiverse.preprocessors.TF_IDF.TF_IDF


from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[docs]class TF_IDF():
    '''
    An interface for transforming sentences into tf-idf vectors.
    '''
    def __init__(self):
        self.tf = None
        self.idf = None
        self.all_words = None
    
[docs]    def transform_list(self, sentence_list, all_words):
        '''
        Given a list of tokenized sentences, return a table of tf-idf vectors (one for each sentence) in the form of a numpy array.
        
        :param sentence_list: A list of tokenized sentences
        :type sentence_list: list
        :param all_words: A list of all the words in the corpus
        :type all_words: list
        
        :return: A numpy array of tf-idf vectors
        :rtype: numpy.ndarray
        '''
        self.all_words = all_words
        # Compute the normalized frequency of each word in the document
        tf_table = np.zeros((len(sentence_list), len(all_words)), dtype=np.float64)
        for i, sentence in enumerate(sentence_list):
            sentence = list(tokenize(sentence, to_lower=True))
            sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
            sentence_length = len(sentence)
            for word in sentence:
                word_index = all_words.index(word)
                tf_table[i, word_index] += 1 / sentence_length
        
        # Get the number of documents in which each word appears
        df = np.sum(tf_table > 0, axis=0)
        N = len(sentence_list)
        idf_col = np.log(N/(df+1))
        
        # compute the tf-table by the idf column (gets broadcasted)
        self.tf, self.idf = tf_table, idf_col
        tfidf_table = tf_table * idf_col
        
        return tfidf_table
    

[docs]    def transform(self, sentence):
        '''
        Given a sentence, return its tf-idf vector as a numpy array.
        
        :param sentence: A string of words
        :type sentence: str
        
        :return: A numpy array of the tf-idf vector
        :rtype: numpy.ndarray
        '''
        sentence = list(tokenize(sentence, to_lower=True))
        sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
        # compute the tf-idf vector for the prompt
        tf_idf = np.zeros((1, len(self.all_words)), dtype=np.float64)
        for word in sentence:
            # get its tf
            if word not in self.all_words:  continue
            word_index = self.all_words.index(word)
            tf_idf[0, word_index] += 1 / len(sentence)
        tf_idf *= self.idf
        return tf_idf