Source code for botiverse.preprocessors.TF_IDF.TF_IDF
from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
[docs]class TF_IDF():
'''
An interface for transforming sentences into tf-idf vectors.
'''
def __init__(self):
self.tf = None
self.idf = None
self.all_words = None
[docs] def transform_list(self, sentence_list, all_words):
'''
Given a list of tokenized sentences, return a table of tf-idf vectors (one for each sentence) in the form of a numpy array.
:param sentence_list: A list of tokenized sentences
:type sentence_list: list
:param all_words: A list of all the words in the corpus
:type all_words: list
:return: A numpy array of tf-idf vectors
:rtype: numpy.ndarray
'''
self.all_words = all_words
# Compute the normalized frequency of each word in the document
tf_table = np.zeros((len(sentence_list), len(all_words)), dtype=np.float64)
for i, sentence in enumerate(sentence_list):
sentence = list(tokenize(sentence, to_lower=True))
sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
sentence_length = len(sentence)
for word in sentence:
word_index = all_words.index(word)
tf_table[i, word_index] += 1 / sentence_length
# Get the number of documents in which each word appears
df = np.sum(tf_table > 0, axis=0)
N = len(sentence_list)
idf_col = np.log(N/(df+1))
# compute the tf-table by the idf column (gets broadcasted)
self.tf, self.idf = tf_table, idf_col
tfidf_table = tf_table * idf_col
return tfidf_table
[docs] def transform(self, sentence):
'''
Given a sentence, return its tf-idf vector as a numpy array.
:param sentence: A string of words
:type sentence: str
:return: A numpy array of the tf-idf vector
:rtype: numpy.ndarray
'''
sentence = list(tokenize(sentence, to_lower=True))
sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
# compute the tf-idf vector for the prompt
tf_idf = np.zeros((1, len(self.all_words)), dtype=np.float64)
for word in sentence:
# get its tf
if word not in self.all_words: continue
word_index = self.all_words.index(word)
tf_idf[0, word_index] += 1 / len(sentence)
tf_idf *= self.idf
return tf_idf