Source code for botiverse.preprocessors.BoW.BoW
from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
[docs]class BoW():
'''
An interface for transforming sentences into bag-of-words vectors.
'''
def __init__(self, binary=False):
'''
Initialize the BoW transformer.
:param binary: Whether to use binary BoW vectors instead of frequency BoW vectors.
:type binary: bool
'''
self.all_words = None
self.binary = binary
[docs] def transform_list(self, sentence_list, all_words):
'''
Given a list of tokenized sentences, return a table of BoW vectors (one for each sentence) in the form of a numpy array.
:param sentence_list: A list of tokenized sentences.
:type sentence_list: list
:param all_words: A list of all the words in the vocabulary.
:type all_words: list
:return: A table of BoW vectors (one for each sentence) in the form of a numpy array.
:rtype: numpy.ndarray
'''
self.all_words = all_words
BoWs = np.zeros((len(sentence_list), len(all_words)), dtype=np.float64)
for i, sentence in enumerate(sentence_list):
sentence = list(tokenize(sentence, to_lower=True))
sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
for word in sentence:
word_index = all_words.index(word)
BoWs[i, word_index] += 1 if not self.binary else 1
return BoWs
[docs] def transform(self, sentence):
'''
Given a sentence, return its BoW vector as a numpy array.
:param sentence: A string of words
:type sentence: str
:return: A BoW vector for the given sentence.
:rtype: numpy.ndarray
'''
sentence = list(tokenize(sentence, to_lower=True))
sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
BoW = np.zeros((1, len(self.all_words)), dtype=np.float64)
for word in sentence:
if word not in self.all_words: continue
word_index = self.all_words.index(word)
BoW[0, word_index] += 1 if not self.binary else 1
return BoW