Source code for botiverse.preprocessors.BoW.BoW

from gensim.utils import tokenize
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


[docs]class BoW():
    '''
    An interface for transforming sentences into bag-of-words vectors.
    '''
    def __init__(self, binary=False):
        '''
        Initialize the BoW transformer.
        
        :param binary: Whether to use binary BoW vectors instead of frequency BoW vectors.
        :type binary: bool
        '''
        
        self.all_words = None
        self.binary = binary
        
[docs]    def transform_list(self, sentence_list, all_words):
        '''
        Given a list of tokenized sentences, return a table of BoW vectors (one for each sentence) in the form of a numpy array.
        
        :param sentence_list: A list of tokenized sentences.
        :type sentence_list: list
        :param all_words: A list of all the words in the vocabulary.
        :type all_words: list
        
        :return: A table of BoW vectors (one for each sentence) in the form of a numpy array.
        :rtype: numpy.ndarray
        '''
        self.all_words = all_words
        BoWs = np.zeros((len(sentence_list), len(all_words)), dtype=np.float64)
        for i, sentence in enumerate(sentence_list):
            sentence = list(tokenize(sentence, to_lower=True))
            sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
            for word in sentence:
                word_index = all_words.index(word)
                BoWs[i, word_index] += 1 if not self.binary else 1
        return BoWs
    
[docs]    def transform(self, sentence):
        '''
        Given a sentence, return its BoW vector as a numpy array.
        
        :param sentence: A string of words
        :type sentence: str
        
        :return: A BoW vector for the given sentence.
        :rtype: numpy.ndarray
        '''
        sentence = list(tokenize(sentence, to_lower=True))
        sentence = [stemmer.stem(word.lower()) for word in sentence if word not in ['?', '!', '.', ',']]
        BoW = np.zeros((1, len(self.all_words)), dtype=np.float64)
        for word in sentence:
            if word not in self.all_words:  continue
            word_index = self.all_words.index(word)
            BoW[0, word_index] += 1 if not self.binary else 1
        return BoW