Source code for botiverse.preprocessors.Special.ConverseBot_Preprocessor.ConverseBot_Preprocessor

import pandas as pd
from transformers import AutoTokenizer
import numpy as np
import json

[docs]class ConverseBot_Preprocessor: ''''An interface that provides the required preprocessing for the ConverseBot bot''' def __init__(self, file_path=None, dataset=None): """ Initializes a ConverseBot_Preprocessor instance with an optional training dataset, note that the dataset structure is an array of multiturn conversations and each multiturn conversation is an array of strings, e.g., [["hi","hello","how are you?"], ["good","how about you?","i am fine"]] :param dataset: Dataset to be processed (use it or file_path). :type dataset: list of list of str, optional :param file_path: Path to the .json file that contains the conversation array (use it or dataset). :type file_path: str, optional :returns: None """ if file_path: with open(file_path, 'r') as f: self.data = json.load(f) else: self.data = dataset # create the t5 tokenizer self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
[docs] def process(self): """ Processes the conversations dataset by cleaning it then combining each conversation into a single string (with [C] between each turn) and then tokenizing it. :returns: DataFrame containing the processed conversations. :rtype: DataFrame """ # separate conversations into text and target target_array = [] text_array = [] for conversation in self.data: # get length of conversation conversation_length = len(conversation) # get a random index to split the conversation split_index = np.random.randint(0, conversation_length//2)*2+1 # split the conversation into input and target conternt = conversation[:split_index] target = conversation[split_index] conternt = "[C]".join(conternt) # add the input and target to the data text_array.append(conternt) target_array.append(target) # create a dataframe self.data = pd.DataFrame({'text': text_array, 'target': target_array}) # clean the text self.data['text'] = self.data['text'].apply(self.clean_string) self.data['target'] = self.data['target'].apply(self.clean_string) # tokenize the data with padding to 512 and truncation to 512 (from the end) self.data['text'] = self.data['text'].apply(self.tokenize_string) self.data['target'] = self.data['target'].apply(self.tokenize_string, target=True) # get text ids and attention masks self.data['text_input_ids'] = self.data['text'].apply(lambda x: x['input_ids']) self.data['text_attention_mask'] = self.data['text'].apply(lambda x: x['attention_mask']) self.data = self.data.drop(columns=['text']) # get traget ids self.data['target'] = self.data['target'].apply(lambda x: x['input_ids']) return self.data
# process a single string
[docs] def clean_string(self, string): """ Cleans a string by removing certain spaces and new line characters. :param string: The string to clean. :type string: str :returns: The cleaned string. :rtype: str """ string = string.replace("\n", " ") # remove spaces at the beginning and end of the string string = string.strip() # remove spaces before and after punctuation string = string.replace(" .", ".") string = string.replace(" ,", ",") string = string.replace(" !", "!") string = string.replace(" ?", "?") string = string.replace(" '", "'") string = string.replace(" ’", "’") return string
# tokenize a single string
[docs] def tokenize_string(self, string, target=False): """ Tokenizes a string. :param string: The string to tokenize. :type string: str :param target: Indicates whether the string is a target. :type target: bool, optional :returns: Tokenized string. :rtype: Dict[str, Tensor] """ tokens_obj = self.tokenizer(string, padding='max_length', truncation=True, max_length=512, return_tensors="pt") if target: tokens_obj['input_ids'][tokens_obj['input_ids'] == self.tokenizer.pad_token_id] = -100 return tokens_obj
# decode a single string
[docs] def decode_tokens(self, tokens): """ Decodes a sequence of tokens. :param tokens: The tokens to decode. :type tokens: Tensor :returns: The decoded string. :rtype: str """ return self.tokenizer.decode(tokens, skip_special_tokens=True)
# clean then tokenize a single string
[docs] def process_string(self, string): """ Cleans and tokenizes a conversational string. :param string: The conversational string to process. :type string: str :returns: Processed string in token vector form. :rtype: Dict[str, Tensor] """ string = self.clean_string(string) tokens_vector = self.tokenize_string(string) return tokens_vector