Source code for botiverse.bots.VoiceBot.VoiceBot
try:
import numpy as np
import json
from gtts import gTTS
import tempfile
import os
from botiverse.models import TTS
from playsound import playsound
from botiverse.models import LSTMClassifier
from botiverse.preprocessors import Vocalize, Wav2Vec, Wav2Text, BertEmbedder, Frequency, BertSentenceEmbedder
from botiverse.bots.VoiceBot.utils import voice_input
except:
pass
[docs]class VoiceBot():
'''An interface for the vocalizer chatbot which simulates a call with a customer service bot.'''
def __init__(self, call_json_path, repr='BERT-Sentence'):
'''
Load the call data from a json file that contains the call's state machine.
:param call_json_path: The path to the json file containing the call state machine.
:type call_json_path: str
:param repr: The numerical representation to use for the audio files. Can be 'BERT' or 'BERT-Sentence'.
:type repr: str
'''
with open(call_json_path, 'r') as file:
call_json = file.read()
self.call_data = json.loads(call_json)
self.current_node = 'A'
self.wav2text = Wav2Text()
if repr == 'BERT':
self.bert_embeddings = BertEmbedder()
elif repr == 'BERT-Sentence':
self.bert_sentence_embeddings = BertSentenceEmbedder()
else:
raise Exception(f"Invalid representation {repr}. Expected BERT or BERT-Sentence.")
def generate_speech(self, text, offline=False):
'''Use Google's TTS or offline FastSpeech 1.0 to play speech from the given text.
:param text: The text to be converted into speech.
:type text: str
:param offline: Whether to use offline FastSpeech 1.0 to generate speech.
:type offline: bool
:meta private:
'''
if offline:
tts = TTS()
tts.speak(text)
else:
tts = gTTS(text=text, lang='en', tld="us", slow=False)
with tempfile.NamedTemporaryFile(delete=False) as temp_audio:
temp_filename = temp_audio.name + ".mp3"
tts.save(temp_filename)
# convert to wav
os.system(f"ffmpeg -i {temp_filename} -acodec pcm_s16le -ac 1 -ar 16000 {temp_filename[:-4]}.wav -loglevel quiet")
playsound(temp_filename)
[docs] def simulate_call(self):
'''
Simulate a call with a voice bot as driven by the call state machine.
'''
while True:
if self.current_node == 'Z':
# the final state has a different structure, bot only speaks and then the call ends
bot_message = self.call_data[self.current_node]['Bot']
self.generate_speech(bot_message)
break
# 1 - get the current node's data and from that get the message the bot should speak
node_data = self.call_data[self.current_node]
bot_message = node_data['Bot']
self.generate_speech(bot_message)
# 2 - get the intent options that the bot expects from the user and classify the user's response
options = node_data['Options']
intents = [option['Intent'] for option in options]
max_dur = node_data['max_duration']
human_resp = voice_input(record_time=int(max_dur))
human_resp = self.wav2text.transcribe(human_resp)
selected_ind, score = self.bert_sentence_embeddings.closest_sentence(human_resp, intents, retun_ind=True)
print(f"you said: {human_resp} and the bot decided that you meant {intents[selected_ind]} with a score of {score}")
# 3 - speak according to the chosen option
speak_message = options[selected_ind]['Speak']
self.generate_speech(speak_message)
# 4 - go to the next state
self.current_node = options[selected_ind]['Next']