Source code for botiverse.preprocessors.Vocalize.Vocalize
try:
from gtts import gTTS
import os
from tqdm import tqdm
import librosa
import random
import soundfile as sf
import gdown
import shutil
except:
pass
[docs]class Vocalize():
'''
An interface for transforming words into audio files via Google's Text-to-Speech API and adding noise to them.
'''
def __init__(self, words):
'''
Initialize the Vocalize transformer by setting the input words and making the dataset.
:param words: A list of words to be transformed into audio files
:type words: list
'''
self.words = words
self.make_dataset()
[docs] def make_dataset(self):
'''
Make a dataset of audio files for the given words by using Google's Text-to-Speech API to pronounce the word
in australian, british, american, indian, and south african accents.
'''
# if there is a folder called dataset, delete it
if os.path.exists('dataset'): shutil.rmtree('dataset')
# make a folder for each word in folder 'dataset
for word in self.words:
if not os.path.exists('dataset/' + word):
os.makedirs('dataset/' + word)
# Make audio for each word
print("Making audio files for each word...")
for word in tqdm(self.words):
tlds = ["com.au", "co.uk", "us", "co.in", "co.za"]
for i, tld in enumerate(tlds):
tts = gTTS(text=word, lang="en", tld=tld, slow=False) # Sample rate of 24K
tts.save(f"dataset/{word}/{i}.mp3")
# convert to wav
os.system(f"ffmpeg -i dataset/{word}/{i}.mp3 -acodec pcm_s16le -ac 1 -ar 16000 dataset/{word}/{i}.wav -loglevel quiet")
# remove the mp3 file
os.remove(f"dataset/{word}/{i}.mp3")
[docs] @staticmethod
def corrupt_dataset(words=None, sample_rate=16000, traffic=False, force_download=False):
'''
Given a folder 'dataset' with folders each containing audio files, this randomly adds noise to each audio file and saves it
by applying specific noise introduction logic. If noise is not found locally, it is downloaded from Google Drive.
:param words: A list of words to be transformed into audio files (i.e., the folder names)
:type words: list
:param sample_rate: The sample rate of the audio files
:type sample_rate: int
:param traffic: Whether to add traffic noise to the audio files
:type traffic: bool
:param force_download: Whether to force download the noise dataset even if it already exists.
:type force_download: bool
'''
# if words is None then assume they are the folder names in dataset
if words is None: words = os.listdir('dataset')
curr_dir = os.path.dirname(os.path.abspath(__file__))
# does not exist or is empty
if not os.path.exists(f"{curr_dir}/noises") or force_download:
print("Noises not found. Downloading the noise sounds to be used for augmentation...")
# if not, download the WaveGlow folder
f_id = '13sOukAKPjoW1K0Ic-8t49P_nkO1dj1oc'
gdown.download(f'https://drive.google.com/uc?export=download&confirm=pbef&id={f_id}', curr_dir + '/noises.zip', quiet=False)
# extract the folder
shutil.unpack_archive(curr_dir + '/noises.zip', curr_dir)
print("Done.")
# remove the zip file
os.remove(curr_dir + '/noises.zip')
print("Corrupting the dataset...")
for word in tqdm(words):
# for each file in the folder
for file in os.listdir(f"dataset/{word}"):
noise_added = False
waveform, sample_rate = librosa.load(f"dataset/{word}/{file}", sr=sample_rate)
#waveform = librosa.effects.pitch_shift(y=waveform, sr=sample_rate, n_steps=-2)
# with probability 100% add room noise, 40% cafe noise, and 20% traffic noise
noise_prob = random.random()
noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/room.wav"), sr=None)
noise_type = "room"
if noise_prob < 0.4:
noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/cafe.wav"), sr=None)
noise_type = "cafe"
noise_added = True
elif noise_prob < 0.6 and not noise_added and traffic:
noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/traffic.wav"), sr=None)
noise_type = "traffic"
# Resample noise waveform to match the sample rate of the target waveform
noise_waveform = librosa.resample(y=noise_waveform, orig_sr=sr, target_sr=sample_rate)
# Trim the waveforms to match the desired duration
max_offset = len(noise_waveform) - len(waveform)
offset = random.randint(0, max_offset)
noise_waveform = noise_waveform[offset:offset+len(waveform)]
# Add noise to the waveform
waveform = waveform + noise_waveform
# Save the modified waveform with noise
output_folder = f"dataset/{word}"
# same name but append the noise type
output_path = f"{output_folder}/{file.split('.')[0]}_{noise_type}.wav"
sf.write(output_path, waveform, sample_rate, 'PCM_24')
# remove the original file
os.remove(f"dataset/{word}/{file}")