Source code for botiverse.preprocessors.Vocalize.Vocalize

try:
    from gtts import gTTS
    import os
    from tqdm import tqdm
    import librosa
    import random
    import soundfile as sf
    import gdown
    import shutil
except:
    pass

[docs]class Vocalize():
    '''
    An interface for transforming words into audio files via Google's Text-to-Speech API and adding noise to them.
    '''
    def __init__(self, words):
        '''
        Initialize the Vocalize transformer by setting the input words and making the dataset.
        
        :param words: A list of words to be transformed into audio files
        :type words: list
        '''
        self.words = words
        self.make_dataset()
        
[docs]    def make_dataset(self):
        '''
        Make a dataset of audio files for the given words by using Google's Text-to-Speech API to pronounce the word
        in australian, british, american, indian, and south african accents.
        '''
        # if there is a folder called dataset, delete it
        if os.path.exists('dataset'): shutil.rmtree('dataset')
        # make a folder for each word in folder 'dataset
        for word in self.words:
            if not os.path.exists('dataset/' + word):
                os.makedirs('dataset/' + word)

        # Make audio for each word
        print("Making audio files for each word...")
        for word in tqdm(self.words):
            tlds = ["com.au", "co.uk", "us", "co.in", "co.za"]
            for i, tld in enumerate(tlds):
                tts = gTTS(text=word, lang="en", tld=tld, slow=False)                       # Sample rate of 24K
                tts.save(f"dataset/{word}/{i}.mp3")
                # convert to wav
                os.system(f"ffmpeg -i dataset/{word}/{i}.mp3 -acodec pcm_s16le -ac 1 -ar 16000 dataset/{word}/{i}.wav -loglevel quiet")                
                # remove the mp3 file
                os.remove(f"dataset/{word}/{i}.mp3")
[docs]    @staticmethod
    def corrupt_dataset(words=None, sample_rate=16000, traffic=False, force_download=False):
        '''
        Given a folder 'dataset' with folders each containing audio files, this randomly adds noise to each audio file and saves it
        by applying specific noise introduction logic. If noise is not found locally, it is downloaded from Google Drive.
        
        :param words: A list of words to be transformed into audio files (i.e., the folder names)
        :type words: list
        :param sample_rate: The sample rate of the audio files
        :type sample_rate: int
        :param traffic: Whether to add traffic noise to the audio files
        :type traffic: bool
        :param force_download: Whether to force download the noise dataset even if it already exists.
        :type force_download: bool
        '''
        # if words is None then assume they are the folder names in dataset
        if words is None: words = os.listdir('dataset')
        
        curr_dir = os.path.dirname(os.path.abspath(__file__))
        # does not exist or is empty
        if not os.path.exists(f"{curr_dir}/noises") or force_download:
            print("Noises not found. Downloading the noise sounds to be used for augmentation...")
            # if not, download the WaveGlow folder
            f_id = '13sOukAKPjoW1K0Ic-8t49P_nkO1dj1oc' 
            gdown.download(f'https://drive.google.com/uc?export=download&confirm=pbef&id={f_id}', curr_dir + '/noises.zip', quiet=False)
            # extract the folder
            shutil.unpack_archive(curr_dir + '/noises.zip', curr_dir)
            print("Done.")   
            # remove the zip file
            os.remove(curr_dir + '/noises.zip') 
        
        print("Corrupting the dataset...")
        for word in tqdm(words):
            # for each file in the folder
            for file in os.listdir(f"dataset/{word}"):
                noise_added = False
                waveform, sample_rate = librosa.load(f"dataset/{word}/{file}", sr=sample_rate)

                #waveform = librosa.effects.pitch_shift(y=waveform, sr=sample_rate, n_steps=-2)

                # with probability 100% add room noise, 40% cafe noise, and 20% traffic noise
                noise_prob = random.random()
                noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/room.wav"), sr=None)
                noise_type = "room"
                
                if noise_prob < 0.4:
                    noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/cafe.wav"), sr=None)
                    noise_type = "cafe"
                    noise_added = True
                elif noise_prob < 0.6 and not noise_added and traffic:
                    noise_waveform, sr = librosa.load(os.path.join(curr_dir, "./noises/traffic.wav"), sr=None)
                    noise_type = "traffic"

                # Resample noise waveform to match the sample rate of the target waveform
                noise_waveform = librosa.resample(y=noise_waveform, orig_sr=sr, target_sr=sample_rate)

                # Trim the waveforms to match the desired duration
                max_offset = len(noise_waveform) - len(waveform)
                offset = random.randint(0, max_offset)
                noise_waveform = noise_waveform[offset:offset+len(waveform)]

                # Add noise to the waveform
                waveform = waveform + noise_waveform

                # Save the modified waveform with noise
                output_folder = f"dataset/{word}"
                # same name but append the noise type
                output_path = f"{output_folder}/{file.split('.')[0]}_{noise_type}.wav"
                sf.write(output_path, waveform, sample_rate, 'PCM_24')
                
                # remove the original file 
                os.remove(f"dataset/{word}/{file}")