Source code for botiverse.models.FastSpeech1.FastSpeech

#!/usr/bin/env python
# coding: utf-8

# # <font color="cyan">FastSpeech 1.0 </font> Implementation
# 
# In this notebook, we shall demonstrate implementing FastSpeech 1.0 from scratch for inference purposes. 

# In[1]:


'''
FastSpeech 1.0 interface and implementation from scratch in PyTorch for inference.
'''
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 
# ![Image](https://i.imgur.com/ZDR7wqr.png)
# 

# #### Clearly, we have three main components in the model:
# 
# <div align='center'>
# <table>
#   <tr>
#     <th colspan="1"><font color="yellow">Component</font></th>
#     <th colspan="3"><font color="yellow">Encoder</font></th>
#     <th colspan="2"><font color="yellow">Length Regulator</font></th>
#     <th colspan="3"><font color="yellow">Decoder</font></th>
# 
#   </tr>
#   <tr>
#     <th colspan="1"><font color="white">Subcomponenets</font></th>
#     <th colspan="1"><font color="white">Phoneme Embedding</font></th>
#     <th colspan="1"><font color="white">Positional Encoding</font></th>
#     <th colspan="1"><font color="white">FFT Block</font></th>
#     <th colspan="1"><font color="white">Duration Predictor</font></th>
#     <th colspan="1"><font color="white">LR Logic</font></th>
#     <th colspan="1"><font color="white">Positional Encoding</font></th>
#     <th colspan="1"><font color="white">FFT Block</font></th>
#     <th colspan="1"><font color="white">Linear Layer</font></th>
#   </tr>
# 
#   <tr>
#   <th colspan="1">Takes</th>
#   <td colspan="3"> <font color='cyan'>[batch_size, seq_len]</font></td>
#   <td colspan="2"> <font color='cyan'>[batch_size, seq_len, emb_dim]</font> </td>
#   <td colspan="3"> <font color='cyan'>[batch_size, new_seq_len, emb_dim]</font> </td>
#   </tr>
# 
#   <tr>
#   <th colspan="1">Yields</th>
#   <td colspan="3"> <font color='cyan'>[batch_size, seq_len, emb_dim]  </td>
#   <td colspan="2"> <font color='cyan'>[batch_size, new_seq_len, emb_dim]</font> </td>
#   <td colspan="3"> <font color='cyan'>[batch_size, new_seq_len, mel_num]</font> </td>
#   </tr>
# 
#   <tr>
#   <th colspan="1">Purpose</th>
#   <td colspan="3">Learn a representation for phonemes. The three layers within guarantee that underlying words, how they are ordered and other words all take part in the representation. </td>
#   <td colspan="2">Predict the duration of each phoneme and repeat accordingly</td>
#   <td colspan="3">Given time-aligned phoneme representations learn to transform them into a mel spectogram</td>
#   </tr>
# </table>
# </div>
# 
# Once we have the spectogram, all that's needed it a vocoder to transform to audio by performing an approximate inverse mel-transform such as Griffin-Lim algorithm or using a pretrained model that does the task more accurately such as WaveGlow; hence, we will go with the latter but we won't implement WaveGlow from scratch.
# 

# #### We will start by implementing 
# 
# <div align='left'>
# <table>
#   <tr>
#     <th colspan="2"><font color="cyan">FFT Block</th>
#   </tr>
#   <tr>
#     <th colspan="1">Multi-head Attention</th>
#     <th colspan="1">Conv1DNet</th>
#   </tr>
# </table>
# </div>
# 
# #### Then we have that
# 
# <div align='left'>
# <table>
#   <tr>
#     <th colspan="3"><font color="deepskyblue">Encoder</font></th>
#     <th colspan="3"><font color="deepskyblue">Decoder</font></th>
#   </tr>
#   <tr>
#     <th colspan="1">Phoneme Emb.</th>
#     <th colspan="1">Positional Enc.</th>
#     <th colspan="1">FFT Block</th>
#     <th colspan="1">Positional Encoding</th>
#     <th colspan="1">FFT Block</th>
#     <th colspan="1">Linear Layer</th>
#   </tr>
# </table>
# </div>
# 
# #### So we follow with
# 
# <div align='left'>
# <table>
#   <tr>
#     <th colspan="2"><font color="deepskyblue">Length Regulator</font></th>
#   </tr>
#   <tr>
#     <th colspan="1">LR Logic</th>
#     <th colspan="1">Duration Predictor</th>
#   </tr>
# 
# </table>
# </div>
# 
# #### And we finally have that
# 
# <div align='left'>
# <table>
#   <tr>
#     <th colspan="3"><font color="deepskyblue">Model</font></th>
#   </tr>
#   <tr>
#     <th colspan="1">Encoder</th>
#     <th colspan="1">Length Regulator</th>
#     <th colspan="1">Decoder</th>
#   </tr>
# </table>
# </div>

# ### <font color="cyan">1. Feedforward-Transformer Block </font>

# ![img](https://i.imgur.com/Kb88BwT.png)

# #### <font color="white"> Multi-head Self-Attention </font>

# <div align='center'>
# 
# $MultiHead(Q, K, V) = \text{Concat}(head_1, \ldots, head_h) \cdot W^O$
# 
# where
# 
#   $head = \text{Attention}(qW_q, kW_k, vW_v) =\text{Attention}(Q, K, V)$
# 
#   such that
# 
#   $\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) \cdot V$
# 
#   
#   </div>

# In[2]:


import torch
import torch.nn as nn
import numpy as np

[docs]class MultiHeadAttention(nn.Module):
    '''
    Multi-Head Attention module with a residual connection and layer normalization. Used as self-attention in the FFT block of the encoder and decoder.

    :param num_head: Number of attention heads.
    :type num_head: int
    :param emb_dim: Input encoder/decoder embeddings dimensions.
    :type emb_dim: int
    :param h_dim: Hidden dimension (output dimension of the linear layers Wq, Wk, Wv).
    :type h_dim: int
    :param dropout: Dropout probability. Default is 0.1.
    :type dropout: float, optional
    '''
    def __init__(self, num_head, emb_dim, h_dim, dropout=0.1):
        super().__init__()

        self.num_head = num_head
        self.h_dim = h_dim                            # dimensionality of the final output
        self.head_dim = h_dim // num_head             # dimensionality of each head

        self.Wq = nn.Linear(emb_dim, h_dim)         # Equivalent to using head_dim, num_head times
        self.Wk = nn.Linear(emb_dim, h_dim)
        self.Wv = nn.Linear(emb_dim, h_dim)

        self.softmax = nn.Softmax(dim=2)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
        self.layer_norm = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(h_dim, emb_dim)
        
       
[docs]    def forward(self, q, k, v, mask):
        '''
        Pass given query, key, and value through the Multi-Head Attention module.

        :param q: Query tensor of shape [batch_size, seq_len, emb_dim].
        :type q: torch.Tensor
        :param k: Key tensor of shape [batch_size, seq_len, emb_dim].
        :type k: torch.Tensor
        :param v: Value tensor of shape [batch_size, seq_len, emb_dim].
        :type v: torch.Tensor
        :param mask: Mask to apply to the attention so that padding tokens do not attend and are not attended to.
        :type mask: torch.Tensor
        
        :returns: Output tensor of shape [batch_size, seq_len, emb_dim].
        :rtype: torch.Tensor
        '''
        residual = q
        batch_size, num_head, seq_len, head_dim = q.size(0), self.num_head, q.size(1), self.head_dim
        Q, K, V = self.Wq(q), self.Wk(k), self.Wv(v)              # [batch_size, seq_len, emb_dim -> h_dim]

        review = lambda X: X.view(batch_size, seq_len, num_head, head_dim).transpose(1, 2).contiguous()
        Q, K, V = review(Q), review(K), review(V)                  # [batch_size, num_head, seq_len, head_dim]
        
        reshape = lambda X: X.view(batch_size * num_head, seq_len, head_dim)
        Q, K, V = reshape(Q), reshape(K), reshape(V)               # [batch_size * num_head, seq_len, head_dim]

        mask = mask.repeat(num_head, 1, 1)                         # [batch_size * num_head, seq_len, seq_len]
        
        a = torch.bmm(Q, K.transpose(1, 2))/self.scale             # [batch_size * num_head, seq_len, seq_len]
        
        a = a.masked_fill(mask, -np.inf)

        a = self.softmax(a)
        
        a = self.dropout(a)
        
        output = torch.bmm(a, V)                                  # [batch_size * num_head, seq_len, head_dim]   
        
        output = output.view(batch_size, num_head, seq_len, head_dim).transpose(1, 2).contiguous()
        output = output.view(batch_size, seq_len, num_head * head_dim)
        # [batch_size, seq_len, num_head * head_dim]

        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)

        return output


# #### <font color="white"> Conv1D Network </font>

# $Embeddings ⇒ Conv1D ⇒ ReuLU ⇒ Conv1D ⇒  Dropout ⇒ AddNorm$

# In[3]:


import torch
import torch.nn as nn
import torch.nn.functional as F

[docs]class Conv1DNet(nn.Module):
    '''
    1D convolutional network with residual connection and layer normalization. Used in the FFT block of the encoder and decoder.

    :param inp_dim: Input dimension.
    :type inp_dim: int
    :param inner_dim: Inner dimension of the convolutional layers (output dimension of the first convolutional layer).
    :type inner_dim: int
    :param dropout: Dropout probability. Default is 0.1.
    :type dropout: float, optional
    '''
    def __init__(self, inp_dim, inner_dim, dropout=0.1):
        super().__init__()
        # Both convolutions have SAME padding so they only change the number of 1D channels
        self.conv1 = nn.Conv1d(inp_dim, inner_dim, kernel_size=9, padding=4) # from n to n-9+2*4+1 = n
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(inner_dim, inp_dim, kernel_size=1, padding=0) # from n to n-1+2*0+1 = n
        self.layer_norm = nn.LayerNorm(inp_dim)
        self.dropout = nn.Dropout(dropout)

[docs]    def forward(self, x):   
        '''
        Pass given input through the 1D convolutional network. The input comes from the Multi-Head Attention module.

        :param x: Input tensor of shape [batch_size, seq_len, d_in].
        :type x: torch.Tensor
        :returns: Output tensor of shape [batch_size, seq_len, d_in].
        :rtype: torch.Tensor
        '''
        residual = x                                     # [batch_size, seq_len, d_in]
        x = x.transpose(1, 2)                            # [batch_size, d_in, seq_len]
        x = F.relu(self.conv1(x))                        # [batch_size, d_out, seq_len]
        x = self.conv2(x)                                # [batch_size, din, seq_len]
        x = x.transpose(1, 2)                            # [batch_size, seq_len, din]
        
        output = self.dropout(x)
        output = self.layer_norm(output + residual)

        return output


# #### <font color="white"> FFT Block </font>
# 
# Here we just put the two pieces together

# In[4]:


import torch.nn as nn

[docs]class FFTBlock(nn.Module):
    '''
    FFT block used in the encoder and decoder. It consists of a Multi-Head Attention module and a 1D convolutional network.

    :param emb_dim: Input encoder/decoder embeddings dimensions.
    :type emb_dim: int
    :param num_head: Number of heads for the Multi-Head Attention module.
    :type num_head: int
    :param h_dim: Hidden dimension (output dimension of the linear layers Wq, Wk, Wv).
    :type h_dim: int
    :param inner_dim: Inner dimension of the convolutional layers (output dimension of the first convolutional layer).
    :type inner_dim: int
    :param dropout: Dropout probability. Default is 0.1.
    :type dropout: float, optional
    '''
    def __init__(self, emb_dim, num_head, h_dim, inner_dim, dropout=0.1):
        super(FFTBlock, self).__init__()
        self.attn_out = MultiHeadAttention(num_head, emb_dim, h_dim, dropout=dropout)
        self.conv_out = Conv1DNet(emb_dim, inner_dim, dropout=dropout)

[docs]    def forward(self, input, non_pad_mask, attn_mask):
        '''
        Pass given encoder/decoder embeddings through the FFT block. The input comes from the previous FFT block or the input embeddings.

        :param input: Input tensor of shape [batch_size, seq_len, emb_dim].
        :type input: torch.Tensor
        :param non_pad_mask: Mask to nullify outputs due to padding tokens.
        :type non_pad_mask: torch.Tensor
        :param attn_mask: Mask to apply to the attention so that future tokens do not attend and are not attended to.
        :type attn_mask: torch.Tensor
        :returns: Output tensor of shape [batch_size, seq_len, emb_dim] and attention weights tensor of shape [batch_size * num_head, seq_len, seq_len].
        :rtype: tuple(torch.Tensor, torch.Tensor)
        '''
        non_pad_mask = non_pad_mask.float()
        # From [batch_size, seq_len, emb_dim] to [batch_size, seq_len, h_dim->emb_dim]
        output = self.attn_out(input, input, input, attn_mask) * non_pad_mask

        # From [batch_size, seq_len, emb_dim] to [batch_size, seq_len, emb_dim]
        output = self.conv_out(output) * non_pad_mask

        return output


# ### <font color="cyan">2. Encoder</font> & <font color="cyan">Decoder</font> 

# ![](https://i.imgur.com/HKBCPO7.png)

# It's obvious that they share most of the structure except for the first/last layer. Hence, we will implement both in one module. But before we do we need positional encoding.

# #### <font color="white"> Positional Encoding </font>

# 
# $$PE_{(pos, 2i)} = \sin(\frac{{pos}}{{10000^{2i/d_{\text{inp}}}}})$$
# 
# 
# $$PE_{(pos, 2i+1)} = \cos(\frac{{pos}}{{10000^{2i/d_{\text{inp}}}}})$$
# 

# The purpose of this is to add some notion of order to the input sequence without distorting it by adding a positional depending signal

# In[5]:


import numpy as np
import torch

[docs]class SinusoidEncodingTable:
    '''
    Sinusoid encoding table used in the encoder and decoder. It is used to add positional information to the input embeddings.

    :param max_seq_len: Maximum sequence length.
    :type max_seq_len: int
    :param inp_dim: Input encoder/decoder embeddings dimensions.
    :type inp_dim: int
    :param padding_idx: Index of the padding token. Default is None.
    :type padding_idx: int, optional
    '''
    def __init__(self, max_seq_len, inp_dim, padding_idx=None):
        self.max_seq_len = max_seq_len + 1
        self.inp_dim = inp_dim
        self.padding_idx = padding_idx
        self.sinusoid_table = self.build_table()

[docs]    def build_table(self):
        '''
        Build the sinusoid encoding table.

        :returns: Sinusoid encoding table of shape [max_seq_len, inp_dim] which is indexed by the position of the input embeddings and the index of the input embeddings value.
        :rtype: torch.Tensor
        '''
        # Compute the denominator
        inds = np.arange(self.inp_dim) // 2
        div_term = np.power(10000, 2 * inds / self.inp_dim)

        # Compute the numerator
        positions = np.arange(self.max_seq_len)

        # Compute the table of shape [max_seq_len, inp_dim]
        sinusoid_table = np.outer(positions, 1/div_term)

        # Apply sin to even indices in the array; 2i and cos to odd indices; 2i+1
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) 
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  

        if self.padding_idx is not None:
            sinusoid_table[self.padding_idx] = 0.

        return torch.Tensor(sinusoid_table)


# #### <font color="white"> Dencoder </font>
# 
# This is both the encoder and decoder in one module

# In[6]:


import torch.nn as nn

[docs]class Dencoder(nn.Module):
    '''
    A single module that implements the encoder and decoder of the FastSpeech 1.0 model. 

    :param mode: 'Encoder' or 'Decoder'.
    :type mode: str
    :param vocab_dim: Vocabulary dimension. None if mode is 'Decoder'.
    :type vocab_dim: int, optional
    :param max_seq_len: Maximum sequence length as needed by the sinusoid encoding table.
    :type max_seq_len: int
    :param emb_dim: Encoder/decoder embeddings dimensions.
    :type emb_dim: int
    :param num_layer: Number of FFT blocks.
    :type num_layer: int
    :param num_head: Number of heads for the Multi-Head Attention module.
    :type num_head: int
    :param h_dim: Hidden dimension (output dimension of the linear layers Wq, Wk, Wv).
    :type h_dim: int
    :param d_inner: Inner dimension of the convolutional layers (output dimension of the first convolutional layer).
    :type d_inner: int
    :param mel_num: Number of mel spectrogram bins (to map the final decoder embeddings). None if mode is 'Encoder'.
    :type mel_num: int, optional
    :param dropout: Dropout probability.
    :type dropout: float
    '''
    def __init__(self, mode, vocab_dim, max_seq_len, emb_dim, num_layer, num_head, h_dim, d_inner, mel_num, dropout):
        super(Dencoder, self).__init__()
        self.mode = mode

        self.embedding = nn.Embedding(vocab_dim, emb_dim, padding_idx=0) if self.mode == 'Encoder' else lambda x: x

        table = SinusoidEncodingTable(max_seq_len, emb_dim, padding_idx=0).sinusoid_table
        self.position_encoding = nn.Embedding.from_pretrained(table, freeze=True)

        self.layer_stack = nn.ModuleList([FFTBlock(emb_dim, num_head, h_dim, d_inner, dropout=dropout) for _ in range(num_layer)])

        self.linear = nn.Linear(emb_dim, mel_num) if self.mode == 'Decoder' else lambda x: x
            
[docs]    def forward(self, inp_seq, inp_seq_pos):                                            
        '''
        Pass given input sequence through the encoder/decoder.

        :param inp_seq: Input sequence of shape [batch_size, seq_len] for Encoder or [batch_size, seq_len, emb_dim] for Decoder.
        :type inp_seq: torch.Tensor
        :param inp_seq_pos: Input sequence positions of shape [batch_size, seq_len].
        :type inp_seq_pos: torch.Tensor
        :returns: Output tensor of shape [batch_size, seq_len, emb_dim] for Decoder or [batch_size, seq_len, mel_num] for Encoder.
        :rtype: torch.Tensor
        '''
        # Prepare non-pad and attention masks
        inp_attn = inp_seq if self.mode == 'Encoder' else inp_seq_pos              # [batch_size, seq_len]
        
        non_pad_mask = (inp_attn.unsqueeze(2) != 0)                                # [batch_size, seq_len, 1]
        attn_mask = (inp_attn.unsqueeze(1) == 0).repeat(1, inp_attn.size(1), 1)    # [batch_size, seq_len, seq_len]

        # Forward
        output = self.embedding(inp_seq) + self.position_encoding(inp_seq_pos)       # [batch_size, seq_len, emb_dim]
        for layer in self.layer_stack:
            output = layer(output, non_pad_mask=non_pad_mask, attn_mask=attn_mask)
        
        output = self.linear(output)
        
        return output


# ### <font color="cyan">3. Length Regulator </font>

# ![](https://i.imgur.com/xUvPwh6.png)

# #### <font color="white"> Duration Predictor </font>
# 

# $PhonemeEmb ⇒ Conv1D ⇒ ReuLU ⇒ LayerNorm ⇒  Dropout ⇒ Conv1D ⇒ ReuLU ⇒ LayerNorm ⇒  Dropout ⇒ Relu ⇒ Linear$

# In[7]:


import torch.nn as nn

[docs]class DurationPredictor(nn.Module):
    '''
    Duration predictor module. It predicts the duration of each phoneme in the input sequence of encoder phoneme embeddings.

    :param inp_dim: Input dimension.
    :type inp_dim: int
    :param inner_dim: Inner dimension of the convolutional layers (output dimension of the first convolutional layer).
    :type inner_dim: int
    :param kernel_size: Kernel size of the convolutional layers.
    :type kernel_size: int
    :param padding_size: Padding size of the convolutional layers.
    :type padding_size: int
    :param dropout: Dropout probability. Default is 0.1.
    :type dropout: float, optional
    '''
    def __init__(self, inp_dim, inner_dim, kernel_size, padding_size, dropout=0.1):
        super(DurationPredictor, self).__init__()

        self.conv1d_1 = nn.Conv1d(inp_dim, inner_dim, kernel_size, padding=padding_size)
        self.layer_norm_1 = nn.LayerNorm(inner_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.conv1d_2 = nn.Conv1d(inner_dim, inner_dim, kernel_size, padding=padding_size)
        self.layer_norm_2 = nn.LayerNorm(inner_dim)
        self.linear_layer = nn.Linear(inner_dim, 1)     
        # Predicts a scalar mel_duration per phoneme

[docs]    def forward(self, encoder_output):
        '''
        Pass given encoder output through the duration predictor module. The input comes from the encoder. 

        :param encoder_output: Encoder output of shape [batch_size, seq_len, emb_dim].
        :type encoder_output: torch.Tensor
        :returns: Predicted duration of each phoneme in the input sequence of encoder phoneme embeddings of shape [batch_size, seq_len].
        :rtype: torch.Tensor
        '''
        x = encoder_output.contiguous().transpose(1, 2)
        x = self.conv1d_1(x)
        x = x.contiguous().transpose(1, 2)
        x = self.relu(self.layer_norm_1(x))
        x = self.dropout(x)
        x = x.contiguous().transpose(1, 2)
        x = self.conv1d_2(x)
        x = x.contiguous().transpose(1, 2)
        x = self.relu(self.layer_norm_2(x))
        x = self.dropout(x)
        
        out = self.relu(self.linear_layer(x))

        out = out.squeeze()
        out = out.unsqueeze(0)      # Leading dimension should not be removed in inference.

        return out


# #### <font color="white">Length Regulation Logic </font>
# 

# Given $H=[h_1, h_2, ..., h_n]$ as phone embeddings from the encoder predict the duration of each phoneme $d=[d_1, d_2, ..., d_n]$ and repeat accordingly.

# In[8]:


import torch
import torch.nn as nn

[docs]class LengthRegulator(nn.Module):
    '''
    Length regulator module. It repeats the encoder outputs according to the predicted duration of each phoneme in the input sequence of encoder phoneme embeddings.

    :param inp_dim: Input dimension.
    :type inp_dim: int
    :param inner_dim: Inner dimension of the convolutional layers (output dimension of the first convolutional layer).
    :type inner_dim: int
    :param kernel_size: Kernel size of the convolutional layers.
    :type kernel_size: int
    :param padding_size: Padding size of the convolutional layers.
    :type padding_size: int
    :param dropout: Dropout probability. Default is 0.1.
    :type dropout: float, optional
    '''
    def __init__(self, inp_dim, inner_dim, kernel_size, padding_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
        self.duration_predictor = DurationPredictor(inp_dim, inner_dim, kernel_size, padding_size, dropout)

[docs]    def forward(self, enc_output):
        '''
        Pass given encoder output through the length regulator module. The input comes from the encoder.

        :param enc_output: Encoder output of shape [batch_size, seq_len, emb_dim].
        :type enc_output: torch.Tensor
        :returns: Modified encoder output of shape [batch_size, new_seq_len, emb_dim] and new positional encoding for the modified encoder output of shape [batch_size, new_seq_len].
        :rtype: tuple(torch.Tensor, torch.Tensor)
        '''
        tiny_slowdown = 0.5                                  # to prevent rounding from dropping phonemes 
        duration_predictions = (self.duration_predictor(enc_output) + tiny_slowdown).int()
        # [batch_size, seq_len, enc_dim] to [batch_size, new_seq_len] (scalar mel_duration per phoneme)
        
        # Repeat each phoneme in the encoder output according to its predicted duration
        new_seq_lens_per_batch = torch.sum(duration_predictions, -1)
        new_seq_len = torch.max(new_seq_lens_per_batch).item()
        
        batch_size, seq_len, enc_dim = enc_output.size()
        mod_enc_output = torch.zeros((batch_size, new_seq_len, enc_dim), device=enc_output.device)
        for sequence in range(batch_size):
            count = 0
            for phoneme in range(seq_len):
                hidden = enc_output[sequence][phoneme]
                reps = duration_predictions[sequence][phoneme]
                for _ in range(reps):
                    mod_enc_output[sequence][count] = hidden
                    count += 1
        
        # Form a new positional encoding for the modified encoder output
        output_pos = torch.LongTensor([i + 1 for i in range(mod_enc_output.size(1))]).unsqueeze(0).to(enc_output.device)
        # [batch_size, seq_len, enc_dim] -> [batch_size, new_seq_len, enc_dim] and [batch_size, new_seq_len]
        return mod_enc_output, output_pos


# ### <font color="cyan">4. Model </font>

# ![](https://i.imgur.com/azh7cwn.png)

# #### Hyperparameters

# In[9]:



# Encoder
VOCAB_SIZE = 300
ENC_EMB_DIM = 256
ENC_NUM_LAYER = 4
ENC_NUM_HEAD = 2
ENC_1D_FILTER_SIZE = 1024
DROPOUT_PROB = 0.1

# Length Regulator
INNER_DIM = 256
DP_KERNEL_SIZE = 3
DROPOUT_PROB = 0.1
DP_PADDING = 1

# Decoder
MAX_SEQ_LEN = 3000
DEC_EMB_DIM = 256
DEC_NUM_LAYER = 4
DEC_NUM_HEAD = 2
DEC_1D_FILTER_SIZE = 1024
MEL_NUM = 80


# #### <font color="white">FastSpeech</font>
# 

# In[10]:


import torch.nn as nn

[docs]class FastSpeech(nn.Module):
    '''
    FastSpeech 1.0 model. It consists of an encoder, a duration predictor, a length regulator, and a decoder.
    '''
    def __init__(self):
        '''
        Initialize the structure of the FastSpeech 1.0 model.
        '''
        super(FastSpeech, self).__init__()

        self.encoder = Dencoder(mode='Encoder', vocab_dim=VOCAB_SIZE, max_seq_len=VOCAB_SIZE, emb_dim=ENC_EMB_DIM,
                                num_layer=ENC_NUM_LAYER, num_head=ENC_NUM_HEAD, h_dim=ENC_EMB_DIM,
                                d_inner=ENC_1D_FILTER_SIZE, mel_num=None, dropout=DROPOUT_PROB)
        
        self.length_regulator = LengthRegulator(inp_dim=ENC_EMB_DIM, inner_dim=INNER_DIM, kernel_size=DP_KERNEL_SIZE, 
                                                padding_size=DP_PADDING, dropout=DROPOUT_PROB)
        
        self.decoder = Dencoder(mode='Decoder', vocab_dim=None, max_seq_len=MAX_SEQ_LEN, emb_dim=DEC_EMB_DIM, 
                                num_layer=DEC_NUM_LAYER, num_head=DEC_NUM_HEAD, h_dim=DEC_EMB_DIM,
                                d_inner=DEC_1D_FILTER_SIZE, mel_num=MEL_NUM, dropout=DROPOUT_PROB)


[docs]    def forward(self, text_seq, src_pos):
        '''
        Pass given input sequence through the FastSpeech 1.0 model.

        :param text_seq: Input sequence of shape [batch_size, seq_len] and assigns a unique id to each character in it.
        :type text_seq: torch.Tensor
        :param src_pos: Input sequence positions (indices) of shape [batch_size, seq_len].
        :type src_pos: torch.Tensor
        :returns: Predicted mel spectrogram of shape [batch_size, mel_num, new_seq_len].
        :rtype: torch.Tensor
        '''
        enc_output = self.encoder(text_seq, src_pos)
        
        length_regulator_output, decoder_pos = self.length_regulator(enc_output)
        
        spectogram = self.decoder(length_regulator_output, decoder_pos)

        return spectogram


# ### <font color="cyan">5. Waveglow

# ![](https://i.imgur.com/Xl9HEgm.png)

#  </font>

# ## <font color="yellow"> Inference </font>

# In[11]:


import warnings; warnings.filterwarnings("ignore")
from scipy.io.wavfile import write
try:
    from playsound import playsound
    import waveglow
except:
    pass
import os
import gdown


# In[12]:


[docs]class TTS():
    '''
    Text-to-Speech (TTS) class that implements the FastSpeech 1.0 model and the WaveGlow model for speech synthesis.

    :param force_download_wg: Whether to force download the WaveGlow weights if they already seem to exist. Default is False.
    :type force_download_wg: bool, optional
    :param force_download_fs: Whether to force download the FastSpeech 1.0 weights if they already seem to exist. Default is False.
    :type force_download_fs: bool, optional
    '''
    def __init__(self, force_download_wg=False, force_download_fs=False):
        # see if there is a WaveGlow folder in the current directory
        # print file and folder names in the current directory
        curr_dir = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(curr_dir + '/weights.pth') or force_download_fs:
            print("Weights not found. Downloading FastSpeech 1.0 weights...")
            # if not, download the WaveGlow folder
            f_id = '1G630THkg1CaAZiYAK-rcg7hLNIxu2oO5' 
            gdown.download(f'https://drive.google.com/uc?export=download&confirm=pbef&id={f_id}', curr_dir + '/weights.pth', quiet=False)
            print("Done.")              
        self.symbols = list('_-!\'(),.:;? ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')
        self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self.model = nn.DataParallel(FastSpeech()).to(device)
        # get directory of the current file
        dir_path = os.path.dirname(os.path.realpath(__file__))
        self.model.load_state_dict(torch.load(dir_path + "/weights.pth", map_location=device))
        self.model.eval()
        self.WaveGlow = waveglow.load.load_model(download=force_download_wg)
    
[docs]    def speak(self, text, play=False, save=False):
        '''
        Pass given text through the FastSpeech 1.0 model and the WaveGlow model to generate speech.

        :param text: Text to be spoken with at most 300 characters.
        :type text: str
        :param play: Whether to play the generated speech. Default is False.
        :type play: bool, optional
        :param save: Whether to save the generated speech as an audio file. Default is False.
        :type save: bool, optional
        :returns: The generated speech as an audio signal.
        :rtype: numpy.ndarray
        '''
        ascii_text = text.lower()
        sequence = np.array([self.symbol_to_id[s] for s in ascii_text if s in self.symbol_to_id ])
        sequence_inds = np.array([i+1 for w, i in enumerate(sequence)])
        
        sequence, sequence_inds = sequence[np.newaxis, :], sequence_inds[np.newaxis, :]
        
        sequence = torch.from_numpy(sequence).long() if device != torch.device('cuda') else torch.from_numpy(sequence).cuda().long()
        sequence_inds = torch.from_numpy(sequence_inds).long() if device != torch.device('cuda') else torch.from_numpy(sequence_inds).cuda().long()

        with torch.no_grad():
            mel = self.model.module.forward(sequence, sequence_inds)
        mel = mel.contiguous().transpose(1, 2)   
        
        audio = waveglow.inference.get_wav(mel, self.WaveGlow).cpu().numpy()
        if save:
            # get the directory of the current file
            dir_path = os.path.dirname(os.path.realpath(__file__))
            # save the audio file in the current directory
            write(dir_path + "/sample.wav", 22050, audio.astype('int16'))
            
        if play:
            write('sample.wav', 22050, audio.astype('int16'))
            playsound("sample.wav")
            os.remove("sample.wav")
        return audio


# In[13]:


# if running from notebook
if __name__ == '__main__':
    get_ipython().system('jupyter nbconvert --to script FastSpeech.ipynb')