Source code for botiverse.models.T5Model.T5Model

import torch
import torch.nn as nn
import math

[docs]class AttentionModule(nn.Module): '''A class used for implementing the general transformer attention mechanism.''' def __init__(self, is_decoder=False, num_positional_encoding_buckets=32, positional_encoding_max_distance=128, d_model=768, num_heads=12, dropout_rate=0.1, has_positional_encoding=False): """ Constructs an AttentionModule instance with specific hyperparameters. :param is_decoder: Indicates if we are using a decoder. :type is_decoder: bool, optional :param num_positional_encoding_buckets: Number of positional encoding buckets. :type num_positional_encoding_buckets: int, optional :param positional_encoding_max_distance: Max distance for positional encoding. :type positional_encoding_max_distance: int, optional :param d_model: Indicates the model embeddings dimension. :type d_model: int, optional :param num_heads: States the number of attention heads. :type num_heads: int, optional :param dropout_rate: Dropout rate. :type dropout_rate: float, optional :param has_positional_encoding: If positional encoding is applied. :type has_positional_encoding: bool, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.is_decoder = is_decoder self.has_positional_encoding = has_positional_encoding self.num_positional_encoding_buckets = num_positional_encoding_buckets self.positional_encoding_max_distance = positional_encoding_max_distance self.d_model = d_model self.per_head_dim = d_model // num_heads self.n_heads = num_heads self.dropout = dropout_rate self.q = nn.Linear(self.d_model, self.d_model, bias=False) self.k = nn.Linear(self.d_model, self.d_model, bias=False) self.v = nn.Linear(self.d_model, self.d_model, bias=False) self.o = nn.Linear(self.d_model, self.d_model, bias=False) if self.has_positional_encoding: self.relative_attention_bias = nn.Embedding(self.num_positional_encoding_buckets, self.n_heads)
[docs] def relative_positional_encoding(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Provides the buckets given the relative positions. :param relative_position: Tensor of relative positions. :type relative_position: Tensor :param bidirectional: If the attention is bidirectional, is false in the decoder as the token can attend only to the tokens behid it. :type bidirectional: bool, optional :param num_buckets: Number of buckets for positional encoding. :type num_buckets: int, optional :param max_distance: Maximum distance for positional encoding. :type max_distance: int, optional :return: Relative buckets. :rtype: Tensor """ relative_buckets = 0 if bidirectional: # self attention in encoder num_buckets //= 2 relative_buckets += (relative_position > 0).to(torch.long) * num_buckets relative_position = torch.abs(relative_position) else: # self attention in decoder (you can't attend to what is yet to come) for efficient utilization of buckets relative_position = -torch.min(relative_position, torch.zeros_like(relative_position).to(self.device)) # now all is in the positive positions realm (mapping to buckets is much straightforward) max_exact = num_buckets // 2 is_small = relative_position < max_exact relative_position_if_large = max_exact + ( torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) ).to(torch.long) relative_position_if_large = torch.min( relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1).to(self.device) ) relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) return relative_buckets
[docs] def compute_bias(self, query_length, key_length): """ Computes the the relative positional bias between the queries and the keys. :param query_length: Length of the query sequance. :type query_length: int :param key_length: Length of the key sequance. :type key_length: int :return: Positional embeddings. :rtype: Tensor """ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") context_position = torch.arange(query_length, dtype=torch.long, device=self.device)[:, None] memory_position = torch.arange(key_length, dtype=torch.long, device=self.device)[None, :] relative_position = memory_position - context_position relative_position_bucket = self.relative_positional_encoding( relative_position, bidirectional=(not self.is_decoder), num_buckets=self.num_positional_encoding_buckets, max_distance=self.positional_encoding_max_distance, ) positional_embeddings = self.relative_attention_bias(relative_position_bucket) # mapping bucketes to their corresponding embeddings positional_embeddings = positional_embeddings.permute([2, 0, 1]).unsqueeze(0) return positional_embeddings
[docs] def forward( self, hidden_states, mask=None, key_value_states=None, position_bias=None): """ The forward pass of the attention layer. :param hidden_states: Tensor of the Query. :type hidden_states: Tensor :param mask: Mask to be applied on values. :type mask: Tensor, optional :param key_value_states: Tensor of the Key and Value, the default is the same as hidden_states. :type key_value_states: Tensor, optional :param position_bias: Positional bias to be added. :type position_bias: Tensor, optional :return: Returns the attention output and positional bias. :rtype: Tuple[Tensor, Tensor] """ batch_size, seq_length = hidden_states.shape[:2] real_seq_length = seq_length key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] def shape(states): return states.view(batch_size, -1, self.n_heads, self.per_head_dim).transpose(1, 2) def unshape(states): return states.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) query_states = shape(self.q(hidden_states)) if key_value_states is not None: key_states= shape(self.k(key_value_states)) value_states = shape(self.v(key_value_states)) else: key_states= shape(self.k(hidden_states)) value_states = shape(self.v(hidden_states)) scores = torch.matmul( # getting the weightes over the whole batch over all the heads and all at once query_states, key_states.transpose(3, 2) ) if position_bias is None: if not self.has_positional_encoding: position_bias = torch.zeros( (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype ).to(self.device) else: position_bias = self.compute_bias(real_seq_length, key_length) if mask is not None: position_bias = position_bias + mask # mask here is not 0 and 1 but -inf and 0 position_bias_masked = position_bias scores += position_bias_masked attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( scores ) attn_weights = nn.functional.dropout( attn_weights, p=self.dropout, training=self.training ) attn_output = unshape(torch.matmul(attn_weights, value_states)) attn_output = self.o(attn_output) return attn_output, position_bias
[docs]class NewGELUActivation(nn.Module): '''Simple interface of the Gaussian Error Linear Units (GELU) activation function'''
[docs] def forward(self, input): return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
[docs]class DenseGatedActDenseModule(nn.Module): '''A class used to implement a dense, gated activation function.''' def __init__(self, d_model=768, d_ff=2048, dropout_rate=0.1): ''' Initializes the DenseGatedActDense Module class with the given parameters which is a gated dense layer followed a dense layer. :param d_model: Input dimension to the module (and also the model embedding dimension). :type d_model: int, optional :param d_ff: Hidden layer dimension. :type d_ff: int, optional :param dropout_rate: Dropout rate. :type dropout_rate: float, optional :return: None ''' super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.wi_0 = nn.Linear(d_model, d_ff, bias=False) self.wi_1 = nn.Linear(d_model, d_ff, bias=False) self.wo = nn.Linear(d_ff, d_model, bias=False) self.dropout = nn.Dropout(dropout_rate) self.act = NewGELUActivation()
[docs] def forward(self, hidden_states): """ Performs the forward pass of the dense, gated activation function. :param hidden_states: Input tensor to the forward method. :type hidden_states: Tensor :return: Output tensor after applying dense, gated activation function. :rtype: Tensor """ hidden_gelu = self.act(self.wi_0(hidden_states)) hidden_linear = self.wi_1(hidden_states) hidden_states = hidden_gelu * hidden_linear hidden_states = self.dropout(hidden_states) hidden_states = self.wo(hidden_states) return hidden_states
[docs]class LayerNormModule(nn.Module): """ A class used to apply the Layer Normalization operation. """ def __init__(self, layer_size = 768, eps=1e-6): ''' Initializes the Layer Normalization Module class with the given parameters. :param layer_size: Dimensions of the layers to be normalized. :type layer_size: int, optional :param eps: A term added to improve numerical stability. :type eps: float, optional :return: None ''' super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.weight = nn.Parameter(torch.ones(layer_size).to(self.device)) self.epsilon = eps
[docs] def forward(self, hidden_states): mean_square = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(mean_square + self.epsilon) return self.weight * hidden_states
[docs]class FFModule(nn.Module): '''A class used to execute a Feed-Forward Neural Network module.''' def __init__(self, dropout_rate=0.1): ''' Initializes the FFModule class with the given parameters (the feed-forward block in the Transformer). :param hidden_states: Input tensor to the forward method. :type hidden_states: Tensor :return: Output tensor after applying the FFN. :rtype: Tensor ''' super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.DenseReluDense = DenseGatedActDenseModule() self.layer_norm = LayerNormModule() self.dropout = nn.Dropout(dropout_rate)
[docs] def forward(self, hidden_states): """ Perform the forward pass of the Feed-forward Neural Network module. :param hidden_states: Input tensor to the Feed Forward Network (FFN). :type hidden_states: Tensor :return: Output tensor after applying the FFN. :rtype: Tensor """ forwarded_states = self.layer_norm(hidden_states) forwarded_states = self.DenseReluDense(forwarded_states) hidden_states = hidden_states + self.dropout(forwarded_states) return hidden_states
[docs]class SelfAttentionModule(nn.Module): '''A class used to implement a self-attention mechanism.''' def __init__(self, is_decoder, dropout_rate=0.1, has_positional_encoding=False): """ Initializes the SelfAttentionModule class with the given parameters. :param is_decoder: Indicates if we are using a decoder (the decoder and encoder has differant ways to hadle the positional encoding). :type is_decoder: bool :param dropout_rate: Dropout rate. :type dropout_rate: float, optional :param has_positional_encoding: If positional encoding is applied. :type has_positional_encoding: bool, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.SelfAttention = AttentionModule(is_decoder, has_positional_encoding=has_positional_encoding) self.layer_norm = LayerNormModule() self.dropout = nn.Dropout(dropout_rate)
[docs] def forward(self, hidden_states, attention_mask=None, position_bias=None): """ Applies the self-attention to the hidden states. :param hidden_states: Tensor of the Query, Key and Value (all have the same inpus as this is self attention). :type hidden_states: Tensor :param attention_mask: Attention mask for the self-attention mechanism. :type attention_mask: Tensor, optional :param position_bias: Position bias for self-attention. :type position_bias: Tensor, optional :return: Returns the hidden states and position bias. :rtype: Tuple[Tensor, Tensor] """ normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( normed_hidden_states, mask=attention_mask, position_bias=position_bias ) hidden_states = hidden_states + self.dropout(attention_output[0]) position_bias = attention_output[1] return hidden_states, position_bias
[docs]class EncoderBlock(nn.Module): '''A class used for the encoder block of the transformer model.''' def __init__(self, has_positional_encoding=False): """ Initializes the EncoderBlock class with the given parameters. :param has_positional_encoding: If positional encoding is applied. :type has_positional_encoding: bool, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.layer = nn.ModuleList() self.layer.append(SelfAttentionModule(is_decoder=False, has_positional_encoding=has_positional_encoding)) self.layer.append(FFModule())
[docs] def forward(self, hidden_states, attention_mask=None, position_bias=None): """ Encoder block forward pass. :param hidden_states: Input tensor to the Encoder block. :type hidden_states: Tensor :param attention_mask: Attention mask for the self-attention mechanism. :type attention_mask: Tensor, optional :param position_bias: Position bias for self-attention. :type position_bias: Tensor, optional :return: Returns the hidden states and position bias. :rtype: Tuple[Tensor, Tensor] """ self_attention_outputs = self.layer[0](hidden_states, attention_mask=attention_mask, position_bias=position_bias) hidden_states = self_attention_outputs[0] position_bias = self_attention_outputs[1] hidden_states = self.layer[-1](hidden_states) return hidden_states, position_bias
[docs]class EncoderModule(nn.Module): '''A class used for the encoder of the transformer model.''' def __init__(self, embed_tokens, num_layers= 12, dropout_rate=0.1): """ Initializes the EncoderModule class with the given parameters. :param embed_tokens: The embeddings of the input tokens. :type embed_tokens: nn.Embedding :param num_layers: The number of encoder layers in the model. :type num_layers: int, optional :param dropout_rate: The dropout rate. :type dropout_rate: float, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.embed_tokens = embed_tokens self.block = nn.ModuleList( [EncoderBlock(has_positional_encoding=bool(i == 0)) for i in range(num_layers)] ) self.final_layer_norm = LayerNormModule() self.dropout = nn.Dropout(dropout_rate)
[docs] def forward(self, input_ids=None, attention_mask=None): """ Performs the forward pass of the encoder module. :param input_ids: The indices of the input sequence tokens. :type input_ids: Tensor, optional :param attention_mask: The binary mask indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type attention_mask: Tensor, optional :return: The encoded hidden states. :rtype: Tensor """ input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = self.embed_tokens(input_ids) if attention_mask is None: attention_mask = torch.ones(input_shape).to(self.device) final_attention_mask = attention_mask.to(dtype=torch.float32) final_attention_mask = (1.0 - final_attention_mask) * torch.finfo(torch.float32).min hidden_states = self.dropout(inputs_embeds) position_bias = None for block in self.block: layer_outputs = block( hidden_states, attention_mask = final_attention_mask, position_bias = position_bias ) hidden_states = layer_outputs[0] position_bias = layer_outputs[1] hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states
[docs]class CrossAttentionModule(nn.Module): '''A class used for the cross-attention module of the transformer model.''' def __init__(self, dropout_rate=0.1): """ Initializes the CrossAttentionModule class with the given parameters. :param dropout_rate: Dropout rate. :type dropout_rate: float, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.EncDecAttention = AttentionModule(is_decoder=True, has_positional_encoding=False) self.layer_norm = LayerNormModule() self.dropout = nn.Dropout(dropout_rate)
[docs] def forward(self, hidden_states, key_value_states, encoder_attention_mask=None): """ Applies cross-attention where the query comes from the hidden states and the key and value come from key_value_states. :param hidden_states: Input tensor to be used for the query. :type hidden_states: Tensor :param key_value_states: Input tensor to be used for the key and value. :type key_value_states: Tensor :param encoder_attention_mask: Attention mask for the cross-attention mechanism. :type encoder_attention_mask: Tensor, optional :return: Returns the hidden states and position bias. :rtype: Tuple[Tensor, Tensor] """ normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( normed_hidden_states, mask=encoder_attention_mask, key_value_states=key_value_states ) hidden_states = hidden_states + self.dropout(attention_output[0]) position_bias = attention_output[1] return hidden_states, position_bias
[docs]class DecoderBlock(nn.Module): '''A class used for the decoder block of the transformer model.''' def __init__(self, has_positional_encoding=False): """ Initializes the DecoderModule class with the given parameters. :param has_positional_encoding: If positional encoding is applied. :type has_positional_encoding: bool, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.layer = nn.ModuleList() self.layer.append(SelfAttentionModule(is_decoder=True, has_positional_encoding=has_positional_encoding)) self.layer.append(CrossAttentionModule()) self.layer.append(FFModule())
[docs] def forward( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None ): """ Performs the forward pass of the decoder block. :param hidden_states: The hidden states from the previous decoder block (or the input embeddings if this is the first decoder block). :type hidden_states: Tensor :param attention_mask: The binary mask of the decoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type attention_mask: Tensor, optional :param position_bias: The positional bias for self-attention mechanism. :type position_bias: Tensor, optional :param encoder_hidden_states: The output hidden states from the encoder module. :type encoder_hidden_states: Tensor, optional :param encoder_attention_mask: The binary mask of the encoder sequance, indicating where the input has been padded. :type encoder_attention_mask: Tensor, optional :return: The hidden states and position bias after the forward pass of the decoder block. :rtype: Tuple[Tensor, Tensor] """ self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias ) hidden_states = self_attention_outputs[0] position_bias = self_attention_outputs[1] cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask ) hidden_states = cross_attention_outputs[0] hidden_states = self.layer[-1](hidden_states) return hidden_states, position_bias
[docs]class DecoderModule(nn.Module): '''A class used to implement the decoder part of the transformer.''' def __init__(self, embed_tokens, num_layers= 12, dropout_rate=0.1): """ Initializes the DecoderModule class with the given parameters. :param embed_tokens: The embeddings of the decoder input sequance tokens. :type embed_tokens: nn.Embedding :param num_layers: The number of decoder layers in the model. :type num_layers: int, optional :param dropout_rate: The dropout rate. :type dropout_rate: float, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.embed_tokens = embed_tokens self.block = nn.ModuleList( [DecoderBlock(has_positional_encoding=bool(i == 0)) for i in range(num_layers)] ) self.final_layer_norm = LayerNormModule() self.dropout = nn.Dropout(dropout_rate)
[docs] def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None ): """ Performs the forward pass of the decoder. :param input_ids: The indices of the input sequence tokens in the vocabulary. :type input_ids: Tensor, optional :param attention_mask: The binary mask of the decoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type attention_mask: Tensor, optional :param encoder_hidden_states: the output of the encoder module. :type encoder_hidden_states: Tensor, optional :param encoder_attention_mask: The binary mask of the encoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type encoder_attention_mask: Tensor, optional :return: Decoded output hidden states. :rtype: Tensor """ input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = self.embed_tokens(input_ids) if attention_mask is None: attention_mask = torch.ones(input_shape).to(self.device) seq_length = input_shape[1] lower_triangular_mask = torch.tril(torch.ones((seq_length, seq_length))).view(1, 1, seq_length, seq_length).to(self.device) final_attention_mask = lower_triangular_mask * attention_mask final_attention_mask = (1.0 - final_attention_mask) * torch.finfo(torch.float32).min if encoder_attention_mask is None: encoder_attention_mask = torch.ones((encoder_hidden_states.size(0), encoder_hidden_states.size(1))).to(self.device) encoder_attention_mask = encoder_attention_mask.to(dtype=torch.float32) encoder_attention_mask = (1.0 - encoder_attention_mask) * torch.finfo(torch.float32).min hidden_states = self.dropout(inputs_embeds) position_bias = None for block in self.block: layer_outputs = block( hidden_states, attention_mask=final_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask ) hidden_states = layer_outputs[0] position_bias = layer_outputs[1] hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states
[docs]class T5Model(nn.Module): '''A class to represent the T5 transformer model.''' def __init__(self, vocab_size=32128, d_model=768): """ Initializes the T5 Model with the given parameters. :param vocab_size: The size of the vocabulary. :type vocab_size: int, optional :param d_model: The dimensionality of the input embedding. :type d_model: int, optional :return: None """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.shared = nn.Embedding(vocab_size, d_model) self.encoder = EncoderModule(self.shared) self.decoder = DecoderModule(self.shared) self.lm_head = nn.Linear(d_model, vocab_size, bias=False) self.softmax = nn.Softmax(dim=-1) self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
[docs] def forward( self, input_ids=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None): """ Performs the forward pass of the T5 transformer model. :param input_ids: The IDs of the input tokens. :type input_ids: Tensor, optional :param attention_mask: The binary mask of the encoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type attention_mask: Tensor, optional :param decoder_input_ids: The ids of the decoder input tokens. :type decoder_input_ids: Tensor, optional :param decoder_attention_mask: The binary mask of the decoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type decoder_attention_mask: Tensor, optional :return: The token probabilities of the output sequence. :rtype: Tensor """ # encoder encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask ) # decoder decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, encoder_hidden_states=encoder_outputs, encoder_attention_mask=attention_mask, ) # lm_head lm_logits = self.lm_head(decoder_outputs) output = self.softmax(lm_logits) loss = self.loss_fn(lm_logits.view(-1, lm_logits.size(-1)), decoder_input_ids.view(-1)) return output, loss
[docs] def generate( self, input_ids=None, attention_mask=None, max_length=10, temperature=1.0): """ Generates output sequence given input_ids and attention_mask. :param input_ids: The IDs of the input tokens. :type input_ids: Tensor, optional :param attention_mask: The binary mask of the encoder sequance, indicating the positions where the input sequence is padded (1 for not padded, 0 for padded). :type attention_mask: Tensor, optional :param max_length: The maximum length of the sequence to be generated. :type max_length: int, optional :param temperature: The temperature of the softmax function, the higher its value the flatter the probability distribution of the next token will be. :type temperature: float, optional :return: The IDs of the generated tokens. :rtype: Tensor """ # put model in evaluation mode (stop dropout and backwrad propagation calculation) self.eval() encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask ) # autoregressive generation generated_ids = [torch.tensor([0]).to(self.device)] cur_ids = torch.zeros((input_ids.size(0), 1)).long().to(self.device) # genrate until max_length or eos_token for _ in range(max_length): decoder_outputs = self.decoder( input_ids=cur_ids, encoder_hidden_states=encoder_outputs ) lm_logits = self.lm_head(decoder_outputs) next_token_logits = lm_logits[:, -1, :] / temperature next_token_id = torch.multinomial(torch.softmax(next_token_logits, dim=-1), num_samples=1)[0] generated_ids.append(next_token_id) cur_ids = torch.cat([cur_ids, next_token_id.unsqueeze(-1)], dim=-1) if next_token_id == 1: break return torch.cat(generated_ids, dim=-1)