Source code for botiverse.preprocessors.BertEmbeddings.BertEmbeddings

from transformers import BertTokenizer, BertModel
from botiverse.models.BERT.config import BERTConfig
from botiverse.models.BERT.utils import LoadPretrainedWeights
from botiverse.models import Bert
import torch
import numpy as np


#out = BertEmbedder().embedd(['hello world', 'hello world'])

[docs]class BertEmbedder():
    '''An interface for converting given text into BERT embeddings.'''
    def __init__(self):
        '''Load the pre-trained model and tokenizer'''
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = Bert(BERTConfig())
        LoadPretrainedWeights(self.bert)
        self.model = BertModel.from_pretrained('bert-base-uncased')
    
[docs]    def embed(self, sentences, random_state=42):
        '''
        Convert the given sentences into BERT embeddings.
        
        :param sentences: A list of sentences to convert into BERT embeddings.
        :type sentences: list
        :param random_state: The random state to use for reproducibility.
        :type random_state: int
        
        :return: A list of BERT embeddings for the given sentences.
        :rtype: list
        '''
        torch.manual_seed(random_state)
        tokss = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
        emb, clst = self.bert(**tokss)
        attention_mask = tokss['attention_mask']
        # exclude padding tokens
        emb = emb * attention_mask.unsqueeze(-1)
        emb = emb.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
        return emb.squeeze()

    
[docs]    def closest_sentence(self, new_sentence,  sentence_list, retun_ind=False):
        '''
        Given a list of sentences and a new sentence, return the sentence from the list that is closest to the new sentence.
        
        :param new_sentence: The new sentence to compare to the list of sentences.
        :type new_sentence: str
        :param sentence_list: A list of sentences to compare the new sentence to.
        :type sentence_list: list
        :param retun_ind: Whether to return the index of the closest sentence instead of the sentence itself.
        :type retun_ind: bool
        
        :return: The sentence from the list that is closest to the new sentence and its score.
        :rtype: str, float
        '''
        new_sentence_embedding = self.embed(new_sentence)
        sentence_list_embeddings = [self.embed(sentence) for sentence in sentence_list]
        cosine_sim = lambda x, y: torch.dot(x, y) / (torch.norm(x) * torch.norm(y))
        scores = [cosine_sim(new_sentence_embedding, sentence_embedding) for sentence_embedding in sentence_list_embeddings]
        softmax = lambda x: torch.exp(x) / torch.sum(torch.exp(x))
        scores = softmax(torch.tensor(scores)).detach().numpy()
        return sentence_list[np.argmax(scores)] if not retun_ind else np.argmax(scores), np.max(scores)
    
    
try:
    from sentence_transformers import SentenceTransformer, util
except:
    pass

[docs]class BertSentenceEmbedder():
    '''
    An interface for converting given text into sentence BERT embeddings.
    '''
    def __init__(self):
        '''
        Load the pre-trained model and tokenizer
        '''
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
[docs]    def embed(self, sentences):
        '''
        Convert the given sentences into BERT embeddings.
        
        :param sentences: A list of sentences to convert into BERT embeddings.
        :type sentences: list
        
        :return: A list of BERT embeddings for the given sentences.
        :rtype: list
        '''
        return self.model.encode(sentences, convert_to_tensor=True)
    
[docs]    def closest_sentence(self, new_sentence,  sentence_list, retun_ind=False):
        '''
        Given a list of sentences and a new sentence, return the sentence from the list that is closest to the new sentence.
        
        :param new_sentence: The new sentence to compare to the list of sentences.
        :type new_sentence: str
        :param sentence_list: A list of sentences to compare the new sentence to.
        :type sentence_list: list
        :param retun_ind: Whether to return the index of the closest sentence instead of the sentence itself.
        :type retun_ind: bool
        '''
        new_sentence_embedding = self.embed(new_sentence)
        sentence_list_embeddings = self.embed(sentence_list)
        cosine_sim = lambda x, y: torch.dot(x, y) / (torch.norm(x) * torch.norm(y))
        scores = [cosine_sim(new_sentence_embedding, sentence_embedding) for sentence_embedding in sentence_list_embeddings]
        softmax = lambda x: torch.exp(x) / torch.sum(torch.exp(x))
        scores = softmax(torch.tensor(scores)).detach().numpy()        
        return sentence_list[np.argmax(scores)] if not retun_ind else np.argmax(scores), np.max(scores)