Source code for botiverse.preprocessors.BertEmbeddings.BertEmbeddings
from transformers import BertTokenizer, BertModel
from botiverse.models.BERT.config import BERTConfig
from botiverse.models.BERT.utils import LoadPretrainedWeights
from botiverse.models import Bert
import torch
import numpy as np
#out = BertEmbedder().embedd(['hello world', 'hello world'])
[docs]class BertEmbedder():
'''An interface for converting given text into BERT embeddings.'''
def __init__(self):
'''Load the pre-trained model and tokenizer'''
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.bert = Bert(BERTConfig())
LoadPretrainedWeights(self.bert)
self.model = BertModel.from_pretrained('bert-base-uncased')
[docs] def embed(self, sentences, random_state=42):
'''
Convert the given sentences into BERT embeddings.
:param sentences: A list of sentences to convert into BERT embeddings.
:type sentences: list
:param random_state: The random state to use for reproducibility.
:type random_state: int
:return: A list of BERT embeddings for the given sentences.
:rtype: list
'''
torch.manual_seed(random_state)
tokss = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
emb, clst = self.bert(**tokss)
attention_mask = tokss['attention_mask']
# exclude padding tokens
emb = emb * attention_mask.unsqueeze(-1)
emb = emb.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
return emb.squeeze()
[docs] def closest_sentence(self, new_sentence, sentence_list, retun_ind=False):
'''
Given a list of sentences and a new sentence, return the sentence from the list that is closest to the new sentence.
:param new_sentence: The new sentence to compare to the list of sentences.
:type new_sentence: str
:param sentence_list: A list of sentences to compare the new sentence to.
:type sentence_list: list
:param retun_ind: Whether to return the index of the closest sentence instead of the sentence itself.
:type retun_ind: bool
:return: The sentence from the list that is closest to the new sentence and its score.
:rtype: str, float
'''
new_sentence_embedding = self.embed(new_sentence)
sentence_list_embeddings = [self.embed(sentence) for sentence in sentence_list]
cosine_sim = lambda x, y: torch.dot(x, y) / (torch.norm(x) * torch.norm(y))
scores = [cosine_sim(new_sentence_embedding, sentence_embedding) for sentence_embedding in sentence_list_embeddings]
softmax = lambda x: torch.exp(x) / torch.sum(torch.exp(x))
scores = softmax(torch.tensor(scores)).detach().numpy()
return sentence_list[np.argmax(scores)] if not retun_ind else np.argmax(scores), np.max(scores)
try:
from sentence_transformers import SentenceTransformer, util
except:
pass
[docs]class BertSentenceEmbedder():
'''
An interface for converting given text into sentence BERT embeddings.
'''
def __init__(self):
'''
Load the pre-trained model and tokenizer
'''
self.model = SentenceTransformer('all-MiniLM-L6-v2')
[docs] def embed(self, sentences):
'''
Convert the given sentences into BERT embeddings.
:param sentences: A list of sentences to convert into BERT embeddings.
:type sentences: list
:return: A list of BERT embeddings for the given sentences.
:rtype: list
'''
return self.model.encode(sentences, convert_to_tensor=True)
[docs] def closest_sentence(self, new_sentence, sentence_list, retun_ind=False):
'''
Given a list of sentences and a new sentence, return the sentence from the list that is closest to the new sentence.
:param new_sentence: The new sentence to compare to the list of sentences.
:type new_sentence: str
:param sentence_list: A list of sentences to compare the new sentence to.
:type sentence_list: list
:param retun_ind: Whether to return the index of the closest sentence instead of the sentence itself.
:type retun_ind: bool
'''
new_sentence_embedding = self.embed(new_sentence)
sentence_list_embeddings = self.embed(sentence_list)
cosine_sim = lambda x, y: torch.dot(x, y) / (torch.norm(x) * torch.norm(y))
scores = [cosine_sim(new_sentence_embedding, sentence_embedding) for sentence_embedding in sentence_list_embeddings]
softmax = lambda x: torch.exp(x) / torch.sum(torch.exp(x))
scores = softmax(torch.tensor(scores)).detach().numpy()
return sentence_list[np.argmax(scores)] if not retun_ind else np.argmax(scores), np.max(scores)