Source code for botiverse.Theorizer.model.dataloader

from dataclasses import dataclass
import os
from collections import defaultdict, namedtuple
from typing import Dict, List
import torch

from torch.utils.data import DataLoader, TensorDataset
from torch.nn.parallel import DistributedDataParallel

from tqdm import tqdm
import pickle as pkl
import numpy as np
from .utils import *
from botiverse.Theorizer.squad.squad_example import SquadExample, SquadProcessedExample
from transformers import (
    WEIGHTS_NAME,
    CONFIG_NAME,
    AdamW,
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    get_linear_schedule_with_warmup,
)

SPECIAL_TOKENS = [
    "<sos>",
    "<eos>",
    "<paragraph>",
    "<clue>",
    "<answer>",
    "<style>",
    "<question>",
    "<pad>",
]
SPECIAL_TOKENS_DICT = {
    "bos_token": "<sos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    "additional_special_tokens": [
        "<paragraph>",
        "<clue>",
        "<answer>",
        "<style>",
        "<question>",
    ],
}
IGNORE_VALUE_OF_LM_HEADS = -100


[docs]@dataclass
class SquadGPT2Example:
    """
    A single example for the SQuAD dataset, processed for GPT-2 training.
    """

    input_ids: List[int]
    attention_mask: List[int] = None
    token_type_ids: List[int] = None
    lm_labels: List[int] = None


[docs]def prepare_squad_data_for_gpt2(tokenizer: GPT2Tokenizer, processed_examples: List[SquadProcessedExample]) -> List[SquadGPT2Example]:
    SOS, EOS, PARAGRAPH, CLUE, ANSWER, STYLE, QUESTION = tokenizer.convert_tokens_to_ids(
        SPECIAL_TOKENS[:-1])
    truncated_sequences = 0
    prepared_data = []

    for inst in tqdm(processed_examples):
        # Tokenize context, question, answer, question_type, and clue if available
        tokenized_context = tokenizer.encode(inst.context_text)
        tokenized_question = tokenizer.encode(inst.question_text)
        tokenized_answer = tokenizer.encode(inst.answer_text)
        tokenized_ans_prefix = tokenizer.encode(
            inst.context_text[: inst.answer_start + 1])
        tokenized_qtype = tokenizer.encode(inst.question_type)

        clue_exist = inst.clue_start is not None
        if clue_exist:
            tokenized_clue = tokenizer.encode(inst.clue_text)
            tokenized_clue_prefix = tokenizer.encode(
                inst.context_text[: inst.clue_start + 1])
        else:
            tokenized_clue = []

        # Calculate the total sequence length
        total_seq_len = (
            len(tokenized_context)
            + len(tokenized_answer)
            + len(tokenized_question)
            + len(tokenized_clue)
            + len(tokenized_qtype)
            + 6  # 6 special tokens, without pad or eos
        )

        # Truncate the sequence if it exceeds the GPT-2 model's input size
        if total_seq_len > tokenizer.max_model_input_sizes["gpt2"]:
            tokenized_context = tokenized_context[
                : -1 * (total_seq_len - tokenizer.max_model_input_sizes["gpt2"] + 1)
            ]
            truncated_sequences += 1

        # Calculate answer and clue positions in the tokenized context
        answer_position_tokenized = get_overlap_position(
            tokenized_context, tokenized_answer, tokenized_ans_prefix)
        if clue_exist:
            clue_position_tokenized = get_overlap_position(
                tokenized_context, tokenized_clue, tokenized_clue_prefix)
        else:
            clue_position_tokenized = (None, None)

        # Build input sequence and token_type_ids
        sequence = [SOS] + tokenized_context + [ANSWER] + tokenized_answer + [CLUE] + \
            tokenized_clue + [STYLE] + tokenized_qtype + \
            [QUESTION] + tokenized_question + [EOS]
        token_types = np.full(
            len(sequence), IGNORE_VALUE_OF_LM_HEADS, dtype=int)
        token_types[:len(tokenized_context) + 1] = PARAGRAPH
        token_types[answer_position_tokenized[0] +
                    1:answer_position_tokenized[1] + 1] = ANSWER
        if clue_exist:
            token_types[clue_position_tokenized[0] +
                        1:clue_position_tokenized[1] + 1] = CLUE
        token_types[len(tokenized_context) + len(tokenized_answer) + 2:len(
            tokenized_context) + len(tokenized_answer) + len(tokenized_clue) + 3] = CLUE
        token_types[len(tokenized_context) + len(tokenized_answer) + len(tokenized_clue) + 3:len(
            tokenized_context) + len(tokenized_answer) + len(tokenized_clue) + len(tokenized_qtype) + 4] = STYLE
        token_types[len(tokenized_context) + len(tokenized_answer) +
                    len(tokenized_clue) + len(tokenized_qtype) + 4:-1] = QUESTION

        # Build lm_labels
        lm_labels = np.full(len(sequence), IGNORE_VALUE_OF_LM_HEADS, dtype=int)
        lm_labels[-len(tokenized_question) - 1:-1] = tokenized_question
        lm_labels[-1] = EOS

        # Create a data point
        # TODO: Add attention_mask
        # I wrongly decided to use numpy arrays instead of lists for the data points, so I have to convert them back to lists here
        data_point = SquadGPT2Example(
            input_ids=list(sequence),
            token_type_ids=list(token_types),
            lm_labels=list(lm_labels),
        )
        # Add the data point to the prepared_data list
        prepared_data.append(data_point)

    return prepared_data


[docs]def prepare_and_pad_squad_data_for_gpt2(tokenizer: GPT2Tokenizer, processed_examples: List[SquadProcessedExample], max_len: int = None, padding: int = 0) -> List[SquadGPT2Example]:
    """
    Prepare and pad SQuAD data for GPT-2.

    This function tokenizes and processes the input data, builds the input sequence, token_type_ids, and lm_labels suitable for training GPT-2.
    It then pads the sequences to the maximum length in the dataset with the specified padding value.

    Args:
        tokenizer (GPT2Tokenizer): The GPT-2 tokenizer used to tokenize the text.
        processed_examples (List[SquadProcessedExample]): A list of SquadProcessedExample instances containing the processed SQuAD data.
        padding (int, optional): The padding value to use when padding the sequences. Default is 0.

    Returns:
        Dict[str, List[List[int]]]: A dictionary containing the prepared and padded data points with keys 'input_ids', 'token_type_ids', and 'lm_labels'.
    """
    prepared_data = prepare_squad_data_for_gpt2(tokenizer, processed_examples)

    max_l = max(len(x.input_ids) for x in prepared_data) if max_len is None else max_len

    data_point:SquadGPT2Example
    for data_point in prepared_data:
        data_point.input_ids = data_point.input_ids + [padding] * (max_l - len(data_point.input_ids))
        data_point.token_type_ids = data_point.token_type_ids + [padding] * (max_l - len(data_point.token_type_ids))
        data_point.lm_labels = data_point.lm_labels + [IGNORE_VALUE_OF_LM_HEADS] * (max_l - len(data_point.lm_labels))

    return prepared_data


[docs]def from_dict_to_squad_processed_example(data: Dict) -> SquadProcessedExample:
    """
    Convert a dictionary to a SquadProcessedExample instance.

    Args:
        data (Dict): A dictionary containing the data to convert.

    Returns:
        SquadProcessedExample: The SquadProcessedExample instance containing the data.
    """
    return SquadProcessedExample(
        context_text=data["paragraph"],
        question_text=data["question"],
        question_type=data["ques_type"],
        answer_text=data["answer"],
        answer_start=data["answer_start"],
        clue_text=data["clue"],
        clue_start=data["clue_start"],
        para_id=data["para_id"],
    )

[docs]def read_cached_processed_examples(filepath: str) -> List[SquadProcessedExample]:
    with open(filepath, "rb") as f:
        exmaples = pkl.load(f)
        exmaples = list(map(from_dict_to_squad_processed_example, exmaples))
    return exmaples

[docs]def test_getdataset():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.add_tokens(SPECIAL_TOKENS)
    with open("squad/dataset/train.processed.pkl", "rb") as f:
        data = pkl.load(f)
        data = list(map(from_dict_to_squad_processed_example, data))
    data = prepare_and_pad_squad_data_for_gpt2(tokenizer, data, max_len=512)
    print(data[0])
    print(tokenizer.decode(data[0].input_ids))
    # paragraph = tokenizer.decode(data[0]["paragraph"])
    # print(data[0])
    # print(paragraph)

[docs]def test_tokenizer():
    a= [50257, 1026, 318, 257, 30069, 286, 262, 7128, 33955, 379, 406, 454, 8906, 11, 4881, 810, 262, 5283, 5335, 1128, 7241, 306, 4120, 284, 9281, 6206, 324, 5857, 311, 12944, 343, 516, 287, 1248, 3365, 13, 50261, 48615, 6206, 324, 5857, 311, 12944, 343, 516, 50260, 1169, 5283, 5335, 50262, 8727, 50263, 2514, 4150, 750, 262, 5283, 5335, 7910, 1656, 287, 1248, 3365, 287, 406, 454, 8906, 4881, 30, 50258]

    b= [50257, 1026, 318, 257, 30069, 286, 262, 7128, 33955, 379, 406, 454, 8906, 11, 4881, 810, 262, 5283, 5335, 1128, 7241, 306, 4120, 284, 9281, 6206, 324, 5857, 311, 12944, 343, 516, 287, 1248, 3365, 13, 50261, 48615, 6206, 324, 5857, 311, 12944, 343, 516, 50260, 1169, 5283, 5335, 50262, 8727, 50263, 2514, 4150, 750, 262, 5283, 5335, 7910, 1656, 287, 1248, 3365, 287, 406, 454, 8906, 4881, 30,50258]
    print(a==b)
    tokenizer=GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.add_tokens(SPECIAL_TOKENS)
    print(tokenizer.decode([8241]))
    print(tokenizer.decode([8727]))

if __name__ == "__main__":
    test_getdataset()

# TODO: remove this one after testing the above one for correctness, this one is for historical reasons
[docs]def prepare_squad_for_gpt2(tokenizer: GPT2Tokenizer, processed_examples: List[SquadProcessedExample], split) -> List[Dict]:

    truncated_sequences = 0
    inst: SquadProcessedExample
    for inst in tqdm(processed_examples):

        tokenized_context = tokenizer.encode(inst.context_text)
        tokenized_question = tokenizer.encode(inst.question_text)
        tokenized_answer = tokenizer.encode(inst.answer_text)
        tokenized_ans_prefix = tokenizer.encode(
            inst.context_text[: inst.answer_start + 1])
        tokenized_qtype = tokenizer.encode(inst.question_type)

        clue_exist = inst.clue_start is not None
        if clue_exist:
            tokenized_clue = tokenizer.encode(inst.clue_text)
            tokenized_clue_prefix = tokenizer.encode(
                inst.context_text[: inst.clue_start + 1])
        else:
            tokenized_clue = []

        total_seq_len = (
            len(tokenized_context)
            + len(tokenized_answer)
            + len(tokenized_question)
            + len(tokenized_clue)
            + len(tokenized_qtype)
            + 6  # 6 special tokens, without pad or eos
        )

        if total_seq_len > tokenizer.max_model_input_sizes["gpt2"]:
            # Heuristic to chop off extra tokens in paragraphs
            tokenized_context = tokenized_context[
                : -1 * (total_seq_len - tokenizer.max_model_input_sizes["gpt2"] + 1)
            ]
            truncated_sequences += 1
            assert (
                len(tokenized_context)
                + len(tokenized_answer)
                + len(tokenized_question)
                + len(tokenized_clue)
                + len(tokenized_qtype)
                + 6
                < tokenizer.max_model_input_sizes["gpt2"]
            )

        ans_prefix_ids = tokenizer.encode(tokenized_ans_prefix)
        answer_position_tokenized = get_overlap_position(
            tokenized_context, tokenized_answer, ans_prefix_ids
        )

        if clue_exist:
            clue_position_tokenized = get_overlap_position(
                tokenized_context, tokenized_clue, tokenized_clue_prefix
            )
        data = {
            "paragraph": tokenized_context,
            "question": tokenized_question,
            "answer": tokenized_answer,
            "answer_position_tokenized": answer_position_tokenized,
            "style": tokenized_qtype,
        }

    return data