from typing import List, Set, Tuple, NewType
import numpy as np
import copy
from nltk.tree import Tree
import nltk
import spacy
import benepar
from dataclasses import dataclass
from .utils import *
[docs]@dataclass
class ClueInfo:
clue_pos_tag: str
clue_ner_tag: str
clue_length: int
clue_chunk: Tuple[str, str, List[str], int, int]
clue_answer_dep_path_len: int
padded_selected_clue_binary_ids: np.ndarray
QuestionType = NewType('QuestionType', str)
[docs]@dataclass
class SquadAugmentedExample:
"""
A single training/test example for the Squad-Zhou dataset, after augmenting with clue info and question style.
"""
context_text: str
question_text: str
answer_text: str
question_type: QuestionType
answer_pos_tag: str
answer_ner_tag: str
answer_length: int
clue_info: ClueInfo
def __navigate(node: nltk.Tree):
"""
Recursively navigates through an NLTK parse tree and extracts information about
the tree depth, number of words, and chunks with their respective positions.
Args:
node (nltk.Tree): An NLTK parse tree node, generated from a constituency parser.
Returns:
tuple: A tuple containing the following elements:
- max_depth (int): The maximum depth of the parse tree.
- word_num (int): The total number of words in the parse tree.
- chunklist (list): A list of tuples where each tuple contains the chunk's
POS tag and a list of its leaves with their respective
positions in the sentence.
"""
"""
Disclaimer: This function is adapted from the original implementation with changes for readability, clarity and documentation
"""
if type(node) is not Tree:
return 1, 1, [("word", (node, 0))]
# process leaf nodes
for idx, _ in enumerate(node.leaves()):
tree_location = node.leaf_treeposition(idx)
non_terminal = node[tree_location[:-1]]
if type(non_terminal[0]) is not str:
non_terminal[0] = non_terminal[0] + [idx]
else:
non_terminal[0] = [non_terminal[0], idx]
# process non-leaf nodes
max_depth, word_num, chunklist = 0, 0, []
for child in node:
child_depth, child_num, child_chunklist = __navigate(child)
max_depth = max(max_depth, child_depth + 1)
word_num += child_num
chunklist += child_chunklist
# process current node
cur_node_chunk = [(node.label(), node.leaves())]
chunklist += cur_node_chunk
return max_depth, word_num, chunklist
[docs]def chunks(sentence: str) -> List[Tuple[str, str, List[str], int, int]]:
"""
Takes a sentence and returns a list of chunks with their respective NER tags,
POS tags, words, and start and end positions in the sentence.
Args:
sentence (str): The input sentence to be parsed and chunked.
Returns:
list: A list of tuples where each tuple contains the following elements:
- chunk_ner_tag (str): The NER tag of the chunk (e.g., 'PERSON', 'ORG').
- chunk_pos_tag (str): The POS tag of the chunk (e.g., 'NP', 'VP').
- leaves_without_position (list): A list of words in the chunk.
- start (int): The start position of the chunk in the sentence.
- end (int): The end position of the chunk in the sentence.
"""
"""
Disclaimer: This function is adapted from the original implementation with changes for readability, clarity and documentation
"""
tree = PARSER.parse(sentence)
max_depth, node_num, orig_chunklist = __navigate(tree)
spacy_document = NLP(sentence)
chunklist = []
for chunk in orig_chunklist:
try:
if chunk[0] == "word":
continue
chunk_pos_tag, leaves = chunk
leaves_without_position = []
position_list = []
for v in leaves:
if type(v) == list: # Check if v is a list
wd = v[0]
idx = v[1:]
leaves_without_position.append(wd)
position_list.append(idx[0])
else:
leaves_without_position.append(v)
st = position_list[0]
ed = position_list[-1]
chunk_ner_tag = "UNK"
chunk_text = " ".join(leaves_without_position)
for ent in spacy_document.ents:
if ent.text == chunk_text or chunk_text in ent.text:
chunk_ner_tag = ent.label_
chunklist.append(
(chunk_ner_tag, chunk_pos_tag, leaves_without_position, st, ed)
)
except:
continue
return chunklist
def __dfs(token_list, current_token, current_path, max_depth, related_tokens):
if len(current_path) > max_depth:
return
if current_token in related_tokens and len(related_tokens[current_token]) <= len(current_path):
return
related_tokens[current_token] = current_path
for token in token_list:
if token.i != current_token:
continue
new_path = copy.deepcopy(current_path)
try:
new_path.append(token.dep_)
except:
continue
__dfs(token_list, token.head.i, new_path, max_depth, related_tokens)
for child in token.children:
new_path = copy.deepcopy(current_path)
new_path.append(child.head.dep_)
__dfs(token_list, child.i, new_path, max_depth, related_tokens)
[docs]def get_dependency_paths(token_list: List[SpacyToken]):
"""
Given a list of spaCy tokens, extract the dependency paths between different tokens.
Args:
doc (spacy.tokens.Doc): A spaCy document.
Returns:
dict: A dictionary mapping token indices to tokens.
dict: A dictionary mapping token indices to related tokens and their dependency paths.
list: A list of token texts.
"""
index_to_token = {token.i: token for token in token_list}
index_to_related = {}
tokens = [token.text for token in token_list]
for token in token_list:
related_tokens = {}
__dfs(
token_list,
token.i,
[],
len(token_list) - 1,
related_tokens,
)
sorted_related = sorted(related_tokens.items(),
key=lambda x: len(x[1]))
index_to_related[token.i] = sorted_related
return index_to_token, index_to_related, tokens
def __tokenize_and_stem(sentence: str, condition_list: List[bool] = None) -> Tuple[List[str], List[str], List[SpacyToken]]:
"""
Takes a sentence and returns a list of tokens, a list of stemmed tokens and a list of spacy tokens, only with the content words.
Args:
sentence (str): The input sentence to be tokenized and stemmed.
Returns:
tuple: A tuple of two lists where the first list contains the tokens and the second list contains the stemmed tokens.
"""
if condition_list is not None:
sentence_tokens = [
token for i, token in enumerate(sentence) if condition_list[i]]
spacy_tokens = None
sentence_lemmas = None
else:
spacy_doc = NLP(sentence)
spacy_tokens = [token for token in spacy_doc]
sentence_tokens = [token.text for token in spacy_doc]
sentence_lemmas = [token.lemma_ for token in spacy_doc]
return sentence_tokens, sentence_lemmas, spacy_tokens
def __number_of_overlapping_tokens(sentence_tokens1: List[str], sentence_tokens2: List[str], condition_list: List[bool]) -> int:
conditioned_sentence_1 = [tk for i, tk in enumerate(
sentence_tokens1) if condition_list[i]]
conditioned_sentence1_inetersect_sentence2 = [
tk for tk in conditioned_sentence_1 if tk in sentence_tokens2]
return len(conditioned_sentence1_inetersect_sentence2)
[docs]def extract_question_type_and_id(question, config: InfoConfig = InfoConfig()) -> Tuple[QuestionType, int]:
"""
Given a question string, returns its type and associated id.
Args:
question (str): A question string.
Returns:
tuple: A tuple containing the question type (str) and its id (int).
"""
# Split the question into words
words = question.split()
# Check if the question is an informational question
for word in words:
for i, question_type in enumerate(QUESTION_WORDS):
if question_type.lower() in word.lower():
return question_type, Q_TYPE2ID_DICT[question_type]
# Check if the question is a boolean question
if words[0].lower() in (q_type.lower() for q_type in BOOL_QUESTION_WORDS):
return "boolean", Q_TYPE2ID_DICT["boolean"]
# Return "other" if the question type is not found
return "other", Q_TYPE2ID_DICT["other"]
[docs]def extract_clue_and_question_info(sentence: str, question: str, answer: str, answer_start: int, config: InfoConfig = InfoConfig) -> SquadAugmentedExample:
"""
Extracts information about the question, answer, and clue from the provided sentence,
question, and answer.
Args:
sentence (str): The sentence containing the answer.
question (str): The question being asked.
answer (str): The answer to the question.
answer_start (int): The character index of the answer's start position in the sentence.
config: A configuration object containing token limits and clue extraction settings.
Returns:
SquadAugmentedExample object containing extracted information about the question, answer, and clue.
"""
# Process the input sentence and chunks
sentence_spacydoc = NLP(sentence)
chunklist = chunks(sentence)
# Extract answer information
answer_pos_tag, answer_ner_tag = "UNK", "UNK"
for chunk in chunklist:
if answer == " ".join(chunk[2]):
answer_ner_tag = chunk[0]
answer_pos_tag = chunk[1]
break
answer_length = len(answer.split())
# Extract question type and id
question_type, question_id = extract_question_type_and_id(question)
# Get the answer start token index
ans_sent_tokens = [token.text for token in sentence_spacydoc]
spans = find_token_spans_in_text(sentence, ans_sent_tokens)
answer_end = answer_start + len(answer)
answer_span = []
for idx, span in enumerate(spans):
if not (answer_end <= span[0] or answer_start >= span[1]):
answer_span.append(idx)
answer_start_idx = answer_span[0]
# Extract clue information
clue_info = extract_clue(sentence, question, answer, answer_start_idx)
example = SquadAugmentedExample(
question_text=question,
context_text=sentence,
answer_text=answer,
question_type=question_type,
answer_pos_tag=answer_pos_tag,
answer_ner_tag=answer_ner_tag,
answer_length=answer_length,
clue_info=clue_info,
)
return example
[docs]def test_chunks():
# Test cases
test_sentences = [
"Beyoncé Giselle Knowles-Carter is an American singer, songwriter, and actress.",
"Barack Obama was the 44th President of the United States.",
"The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."
]
for sentence in test_sentences:
print(chunks(sentence))
[docs]def test_extract_clue_and_question_info():
# Test case 1: A simple question and answer
sentence1 = "The quick brown fox jumps over the lazy dog."
question1 = "What color is the fox?"
answer1 = "brown"
answer_start1 = 10
result1 = extract_clue_and_question_info(
sentence1, question1, answer1, answer_start1)
print(result1)
assert result1.question_type == "what"
assert result1.answer_pos_tag == "JJ"
assert result1.answer_ner_tag == "UNK"
assert result1.clue_info.clue_pos_tag == "ADJ"
assert result1.clue_info.clue_ner_tag == "UNK"
print("succeeded")
if __name__ == "__main__":
# test_chunks()
# test_extract_clue()
# test_extract_question_type()
test_extract_clue_and_question_info()