Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion src/lighteval/tasks/tasks/squad_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
when possible, but also determine when no answer is supported by the paragraph
and abstain from answering.

note:
This is an LLM-friendly adaptation of the original SQuAD 2.0 evaluation.
The original evaluation uses extractive span selection with a confidence-based
"no answer" threshold, which does not apply to generative models.
Here, the model is instead instructed to generate "unanswerable" when the
question cannot be answered from the context. EM and F1 metrics are computed
over both answerable and unanswerable questions.

languages:
english

Expand All @@ -28,12 +36,47 @@

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.tasks.templates.qa import get_qa_prompt_function
from lighteval.utils.language import Language

UNANSWERABLE = "unanswerable"


def squad_v2_prompt(line, task_name: str = None):
answers = list(set(ans for ans in line["answers"]["text"] if len(ans) > 0))
is_unanswerable = len(answers) == 0

if is_unanswerable:
choices = [f" {UNANSWERABLE}"]
else:
choices = [f" {ans}" for ans in answers]

return Doc(
task_name=task_name,
query=f"Context: {line['context']}\nQuestion: {line['question']}\n"
f"Answer with a span from the context, or \"{UNANSWERABLE}\" if the question cannot be answered.\nAnswer:",
choices=choices,
gold_index=list(range(len(choices))),
specific={"text": line["context"]},
)


squad_v2 = LightevalTaskConfig(
name="squad_v2",
prompt_function=squad_v2_prompt,
hf_repo="rajpurkar/squad_v2",
hf_subset="squad_v2",
evaluation_splits=("validation",),
few_shots_split="train",
stop_sequence=["\n", "Question:", "question:"],
generation_size=200,
metrics=[Metrics.exact_match, Metrics.f1_score, Metrics.faithfulness],
version=2,
)

squad_v2_answerable = LightevalTaskConfig(
name="squad_v2:answerable",
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {
Expand All @@ -49,10 +92,11 @@
few_shots_split="train",
stop_sequence=["\n", "Question:", "question:"],
generation_size=200,
metrics=[Metrics.exact_match],
metrics=[Metrics.exact_match, Metrics.f1_score],
version=1,
)

TASKS_TABLE = [
squad_v2,
squad_v2_answerable,
]