Skip to content
46 changes: 46 additions & 0 deletions src/lighteval/tasks/tasks/long_horizon_execution/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Constants file reused within the Long Horizon Execution task.
"""

PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys.
Your task is to calculate the final cumulative sum after processing all keys in order.

For each key in the list, you need to:
1. Look up the value in the dictionary
2. Add it to the running sum
3. After processing all keys, output the final cumulative sum

Dictionary to use:
{dict_str}

Keys to process in order:
{keys_str}

Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys.

IMPORTANT:
- Output your answer as a single integer value inside <answer></answer> tags
- Do not include any other text outside the answer tags
- Format: <answer>final_sum</answer>
- Example: If the final cumulative sum is 42, output: <answer>42</answer>

Your answer:"""

PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}.
Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide.
In each turn, I'll provide {k} key(s) (comma-separated).
Respond with the current running sum, enclosed in <answer> tags.

Dictionary to maintain:
{dict_str}

Ready to start!
**User**: {keys_str}
**Assistant**:"""

PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process:
**User**: {keys_str}
**Assistant**:"""

CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
TURN_COMPLEXITIES = [1, 2, 10]
167 changes: 167 additions & 0 deletions src/lighteval/tasks/tasks/long_horizon_execution/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""
name:
Long Horizon Execution

dataset:
arvindh75/Long-Horizon-Execution

abstract:
Evaluation benchmark for long-context execution capabilities of language models.
Tests a model's ability to maintain state and perform cumulative operations over
long sequences of inputs. Supports both single-turn (all inputs at once) and
multi-turn (inputs provided incrementally) evaluation modes.
The task requires models to:
1. Maintain a dictionary mapping keys to values
2. Process a sequence of keys
3. Calculate cumulative sums after each key or group of keys
4. Handle varying context sizes and turn complexities
Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum
after processing all keys, allowing any aggregation strategy.

Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining
conversation history and outputting cumulative sums incrementally. Evaluates
fractional accuracy (correct turns / total turns).

languages:
english

tags:
long-context, state-tracking, arithmetic, execution

paper:
https://arxiv.org/abs/2509.09677

starred: true
"""

import functools
import re

from inspect_ai.dataset import Sample
from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE
from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks
from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target


def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None):
"""
Prompt function for single-turn evaluation (non-inspect-ai backend).
Converts dataset record to Doc object.
Returns:
Doc object for evaluation
"""
prompt, target_str, _ = _build_prompt_and_target(
line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
)

return Doc(
task_name=task_name,
query=prompt,
choices=[target_str], # Expected answer as a choice
gold_index=0,
instruction=prompt,
)


def single_turn_record_to_sample(record, prompt_length=32768):
"""
Converts dataset record to inspect-ai Sample object for single-turn evaluation.
Returns:
Sample object for inspect-ai
"""
prompt, target_str, metadata = _build_prompt_and_target(
record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
)

return Sample(
input=prompt,
target=target_str,
metadata=metadata,
)


@scorer(metrics=[accuracy(), stderr()])
def single_turn_scorer():
"""
Scorer for single-turn evaluation.
Compares the model's predicted final sum with the expected final sum (binary score).
Returns:
Scorer function that evaluates single integer responses
"""

async def score(state: TaskState, target: Target):
response = state.output.completion

answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
match = answer_pattern.search(response)

if not match:
return Score(value="I", answer="", explanation="No <answer> tag found in response.")

content = match.group(1).strip()

try:
pred_value = int(content.strip())
except ValueError:
return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}")

try:
exp_value = int(target.text.strip())
except (ValueError, AttributeError):
return Score(
value="I",
answer=str(pred_value),
explanation=f"Failed to parse expected target: {target.text}",
)

is_correct = pred_value == exp_value
return Score(
value="C" if is_correct else "I",
answer=str(pred_value),
explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"),
)

return score


def create_single_turn_tasks():
"""
Create all single-turn task configurations for different context sizes.
Returns:
list[LightevalTaskConfig]: List of task configurations for single-turn evaluation
"""
tasks = []

for context_size in CONTEXT_SIZES:
task_name = f"long_horizon_execution_single:{context_size}"
prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size)
sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size)

task = LightevalTaskConfig(
name=task_name,
prompt_function=prompt_fn,
sample_fields=sample_fn,
solver=[generate(cache=True)],
scorer=single_turn_scorer(),
hf_repo="arvindh75/Long-Horizon-Execution",
hf_subset="default",
evaluation_splits=("test",),
generation_size=context_size,
metrics=[Metrics.exact_match],
)

tasks.append(task)

return tasks


single_turn_tasks = create_single_turn_tasks()
multi_turn_tasks = create_multi_turn_tasks()

TASKS_TABLE = single_turn_tasks + multi_turn_tasks
Loading