Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
48 changes: 48 additions & 0 deletions src/lighteval/tasks/tasks/long_horizon_execution/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Constants file reused within the Long Horizon Execution task.
"""

PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys.
Your task is to calculate the final cumulative sum after processing all keys in order.

For each key in the list, you need to:
1. Look up the value in the dictionary
2. Add it to the running sum
3. After processing all keys, output the final cumulative sum

Dictionary to use:
{dict_str}

Keys to process in order:
{keys_str}

Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys.

IMPORTANT:
- Output your answer as a single integer value inside <answer></answer> tags
- Do not include any other text outside the answer tags
- Format: <answer>final_sum</answer>
- Example: If the final cumulative sum is 42, output: <answer>42</answer>

Your answer:"""

PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}.
Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide.

In each turn, I'll provide {k} keys (comma-separated).
Respond with the current running sum, enclosed in <answer> tags.

Dictionary to maintain:
{dict_str}

Ready to start!

**User**: {keys_str}
**Assistant**:"""

PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process:
**User**: {keys_str}
**Assistant**:"""

CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
TURN_COMPLEXITIES = [1, 2, 10]
47 changes: 47 additions & 0 deletions src/lighteval/tasks/tasks/long_horizon_execution/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
name:
Long Horizon Execution

dataset:
arvindh75/Long-Horizon-Execution

abstract:
Evaluation benchmark for long-context execution capabilities of language models.
Tests a model's ability to maintain state and perform cumulative operations over
long sequences of inputs. Supports both single-turn (all inputs at once) and
multi-turn (inputs provided incrementally) evaluation modes.

The task requires models to:
1. Maintain a dictionary mapping keys to values
2. Process a sequence of keys
3. Calculate cumulative sums after each key or group of keys
4. Handle varying context sizes and turn complexities

Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum
after processing all keys, allowing any aggregation strategy.

Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining
conversation history and outputting cumulative sums incrementally. Evaluates
fractional accuracy (correct turns / total turns).

languages:
english

tags:
long-context, state-tracking, arithmetic, execution

paper:
https://arxiv.org/abs/2509.09677

starred:
true
"""

from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks
from lighteval.tasks.tasks.long_horizon_execution.single_turn import create_single_turn_tasks


single_turn_tasks = create_single_turn_tasks()
multi_turn_tasks = create_multi_turn_tasks()

TASKS_TABLE = single_turn_tasks + multi_turn_tasks
224 changes: 224 additions & 0 deletions src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""
Multi-turn implementation of the Long Horizon Execution task.

This implementation matches the multi-turn evaluation approach from the research paper,
where keys are provided in batches of K per turn, and the model maintains conversation
state to output cumulative sums after each turn.
"""

import functools
import re

from inspect_ai.dataset import Sample
from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.tasks.tasks.long_horizon_execution.constants import (
CONTEXT_SIZES,
PROMPT_TEMPLATE_MULTI_FOLLOWUP,
TURN_COMPLEXITIES,
)
from lighteval.tasks.tasks.long_horizon_execution.utils import _build_multi_turn_prompts


def multi_turn_prompt_function(line, prompt_length=32768, k=1, task_name: str = None):
"""
Prompt function for non-inspect-ai backend for multi-turn evaluation.
Converts dataset record to Doc object.

Note: For multi-turn, this returns the first turn's prompt.
Subsequent turns are handled by the solver.
"""
initial_prompt, _, expected_per_turn, _ = _build_multi_turn_prompts(line, prompt_length=prompt_length, k=k)

return Doc(
task_name=task_name,
query=initial_prompt,
choices=[str(expected_per_turn[-1])], # Final sum as choice
gold_index=0,
instruction=initial_prompt,
)


def multi_turn_record_to_sample(record, prompt_length=32768, k=1):
"""
Converts dataset record to inspect-ai Sample object for multi-turn evaluation.
Stores all turn information in metadata for the solver to use.
"""
initial_prompt, _, expected_per_turn, metadata = _build_multi_turn_prompts(
record, prompt_length=prompt_length, k=k
)

return Sample(
input=initial_prompt,
target=str(expected_per_turn[-1]),
metadata=metadata,
)


def _extract_response_content(response):
"""Extract content from model response object."""
if hasattr(response, "content"):
return response.content
if hasattr(response, "completion"):
return response.completion
return str(response)


async def _process_single_turn(state, turn_chunk, config):
"""Process a single turn: add user message, get model response, add assistant message."""
keys_str = ", ".join(turn_chunk)
followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str)
state.messages.append({"role": "user", "content": followup_prompt})

response = await state.model.generate(messages=state.messages, config=config)
turn_response = _extract_response_content(response)

state.messages.append({"role": "assistant", "content": turn_response})
return turn_response


async def multi_turn_solver(state: TaskState):
"""
Custom solver for multi-turn evaluation.
Loops through turns, calling the model for each turn while maintaining conversation history.

This implements offline evaluation: all turns are called, then evaluation happens.
"""
from inspect_ai.model import GenerateConfig, ModelOutput

turn_chunks = state.metadata.get("turn_chunks", [])

if not turn_chunks or len(turn_chunks) == 0:
return state

# Initialize messages
if not hasattr(state, "messages") or state.messages is None:
state.messages = []

if not state.messages:
state.messages.append({"role": "user", "content": state.input})

all_turn_outputs = []

# Process all turns
if hasattr(state, "model") and state.model is not None:
config = GenerateConfig()

# Process first turn (already in messages as initial prompt)
response = await state.model.generate(messages=state.messages, config=config)
turn_response = _extract_response_content(response)
all_turn_outputs.append(turn_response)
state.messages.append({"role": "assistant", "content": turn_response})

# Process remaining turns
for turn_idx in range(1, len(turn_chunks)):
if not hasattr(state, "model") or state.model is None:
break
turn_response = await _process_single_turn(state, turn_chunks[turn_idx], config)
all_turn_outputs.append(turn_response)

state.metadata["all_turn_outputs"] = all_turn_outputs

# Set final output
if all_turn_outputs:
if hasattr(state, "output") and state.output is not None:
state.output.completion = all_turn_outputs[-1]
else:
state.output = ModelOutput(completion=all_turn_outputs[-1])

return state


@scorer(metrics={"turn_accuracy": [accuracy(), stderr()], "fractional_accuracy": [accuracy(), stderr()]})
def multi_turn_scorer():
"""
Scorer for multi-turn Long Horizon Execution task.
Compares predicted cumulative sums at each turn with expected.
Returns fractional accuracy (correct turns / total turns).
"""

async def score(state: TaskState, target: Target):
# metadata stored by solver
all_turn_outputs = state.metadata.get("all_turn_outputs", [])
expected_per_turn = state.metadata.get("expected_per_turn", [])

if not all_turn_outputs:
return Score(value=0.0, answer="", explanation="No turn outputs found in state.metadata")

if len(all_turn_outputs) != len(expected_per_turn):
return Score(
value=0.0,
answer="",
explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns",
)

parsed_outputs = []
answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)

for turn_idx, turn_output in enumerate(all_turn_outputs):
match = answer_pattern.search(turn_output)
if match:
try:
parsed_value = int(match.group(1).strip())
parsed_outputs.append(parsed_value)
except ValueError:
parsed_outputs.append(None)
else:
parsed_outputs.append(None)

correct_turns = 0
turn_results = []
for turn_idx, (pred, exp) in enumerate(zip(parsed_outputs, expected_per_turn)):
is_correct = (pred is not None) and (pred == exp)
if is_correct:
correct_turns += 1
turn_results.append({"turn": turn_idx + 1, "predicted": pred, "expected": exp, "correct": is_correct})

fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0

return Score(
value={
"turn_accuracy": fractional_accuracy,
"fractional_accuracy": fractional_accuracy,
"correct_turns": correct_turns,
"total_turns": len(expected_per_turn),
},
answer=str(parsed_outputs),
explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}",
)

return score


def create_multi_turn_tasks():
"""
Creates a list of LightevalTaskConfig objects for multi-turn Long Horizon Execution.
Each task corresponds to a different combination of context size and turn complexity (K).
"""
tasks = []

for context_size in CONTEXT_SIZES:
for k in TURN_COMPLEXITIES:
task_name = f"long_horizon_execution:multi:{context_size}:k{k}"
prompt_fn = functools.partial(multi_turn_prompt_function, prompt_length=context_size, k=k)
sample_fn = functools.partial(multi_turn_record_to_sample, prompt_length=context_size, k=k)

task = LightevalTaskConfig(
name=task_name,
prompt_function=prompt_fn,
sample_fields=sample_fn,
solver=[multi_turn_solver, generate(cache=True)],
scorer=multi_turn_scorer(),
hf_repo="arvindh75/Long-Horizon-Execution",
hf_subset="default",
evaluation_splits=("test",),
generation_size=context_size,
metrics=[Metrics.exact_match],
)
tasks.append(task)

return tasks
Loading