diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py new file mode 100644 index 000000000..2a5c9954f --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py @@ -0,0 +1,46 @@ +""" +Constants file reused within the Long Horizon Execution task. +""" + +PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. +Your task is to calculate the final cumulative sum after processing all keys in order. + +For each key in the list, you need to: +1. Look up the value in the dictionary +2. Add it to the running sum +3. After processing all keys, output the final cumulative sum + +Dictionary to use: +{dict_str} + +Keys to process in order: +{keys_str} + +Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys. + +IMPORTANT: +- Output your answer as a single integer value inside tags +- Do not include any other text outside the answer tags +- Format: final_sum +- Example: If the final cumulative sum is 42, output: 42 + +Your answer:""" + +PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}. +Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide. +In each turn, I'll provide {k} key(s) (comma-separated). +Respond with the current running sum, enclosed in tags. + +Dictionary to maintain: +{dict_str} + +Ready to start! +**User**: {keys_str} +**Assistant**:""" + +PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process: +**User**: {keys_str} +**Assistant**:""" + +CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536] +TURN_COMPLEXITIES = [1, 2, 10] diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py new file mode 100644 index 000000000..09686b40c --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/main.py @@ -0,0 +1,167 @@ +""" +name: +Long Horizon Execution + +dataset: +arvindh75/Long-Horizon-Execution + +abstract: +Evaluation benchmark for long-context execution capabilities of language models. +Tests a model's ability to maintain state and perform cumulative operations over +long sequences of inputs. Supports both single-turn (all inputs at once) and +multi-turn (inputs provided incrementally) evaluation modes. +The task requires models to: +1. Maintain a dictionary mapping keys to values +2. Process a sequence of keys +3. Calculate cumulative sums after each key or group of keys +4. Handle varying context sizes and turn complexities +Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum +after processing all keys, allowing any aggregation strategy. + +Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining +conversation history and outputting cumulative sums incrementally. Evaluates +fractional accuracy (correct turns / total turns). + +languages: +english + +tags: +long-context, state-tracking, arithmetic, execution + +paper: +https://arxiv.org/abs/2509.09677 + +starred: true +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE +from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target + + +def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None): + """ + Prompt function for single-turn evaluation (non-inspect-ai backend). + Converts dataset record to Doc object. + Returns: + Doc object for evaluation + """ + prompt, target_str, _ = _build_prompt_and_target( + line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Doc( + task_name=task_name, + query=prompt, + choices=[target_str], # Expected answer as a choice + gold_index=0, + instruction=prompt, + ) + + +def single_turn_record_to_sample(record, prompt_length=32768): + """ + Converts dataset record to inspect-ai Sample object for single-turn evaluation. + Returns: + Sample object for inspect-ai + """ + prompt, target_str, metadata = _build_prompt_and_target( + record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Sample( + input=prompt, + target=target_str, + metadata=metadata, + ) + + +@scorer(metrics=[accuracy(), stderr()]) +def single_turn_scorer(): + """ + Scorer for single-turn evaluation. + Compares the model's predicted final sum with the expected final sum (binary score). + Returns: + Scorer function that evaluates single integer responses + """ + + async def score(state: TaskState, target: Target): + response = state.output.completion + + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + match = answer_pattern.search(response) + + if not match: + return Score(value="I", answer="", explanation="No tag found in response.") + + content = match.group(1).strip() + + try: + pred_value = int(content.strip()) + except ValueError: + return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}") + + try: + exp_value = int(target.text.strip()) + except (ValueError, AttributeError): + return Score( + value="I", + answer=str(pred_value), + explanation=f"Failed to parse expected target: {target.text}", + ) + + is_correct = pred_value == exp_value + return Score( + value="C" if is_correct else "I", + answer=str(pred_value), + explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"), + ) + + return score + + +def create_single_turn_tasks(): + """ + Create all single-turn task configurations for different context sizes. + Returns: + list[LightevalTaskConfig]: List of task configurations for single-turn evaluation + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + task_name = f"long_horizon_execution_single:{context_size}" + prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size) + sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[generate(cache=True)], + scorer=single_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + + tasks.append(task) + + return tasks + + +single_turn_tasks = create_single_turn_tasks() +multi_turn_tasks = create_multi_turn_tasks() + +TASKS_TABLE = single_turn_tasks + multi_turn_tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py new file mode 100644 index 000000000..e34638fd0 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py @@ -0,0 +1,219 @@ +""" +Multi-turn implementation of the Long Horizon Execution task. +This implementation matches the multi-turn evaluation approach from the research paper, +where keys are provided in batches of K per turn, and the model maintains conversation +state to output cumulative sums after each turn. +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.model import ChatMessageUser, ModelOutput +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import Generate, TaskState, generate, solver + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + CONTEXT_SIZES, + PROMPT_TEMPLATE_MULTI_FOLLOWUP, + TURN_COMPLEXITIES, +) +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_multi_turn_prompts + + +def multi_turn_prompt_function(line, prompt_length=32768, k=1, task_name: str = None): + """ + Prompt function for non-inspect-ai backend for multi-turn evaluation. + Converts dataset record to Doc object. + Note: For multi-turn, this returns the first turn's prompt. + Subsequent turns are handled by the solver. + """ + initial_prompt, _, expected_per_turn, _ = _build_multi_turn_prompts(line, prompt_length=prompt_length, k=k) + + return Doc( + task_name=task_name, + query=initial_prompt, + choices=[str(expected_per_turn[-1])], # Final sum as choice + gold_index=0, + instruction=initial_prompt, + ) + + +def multi_turn_record_to_sample(record, prompt_length=32768, k=1): + """ + Converts dataset record to inspect-ai Sample object for multi-turn evaluation. + Stores all turn information in metadata for the solver to use. + """ + initial_prompt, _, expected_per_turn, metadata = _build_multi_turn_prompts( + record, prompt_length=prompt_length, k=k + ) + + return Sample( + input=initial_prompt, + target=str(expected_per_turn[-1]), + metadata=metadata, + ) + + +def _extract_response_content(response): + """Extract content from model response object.""" + if hasattr(response, "content"): + return response.content + if hasattr(response, "completion"): + return response.completion + return str(response) + + +async def _process_single_turn(state, turn_chunk, generate_fn): + """Process a single turn: add user message, get model response, add assistant message.""" + keys_str = ", ".join(turn_chunk) + followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str) + state.messages.append(ChatMessageUser(content=followup_prompt)) + + # generate_fn() takes the state and returns updated state with assistant message added + updated_state = await generate_fn(state) + turn_response = _extract_response_content(updated_state.output.completion if updated_state.output else "") + + return updated_state, turn_response + + +@solver +def multi_turn_solver(): + """ + Solver for multi-turn evaluation. + Loops through turns, calling the model for each turn while maintaining conversation history. + This implements offline evaluation: all turns are called, then evaluation happens. + """ + + async def solve(state: TaskState, generate: Generate): + turn_chunks = state.metadata.get("turn_chunks", []) + + if not turn_chunks: + return state + + # Initialize messages + if not hasattr(state, "messages") or state.messages is None: + state.messages = [] + + if not state.messages: + state.messages.append(ChatMessageUser(content=state.input)) + + all_turn_outputs = [] + + # Process first turn (already in messages as initial prompt) + updated_state = await generate(state) + turn_response = _extract_response_content(updated_state.output.completion if updated_state.output else "") + all_turn_outputs.append(turn_response) + + state = updated_state + + # Process remaining turns + for turn_idx in range(1, len(turn_chunks)): + state, turn_response = await _process_single_turn(state, turn_chunks[turn_idx], generate) + all_turn_outputs.append(turn_response) + + state.metadata["all_turn_outputs"] = all_turn_outputs + + # Set final output + if all_turn_outputs: + if hasattr(state, "output") and state.output is not None: + state.output.completion = all_turn_outputs[-1] + else: + state.output = ModelOutput(completion=all_turn_outputs[-1]) + + return state + + return solve + + +@scorer(metrics={"fractional_accuracy": [accuracy(), stderr()]}) +def multi_turn_scorer(): + """ + Scorer for multi-turn Long Horizon Execution task. + Compares predicted cumulative sums at each turn with expected. + Returns fractional accuracy (correct turns / total turns). + """ + + async def score(state: TaskState, target: Target): + # metadata stored by solver + all_turn_outputs = state.metadata.get("all_turn_outputs", []) + expected_per_turn = state.metadata.get("expected_per_turn", []) + + if not all_turn_outputs: + return Score( + value={"fractional_accuracy": 0.0}, + answer="", + explanation="No turn outputs found in state.metadata", + ) + + if len(all_turn_outputs) != len(expected_per_turn): + return Score( + value={"fractional_accuracy": 0.0}, + answer="", + explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns", + ) + + parsed_outputs = [] + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + + for turn_output in all_turn_outputs: + match = answer_pattern.search(turn_output) + if match: + try: + parsed_value = int(match.group(1).strip()) + parsed_outputs.append(parsed_value) + except ValueError: + parsed_outputs.append(None) + else: + parsed_outputs.append(None) + + correct_turns = 0 + turn_results = [] + for turn_idx, (pred, exp) in enumerate(zip(parsed_outputs, expected_per_turn)): + is_correct = (pred is not None) and (pred == exp) + if is_correct: + correct_turns += 1 + turn_results.append({"turn": turn_idx + 1, "predicted": pred, "expected": exp, "correct": is_correct}) + + fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0 + + return Score( + value={"fractional_accuracy": fractional_accuracy}, + answer=str(parsed_outputs), + explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}", + ) + + return score + + +def create_multi_turn_tasks(): + """ + Creates a list of LightevalTaskConfig objects for multi-turn Long Horizon Execution. + Each task corresponds to a different combination of context size and turn complexity (K). + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + for k in TURN_COMPLEXITIES: + task_name = f"long_horizon_execution_multi_k{k}:{context_size}" + prompt_fn = functools.partial(multi_turn_prompt_function, prompt_length=context_size, k=k) + sample_fn = functools.partial(multi_turn_record_to_sample, prompt_length=context_size, k=k) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[multi_turn_solver(), generate(cache=True)], + scorer=multi_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + tasks.append(task) + + return tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/utils.py b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py new file mode 100644 index 000000000..f96acda86 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py @@ -0,0 +1,203 @@ +""" +Utility functions for Long Horizon Execution task. +""" + +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + PROMPT_TEMPLATE_MULTI_START, + PROMPT_TEMPLATE_SINGLE, +) + + +def _binary_search_max_items(input_keys, build_prompt_fn, prompt_length, min_items=1): + """ + Generic binary search to find maximum number of items that fit within prompt_length. + Returns: + int: Maximum number of items that fit + """ + # Pre-validate that at least min_items fit within prompt_length + test_prompt = build_prompt_fn(min_items) + if test_prompt is None: + raise ValueError("Cannot build prompt: unable to generate prompt with available items") + + if len(test_prompt) > prompt_length: + item_label = "item" if min_items == 1 else f"{min_items} items" + raise ValueError( + f"Prompt length ({prompt_length} chars) is too small to fit {item_label}. " + f"Minimum required: {len(test_prompt)} chars. " + f"Please increase prompt_length or reduce dataset complexity." + ) + + # Binary search to find maximum n that fits within prompt_length + left, right = min_items, len(input_keys) + max_n = min_items + + while left <= right: + mid = (left + right) // 2 + prompt = build_prompt_fn(mid) + + if prompt is None: + right = mid - 1 + continue + + if len(prompt) <= prompt_length: + max_n = mid + left = mid + 1 + else: + right = mid - 1 + + return max_n + + +def _build_prompt_and_target(record, prompt_length=32768, prompt_template=PROMPT_TEMPLATE_SINGLE): + """ + Helper function to extract common logic for building prompt and target. + Uses binary search to find the maximum number of items that fit within prompt_length. + Processes the record and returns prompt, target, and metadata. + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + prompt_template: Prompt template to use for formatting. Defaults to PROMPT_TEMPLATE_SINGLE. + Returns: + tuple: (prompt: str, target_str: str, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + def build_prompt_for_n(n): + """Build a prompt with the first n items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + keys_str = str(keys_n) + return prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=n) + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + max_n = _binary_search_max_items(input_keys, build_prompt_for_n, prompt_length, min_items=1) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + keys_str = str(input_keys) + prompt = prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=len(input_keys)) + + target_str = str(expected_output[-1]) + + metadata = { + "input_keys": input_keys, + "input_values": input_values, + "expected_output": expected_output, + "dictionary": dictionary, + "num_items": len(input_keys), + } + + return prompt, target_str, metadata + + +def _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k): + """ + Find maximum number of items that fit within prompt_length for multi-turn evaluation. + Uses binary search to find max items where initial prompt (dict + first K keys) fits. + Returns: + int: Maximum number of items that fit + """ + + def build_initial_prompt_for_n(n): + """Build initial prompt with dictionary and first K keys from n total items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + + # First turn has first K keys + first_turn_keys = keys_n[:k] + keys_str = ", ".join(first_turn_keys) + + return PROMPT_TEMPLATE_MULTI_START.format( + dict_str=dict_str, + keys_str=keys_str, + k=k, + ) + + return _binary_search_max_items(input_keys, build_initial_prompt_for_n, prompt_length, min_items=k) + + +def _chunk_and_calculate_expected(input_keys, input_values, k): + """ + Chunk keys into turns of size K and calculate expected cumulative sums per turn. + Returns: + tuple: (turn_chunks: list, value_chunks: list, expected_per_turn: list) + """ + # Chunk keys into turns of size K + turn_chunks = [] + value_chunks = [] + for i in range(0, len(input_keys), k): + turn_chunks.append(input_keys[i : i + k]) + value_chunks.append(input_values[i : i + k]) + + # Calculate expected cumulative sums for each turn + expected_per_turn = [] + cumulative_sum = 0 + for values in value_chunks: + cumulative_sum += sum(values) + expected_per_turn.append(cumulative_sum) + + return turn_chunks, value_chunks, expected_per_turn + + +def _build_multi_turn_prompts(record, prompt_length=32768, k=1): + """ + Build prompts for multi-turn evaluation. + Uses binary search to find maximum number of items that fit within prompt_length. + Chunks keys into turns of size K. + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + k: Turn complexity (number of keys per turn). Defaults to 1. + Returns: + tuple: (initial_prompt: str, turn_chunks: list, expected_per_turn: list, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + # Find maximum number of items that fit + max_n = _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + + turn_chunks, _, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k) + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + + first_turn_keys_str = ", ".join(turn_chunks[0]) + initial_prompt = PROMPT_TEMPLATE_MULTI_START.format(dict_str=dict_str, keys_str=first_turn_keys_str, k=k) + + metadata = { + "turn_chunks": turn_chunks, + "expected_per_turn": expected_per_turn, + "dictionary": dictionary, + "k": k, + "num_turns": len(turn_chunks), + "num_items": len(input_keys), + } + + return initial_prompt, turn_chunks, expected_per_turn, metadata