diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/__init__.py b/src/lighteval/tasks/tasks/long_horizon_execution/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py new file mode 100644 index 000000000..f2c235261 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py @@ -0,0 +1,48 @@ +""" +Constants file reused within the Long Horizon Execution task. +""" + +PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. +Your task is to calculate the final cumulative sum after processing all keys in order. + +For each key in the list, you need to: +1. Look up the value in the dictionary +2. Add it to the running sum +3. After processing all keys, output the final cumulative sum + +Dictionary to use: +{dict_str} + +Keys to process in order: +{keys_str} + +Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys. + +IMPORTANT: +- Output your answer as a single integer value inside tags +- Do not include any other text outside the answer tags +- Format: final_sum +- Example: If the final cumulative sum is 42, output: 42 + +Your answer:""" + +PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}. +Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide. + +In each turn, I'll provide {k} keys (comma-separated). +Respond with the current running sum, enclosed in tags. + +Dictionary to maintain: +{dict_str} + +Ready to start! + +**User**: {keys_str} +**Assistant**:""" + +PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process: +**User**: {keys_str} +**Assistant**:""" + +CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536] +TURN_COMPLEXITIES = [1, 2, 10] diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py new file mode 100644 index 000000000..472eb3fc6 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/main.py @@ -0,0 +1,47 @@ +""" +name: +Long Horizon Execution + +dataset: +arvindh75/Long-Horizon-Execution + +abstract: +Evaluation benchmark for long-context execution capabilities of language models. +Tests a model's ability to maintain state and perform cumulative operations over +long sequences of inputs. Supports both single-turn (all inputs at once) and +multi-turn (inputs provided incrementally) evaluation modes. + +The task requires models to: +1. Maintain a dictionary mapping keys to values +2. Process a sequence of keys +3. Calculate cumulative sums after each key or group of keys +4. Handle varying context sizes and turn complexities + +Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum +after processing all keys, allowing any aggregation strategy. + +Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining +conversation history and outputting cumulative sums incrementally. Evaluates +fractional accuracy (correct turns / total turns). + +languages: +english + +tags: +long-context, state-tracking, arithmetic, execution + +paper: +https://arxiv.org/abs/2509.09677 + +starred: +true +""" + +from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks +from lighteval.tasks.tasks.long_horizon_execution.single_turn import create_single_turn_tasks + + +single_turn_tasks = create_single_turn_tasks() +multi_turn_tasks = create_multi_turn_tasks() + +TASKS_TABLE = single_turn_tasks + multi_turn_tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py new file mode 100644 index 000000000..34e34080e --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py @@ -0,0 +1,224 @@ +""" +Multi-turn implementation of the Long Horizon Execution task. + +This implementation matches the multi-turn evaluation approach from the research paper, +where keys are provided in batches of K per turn, and the model maintains conversation +state to output cumulative sums after each turn. +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + CONTEXT_SIZES, + PROMPT_TEMPLATE_MULTI_FOLLOWUP, + TURN_COMPLEXITIES, +) +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_multi_turn_prompts + + +def multi_turn_prompt_function(line, prompt_length=32768, k=1, task_name: str = None): + """ + Prompt function for non-inspect-ai backend for multi-turn evaluation. + Converts dataset record to Doc object. + + Note: For multi-turn, this returns the first turn's prompt. + Subsequent turns are handled by the solver. + """ + initial_prompt, _, expected_per_turn, _ = _build_multi_turn_prompts(line, prompt_length=prompt_length, k=k) + + return Doc( + task_name=task_name, + query=initial_prompt, + choices=[str(expected_per_turn[-1])], # Final sum as choice + gold_index=0, + instruction=initial_prompt, + ) + + +def multi_turn_record_to_sample(record, prompt_length=32768, k=1): + """ + Converts dataset record to inspect-ai Sample object for multi-turn evaluation. + Stores all turn information in metadata for the solver to use. + """ + initial_prompt, _, expected_per_turn, metadata = _build_multi_turn_prompts( + record, prompt_length=prompt_length, k=k + ) + + return Sample( + input=initial_prompt, + target=str(expected_per_turn[-1]), + metadata=metadata, + ) + + +def _extract_response_content(response): + """Extract content from model response object.""" + if hasattr(response, "content"): + return response.content + if hasattr(response, "completion"): + return response.completion + return str(response) + + +async def _process_single_turn(state, turn_chunk, config): + """Process a single turn: add user message, get model response, add assistant message.""" + keys_str = ", ".join(turn_chunk) + followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str) + state.messages.append({"role": "user", "content": followup_prompt}) + + response = await state.model.generate(messages=state.messages, config=config) + turn_response = _extract_response_content(response) + + state.messages.append({"role": "assistant", "content": turn_response}) + return turn_response + + +async def multi_turn_solver(state: TaskState): + """ + Custom solver for multi-turn evaluation. + Loops through turns, calling the model for each turn while maintaining conversation history. + + This implements offline evaluation: all turns are called, then evaluation happens. + """ + from inspect_ai.model import GenerateConfig, ModelOutput + + turn_chunks = state.metadata.get("turn_chunks", []) + + if not turn_chunks or len(turn_chunks) == 0: + return state + + # Initialize messages + if not hasattr(state, "messages") or state.messages is None: + state.messages = [] + + if not state.messages: + state.messages.append({"role": "user", "content": state.input}) + + all_turn_outputs = [] + + # Process all turns + if hasattr(state, "model") and state.model is not None: + config = GenerateConfig() + + # Process first turn (already in messages as initial prompt) + response = await state.model.generate(messages=state.messages, config=config) + turn_response = _extract_response_content(response) + all_turn_outputs.append(turn_response) + state.messages.append({"role": "assistant", "content": turn_response}) + + # Process remaining turns + for turn_idx in range(1, len(turn_chunks)): + if not hasattr(state, "model") or state.model is None: + break + turn_response = await _process_single_turn(state, turn_chunks[turn_idx], config) + all_turn_outputs.append(turn_response) + + state.metadata["all_turn_outputs"] = all_turn_outputs + + # Set final output + if all_turn_outputs: + if hasattr(state, "output") and state.output is not None: + state.output.completion = all_turn_outputs[-1] + else: + state.output = ModelOutput(completion=all_turn_outputs[-1]) + + return state + + +@scorer(metrics={"turn_accuracy": [accuracy(), stderr()], "fractional_accuracy": [accuracy(), stderr()]}) +def multi_turn_scorer(): + """ + Scorer for multi-turn Long Horizon Execution task. + Compares predicted cumulative sums at each turn with expected. + Returns fractional accuracy (correct turns / total turns). + """ + + async def score(state: TaskState, target: Target): + # metadata stored by solver + all_turn_outputs = state.metadata.get("all_turn_outputs", []) + expected_per_turn = state.metadata.get("expected_per_turn", []) + + if not all_turn_outputs: + return Score(value=0.0, answer="", explanation="No turn outputs found in state.metadata") + + if len(all_turn_outputs) != len(expected_per_turn): + return Score( + value=0.0, + answer="", + explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns", + ) + + parsed_outputs = [] + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + + for turn_idx, turn_output in enumerate(all_turn_outputs): + match = answer_pattern.search(turn_output) + if match: + try: + parsed_value = int(match.group(1).strip()) + parsed_outputs.append(parsed_value) + except ValueError: + parsed_outputs.append(None) + else: + parsed_outputs.append(None) + + correct_turns = 0 + turn_results = [] + for turn_idx, (pred, exp) in enumerate(zip(parsed_outputs, expected_per_turn)): + is_correct = (pred is not None) and (pred == exp) + if is_correct: + correct_turns += 1 + turn_results.append({"turn": turn_idx + 1, "predicted": pred, "expected": exp, "correct": is_correct}) + + fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0 + + return Score( + value={ + "turn_accuracy": fractional_accuracy, + "fractional_accuracy": fractional_accuracy, + "correct_turns": correct_turns, + "total_turns": len(expected_per_turn), + }, + answer=str(parsed_outputs), + explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}", + ) + + return score + + +def create_multi_turn_tasks(): + """ + Creates a list of LightevalTaskConfig objects for multi-turn Long Horizon Execution. + Each task corresponds to a different combination of context size and turn complexity (K). + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + for k in TURN_COMPLEXITIES: + task_name = f"long_horizon_execution:multi:{context_size}:k{k}" + prompt_fn = functools.partial(multi_turn_prompt_function, prompt_length=context_size, k=k) + sample_fn = functools.partial(multi_turn_record_to_sample, prompt_length=context_size, k=k) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[multi_turn_solver, generate(cache=True)], + scorer=multi_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + tasks.append(task) + + return tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py new file mode 100644 index 000000000..c6fd0ca4b --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py @@ -0,0 +1,132 @@ +""" +Single turn implementation of the Long Horizon Execution task. +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target + + +def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None): + """ + Prompt function for single-turn evaluation (non-inspect-ai backend). + Converts dataset record to Doc object. + + Returns: + Doc object for evaluation + """ + prompt, target_str, _ = _build_prompt_and_target( + line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Doc( + task_name=task_name, + query=prompt, + choices=[target_str], # Expected answer as a choice + gold_index=0, + instruction=prompt, + ) + + +def single_turn_record_to_sample(record, prompt_length=32768): + """ + Converts dataset record to inspect-ai Sample object for single-turn evaluation. + + Returns: + Sample object for inspect-ai + """ + prompt, target_str, metadata = _build_prompt_and_target( + record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Sample( + input=prompt, + target=target_str, + metadata=metadata, + ) + + +@scorer(metrics={"accuracy": [accuracy(), stderr()]}) +def single_turn_scorer(): + """ + Scorer for single-turn evaluation. + Compares the model's predicted final sum with the expected final sum (binary score). + + Returns: + Scorer function that evaluates single integer responses + """ + + async def score(state: TaskState, target: Target): + response = state.output.completion + + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + match = answer_pattern.search(response) + + if not match: + return Score(value="I", answer="", explanation="No tag found in response.") + + content = match.group(1).strip() + + try: + pred_value = int(content.strip()) + except ValueError: + return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}") + + try: + exp_value = int(target.text.strip()) + except (ValueError, AttributeError): + return Score( + value="I", + answer=str(pred_value), + explanation=f"Failed to parse expected target: {target.text}", + ) + + is_correct = pred_value == exp_value + return Score( + value="C" if is_correct else "I", + answer=str(pred_value), + explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"), + ) + + return score + + +def create_single_turn_tasks(): + """ + Create all single-turn task configurations for different context sizes. + + Returns: + list[LightevalTaskConfig]: List of task configurations for single-turn evaluation + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + task_name = f"long_horizon_execution:{context_size}" + prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size) + sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[generate(cache=True)], + scorer=single_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + + tasks.append(task) + + return tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/utils.py b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py new file mode 100644 index 000000000..cc7eadca5 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py @@ -0,0 +1,214 @@ +""" +Utility functions for Long Horizon Execution task. +""" + +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + PROMPT_TEMPLATE_MULTI_START, + PROMPT_TEMPLATE_SINGLE, +) + + +def _binary_search_max_items(input_keys, build_prompt_fn, prompt_length, min_items=1): + """ + Generic binary search to find maximum number of items that fit within prompt_length. + + Returns: + int: Maximum number of items that fit + + """ + # Pre-validate that at least min_items fit within prompt_length + test_prompt = build_prompt_fn(min_items) + if test_prompt is None: + raise ValueError("Cannot build prompt: unable to generate prompt with available items") + + if len(test_prompt) > prompt_length: + item_label = "item" if min_items == 1 else f"{min_items} items" + raise ValueError( + f"Prompt length ({prompt_length} chars) is too small to fit {item_label}. " + f"Minimum required: {len(test_prompt)} chars. " + f"Please increase prompt_length or reduce dataset complexity." + ) + + # Binary search to find maximum n that fits within prompt_length + left, right = min_items, len(input_keys) + max_n = min_items + + while left <= right: + mid = (left + right) // 2 + prompt = build_prompt_fn(mid) + + if prompt is None: + right = mid - 1 + continue + + if len(prompt) <= prompt_length: + max_n = mid + left = mid + 1 + else: + right = mid - 1 + + return max_n + + +def _build_prompt_and_target(record, prompt_length=32768, prompt_template=PROMPT_TEMPLATE_SINGLE): + """ + Helper function to extract common logic for building prompt and target. + Uses binary search to find the maximum number of items that fit within prompt_length. + Processes the record and returns prompt, target, and metadata. + + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + prompt_template: Prompt template to use for formatting. Defaults to PROMPT_TEMPLATE_SINGLE. + + Returns: + tuple: (prompt: str, target_str: str, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + def build_prompt_for_n(n): + """Build a prompt with the first n items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + keys_str = str(keys_n) + return prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=n) + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + max_n = _binary_search_max_items(input_keys, build_prompt_for_n, prompt_length, min_items=1) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + keys_str = str(input_keys) + prompt = prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=len(input_keys)) + + target_str = str(expected_output[-1]) + + metadata = { + "input_keys": input_keys, + "input_values": input_values, + "expected_output": expected_output, + "dictionary": dictionary, + "num_items": len(input_keys), + } + + return prompt, target_str, metadata + + +def _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k): + """ + Find maximum number of items that fit within prompt_length for multi-turn evaluation. + Uses binary search to find max items where initial prompt (dict + first K keys) fits. + + Returns: + int: Maximum number of items that fit + """ + + def build_initial_prompt_for_n(n): + """Build initial prompt with dictionary and first K keys from n total items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + + # First turn has first K keys + first_turn_keys = keys_n[:k] + keys_str = ", ".join(first_turn_keys) + + return PROMPT_TEMPLATE_MULTI_START.format( + dict_str=dict_str, keys_str=keys_str, k=k, num_keys=len(first_turn_keys) + ) + + return _binary_search_max_items(input_keys, build_initial_prompt_for_n, prompt_length, min_items=k) + + +def _chunk_and_calculate_expected(input_keys, input_values, k): + """ + Chunk keys into turns of size K and calculate expected cumulative sums per turn. + + Returns: + tuple: (turn_chunks: list, value_chunks: list, expected_per_turn: list) + """ + # Chunk keys into turns of size K + turn_chunks = [] + value_chunks = [] + for i in range(0, len(input_keys), k): + turn_chunks.append(input_keys[i : i + k]) + value_chunks.append(input_values[i : i + k]) + + # Calculate expected cumulative sums for each turn + expected_per_turn = [] + cumulative_sum = 0 + for values in value_chunks: + cumulative_sum += sum(values) + expected_per_turn.append(cumulative_sum) + + return turn_chunks, value_chunks, expected_per_turn + + +def _build_multi_turn_prompts(record, prompt_length=32768, k=1): + """ + Build prompts for multi-turn evaluation. + Uses binary search to find maximum number of items that fit within prompt_length. + Chunks keys into turns of size K. + + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + k: Turn complexity (number of keys per turn). Defaults to 1. + + Returns: + tuple: (initial_prompt: str, turn_chunks: list, expected_per_turn: list, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + # Find maximum number of items that fit + max_n = _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + turn_chunks, value_chunks, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k) + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + + first_turn_keys_str = ", ".join(turn_chunks[0]) + initial_prompt = PROMPT_TEMPLATE_MULTI_START.format( + dict_str=dict_str, keys_str=first_turn_keys_str, k=k, num_keys=len(turn_chunks[0]) + ) + + metadata = { + "turn_chunks": turn_chunks, + "value_chunks": value_chunks, + "expected_per_turn": expected_per_turn, + "dictionary": dictionary, + "k": k, + "num_turns": len(turn_chunks), + "num_items": len(input_keys), + } + + return initial_prompt, turn_chunks, expected_per_turn, metadata