From 8ee82ef5d9e06c8613c05a8304d117190dfcac03 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Wed, 19 Nov 2025 21:45:05 +0530 Subject: [PATCH 1/4] initial impl for long horizon execution --- .../tasks/tasks/long_horizon_execution.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution.py diff --git a/src/lighteval/tasks/tasks/long_horizon_execution.py b/src/lighteval/tasks/tasks/long_horizon_execution.py new file mode 100644 index 000000000..9c8b9996b --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution.py @@ -0,0 +1,75 @@ +""" +DOCSTRING TO BE IMPLEMENTED +""" + +from inspect_ai.dataset import Sample +from inspect_ai.solver import generate + +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +def record_to_sample(record): + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + MAX_ITEMS = 100 # for truncation, can be adjusted. + if len(input_keys) > MAX_ITEMS: + input_keys = input_keys[:MAX_ITEMS] + input_values = input_values[:MAX_ITEMS] + expected_output = expected_output[:MAX_ITEMS] + + dictionary = dict(zip(input_keys, input_values)) + + dict_str = str(dictionary) + keys_str = str(input_keys) + + prompt = f"""You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. +Your task is to calculate the running cumulative sum (starting from 0) by adding the value associated with each key in order. + +For each key in the list, you need to: +1. Look up the value in the dictionary +2. Add it to the running sum +3. Output the cumulative sum after processing all keys up to that point + +Dictionary to use: +{dict_str} + +Keys to process in order: +{keys_str} + +Your task: Calculate the cumulative sum after each key. The first sum is just the value of the first key. The second sum is the first value plus the second value, and so on. + +IMPORTANT: +- Output your answer as a single line with comma-separated values inside tags +- Do not include any other text outside the answer tags +- Format: value1,value2,value3,... +- Example: If the cumulative sums are [5, 8, 12], output: 5,8,12 + +Your answer:""" + + target_str = ",".join(map(str, expected_output)) + + return Sample( + input=prompt, + target=target_str, + metadata={ + "input_keys": input_keys, + "input_values": input_values, + "expected_output": expected_output, + "dictionary": dictionary, + "num_items": len(input_keys), + }, + ) + + +long_horizon_execution = LightevalTaskConfig( + name="long_horizon_execution", + sample_fields=record_to_sample, + solver=[generate(cache=True)], + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), +) + +TASKS_TABLE = [long_horizon_execution] From bb01770447f2f97dfb42f1755978fd05037c11d1 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Thu, 20 Nov 2025 14:17:32 +0530 Subject: [PATCH 2/4] ready for review --- .../tasks/tasks/long_horizon_execution.py | 135 +++++++++++++++--- 1 file changed, 118 insertions(+), 17 deletions(-) diff --git a/src/lighteval/tasks/tasks/long_horizon_execution.py b/src/lighteval/tasks/tasks/long_horizon_execution.py index 9c8b9996b..6783d645b 100644 --- a/src/lighteval/tasks/tasks/long_horizon_execution.py +++ b/src/lighteval/tasks/tasks/long_horizon_execution.py @@ -1,26 +1,56 @@ """ -DOCSTRING TO BE IMPLEMENTED +name: +Long Horizon Execution + +dataset: +arvindh75/Long-Horizon-Execution + +abstract: +This dataset is a synthetic benchmark designed to measure the pure execution +capability of LLMs over long horizons. The core task is key-value dictionary addition. +A fixed, in-context dictionary mapping five-letter English words (keys) to integer values +is provided in dictionary.json. The model's goal is to maintain a running sum. +In each turn, it receives one or more keys (defined by the turn complexity, K), +retrieves their corresponding values from the dictionary, adds them to the running sum, and outputs the new sum. +The primary metric for evaluation is the task length: the number of steps a model can execute before its accuracy drops below a certain threshold. + +languages: +english + +tags: +agent, llm, benchmark + +paper: +https://arxiv.org/abs/2509.09677 + +starred: +true """ +import re + from inspect_ai.dataset import Sample -from inspect_ai.solver import generate +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate +from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc -def record_to_sample(record): +def _build_prompt_and_target(record): + """ + Helper function to extract common logic for building prompt and target. + Processes the record and returns prompt, target, and metadata. + + Returns: + tuple: (prompt: str, target_str: str, metadata: dict) + """ input_keys = record["input"] input_values = record["values"] expected_output = record["output"] - MAX_ITEMS = 100 # for truncation, can be adjusted. - if len(input_keys) > MAX_ITEMS: - input_keys = input_keys[:MAX_ITEMS] - input_values = input_values[:MAX_ITEMS] - expected_output = expected_output[:MAX_ITEMS] - dictionary = dict(zip(input_keys, input_values)) - dict_str = str(dictionary) keys_str = str(input_keys) @@ -50,26 +80,97 @@ def record_to_sample(record): target_str = ",".join(map(str, expected_output)) + metadata = { + "input_keys": input_keys, + "input_values": input_values, + "expected_output": expected_output, + "dictionary": dictionary, + "num_items": len(input_keys), + } + + return prompt, target_str, metadata + + +def long_horizon_execution_prompt_function(line, task_name: str = None): + """ + Prompt function for non-inspect-ai backend. + Converts dataset record to Doc object. + """ + prompt, target_str, _ = _build_prompt_and_target(line) + + return Doc( + task_name=task_name, + query=prompt, + choices=[target_str], # Expected answer as a choice + gold_index=0, + instruction=prompt, + ) + + +def record_to_sample(record): + """ + Converts dataset record to inspect-ai Sample object. + """ + prompt, target_str, metadata = _build_prompt_and_target(record) + return Sample( input=prompt, target=target_str, - metadata={ - "input_keys": input_keys, - "input_values": input_values, - "expected_output": expected_output, - "dictionary": dictionary, - "num_items": len(input_keys), - }, + metadata=metadata, ) +@scorer(metrics={"accuracy": [accuracy(), stderr()]}) +def long_horizon_execution_scorer(): + async def score(state: TaskState, target: Target): + response = state.output.completion + + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + match = answer_pattern.search(response) + + if not match: + return Score(value="I", answer="", explanation="No tag found in response.") + + content = match.group(1).strip() + + try: + pred_values = [int(x.strip()) for x in content.split(",") if x.strip()] + except ValueError: + return Score(value="I", answer=content, explanation=f"Failed to parse integers from: {content}") + + try: + exp_values = [int(x.strip()) for x in target.text.split(",") if x.strip()] + + except (ValueError, AttributeError): + pred_str = ",".join(map(str, pred_values)) + is_correct = pred_str == target.text + return Score( + value="C" if is_correct else "I", + answer=pred_str, + explanation=f"Expected: {target.text}, Predicted: {pred_str}", + ) + + is_correct = pred_values == exp_values + return Score( + value="C" if is_correct else "I", + answer=",".join(map(str, pred_values)), + explanation=(f"Expected {len(exp_values)} values, Got {len(pred_values)} values. Match: {is_correct}"), + ) + + return score + + long_horizon_execution = LightevalTaskConfig( name="long_horizon_execution", + prompt_function=long_horizon_execution_prompt_function, sample_fields=record_to_sample, solver=[generate(cache=True)], + scorer=long_horizon_execution_scorer(), hf_repo="arvindh75/Long-Horizon-Execution", hf_subset="default", evaluation_splits=("test",), + generation_size=32768, + metrics=[Metrics.exact_match], ) TASKS_TABLE = [long_horizon_execution] From 6362c8b30b49b4c68cf7ae27b7d0226e720772c3 Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Thu, 20 Nov 2025 22:23:34 +0530 Subject: [PATCH 3/4] addressed nathan's comments, yet to fix reproduction inaccuracies --- .../tasks/tasks/long_horizon_execution.py | 115 ++++++++++++------ 1 file changed, 80 insertions(+), 35 deletions(-) diff --git a/src/lighteval/tasks/tasks/long_horizon_execution.py b/src/lighteval/tasks/tasks/long_horizon_execution.py index 6783d645b..162c8a940 100644 --- a/src/lighteval/tasks/tasks/long_horizon_execution.py +++ b/src/lighteval/tasks/tasks/long_horizon_execution.py @@ -27,6 +27,7 @@ true """ +import functools import re from inspect_ai.dataset import Sample @@ -38,23 +39,7 @@ from lighteval.tasks.requests import Doc -def _build_prompt_and_target(record): - """ - Helper function to extract common logic for building prompt and target. - Processes the record and returns prompt, target, and metadata. - - Returns: - tuple: (prompt: str, target_str: str, metadata: dict) - """ - input_keys = record["input"] - input_values = record["values"] - expected_output = record["output"] - - dictionary = dict(zip(input_keys, input_values)) - dict_str = str(dictionary) - keys_str = str(input_keys) - - prompt = f"""You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. +PROMPT_TEMPLATE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. Your task is to calculate the running cumulative sum (starting from 0) by adding the value associated with each key in order. For each key in the list, you need to: @@ -78,6 +63,58 @@ def _build_prompt_and_target(record): Your answer:""" + +def _build_prompt_and_target(record, prompt_length=32768): + """ + Helper function to extract common logic for building prompt and target. + Uses binary search to find the maximum number of items that fit within prompt_length. + Processes the record and returns prompt, target, and metadata. + + Returns: + tuple: (prompt: str, target_str: str, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + def build_prompt_for_n(n): + """Build a prompt with the first n items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + keys_str = str(keys_n) + return PROMPT_TEMPLATE.format(dict_str=dict_str, keys_str=keys_str) + + # Binary search to find maximum n that fits within prompt_length + left, right = 0, len(input_keys) + max_n = 0 + + while left <= right: + mid = (left + right) // 2 + prompt = build_prompt_for_n(mid) + + if prompt is None: + break + + if len(prompt) <= prompt_length: + max_n = mid + left = mid + 1 + else: + right = mid - 1 + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + keys_str = str(input_keys) + prompt = PROMPT_TEMPLATE.format(dict_str=dict_str, keys_str=keys_str) + target_str = ",".join(map(str, expected_output)) metadata = { @@ -91,12 +128,12 @@ def _build_prompt_and_target(record): return prompt, target_str, metadata -def long_horizon_execution_prompt_function(line, task_name: str = None): +def long_horizon_execution_prompt_function(line, prompt_length=32768, task_name: str = None): """ Prompt function for non-inspect-ai backend. Converts dataset record to Doc object. """ - prompt, target_str, _ = _build_prompt_and_target(line) + prompt, target_str, _ = _build_prompt_and_target(line, prompt_length=prompt_length) return Doc( task_name=task_name, @@ -107,11 +144,11 @@ def long_horizon_execution_prompt_function(line, task_name: str = None): ) -def record_to_sample(record): +def record_to_sample(record, prompt_length=32768): """ Converts dataset record to inspect-ai Sample object. """ - prompt, target_str, metadata = _build_prompt_and_target(record) + prompt, target_str, metadata = _build_prompt_and_target(record, prompt_length=prompt_length) return Sample( input=prompt, @@ -160,17 +197,25 @@ async def score(state: TaskState, target: Target): return score -long_horizon_execution = LightevalTaskConfig( - name="long_horizon_execution", - prompt_function=long_horizon_execution_prompt_function, - sample_fields=record_to_sample, - solver=[generate(cache=True)], - scorer=long_horizon_execution_scorer(), - hf_repo="arvindh75/Long-Horizon-Execution", - hf_subset="default", - evaluation_splits=("test",), - generation_size=32768, - metrics=[Metrics.exact_match], -) - -TASKS_TABLE = [long_horizon_execution] +TASKS_TABLE = [] +CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536] + +for context_size in CONTEXT_SIZES: + task_name = f"long_horizon_execution:{context_size}" + prompt_fn = functools.partial(long_horizon_execution_prompt_function, prompt_length=context_size) + sample_fn = functools.partial(record_to_sample, prompt_length=context_size) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[generate(cache=True)], + scorer=long_horizon_execution_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + + TASKS_TABLE.append(task) From ac7943022fa8a0c8b6212b8f566bcec54fb5d71d Mon Sep 17 00:00:00 2001 From: Akshath Mangudi Date: Fri, 21 Nov 2025 12:53:50 +0530 Subject: [PATCH 4/4] multi-turn impl + breaking down code to resolve C901 errors --- .../tasks/tasks/long_horizon_execution.py | 221 ----------------- .../tasks/long_horizon_execution/__init__.py | 0 .../tasks/long_horizon_execution/constants.py | 48 ++++ .../tasks/long_horizon_execution/main.py | 47 ++++ .../long_horizon_execution/multi_turn.py | 224 ++++++++++++++++++ .../long_horizon_execution/single_turn.py | 132 +++++++++++ .../tasks/long_horizon_execution/utils.py | 214 +++++++++++++++++ 7 files changed, 665 insertions(+), 221 deletions(-) delete mode 100644 src/lighteval/tasks/tasks/long_horizon_execution.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/__init__.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/constants.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/main.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py create mode 100644 src/lighteval/tasks/tasks/long_horizon_execution/utils.py diff --git a/src/lighteval/tasks/tasks/long_horizon_execution.py b/src/lighteval/tasks/tasks/long_horizon_execution.py deleted file mode 100644 index 162c8a940..000000000 --- a/src/lighteval/tasks/tasks/long_horizon_execution.py +++ /dev/null @@ -1,221 +0,0 @@ -""" -name: -Long Horizon Execution - -dataset: -arvindh75/Long-Horizon-Execution - -abstract: -This dataset is a synthetic benchmark designed to measure the pure execution -capability of LLMs over long horizons. The core task is key-value dictionary addition. -A fixed, in-context dictionary mapping five-letter English words (keys) to integer values -is provided in dictionary.json. The model's goal is to maintain a running sum. -In each turn, it receives one or more keys (defined by the turn complexity, K), -retrieves their corresponding values from the dictionary, adds them to the running sum, and outputs the new sum. -The primary metric for evaluation is the task length: the number of steps a model can execute before its accuracy drops below a certain threshold. - -languages: -english - -tags: -agent, llm, benchmark - -paper: -https://arxiv.org/abs/2509.09677 - -starred: -true -""" - -import functools -import re - -from inspect_ai.dataset import Sample -from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr -from inspect_ai.solver import TaskState, generate - -from lighteval.metrics.metrics import Metrics -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc - - -PROMPT_TEMPLATE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. -Your task is to calculate the running cumulative sum (starting from 0) by adding the value associated with each key in order. - -For each key in the list, you need to: -1. Look up the value in the dictionary -2. Add it to the running sum -3. Output the cumulative sum after processing all keys up to that point - -Dictionary to use: -{dict_str} - -Keys to process in order: -{keys_str} - -Your task: Calculate the cumulative sum after each key. The first sum is just the value of the first key. The second sum is the first value plus the second value, and so on. - -IMPORTANT: -- Output your answer as a single line with comma-separated values inside tags -- Do not include any other text outside the answer tags -- Format: value1,value2,value3,... -- Example: If the cumulative sums are [5, 8, 12], output: 5,8,12 - -Your answer:""" - - -def _build_prompt_and_target(record, prompt_length=32768): - """ - Helper function to extract common logic for building prompt and target. - Uses binary search to find the maximum number of items that fit within prompt_length. - Processes the record and returns prompt, target, and metadata. - - Returns: - tuple: (prompt: str, target_str: str, metadata: dict) - """ - input_keys = record["input"] - input_values = record["values"] - expected_output = record["output"] - - def build_prompt_for_n(n): - """Build a prompt with the first n items.""" - if n == 0: - return None - keys_n = input_keys[:n] - values_n = input_values[:n] - dictionary_n = dict(zip(keys_n, values_n)) - dict_str = str(dictionary_n) - keys_str = str(keys_n) - return PROMPT_TEMPLATE.format(dict_str=dict_str, keys_str=keys_str) - - # Binary search to find maximum n that fits within prompt_length - left, right = 0, len(input_keys) - max_n = 0 - - while left <= right: - mid = (left + right) // 2 - prompt = build_prompt_for_n(mid) - - if prompt is None: - break - - if len(prompt) <= prompt_length: - max_n = mid - left = mid + 1 - else: - right = mid - 1 - - # Use the maximum n that fits - input_keys = input_keys[:max_n] - input_values = input_values[:max_n] - expected_output = expected_output[:max_n] - - dictionary = dict(zip(input_keys, input_values)) - dict_str = str(dictionary) - keys_str = str(input_keys) - prompt = PROMPT_TEMPLATE.format(dict_str=dict_str, keys_str=keys_str) - - target_str = ",".join(map(str, expected_output)) - - metadata = { - "input_keys": input_keys, - "input_values": input_values, - "expected_output": expected_output, - "dictionary": dictionary, - "num_items": len(input_keys), - } - - return prompt, target_str, metadata - - -def long_horizon_execution_prompt_function(line, prompt_length=32768, task_name: str = None): - """ - Prompt function for non-inspect-ai backend. - Converts dataset record to Doc object. - """ - prompt, target_str, _ = _build_prompt_and_target(line, prompt_length=prompt_length) - - return Doc( - task_name=task_name, - query=prompt, - choices=[target_str], # Expected answer as a choice - gold_index=0, - instruction=prompt, - ) - - -def record_to_sample(record, prompt_length=32768): - """ - Converts dataset record to inspect-ai Sample object. - """ - prompt, target_str, metadata = _build_prompt_and_target(record, prompt_length=prompt_length) - - return Sample( - input=prompt, - target=target_str, - metadata=metadata, - ) - - -@scorer(metrics={"accuracy": [accuracy(), stderr()]}) -def long_horizon_execution_scorer(): - async def score(state: TaskState, target: Target): - response = state.output.completion - - answer_pattern = re.compile(r"(.*?)", re.DOTALL) - match = answer_pattern.search(response) - - if not match: - return Score(value="I", answer="", explanation="No tag found in response.") - - content = match.group(1).strip() - - try: - pred_values = [int(x.strip()) for x in content.split(",") if x.strip()] - except ValueError: - return Score(value="I", answer=content, explanation=f"Failed to parse integers from: {content}") - - try: - exp_values = [int(x.strip()) for x in target.text.split(",") if x.strip()] - - except (ValueError, AttributeError): - pred_str = ",".join(map(str, pred_values)) - is_correct = pred_str == target.text - return Score( - value="C" if is_correct else "I", - answer=pred_str, - explanation=f"Expected: {target.text}, Predicted: {pred_str}", - ) - - is_correct = pred_values == exp_values - return Score( - value="C" if is_correct else "I", - answer=",".join(map(str, pred_values)), - explanation=(f"Expected {len(exp_values)} values, Got {len(pred_values)} values. Match: {is_correct}"), - ) - - return score - - -TASKS_TABLE = [] -CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536] - -for context_size in CONTEXT_SIZES: - task_name = f"long_horizon_execution:{context_size}" - prompt_fn = functools.partial(long_horizon_execution_prompt_function, prompt_length=context_size) - sample_fn = functools.partial(record_to_sample, prompt_length=context_size) - - task = LightevalTaskConfig( - name=task_name, - prompt_function=prompt_fn, - sample_fields=sample_fn, - solver=[generate(cache=True)], - scorer=long_horizon_execution_scorer(), - hf_repo="arvindh75/Long-Horizon-Execution", - hf_subset="default", - evaluation_splits=("test",), - generation_size=context_size, - metrics=[Metrics.exact_match], - ) - - TASKS_TABLE.append(task) diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/__init__.py b/src/lighteval/tasks/tasks/long_horizon_execution/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py new file mode 100644 index 000000000..f2c235261 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py @@ -0,0 +1,48 @@ +""" +Constants file reused within the Long Horizon Execution task. +""" + +PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys. +Your task is to calculate the final cumulative sum after processing all keys in order. + +For each key in the list, you need to: +1. Look up the value in the dictionary +2. Add it to the running sum +3. After processing all keys, output the final cumulative sum + +Dictionary to use: +{dict_str} + +Keys to process in order: +{keys_str} + +Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys. + +IMPORTANT: +- Output your answer as a single integer value inside tags +- Do not include any other text outside the answer tags +- Format: final_sum +- Example: If the final cumulative sum is 42, output: 42 + +Your answer:""" + +PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}. +Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide. + +In each turn, I'll provide {k} keys (comma-separated). +Respond with the current running sum, enclosed in tags. + +Dictionary to maintain: +{dict_str} + +Ready to start! + +**User**: {keys_str} +**Assistant**:""" + +PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process: +**User**: {keys_str} +**Assistant**:""" + +CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536] +TURN_COMPLEXITIES = [1, 2, 10] diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py new file mode 100644 index 000000000..472eb3fc6 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/main.py @@ -0,0 +1,47 @@ +""" +name: +Long Horizon Execution + +dataset: +arvindh75/Long-Horizon-Execution + +abstract: +Evaluation benchmark for long-context execution capabilities of language models. +Tests a model's ability to maintain state and perform cumulative operations over +long sequences of inputs. Supports both single-turn (all inputs at once) and +multi-turn (inputs provided incrementally) evaluation modes. + +The task requires models to: +1. Maintain a dictionary mapping keys to values +2. Process a sequence of keys +3. Calculate cumulative sums after each key or group of keys +4. Handle varying context sizes and turn complexities + +Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum +after processing all keys, allowing any aggregation strategy. + +Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining +conversation history and outputting cumulative sums incrementally. Evaluates +fractional accuracy (correct turns / total turns). + +languages: +english + +tags: +long-context, state-tracking, arithmetic, execution + +paper: +https://arxiv.org/abs/2509.09677 + +starred: +true +""" + +from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks +from lighteval.tasks.tasks.long_horizon_execution.single_turn import create_single_turn_tasks + + +single_turn_tasks = create_single_turn_tasks() +multi_turn_tasks = create_multi_turn_tasks() + +TASKS_TABLE = single_turn_tasks + multi_turn_tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py new file mode 100644 index 000000000..34e34080e --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py @@ -0,0 +1,224 @@ +""" +Multi-turn implementation of the Long Horizon Execution task. + +This implementation matches the multi-turn evaluation approach from the research paper, +where keys are provided in batches of K per turn, and the model maintains conversation +state to output cumulative sums after each turn. +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + CONTEXT_SIZES, + PROMPT_TEMPLATE_MULTI_FOLLOWUP, + TURN_COMPLEXITIES, +) +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_multi_turn_prompts + + +def multi_turn_prompt_function(line, prompt_length=32768, k=1, task_name: str = None): + """ + Prompt function for non-inspect-ai backend for multi-turn evaluation. + Converts dataset record to Doc object. + + Note: For multi-turn, this returns the first turn's prompt. + Subsequent turns are handled by the solver. + """ + initial_prompt, _, expected_per_turn, _ = _build_multi_turn_prompts(line, prompt_length=prompt_length, k=k) + + return Doc( + task_name=task_name, + query=initial_prompt, + choices=[str(expected_per_turn[-1])], # Final sum as choice + gold_index=0, + instruction=initial_prompt, + ) + + +def multi_turn_record_to_sample(record, prompt_length=32768, k=1): + """ + Converts dataset record to inspect-ai Sample object for multi-turn evaluation. + Stores all turn information in metadata for the solver to use. + """ + initial_prompt, _, expected_per_turn, metadata = _build_multi_turn_prompts( + record, prompt_length=prompt_length, k=k + ) + + return Sample( + input=initial_prompt, + target=str(expected_per_turn[-1]), + metadata=metadata, + ) + + +def _extract_response_content(response): + """Extract content from model response object.""" + if hasattr(response, "content"): + return response.content + if hasattr(response, "completion"): + return response.completion + return str(response) + + +async def _process_single_turn(state, turn_chunk, config): + """Process a single turn: add user message, get model response, add assistant message.""" + keys_str = ", ".join(turn_chunk) + followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str) + state.messages.append({"role": "user", "content": followup_prompt}) + + response = await state.model.generate(messages=state.messages, config=config) + turn_response = _extract_response_content(response) + + state.messages.append({"role": "assistant", "content": turn_response}) + return turn_response + + +async def multi_turn_solver(state: TaskState): + """ + Custom solver for multi-turn evaluation. + Loops through turns, calling the model for each turn while maintaining conversation history. + + This implements offline evaluation: all turns are called, then evaluation happens. + """ + from inspect_ai.model import GenerateConfig, ModelOutput + + turn_chunks = state.metadata.get("turn_chunks", []) + + if not turn_chunks or len(turn_chunks) == 0: + return state + + # Initialize messages + if not hasattr(state, "messages") or state.messages is None: + state.messages = [] + + if not state.messages: + state.messages.append({"role": "user", "content": state.input}) + + all_turn_outputs = [] + + # Process all turns + if hasattr(state, "model") and state.model is not None: + config = GenerateConfig() + + # Process first turn (already in messages as initial prompt) + response = await state.model.generate(messages=state.messages, config=config) + turn_response = _extract_response_content(response) + all_turn_outputs.append(turn_response) + state.messages.append({"role": "assistant", "content": turn_response}) + + # Process remaining turns + for turn_idx in range(1, len(turn_chunks)): + if not hasattr(state, "model") or state.model is None: + break + turn_response = await _process_single_turn(state, turn_chunks[turn_idx], config) + all_turn_outputs.append(turn_response) + + state.metadata["all_turn_outputs"] = all_turn_outputs + + # Set final output + if all_turn_outputs: + if hasattr(state, "output") and state.output is not None: + state.output.completion = all_turn_outputs[-1] + else: + state.output = ModelOutput(completion=all_turn_outputs[-1]) + + return state + + +@scorer(metrics={"turn_accuracy": [accuracy(), stderr()], "fractional_accuracy": [accuracy(), stderr()]}) +def multi_turn_scorer(): + """ + Scorer for multi-turn Long Horizon Execution task. + Compares predicted cumulative sums at each turn with expected. + Returns fractional accuracy (correct turns / total turns). + """ + + async def score(state: TaskState, target: Target): + # metadata stored by solver + all_turn_outputs = state.metadata.get("all_turn_outputs", []) + expected_per_turn = state.metadata.get("expected_per_turn", []) + + if not all_turn_outputs: + return Score(value=0.0, answer="", explanation="No turn outputs found in state.metadata") + + if len(all_turn_outputs) != len(expected_per_turn): + return Score( + value=0.0, + answer="", + explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns", + ) + + parsed_outputs = [] + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + + for turn_idx, turn_output in enumerate(all_turn_outputs): + match = answer_pattern.search(turn_output) + if match: + try: + parsed_value = int(match.group(1).strip()) + parsed_outputs.append(parsed_value) + except ValueError: + parsed_outputs.append(None) + else: + parsed_outputs.append(None) + + correct_turns = 0 + turn_results = [] + for turn_idx, (pred, exp) in enumerate(zip(parsed_outputs, expected_per_turn)): + is_correct = (pred is not None) and (pred == exp) + if is_correct: + correct_turns += 1 + turn_results.append({"turn": turn_idx + 1, "predicted": pred, "expected": exp, "correct": is_correct}) + + fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0 + + return Score( + value={ + "turn_accuracy": fractional_accuracy, + "fractional_accuracy": fractional_accuracy, + "correct_turns": correct_turns, + "total_turns": len(expected_per_turn), + }, + answer=str(parsed_outputs), + explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}", + ) + + return score + + +def create_multi_turn_tasks(): + """ + Creates a list of LightevalTaskConfig objects for multi-turn Long Horizon Execution. + Each task corresponds to a different combination of context size and turn complexity (K). + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + for k in TURN_COMPLEXITIES: + task_name = f"long_horizon_execution:multi:{context_size}:k{k}" + prompt_fn = functools.partial(multi_turn_prompt_function, prompt_length=context_size, k=k) + sample_fn = functools.partial(multi_turn_record_to_sample, prompt_length=context_size, k=k) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[multi_turn_solver, generate(cache=True)], + scorer=multi_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + tasks.append(task) + + return tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py new file mode 100644 index 000000000..c6fd0ca4b --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/single_turn.py @@ -0,0 +1,132 @@ +""" +Single turn implementation of the Long Horizon Execution task. +""" + +import functools +import re + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr +from inspect_ai.solver import TaskState, generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE +from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target + + +def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None): + """ + Prompt function for single-turn evaluation (non-inspect-ai backend). + Converts dataset record to Doc object. + + Returns: + Doc object for evaluation + """ + prompt, target_str, _ = _build_prompt_and_target( + line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Doc( + task_name=task_name, + query=prompt, + choices=[target_str], # Expected answer as a choice + gold_index=0, + instruction=prompt, + ) + + +def single_turn_record_to_sample(record, prompt_length=32768): + """ + Converts dataset record to inspect-ai Sample object for single-turn evaluation. + + Returns: + Sample object for inspect-ai + """ + prompt, target_str, metadata = _build_prompt_and_target( + record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE + ) + + return Sample( + input=prompt, + target=target_str, + metadata=metadata, + ) + + +@scorer(metrics={"accuracy": [accuracy(), stderr()]}) +def single_turn_scorer(): + """ + Scorer for single-turn evaluation. + Compares the model's predicted final sum with the expected final sum (binary score). + + Returns: + Scorer function that evaluates single integer responses + """ + + async def score(state: TaskState, target: Target): + response = state.output.completion + + answer_pattern = re.compile(r"(.*?)", re.DOTALL) + match = answer_pattern.search(response) + + if not match: + return Score(value="I", answer="", explanation="No tag found in response.") + + content = match.group(1).strip() + + try: + pred_value = int(content.strip()) + except ValueError: + return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}") + + try: + exp_value = int(target.text.strip()) + except (ValueError, AttributeError): + return Score( + value="I", + answer=str(pred_value), + explanation=f"Failed to parse expected target: {target.text}", + ) + + is_correct = pred_value == exp_value + return Score( + value="C" if is_correct else "I", + answer=str(pred_value), + explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"), + ) + + return score + + +def create_single_turn_tasks(): + """ + Create all single-turn task configurations for different context sizes. + + Returns: + list[LightevalTaskConfig]: List of task configurations for single-turn evaluation + """ + tasks = [] + + for context_size in CONTEXT_SIZES: + task_name = f"long_horizon_execution:{context_size}" + prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size) + sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size) + + task = LightevalTaskConfig( + name=task_name, + prompt_function=prompt_fn, + sample_fields=sample_fn, + solver=[generate(cache=True)], + scorer=single_turn_scorer(), + hf_repo="arvindh75/Long-Horizon-Execution", + hf_subset="default", + evaluation_splits=("test",), + generation_size=context_size, + metrics=[Metrics.exact_match], + ) + + tasks.append(task) + + return tasks diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/utils.py b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py new file mode 100644 index 000000000..cc7eadca5 --- /dev/null +++ b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py @@ -0,0 +1,214 @@ +""" +Utility functions for Long Horizon Execution task. +""" + +from lighteval.tasks.tasks.long_horizon_execution.constants import ( + PROMPT_TEMPLATE_MULTI_START, + PROMPT_TEMPLATE_SINGLE, +) + + +def _binary_search_max_items(input_keys, build_prompt_fn, prompt_length, min_items=1): + """ + Generic binary search to find maximum number of items that fit within prompt_length. + + Returns: + int: Maximum number of items that fit + + """ + # Pre-validate that at least min_items fit within prompt_length + test_prompt = build_prompt_fn(min_items) + if test_prompt is None: + raise ValueError("Cannot build prompt: unable to generate prompt with available items") + + if len(test_prompt) > prompt_length: + item_label = "item" if min_items == 1 else f"{min_items} items" + raise ValueError( + f"Prompt length ({prompt_length} chars) is too small to fit {item_label}. " + f"Minimum required: {len(test_prompt)} chars. " + f"Please increase prompt_length or reduce dataset complexity." + ) + + # Binary search to find maximum n that fits within prompt_length + left, right = min_items, len(input_keys) + max_n = min_items + + while left <= right: + mid = (left + right) // 2 + prompt = build_prompt_fn(mid) + + if prompt is None: + right = mid - 1 + continue + + if len(prompt) <= prompt_length: + max_n = mid + left = mid + 1 + else: + right = mid - 1 + + return max_n + + +def _build_prompt_and_target(record, prompt_length=32768, prompt_template=PROMPT_TEMPLATE_SINGLE): + """ + Helper function to extract common logic for building prompt and target. + Uses binary search to find the maximum number of items that fit within prompt_length. + Processes the record and returns prompt, target, and metadata. + + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + prompt_template: Prompt template to use for formatting. Defaults to PROMPT_TEMPLATE_SINGLE. + + Returns: + tuple: (prompt: str, target_str: str, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + def build_prompt_for_n(n): + """Build a prompt with the first n items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + keys_str = str(keys_n) + return prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=n) + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + max_n = _binary_search_max_items(input_keys, build_prompt_for_n, prompt_length, min_items=1) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + keys_str = str(input_keys) + prompt = prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=len(input_keys)) + + target_str = str(expected_output[-1]) + + metadata = { + "input_keys": input_keys, + "input_values": input_values, + "expected_output": expected_output, + "dictionary": dictionary, + "num_items": len(input_keys), + } + + return prompt, target_str, metadata + + +def _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k): + """ + Find maximum number of items that fit within prompt_length for multi-turn evaluation. + Uses binary search to find max items where initial prompt (dict + first K keys) fits. + + Returns: + int: Maximum number of items that fit + """ + + def build_initial_prompt_for_n(n): + """Build initial prompt with dictionary and first K keys from n total items.""" + if n == 0: + return None + keys_n = input_keys[:n] + values_n = input_values[:n] + dictionary_n = dict(zip(keys_n, values_n)) + dict_str = str(dictionary_n) + + # First turn has first K keys + first_turn_keys = keys_n[:k] + keys_str = ", ".join(first_turn_keys) + + return PROMPT_TEMPLATE_MULTI_START.format( + dict_str=dict_str, keys_str=keys_str, k=k, num_keys=len(first_turn_keys) + ) + + return _binary_search_max_items(input_keys, build_initial_prompt_for_n, prompt_length, min_items=k) + + +def _chunk_and_calculate_expected(input_keys, input_values, k): + """ + Chunk keys into turns of size K and calculate expected cumulative sums per turn. + + Returns: + tuple: (turn_chunks: list, value_chunks: list, expected_per_turn: list) + """ + # Chunk keys into turns of size K + turn_chunks = [] + value_chunks = [] + for i in range(0, len(input_keys), k): + turn_chunks.append(input_keys[i : i + k]) + value_chunks.append(input_values[i : i + k]) + + # Calculate expected cumulative sums for each turn + expected_per_turn = [] + cumulative_sum = 0 + for values in value_chunks: + cumulative_sum += sum(values) + expected_per_turn.append(cumulative_sum) + + return turn_chunks, value_chunks, expected_per_turn + + +def _build_multi_turn_prompts(record, prompt_length=32768, k=1): + """ + Build prompts for multi-turn evaluation. + Uses binary search to find maximum number of items that fit within prompt_length. + Chunks keys into turns of size K. + + Args: + record: Dictionary with 'input', 'values', and 'output' keys + prompt_length: Maximum character length for the prompt. Defaults to 32768. + k: Turn complexity (number of keys per turn). Defaults to 1. + + Returns: + tuple: (initial_prompt: str, turn_chunks: list, expected_per_turn: list, metadata: dict) + """ + input_keys = record["input"] + input_values = record["values"] + expected_output = record["output"] + + # Handle empty input case + if len(input_keys) == 0: + raise ValueError("Cannot build prompt: no items available in record") + + # Find maximum number of items that fit + max_n = _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k) + + # Use the maximum n that fits + input_keys = input_keys[:max_n] + input_values = input_values[:max_n] + expected_output = expected_output[:max_n] + + turn_chunks, value_chunks, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k) + + dictionary = dict(zip(input_keys, input_values)) + dict_str = str(dictionary) + + first_turn_keys_str = ", ".join(turn_chunks[0]) + initial_prompt = PROMPT_TEMPLATE_MULTI_START.format( + dict_str=dict_str, keys_str=first_turn_keys_str, k=k, num_keys=len(turn_chunks[0]) + ) + + metadata = { + "turn_chunks": turn_chunks, + "value_chunks": value_chunks, + "expected_per_turn": expected_per_turn, + "dictionary": dictionary, + "k": k, + "num_turns": len(turn_chunks), + "num_items": len(input_keys), + } + + return initial_prompt, turn_chunks, expected_per_turn, metadata