huggingface · akshathmangudi · Nov 21, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py
@@ -0,0 +1,46 @@
+"""
+Constants file reused within the Long Horizon Execution task.
+"""
+
+PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys.
+Your task is to calculate the final cumulative sum after processing all keys in order.
+
+For each key in the list, you need to:
+1. Look up the value in the dictionary
+2. Add it to the running sum
+3. After processing all keys, output the final cumulative sum
+
+Dictionary to use:
+{dict_str}
+
+Keys to process in order:
+{keys_str}
+
+Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys.
+
+IMPORTANT:
+- Output your answer as a single integer value inside <answer></answer> tags
+- Do not include any other text outside the answer tags
+- Format: <answer>final_sum</answer>
+- Example: If the final cumulative sum is 42, output: <answer>42</answer>
+
+Your answer:"""
+
+PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}.
+Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide.
+In each turn, I'll provide {k} key(s) (comma-separated).
+Respond with the current running sum, enclosed in <answer> tags.
+
+Dictionary to maintain:
+{dict_str}
+
+Ready to start!
+**User**: {keys_str}
+**Assistant**:"""
+
+PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process:
+**User**: {keys_str}
+**Assistant**:"""
+
+CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
+TURN_COMPLEXITIES = [1, 2, 10]
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py
@@ -0,0 +1,167 @@
+"""
+name:
+Long Horizon Execution
+
+dataset:
+arvindh75/Long-Horizon-Execution
+
+abstract:
+Evaluation benchmark for long-context execution capabilities of language models.
+Tests a model's ability to maintain state and perform cumulative operations over
+long sequences of inputs. Supports both single-turn (all inputs at once) and
+multi-turn (inputs provided incrementally) evaluation modes.
+The task requires models to:
+1. Maintain a dictionary mapping keys to values
+2. Process a sequence of keys
+3. Calculate cumulative sums after each key or group of keys
+4. Handle varying context sizes and turn complexities
+Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum
+after processing all keys, allowing any aggregation strategy.
+
+Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining
+conversation history and outputting cumulative sums incrementally. Evaluates
+fractional accuracy (correct turns / total turns).
+
+languages:
+english
+
+tags:
+long-context, state-tracking, arithmetic, execution
+
+paper:
+https://arxiv.org/abs/2509.09677
+
+starred: true
+"""
+
+import functools
+import re
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import TaskState, generate
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE
+from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks
+from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target
+
+
+def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None):
+    """
+    Prompt function for single-turn evaluation (non-inspect-ai backend).
+    Converts dataset record to Doc object.
+    Returns:
+        Doc object for evaluation
+    """
+    prompt, target_str, _ = _build_prompt_and_target(
+        line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
+    )
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[target_str],  # Expected answer as a choice
+        gold_index=0,
+        instruction=prompt,
+    )
+
+
+def single_turn_record_to_sample(record, prompt_length=32768):
+    """
+    Converts dataset record to inspect-ai Sample object for single-turn evaluation.
+    Returns:
+        Sample object for inspect-ai
+    """
+    prompt, target_str, metadata = _build_prompt_and_target(
+        record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
+    )
+
+    return Sample(
+        input=prompt,
+        target=target_str,
+        metadata=metadata,
+    )
+
+
+@scorer(metrics=[accuracy(), stderr()])
+def single_turn_scorer():
+    """
+    Scorer for single-turn evaluation.
+    Compares the model's predicted final sum with the expected final sum (binary score).
+    Returns:
+        Scorer function that evaluates single integer responses
+    """
+
+    async def score(state: TaskState, target: Target):
+        response = state.output.completion
+
+        answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
+        match = answer_pattern.search(response)
+
+        if not match:
+            return Score(value="I", answer="", explanation="No <answer> tag found in response.")
+
+        content = match.group(1).strip()
+
+        try:
+            pred_value = int(content.strip())
+        except ValueError:
+            return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}")
+
+        try:
+            exp_value = int(target.text.strip())
+        except (ValueError, AttributeError):
+            return Score(
+                value="I",
+                answer=str(pred_value),
+                explanation=f"Failed to parse expected target: {target.text}",
+            )
+
+        is_correct = pred_value == exp_value
+        return Score(
+            value="C" if is_correct else "I",
+            answer=str(pred_value),
+            explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"),
+        )
+
+    return score
+
+
+def create_single_turn_tasks():
+    """
+    Create all single-turn task configurations for different context sizes.
+    Returns:
+        list[LightevalTaskConfig]: List of task configurations for single-turn evaluation
+    """
+    tasks = []
+
+    for context_size in CONTEXT_SIZES:
+        task_name = f"long_horizon_execution_single:{context_size}"
+        prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size)
+        sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size)
+
+        task = LightevalTaskConfig(
+            name=task_name,
+            prompt_function=prompt_fn,
+            sample_fields=sample_fn,
+            solver=[generate(cache=True)],
+            scorer=single_turn_scorer(),
+            hf_repo="arvindh75/Long-Horizon-Execution",
+            hf_subset="default",
+            evaluation_splits=("test",),
+            generation_size=context_size,
+            metrics=[Metrics.exact_match],
+        )
+
+        tasks.append(task)
+
+    return tasks
+
+
+single_turn_tasks = create_single_turn_tasks()
+multi_turn_tasks = create_multi_turn_tasks()
+
+TASKS_TABLE = single_turn_tasks + multi_turn_tasks