diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py
new file mode 100644
index 000000000..2a5c9954f
--- /dev/null
+++ b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py
@@ -0,0 +1,46 @@
+"""
+Constants file reused within the Long Horizon Execution task.
+"""
+
+PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys.
+Your task is to calculate the final cumulative sum after processing all keys in order.
+
+For each key in the list, you need to:
+1. Look up the value in the dictionary
+2. Add it to the running sum
+3. After processing all keys, output the final cumulative sum
+
+Dictionary to use:
+{dict_str}
+
+Keys to process in order:
+{keys_str}
+
+Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys.
+
+IMPORTANT:
+- Output your answer as a single integer value inside <answer></answer> tags
+- Do not include any other text outside the answer tags
+- Format: <answer>final_sum</answer>
+- Example: If the final cumulative sum is 42, output: <answer>42</answer>
+
+Your answer:"""
+
+PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}.
+Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide.
+In each turn, I'll provide {k} key(s) (comma-separated).
+Respond with the current running sum, enclosed in <answer> tags.
+
+Dictionary to maintain:
+{dict_str}
+
+Ready to start!
+**User**: {keys_str}
+**Assistant**:"""
+
+PROMPT_TEMPLATE_MULTI_FOLLOWUP = """Here are the next keys to process:
+**User**: {keys_str}
+**Assistant**:"""
+
+CONTEXT_SIZES = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
+TURN_COMPLEXITIES = [1, 2, 10]
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py
new file mode 100644
index 000000000..09686b40c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/long_horizon_execution/main.py
@@ -0,0 +1,167 @@
+"""
+name:
+Long Horizon Execution
+
+dataset:
+arvindh75/Long-Horizon-Execution
+
+abstract:
+Evaluation benchmark for long-context execution capabilities of language models.
+Tests a model's ability to maintain state and perform cumulative operations over
+long sequences of inputs. Supports both single-turn (all inputs at once) and
+multi-turn (inputs provided incrementally) evaluation modes.
+The task requires models to:
+1. Maintain a dictionary mapping keys to values
+2. Process a sequence of keys
+3. Calculate cumulative sums after each key or group of keys
+4. Handle varying context sizes and turn complexities
+Single-turn evaluation (Section 3.3): Model outputs only the final cumulative sum
+after processing all keys, allowing any aggregation strategy.
+
+Multi-turn evaluation: Model processes keys in batches of K per turn, maintaining
+conversation history and outputting cumulative sums incrementally. Evaluates
+fractional accuracy (correct turns / total turns).
+
+languages:
+english
+
+tags:
+long-context, state-tracking, arithmetic, execution
+
+paper:
+https://arxiv.org/abs/2509.09677
+
+starred: true
+"""
+
+import functools
+import re
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import TaskState, generate
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE
+from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks
+from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target
+
+
+def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None):
+    """
+    Prompt function for single-turn evaluation (non-inspect-ai backend).
+    Converts dataset record to Doc object.
+    Returns:
+        Doc object for evaluation
+    """
+    prompt, target_str, _ = _build_prompt_and_target(
+        line, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
+    )
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[target_str],  # Expected answer as a choice
+        gold_index=0,
+        instruction=prompt,
+    )
+
+
+def single_turn_record_to_sample(record, prompt_length=32768):
+    """
+    Converts dataset record to inspect-ai Sample object for single-turn evaluation.
+    Returns:
+        Sample object for inspect-ai
+    """
+    prompt, target_str, metadata = _build_prompt_and_target(
+        record, prompt_length=prompt_length, prompt_template=PROMPT_TEMPLATE_SINGLE
+    )
+
+    return Sample(
+        input=prompt,
+        target=target_str,
+        metadata=metadata,
+    )
+
+
+@scorer(metrics=[accuracy(), stderr()])
+def single_turn_scorer():
+    """
+    Scorer for single-turn evaluation.
+    Compares the model's predicted final sum with the expected final sum (binary score).
+    Returns:
+        Scorer function that evaluates single integer responses
+    """
+
+    async def score(state: TaskState, target: Target):
+        response = state.output.completion
+
+        answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
+        match = answer_pattern.search(response)
+
+        if not match:
+            return Score(value="I", answer="", explanation="No <answer> tag found in response.")
+
+        content = match.group(1).strip()
+
+        try:
+            pred_value = int(content.strip())
+        except ValueError:
+            return Score(value="I", answer=content, explanation=f"Failed to parse integer from: {content}")
+
+        try:
+            exp_value = int(target.text.strip())
+        except (ValueError, AttributeError):
+            return Score(
+                value="I",
+                answer=str(pred_value),
+                explanation=f"Failed to parse expected target: {target.text}",
+            )
+
+        is_correct = pred_value == exp_value
+        return Score(
+            value="C" if is_correct else "I",
+            answer=str(pred_value),
+            explanation=(f"Expected {exp_value}, Got {pred_value}. Match: {is_correct}"),
+        )
+
+    return score
+
+
+def create_single_turn_tasks():
+    """
+    Create all single-turn task configurations for different context sizes.
+    Returns:
+        list[LightevalTaskConfig]: List of task configurations for single-turn evaluation
+    """
+    tasks = []
+
+    for context_size in CONTEXT_SIZES:
+        task_name = f"long_horizon_execution_single:{context_size}"
+        prompt_fn = functools.partial(single_turn_prompt_function, prompt_length=context_size)
+        sample_fn = functools.partial(single_turn_record_to_sample, prompt_length=context_size)
+
+        task = LightevalTaskConfig(
+            name=task_name,
+            prompt_function=prompt_fn,
+            sample_fields=sample_fn,
+            solver=[generate(cache=True)],
+            scorer=single_turn_scorer(),
+            hf_repo="arvindh75/Long-Horizon-Execution",
+            hf_subset="default",
+            evaluation_splits=("test",),
+            generation_size=context_size,
+            metrics=[Metrics.exact_match],
+        )
+
+        tasks.append(task)
+
+    return tasks
+
+
+single_turn_tasks = create_single_turn_tasks()
+multi_turn_tasks = create_multi_turn_tasks()
+
+TASKS_TABLE = single_turn_tasks + multi_turn_tasks
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py
new file mode 100644
index 000000000..e34638fd0
--- /dev/null
+++ b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py
@@ -0,0 +1,219 @@
+"""
+Multi-turn implementation of the Long Horizon Execution task.
+This implementation matches the multi-turn evaluation approach from the research paper,
+where keys are provided in batches of K per turn, and the model maintains conversation
+state to output cumulative sums after each turn.
+"""
+
+import functools
+import re
+
+from inspect_ai.dataset import Sample
+from inspect_ai.model import ChatMessageUser, ModelOutput
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import Generate, TaskState, generate, solver
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks.long_horizon_execution.constants import (
+    CONTEXT_SIZES,
+    PROMPT_TEMPLATE_MULTI_FOLLOWUP,
+    TURN_COMPLEXITIES,
+)
+from lighteval.tasks.tasks.long_horizon_execution.utils import _build_multi_turn_prompts
+
+
+def multi_turn_prompt_function(line, prompt_length=32768, k=1, task_name: str = None):
+    """
+    Prompt function for non-inspect-ai backend for multi-turn evaluation.
+    Converts dataset record to Doc object.
+    Note: For multi-turn, this returns the first turn's prompt.
+    Subsequent turns are handled by the solver.
+    """
+    initial_prompt, _, expected_per_turn, _ = _build_multi_turn_prompts(line, prompt_length=prompt_length, k=k)
+
+    return Doc(
+        task_name=task_name,
+        query=initial_prompt,
+        choices=[str(expected_per_turn[-1])],  # Final sum as choice
+        gold_index=0,
+        instruction=initial_prompt,
+    )
+
+
+def multi_turn_record_to_sample(record, prompt_length=32768, k=1):
+    """
+    Converts dataset record to inspect-ai Sample object for multi-turn evaluation.
+    Stores all turn information in metadata for the solver to use.
+    """
+    initial_prompt, _, expected_per_turn, metadata = _build_multi_turn_prompts(
+        record, prompt_length=prompt_length, k=k
+    )
+
+    return Sample(
+        input=initial_prompt,
+        target=str(expected_per_turn[-1]),
+        metadata=metadata,
+    )
+
+
+def _extract_response_content(response):
+    """Extract content from model response object."""
+    if hasattr(response, "content"):
+        return response.content
+    if hasattr(response, "completion"):
+        return response.completion
+    return str(response)
+
+
+async def _process_single_turn(state, turn_chunk, generate_fn):
+    """Process a single turn: add user message, get model response, add assistant message."""
+    keys_str = ", ".join(turn_chunk)
+    followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str)
+    state.messages.append(ChatMessageUser(content=followup_prompt))
+
+    # generate_fn() takes the state and returns updated state with assistant message added
+    updated_state = await generate_fn(state)
+    turn_response = _extract_response_content(updated_state.output.completion if updated_state.output else "")
+
+    return updated_state, turn_response
+
+
+@solver
+def multi_turn_solver():
+    """
+    Solver for multi-turn evaluation.
+    Loops through turns, calling the model for each turn while maintaining conversation history.
+    This implements offline evaluation: all turns are called, then evaluation happens.
+    """
+
+    async def solve(state: TaskState, generate: Generate):
+        turn_chunks = state.metadata.get("turn_chunks", [])
+
+        if not turn_chunks:
+            return state
+
+        # Initialize messages
+        if not hasattr(state, "messages") or state.messages is None:
+            state.messages = []
+
+        if not state.messages:
+            state.messages.append(ChatMessageUser(content=state.input))
+
+        all_turn_outputs = []
+
+        # Process first turn (already in messages as initial prompt)
+        updated_state = await generate(state)
+        turn_response = _extract_response_content(updated_state.output.completion if updated_state.output else "")
+        all_turn_outputs.append(turn_response)
+
+        state = updated_state
+
+        # Process remaining turns
+        for turn_idx in range(1, len(turn_chunks)):
+            state, turn_response = await _process_single_turn(state, turn_chunks[turn_idx], generate)
+            all_turn_outputs.append(turn_response)
+
+        state.metadata["all_turn_outputs"] = all_turn_outputs
+
+        # Set final output
+        if all_turn_outputs:
+            if hasattr(state, "output") and state.output is not None:
+                state.output.completion = all_turn_outputs[-1]
+            else:
+                state.output = ModelOutput(completion=all_turn_outputs[-1])
+
+        return state
+
+    return solve
+
+
+@scorer(metrics={"fractional_accuracy": [accuracy(), stderr()]})
+def multi_turn_scorer():
+    """
+    Scorer for multi-turn Long Horizon Execution task.
+    Compares predicted cumulative sums at each turn with expected.
+    Returns fractional accuracy (correct turns / total turns).
+    """
+
+    async def score(state: TaskState, target: Target):
+        # metadata stored by solver
+        all_turn_outputs = state.metadata.get("all_turn_outputs", [])
+        expected_per_turn = state.metadata.get("expected_per_turn", [])
+
+        if not all_turn_outputs:
+            return Score(
+                value={"fractional_accuracy": 0.0},
+                answer="",
+                explanation="No turn outputs found in state.metadata",
+            )
+
+        if len(all_turn_outputs) != len(expected_per_turn):
+            return Score(
+                value={"fractional_accuracy": 0.0},
+                answer="",
+                explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns",
+            )
+
+        parsed_outputs = []
+        answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
+
+        for turn_output in all_turn_outputs:
+            match = answer_pattern.search(turn_output)
+            if match:
+                try:
+                    parsed_value = int(match.group(1).strip())
+                    parsed_outputs.append(parsed_value)
+                except ValueError:
+                    parsed_outputs.append(None)
+            else:
+                parsed_outputs.append(None)
+
+        correct_turns = 0
+        turn_results = []
+        for turn_idx, (pred, exp) in enumerate(zip(parsed_outputs, expected_per_turn)):
+            is_correct = (pred is not None) and (pred == exp)
+            if is_correct:
+                correct_turns += 1
+            turn_results.append({"turn": turn_idx + 1, "predicted": pred, "expected": exp, "correct": is_correct})
+
+        fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0
+
+        return Score(
+            value={"fractional_accuracy": fractional_accuracy},
+            answer=str(parsed_outputs),
+            explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}",
+        )
+
+    return score
+
+
+def create_multi_turn_tasks():
+    """
+    Creates a list of LightevalTaskConfig objects for multi-turn Long Horizon Execution.
+    Each task corresponds to a different combination of context size and turn complexity (K).
+    """
+    tasks = []
+
+    for context_size in CONTEXT_SIZES:
+        for k in TURN_COMPLEXITIES:
+            task_name = f"long_horizon_execution_multi_k{k}:{context_size}"
+            prompt_fn = functools.partial(multi_turn_prompt_function, prompt_length=context_size, k=k)
+            sample_fn = functools.partial(multi_turn_record_to_sample, prompt_length=context_size, k=k)
+
+            task = LightevalTaskConfig(
+                name=task_name,
+                prompt_function=prompt_fn,
+                sample_fields=sample_fn,
+                solver=[multi_turn_solver(), generate(cache=True)],
+                scorer=multi_turn_scorer(),
+                hf_repo="arvindh75/Long-Horizon-Execution",
+                hf_subset="default",
+                evaluation_splits=("test",),
+                generation_size=context_size,
+                metrics=[Metrics.exact_match],
+            )
+            tasks.append(task)
+
+    return tasks
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/utils.py b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py
new file mode 100644
index 000000000..f96acda86
--- /dev/null
+++ b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py
@@ -0,0 +1,203 @@
+"""
+Utility functions for Long Horizon Execution task.
+"""
+
+from lighteval.tasks.tasks.long_horizon_execution.constants import (
+    PROMPT_TEMPLATE_MULTI_START,
+    PROMPT_TEMPLATE_SINGLE,
+)
+
+
+def _binary_search_max_items(input_keys, build_prompt_fn, prompt_length, min_items=1):
+    """
+    Generic binary search to find maximum number of items that fit within prompt_length.
+    Returns:
+        int: Maximum number of items that fit
+    """
+    # Pre-validate that at least min_items fit within prompt_length
+    test_prompt = build_prompt_fn(min_items)
+    if test_prompt is None:
+        raise ValueError("Cannot build prompt: unable to generate prompt with available items")
+
+    if len(test_prompt) > prompt_length:
+        item_label = "item" if min_items == 1 else f"{min_items} items"
+        raise ValueError(
+            f"Prompt length ({prompt_length} chars) is too small to fit {item_label}. "
+            f"Minimum required: {len(test_prompt)} chars. "
+            f"Please increase prompt_length or reduce dataset complexity."
+        )
+
+    # Binary search to find maximum n that fits within prompt_length
+    left, right = min_items, len(input_keys)
+    max_n = min_items
+
+    while left <= right:
+        mid = (left + right) // 2
+        prompt = build_prompt_fn(mid)
+
+        if prompt is None:
+            right = mid - 1
+            continue
+
+        if len(prompt) <= prompt_length:
+            max_n = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+
+    return max_n
+
+
+def _build_prompt_and_target(record, prompt_length=32768, prompt_template=PROMPT_TEMPLATE_SINGLE):
+    """
+    Helper function to extract common logic for building prompt and target.
+    Uses binary search to find the maximum number of items that fit within prompt_length.
+    Processes the record and returns prompt, target, and metadata.
+    Args:
+        record: Dictionary with 'input', 'values', and 'output' keys
+        prompt_length: Maximum character length for the prompt. Defaults to 32768.
+        prompt_template: Prompt template to use for formatting. Defaults to PROMPT_TEMPLATE_SINGLE.
+    Returns:
+        tuple: (prompt: str, target_str: str, metadata: dict)
+    """
+    input_keys = record["input"]
+    input_values = record["values"]
+    expected_output = record["output"]
+
+    def build_prompt_for_n(n):
+        """Build a prompt with the first n items."""
+        if n == 0:
+            return None
+        keys_n = input_keys[:n]
+        values_n = input_values[:n]
+        dictionary_n = dict(zip(keys_n, values_n))
+        dict_str = str(dictionary_n)
+        keys_str = str(keys_n)
+        return prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=n)
+
+    # Handle empty input case
+    if len(input_keys) == 0:
+        raise ValueError("Cannot build prompt: no items available in record")
+
+    max_n = _binary_search_max_items(input_keys, build_prompt_for_n, prompt_length, min_items=1)
+
+    # Use the maximum n that fits
+    input_keys = input_keys[:max_n]
+    input_values = input_values[:max_n]
+    expected_output = expected_output[:max_n]
+
+    dictionary = dict(zip(input_keys, input_values))
+    dict_str = str(dictionary)
+    keys_str = str(input_keys)
+    prompt = prompt_template.format(dict_str=dict_str, keys_str=keys_str, num_keys=len(input_keys))
+
+    target_str = str(expected_output[-1])
+
+    metadata = {
+        "input_keys": input_keys,
+        "input_values": input_values,
+        "expected_output": expected_output,
+        "dictionary": dictionary,
+        "num_items": len(input_keys),
+    }
+
+    return prompt, target_str, metadata
+
+
+def _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k):
+    """
+    Find maximum number of items that fit within prompt_length for multi-turn evaluation.
+    Uses binary search to find max items where initial prompt (dict + first K keys) fits.
+    Returns:
+        int: Maximum number of items that fit
+    """
+
+    def build_initial_prompt_for_n(n):
+        """Build initial prompt with dictionary and first K keys from n total items."""
+        if n == 0:
+            return None
+        keys_n = input_keys[:n]
+        values_n = input_values[:n]
+        dictionary_n = dict(zip(keys_n, values_n))
+        dict_str = str(dictionary_n)
+
+        # First turn has first K keys
+        first_turn_keys = keys_n[:k]
+        keys_str = ", ".join(first_turn_keys)
+
+        return PROMPT_TEMPLATE_MULTI_START.format(
+            dict_str=dict_str,
+            keys_str=keys_str,
+            k=k,
+        )
+
+    return _binary_search_max_items(input_keys, build_initial_prompt_for_n, prompt_length, min_items=k)
+
+
+def _chunk_and_calculate_expected(input_keys, input_values, k):
+    """
+    Chunk keys into turns of size K and calculate expected cumulative sums per turn.
+    Returns:
+        tuple: (turn_chunks: list, value_chunks: list, expected_per_turn: list)
+    """
+    # Chunk keys into turns of size K
+    turn_chunks = []
+    value_chunks = []
+    for i in range(0, len(input_keys), k):
+        turn_chunks.append(input_keys[i : i + k])
+        value_chunks.append(input_values[i : i + k])
+
+    # Calculate expected cumulative sums for each turn
+    expected_per_turn = []
+    cumulative_sum = 0
+    for values in value_chunks:
+        cumulative_sum += sum(values)
+        expected_per_turn.append(cumulative_sum)
+
+    return turn_chunks, value_chunks, expected_per_turn
+
+
+def _build_multi_turn_prompts(record, prompt_length=32768, k=1):
+    """
+    Build prompts for multi-turn evaluation.
+    Uses binary search to find maximum number of items that fit within prompt_length.
+    Chunks keys into turns of size K.
+    Args:
+        record: Dictionary with 'input', 'values', and 'output' keys
+        prompt_length: Maximum character length for the prompt. Defaults to 32768.
+        k: Turn complexity (number of keys per turn). Defaults to 1.
+    Returns:
+        tuple: (initial_prompt: str, turn_chunks: list, expected_per_turn: list, metadata: dict)
+    """
+    input_keys = record["input"]
+    input_values = record["values"]
+
+    # Handle empty input case
+    if len(input_keys) == 0:
+        raise ValueError("Cannot build prompt: no items available in record")
+
+    # Find maximum number of items that fit
+    max_n = _find_max_items_for_multi_turn(input_keys, input_values, prompt_length, k)
+
+    # Use the maximum n that fits
+    input_keys = input_keys[:max_n]
+    input_values = input_values[:max_n]
+
+    turn_chunks, _, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k)
+
+    dictionary = dict(zip(input_keys, input_values))
+    dict_str = str(dictionary)
+
+    first_turn_keys_str = ", ".join(turn_chunks[0])
+    initial_prompt = PROMPT_TEMPLATE_MULTI_START.format(dict_str=dict_str, keys_str=first_turn_keys_str, k=k)
+
+    metadata = {
+        "turn_chunks": turn_chunks,
+        "expected_per_turn": expected_per_turn,
+        "dictionary": dictionary,
+        "k": k,
+        "num_turns": len(turn_chunks),
+        "num_items": len(input_keys),
+    }
+
+    return initial_prompt, turn_chunks, expected_per_turn, metadata