NVIDIA-NeMo · stephencge · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/resources_servers/math_formal_lean/README.md b/resources_servers/math_formal_lean/README.md
@@ -0,0 +1,48 @@
+# Math Formal Lean Resource Server
+
+Lean4 formal proof verification environment for NeMo Gym.
+
+## Overview
+
+This environment verifies Lean4 mathematical proofs generated by language models. The model receives a theorem statement and must produce a valid proof that compiles successfully.
+
+## Verification Logic
+
+The `/verify` endpoint:
+1. Extracts Lean4 code from the model's markdown output
+2. Builds a complete proof by combining header imports + theorem statement + generated proof
+3. Sends the code to a Lean4 sandbox container for compilation
+4. Returns `reward=1.0` if compilation succeeds, `reward=0.0` otherwise
+
+Failure modes detected:
+- Syntax errors
+- Type errors
+- Timeout (default 30s)
+- Use of `sorry` tactic
+
+## Data Sources
+
+**MiniF2F Dataset**: 244 formal mathematics problems from the [MiniF2F benchmark](https://github.com/openai/miniF2F), covering:
+- Algebra
+- Number theory
+- Geometry (formalized)
+
+The dataset uses the [NeMo-Skills](https://github.com/NVIDIA-NeMo/Skills) prompt format aligned with deepseek-prover-v2.
+
+## Requirements
+
+- Lean4 sandbox container running on configurable host:port (default `127.0.0.1:6000`)
+- The sandbox must expose `/execute` endpoint accepting JSON with `generated_code`, `language`, `timeout`
+
+## Configuration
+
+Set sandbox location via environment variables or YAML config:
+```yaml
+sandbox_host: ${oc.env:NEMO_SKILLS_SANDBOX_HOST,127.0.0.1}
+sandbox_port: ${oc.env:NEMO_SKILLS_SANDBOX_PORT,6000}
+```
+
+## License
+
+- **Code**: Apache-2.0
+- **MiniF2F Data**: MIT License (from OpenAI MiniF2F repository)
diff --git a/resources_servers/math_formal_lean/__init__.py b/resources_servers/math_formal_lean/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lean4 formal proof verification resource server for NeMo Gym."""
diff --git a/resources_servers/math_formal_lean/app.py b/resources_servers/math_formal_lean/app.py
@@ -0,0 +1,244 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lean4 formal proof verification resource server."""
+
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel
+
+from nemo_gym.base_resources_server import (
+    BaseResourcesServerConfig,
+    BaseRunRequest,
+    BaseVerifyRequest,
+    BaseVerifyResponse,
+    SimpleResourcesServer,
+)
+from resources_servers.math_formal_lean.sandbox_client import Lean4SandboxClient
+
+
+LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class ProofBuildConfig:
+    final_answer_key: Optional[str] = None
+    extract_code_mode: str = "last"  # "first" or "last"
+    restate_formal_statement: bool = True
+    strip_theorem_from_proof: bool = True
+
+
+def extract_code_block(text: str, languages: Optional[list] = None, extract_code_mode: str = "last") -> str:
+    """Extract code from markdown code blocks."""
+    if languages is None:
+        languages = [""]
+    for language in languages:
+        matches = re.findall(rf"```{language}\s*\n?(.*?)\n?```", text, re.DOTALL)
+        if matches:
+            idx = 0 if extract_code_mode == "first" else -1
+            return matches[idx].strip()
+    return ""
+
+
+def clean_formal_generation(
+    generation: str,
+    final_answer_key: Optional[str] = None,
+    extract_code_mode: str = "last",
+) -> str:
+    """Clean LLM generation to extract Lean code."""
+    if final_answer_key and final_answer_key in generation:
+        generation = generation.split(final_answer_key, 1)[1].strip()
+
+    languages = ["lean4", "lean3", "lean", ""]
+    extracted_code = extract_code_block(generation, languages, extract_code_mode=extract_code_mode)
+    if extracted_code:
+        return extracted_code
+
+    return re.sub(r"^\s*```(?:lean4|lean3|lean)?\s*|\s*```[\s]*$", "", generation).strip()
+
+
+def extract_proof_only(lean_code: str) -> str:
+    """Extract only the proof part from a Lean theorem/example."""
+    lines = lean_code.strip().splitlines()
+    if not lines:
+        return ""
+
+    header_start_pattern = re.compile(r"^\s*(theorem|example)\b")
+    header_start_idx = None
+
+    for i, line in enumerate(lines):
+        if header_start_pattern.match(line):
+            header_start_idx = i
+            break
+
+    if header_start_idx is None:
+        return lean_code.strip()
+
+    header_end_idx = None
+    for i in range(header_start_idx, len(lines)):
+        if ":=" in lines[i]:
+            header_end_idx = i
+            break
+
+    if header_end_idx is None:
+        return lean_code.strip()
+
+    header_line, after = lines[header_end_idx].split(":=", 1)
+    proof_first_line = after.strip()
+
+    if proof_first_line:
+        proof_lines = [proof_first_line] + lines[header_end_idx + 1 :]
+    else:
+        proof_lines = lines[header_end_idx + 1 :]
+
+    if proof_lines:
+        first = proof_lines[0].lstrip()
+        if first == "by":
+            proof_lines = proof_lines[1:]
+        elif first.startswith("by "):
+            proof_lines[0] = first[3:]
+
+    return "\n".join(proof_lines).rstrip()
+
+
+def build_lean4_proof(generation: str, data_point: Dict[str, Any], config: ProofBuildConfig) -> str:
+    """Build a complete Lean4 proof from generation and data point."""
+    cleaned_generation = clean_formal_generation(
+        generation, final_answer_key=config.final_answer_key, extract_code_mode=config.extract_code_mode
+    )
+
+    header = data_point.get("header", "")
+    formal_statement = data_point.get("formal_statement", "") if config.restate_formal_statement else ""
+
+    if config.strip_theorem_from_proof:
+        proof_part = extract_proof_only(cleaned_generation)
+    else:
+        proof_part = cleaned_generation
+
+    return header + formal_statement + proof_part
+
+
+def determine_proof_status(compiler_output: Dict[str, Any]) -> str:
+    """Determine proof status from compiler output."""
+    process_status = compiler_output.get("process_status", "unknown")
+
+    if process_status == "timeout":
+        return "timeout"
+    elif process_status != "completed":
+        return process_status
+
+    stdout = compiler_output.get("stdout", "").lower()
+    stderr = compiler_output.get("stderr", "").lower()
+    combined = stdout + "\n" + stderr
+
+    if re.search(r"\bsorry\b", combined) is not None:
+        return "has_sorry"
+
+    return "completed"
+
+
+class MathFormalLeanResourcesServerConfig(BaseResourcesServerConfig):
+    sandbox_host: str = "127.0.0.1"
+    sandbox_port: int = 6000
+    compilation_timeout: float = 30.0
+    max_output_characters: int = 1000
+    extract_code_mode: str = "last"
+    restate_formal_statement: bool = True
+    strip_theorem_from_proof: bool = True
+
+
+class MathFormalLeanRunRequest(BaseRunRequest):
+    header: str
+    formal_statement: str
+    informal_prefix: Optional[str] = None
+    name: Optional[str] = None
+
+
+class MathFormalLeanVerifyRequest(MathFormalLeanRunRequest, BaseVerifyRequest):
+    pass
+
+
+class CompilerOutput(BaseModel):
+    process_status: str
+    stdout: str
+    stderr: str
+
+
+class MathFormalLeanVerifyResponse(BaseVerifyResponse):
+    proof_status: str
+    predicted_proof: str
+    compiler_output: Optional[CompilerOutput] = None
+
+
+class MathFormalLeanResourcesServer(SimpleResourcesServer):
+    config: MathFormalLeanResourcesServerConfig
+
+    def model_post_init(self, context: Any) -> None:
+        super().model_post_init(context)
+        self._sandbox_client = Lean4SandboxClient(
+            host=self.config.sandbox_host,
+            port=self.config.sandbox_port,
+            max_output_characters=self.config.max_output_characters,
+        )
+        self._proof_build_config = ProofBuildConfig(
+            extract_code_mode=self.config.extract_code_mode,
+            restate_formal_statement=self.config.restate_formal_statement,
+            strip_theorem_from_proof=self.config.strip_theorem_from_proof,
+        )
+
+    async def verify(self, body: MathFormalLeanVerifyRequest) -> MathFormalLeanVerifyResponse:
+        generation = body.response.output_text
+        if not generation or not generation.strip():
+            LOG.warning("Empty generation received")
+            return MathFormalLeanVerifyResponse(
+                **body.model_dump(),
+                reward=0.0,
+                proof_status="empty_generation",
+                predicted_proof="",
+            )
+
+        data_point = {
+            "header": body.header,
+            "formal_statement": body.formal_statement,
+        }
+
+        predicted_proof = build_lean4_proof(
+            generation=generation,
+            data_point=data_point,
+            config=self._proof_build_config,
+        )
+
+        compiler_output = await self._sandbox_client.execute_lean4(
+            code=predicted_proof,
+            timeout=self.config.compilation_timeout,
+        )
+
+        proof_status = determine_proof_status(compiler_output)
+        reward = 1.0 if proof_status == "completed" else 0.0
+
+        return MathFormalLeanVerifyResponse(
+            **body.model_dump(),
+            reward=reward,
+            proof_status=proof_status,
+            predicted_proof=predicted_proof,
+            compiler_output=CompilerOutput(**compiler_output),
+        )
+
+
+if __name__ == "__main__":
+    MathFormalLeanResourcesServer.run_webserver()
diff --git a/resources_servers/math_formal_lean/configs/math_formal_lean.yaml b/resources_servers/math_formal_lean/configs/math_formal_lean.yaml
@@ -0,0 +1,30 @@
+math_formal_lean:
+  resources_servers:
+    math_formal_lean:
+      entrypoint: app.py
+      sandbox_host: ${oc.env:NEMO_SKILLS_SANDBOX_HOST,127.0.0.1}
+      sandbox_port: ${oc.env:NEMO_SKILLS_SANDBOX_PORT,6000}
+      compilation_timeout: 30.0
+      domain: math
+      verified: false
+      description: Lean4 formal proof verification environment
+      value: Improve formal theorem proving capabilities
+
+math_formal_lean_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: math_formal_lean
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: train
+        type: train
+        jsonl_fpath: resources_servers/math_formal_lean/data/minif2f_test.jsonl
+        license: MIT
+      - name: example
+        type: example
+        jsonl_fpath: resources_servers/math_formal_lean/data/minif2f_test.jsonl
diff --git a/resources_servers/math_formal_lean/data/example.jsonl b/resources_servers/math_formal_lean/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\ntheorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n    (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n\n  sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n    (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n", "informal_prefix": "/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\n", "name": "mathd_algebra_478", "split": "test"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- Show that there are no integers $x$ and $y$ such that $4x^3 - 7y^3 = 2003$.-/\ntheorem numbertheory_4x3m7y3neq2003 (x y : ℤ) : 4 * x ^ 3 - 7 * y ^ 3 ≠ 2003 := by\n\n  sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem numbertheory_4x3m7y3neq2003 (x y : ℤ) : 4 * x ^ 3 - 7 * y ^ 3 ≠ 2003 := by\n", "informal_prefix": "/-- Show that there are no integers $x$ and $y$ such that $4x^3 - 7y^3 = 2003$.-/\n", "name": "numbertheory_4x3m7y3neq2003", "split": "test"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- Let $x$, $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\log_x w = 24$, $\\log_y w = 40$ and $\\log_{xyz} w = 12$. Find $\\log_z w$. Show that it is 060.-/\ntheorem aime_1983_p1 (x y z w : ℕ) (ht : 1 < x ∧ 1 < y ∧ 1 < z) (hw : 0 ≤ w)\n    (h0 : Real.log w / Real.log x = 24) (h1 : Real.log w / Real.log y = 40)\n    (h2 : Real.log w / Real.log (x * y * z) = 12) : Real.log w / Real.log z = 60 := by\n\n  sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem aime_1983_p1 (x y z w : ℕ) (ht : 1 < x ∧ 1 < y ∧ 1 < z) (hw : 0 ≤ w)\n    (h0 : Real.log w / Real.log x = 24) (h1 : Real.log w / Real.log y = 40)\n    (h2 : Real.log w / Real.log (x * y * z) = 12) : Real.log w / Real.log z = 60 := by\n", "informal_prefix": "/-- Let $x$, $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\log_x w = 24$, $\\log_y w = 40$ and $\\log_{xyz} w = 12$. Find $\\log_z w$. Show that it is 060.-/\n", "name": "aime_1983_p1", "split": "test"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- What is the product of all positive odd integers less than $10000$?\n\n$\\text{(A)}\\ \\dfrac{10000!}{(5000!)^2}\\qquad \\text{(B)}\\ \\dfrac{10000!}{2^{5000}}\\qquad\n\\text{(C)}\\ \\dfrac{9999!}{2^{5000}}\\qquad \\text{(D)}\\ \\dfrac{10000!}{2^{5000} \\cdot 5000!}\\qquad\n\\text{(E)}\\ \\dfrac{5000!}{2^{5000}}$ Show that it is \\text{(D)} \\dfrac{10000!}{2^{5000} \\cdot 5000!}.-/\ntheorem amc12_2001_p5 :\n    Finset.prod (Finset.filter (fun x => ¬Even x) (Finset.range 10000)) (id : ℕ → ℕ) =\n      10000! / (2 ^ 5000 * 5000!) := by\n\n  sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem amc12_2001_p5 :\n    Finset.prod (Finset.filter (fun x => ¬Even x) (Finset.range 10000)) (id : ℕ → ℕ) =\n      10000! / (2 ^ 5000 * 5000!) := by\n", "informal_prefix": "/-- What is the product of all positive odd integers less than $10000$?\n\n$\\text{(A)}\\ \\dfrac{10000!}{(5000!)^2}\\qquad \\text{(B)}\\ \\dfrac{10000!}{2^{5000}}\\qquad\n\\text{(C)}\\ \\dfrac{9999!}{2^{5000}}\\qquad \\text{(D)}\\ \\dfrac{10000!}{2^{5000} \\cdot 5000!}\\qquad\n\\text{(E)}\\ \\dfrac{5000!}{2^{5000}}$ Show that it is \\text{(D)} \\dfrac{10000!}{2^{5000} \\cdot 5000!}.-/\n", "name": "amc12_2001_p5", "split": "test"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- A rectangular patio has an area of $180$ square feet and a perimeter of $54$ feet. What is the length of the diagonal (in feet) squared? Show that it is 369.-/\ntheorem mathd_algebra_141 (a b : ℝ) (h₁ : a * b = 180) (h₂ : 2 * (a + b) = 54) :\n    a ^ 2 + b ^ 2 = 369 := by\n\n  sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem mathd_algebra_141 (a b : ℝ) (h₁ : a * b = 180) (h₂ : 2 * (a + b) = 54) :\n    a ^ 2 + b ^ 2 = 369 := by\n", "informal_prefix": "/-- A rectangular patio has an area of $180$ square feet and a perimeter of $54$ feet. What is the length of the diagonal (in feet) squared? Show that it is 369.-/\n", "name": "mathd_algebra_141", "split": "test"}
diff --git a/resources_servers/math_formal_lean/data/example_metrics.json b/resources_servers/math_formal_lean/data/example_metrics.json
@@ -0,0 +1,34 @@
+{
+    "name": "example",
+    "type": "example",
+    "jsonl_fpath": "resources_servers/math_formal_lean/data/example.jsonl",
+    "num_repeats": 1,
+    "gitlab_identifier": null,
+    "huggingface_identifier": null,
+    "license": "MIT",
+    "Number of examples": 5,
+    "Number of tools": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Median": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Json-dumped number of words (proxy for token count)": {
+        "Total # non-null values": 5,
+        "Average": 500.0,
+        "Min": 400.0,
+        "Max": 600.0,
+        "Median": 500.0,
+        "Standard deviation": 50.0
+    },
+    "Number of turns": {
+        "Total # non-null values": 5,
+        "Average": 1.0,
+        "Min": 1.0,
+        "Max": 1.0,
+        "Median": 1.0,
+        "Standard deviation": 0.0
+    }
+}