Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions resources_servers/math_formal_lean/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Math Formal Lean Resource Server

Lean4 formal proof verification environment for NeMo Gym.

## Overview

This environment verifies Lean4 mathematical proofs generated by language models. The model receives a theorem statement and must produce a valid proof that compiles successfully.

## Verification Logic

The `/verify` endpoint:
1. Extracts Lean4 code from the model's markdown output
2. Builds a complete proof by combining header imports + theorem statement + generated proof
3. Sends the code to a Lean4 sandbox container for compilation
4. Returns `reward=1.0` if compilation succeeds, `reward=0.0` otherwise

Failure modes detected:
- Syntax errors
- Type errors
- Timeout (default 30s)
- Use of `sorry` tactic

## Data Sources

**MiniF2F Dataset**: 244 formal mathematics problems from the [MiniF2F benchmark](https://github.com/openai/miniF2F), covering:
- Algebra
- Number theory
- Geometry (formalized)

The dataset uses the [NeMo-Skills](https://github.com/NVIDIA-NeMo/Skills) prompt format aligned with deepseek-prover-v2.

## Requirements

- Lean4 sandbox container running on configurable host:port (default `127.0.0.1:6000`)
- The sandbox must expose `/execute` endpoint accepting JSON with `generated_code`, `language`, `timeout`

## Configuration

Set sandbox location via environment variables or YAML config:
```yaml
sandbox_host: ${oc.env:NEMO_SKILLS_SANDBOX_HOST,127.0.0.1}
sandbox_port: ${oc.env:NEMO_SKILLS_SANDBOX_PORT,6000}
```

## License

- **Code**: Apache-2.0
- **MiniF2F Data**: MIT License (from OpenAI MiniF2F repository)
16 changes: 16 additions & 0 deletions resources_servers/math_formal_lean/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Lean4 formal proof verification resource server for NeMo Gym."""
244 changes: 244 additions & 0 deletions resources_servers/math_formal_lean/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Lean4 formal proof verification resource server."""

import logging
import re
from dataclasses import dataclass
from typing import Any, Dict, Optional

from pydantic import BaseModel

from nemo_gym.base_resources_server import (
BaseResourcesServerConfig,
BaseRunRequest,
BaseVerifyRequest,
BaseVerifyResponse,
SimpleResourcesServer,
)
from resources_servers.math_formal_lean.sandbox_client import Lean4SandboxClient


LOG = logging.getLogger(__name__)


@dataclass
class ProofBuildConfig:
final_answer_key: Optional[str] = None
extract_code_mode: str = "last" # "first" or "last"
restate_formal_statement: bool = True
strip_theorem_from_proof: bool = True


def extract_code_block(text: str, languages: Optional[list] = None, extract_code_mode: str = "last") -> str:
"""Extract code from markdown code blocks."""
if languages is None:
languages = [""]
for language in languages:
matches = re.findall(rf"```{language}\s*\n?(.*?)\n?```", text, re.DOTALL)
if matches:
idx = 0 if extract_code_mode == "first" else -1
return matches[idx].strip()
return ""


def clean_formal_generation(
generation: str,
final_answer_key: Optional[str] = None,
extract_code_mode: str = "last",
) -> str:
"""Clean LLM generation to extract Lean code."""
if final_answer_key and final_answer_key in generation:
generation = generation.split(final_answer_key, 1)[1].strip()

languages = ["lean4", "lean3", "lean", ""]
extracted_code = extract_code_block(generation, languages, extract_code_mode=extract_code_mode)
if extracted_code:
return extracted_code

return re.sub(r"^\s*```(?:lean4|lean3|lean)?\s*|\s*```[\s]*$", "", generation).strip()


def extract_proof_only(lean_code: str) -> str:
"""Extract only the proof part from a Lean theorem/example."""
lines = lean_code.strip().splitlines()
if not lines:
return ""

header_start_pattern = re.compile(r"^\s*(theorem|example)\b")
header_start_idx = None

for i, line in enumerate(lines):
if header_start_pattern.match(line):
header_start_idx = i
break

if header_start_idx is None:
return lean_code.strip()

header_end_idx = None
for i in range(header_start_idx, len(lines)):
if ":=" in lines[i]:
header_end_idx = i
break

if header_end_idx is None:
return lean_code.strip()

header_line, after = lines[header_end_idx].split(":=", 1)
proof_first_line = after.strip()

if proof_first_line:
proof_lines = [proof_first_line] + lines[header_end_idx + 1 :]
else:
proof_lines = lines[header_end_idx + 1 :]

if proof_lines:
first = proof_lines[0].lstrip()
if first == "by":
proof_lines = proof_lines[1:]
elif first.startswith("by "):
proof_lines[0] = first[3:]

return "\n".join(proof_lines).rstrip()


def build_lean4_proof(generation: str, data_point: Dict[str, Any], config: ProofBuildConfig) -> str:
"""Build a complete Lean4 proof from generation and data point."""
cleaned_generation = clean_formal_generation(
generation, final_answer_key=config.final_answer_key, extract_code_mode=config.extract_code_mode
)

header = data_point.get("header", "")
formal_statement = data_point.get("formal_statement", "") if config.restate_formal_statement else ""

if config.strip_theorem_from_proof:
proof_part = extract_proof_only(cleaned_generation)
else:
proof_part = cleaned_generation

return header + formal_statement + proof_part


def determine_proof_status(compiler_output: Dict[str, Any]) -> str:
"""Determine proof status from compiler output."""
process_status = compiler_output.get("process_status", "unknown")

if process_status == "timeout":
return "timeout"
elif process_status != "completed":
return process_status

stdout = compiler_output.get("stdout", "").lower()
stderr = compiler_output.get("stderr", "").lower()
combined = stdout + "\n" + stderr

if re.search(r"\bsorry\b", combined) is not None:
return "has_sorry"

return "completed"


class MathFormalLeanResourcesServerConfig(BaseResourcesServerConfig):
sandbox_host: str = "127.0.0.1"
sandbox_port: int = 6000
compilation_timeout: float = 30.0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this too low? I guess we don't want to go aggressive with timeouts for training, but curious if you have an estimation of how much time we'd be hitting timeouts on valid proofs with 30 seconds

max_output_characters: int = 1000
extract_code_mode: str = "last"
restate_formal_statement: bool = True
strip_theorem_from_proof: bool = True


class MathFormalLeanRunRequest(BaseRunRequest):
header: str
formal_statement: str
informal_prefix: Optional[str] = None
name: Optional[str] = None


class MathFormalLeanVerifyRequest(MathFormalLeanRunRequest, BaseVerifyRequest):
pass


class CompilerOutput(BaseModel):
process_status: str
stdout: str
stderr: str


class MathFormalLeanVerifyResponse(BaseVerifyResponse):
proof_status: str
predicted_proof: str
compiler_output: Optional[CompilerOutput] = None


class MathFormalLeanResourcesServer(SimpleResourcesServer):
config: MathFormalLeanResourcesServerConfig

def model_post_init(self, context: Any) -> None:
super().model_post_init(context)
self._sandbox_client = Lean4SandboxClient(
host=self.config.sandbox_host,
port=self.config.sandbox_port,
max_output_characters=self.config.max_output_characters,
)
self._proof_build_config = ProofBuildConfig(
extract_code_mode=self.config.extract_code_mode,
restate_formal_statement=self.config.restate_formal_statement,
strip_theorem_from_proof=self.config.strip_theorem_from_proof,
)

async def verify(self, body: MathFormalLeanVerifyRequest) -> MathFormalLeanVerifyResponse:
generation = body.response.output_text
if not generation or not generation.strip():
LOG.warning("Empty generation received")
return MathFormalLeanVerifyResponse(
**body.model_dump(),
reward=0.0,
proof_status="empty_generation",
predicted_proof="",
)

data_point = {
"header": body.header,
"formal_statement": body.formal_statement,
}

predicted_proof = build_lean4_proof(
generation=generation,
data_point=data_point,
config=self._proof_build_config,
)

compiler_output = await self._sandbox_client.execute_lean4(
code=predicted_proof,
timeout=self.config.compilation_timeout,
)

proof_status = determine_proof_status(compiler_output)
reward = 1.0 if proof_status == "completed" else 0.0

return MathFormalLeanVerifyResponse(
**body.model_dump(),
reward=reward,
proof_status=proof_status,
predicted_proof=predicted_proof,
compiler_output=CompilerOutput(**compiler_output),
)


if __name__ == "__main__":
MathFormalLeanResourcesServer.run_webserver()
30 changes: 30 additions & 0 deletions resources_servers/math_formal_lean/configs/math_formal_lean.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
math_formal_lean:
resources_servers:
math_formal_lean:
entrypoint: app.py
sandbox_host: ${oc.env:NEMO_SKILLS_SANDBOX_HOST,127.0.0.1}
sandbox_port: ${oc.env:NEMO_SKILLS_SANDBOX_PORT,6000}
compilation_timeout: 30.0
domain: math
verified: false
description: Lean4 formal proof verification environment
value: Improve formal theorem proving capabilities

math_formal_lean_simple_agent:
responses_api_agents:
simple_agent:
entrypoint: app.py
resources_server:
type: resources_servers
name: math_formal_lean
model_server:
type: responses_api_models
name: policy_model
datasets:
- name: train
type: train
jsonl_fpath: resources_servers/math_formal_lean/data/minif2f_test.jsonl
license: MIT
- name: example
type: example
jsonl_fpath: resources_servers/math_formal_lean/data/minif2f_test.jsonl
5 changes: 5 additions & 0 deletions resources_servers/math_formal_lean/data/example.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\ntheorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n\n sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n", "informal_prefix": "/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\n", "name": "mathd_algebra_478", "split": "test"}
{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- Show that there are no integers $x$ and $y$ such that $4x^3 - 7y^3 = 2003$.-/\ntheorem numbertheory_4x3m7y3neq2003 (x y : ℤ) : 4 * x ^ 3 - 7 * y ^ 3 ≠ 2003 := by\n\n sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem numbertheory_4x3m7y3neq2003 (x y : ℤ) : 4 * x ^ 3 - 7 * y ^ 3 ≠ 2003 := by\n", "informal_prefix": "/-- Show that there are no integers $x$ and $y$ such that $4x^3 - 7y^3 = 2003$.-/\n", "name": "numbertheory_4x3m7y3neq2003", "split": "test"}
{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- Let $x$, $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\log_x w = 24$, $\\log_y w = 40$ and $\\log_{xyz} w = 12$. Find $\\log_z w$. Show that it is 060.-/\ntheorem aime_1983_p1 (x y z w : ℕ) (ht : 1 < x ∧ 1 < y ∧ 1 < z) (hw : 0 ≤ w)\n (h0 : Real.log w / Real.log x = 24) (h1 : Real.log w / Real.log y = 40)\n (h2 : Real.log w / Real.log (x * y * z) = 12) : Real.log w / Real.log z = 60 := by\n\n sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem aime_1983_p1 (x y z w : ℕ) (ht : 1 < x ∧ 1 < y ∧ 1 < z) (hw : 0 ≤ w)\n (h0 : Real.log w / Real.log x = 24) (h1 : Real.log w / Real.log y = 40)\n (h2 : Real.log w / Real.log (x * y * z) = 12) : Real.log w / Real.log z = 60 := by\n", "informal_prefix": "/-- Let $x$, $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\log_x w = 24$, $\\log_y w = 40$ and $\\log_{xyz} w = 12$. Find $\\log_z w$. Show that it is 060.-/\n", "name": "aime_1983_p1", "split": "test"}
{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- What is the product of all positive odd integers less than $10000$?\n\n$\\text{(A)}\\ \\dfrac{10000!}{(5000!)^2}\\qquad \\text{(B)}\\ \\dfrac{10000!}{2^{5000}}\\qquad\n\\text{(C)}\\ \\dfrac{9999!}{2^{5000}}\\qquad \\text{(D)}\\ \\dfrac{10000!}{2^{5000} \\cdot 5000!}\\qquad\n\\text{(E)}\\ \\dfrac{5000!}{2^{5000}}$ Show that it is \\text{(D)} \\dfrac{10000!}{2^{5000} \\cdot 5000!}.-/\ntheorem amc12_2001_p5 :\n Finset.prod (Finset.filter (fun x => ¬Even x) (Finset.range 10000)) (id : ℕ → ℕ) =\n 10000! / (2 ^ 5000 * 5000!) := by\n\n sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem amc12_2001_p5 :\n Finset.prod (Finset.filter (fun x => ¬Even x) (Finset.range 10000)) (id : ℕ → ℕ) =\n 10000! / (2 ^ 5000 * 5000!) := by\n", "informal_prefix": "/-- What is the product of all positive odd integers less than $10000$?\n\n$\\text{(A)}\\ \\dfrac{10000!}{(5000!)^2}\\qquad \\text{(B)}\\ \\dfrac{10000!}{2^{5000}}\\qquad\n\\text{(C)}\\ \\dfrac{9999!}{2^{5000}}\\qquad \\text{(D)}\\ \\dfrac{10000!}{2^{5000} \\cdot 5000!}\\qquad\n\\text{(E)}\\ \\dfrac{5000!}{2^{5000}}$ Show that it is \\text{(D)} \\dfrac{10000!}{2^{5000} \\cdot 5000!}.-/\n", "name": "amc12_2001_p5", "split": "test"}
{"responses_create_params": {"input": [{"role": "user", "content": "Complete the following Lean 4 code:\n\n```lean4\nimport Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- A rectangular patio has an area of $180$ square feet and a perimeter of $54$ feet. What is the length of the diagonal (in feet) squared? Show that it is 369.-/\ntheorem mathd_algebra_141 (a b : ℝ) (h₁ : a * b = 180) (h₂ : 2 * (a + b) = 54) :\n a ^ 2 + b ^ 2 = 369 := by\n\n sorry\n```\n\nBefore producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\nThe plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof."}]}, "header": "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n", "formal_statement": "theorem mathd_algebra_141 (a b : ℝ) (h₁ : a * b = 180) (h₂ : 2 * (a + b) = 54) :\n a ^ 2 + b ^ 2 = 369 := by\n", "informal_prefix": "/-- A rectangular patio has an area of $180$ square feet and a perimeter of $54$ feet. What is the length of the diagonal (in feet) squared? Show that it is 369.-/\n", "name": "mathd_algebra_141", "split": "test"}
34 changes: 34 additions & 0 deletions resources_servers/math_formal_lean/data/example_metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"name": "example",
"type": "example",
"jsonl_fpath": "resources_servers/math_formal_lean/data/example.jsonl",
"num_repeats": 1,
"gitlab_identifier": null,
"huggingface_identifier": null,
"license": "MIT",
"Number of examples": 5,
"Number of tools": {
"Total # non-null values": 0,
"Average": 0.0,
"Min": 0.0,
"Max": 0.0,
"Median": 0.0,
"Standard deviation": 0.0
},
"Json-dumped number of words (proxy for token count)": {
"Total # non-null values": 5,
"Average": 500.0,
"Min": 400.0,
"Max": 600.0,
"Median": 500.0,
"Standard deviation": 50.0
},
"Number of turns": {
"Total # non-null values": 5,
"Average": 1.0,
"Min": 1.0,
"Max": 1.0,
"Median": 1.0,
"Standard deviation": 0.0
}
}
Loading