Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
cca96cb
migrate first commit
HeyyyyyyG Sep 5, 2025
e03e381
fix pyproject.toml
HeyyyyyyG Sep 5, 2025
fa7b54b
refactor and fix bugs
HeyyyyyyG Sep 6, 2025
02721b4
rewrite tests
HeyyyyyyG Sep 8, 2025
91d4bfb
license
HeyyyyyyG Sep 8, 2025
8e06abe
license
HeyyyyyyG Sep 8, 2025
f397cb3
fix format
HeyyyyyyG Sep 8, 2025
66d3f3a
add datasets
HeyyyyyyG Sep 9, 2025
f5f13b5
add example data
HeyyyyyyG Sep 9, 2025
f6bce8e
update readme
HeyyyyyyG Sep 9, 2025
a545253
swe changes
Oct 28, 2025
a7e3df4
minor changes
Oct 28, 2025
2e7eda9
token ids added
Nov 6, 2025
b39323f
migrate first commit
HeyyyyyyG Sep 5, 2025
1e32f18
fix pyproject.toml
HeyyyyyyG Sep 5, 2025
2db2001
refactor and fix bugs
HeyyyyyyG Sep 6, 2025
c4fa0d5
rewrite tests
HeyyyyyyG Sep 8, 2025
f96d52d
license
HeyyyyyyG Sep 8, 2025
532b4c4
license
HeyyyyyyG Sep 8, 2025
79120dc
fix format
HeyyyyyyG Sep 8, 2025
1729735
add datasets
HeyyyyyyG Sep 9, 2025
21481c9
add example data
HeyyyyyyG Sep 9, 2025
168d14c
update readme
HeyyyyyyG Sep 9, 2025
a465a4f
swe changes
Oct 28, 2025
cdd8e91
minor changes
Oct 28, 2025
b9f3840
token ids added
Nov 6, 2025
b82713c
add assetions
slikhite-1 Nov 8, 2025
74a431a
local changes before pull
slikhite-1 Nov 9, 2025
ab1f525
changes for merge
slikhite-1 Nov 10, 2025
1f9bbf1
version changes
slikhite-1 Nov 11, 2025
af2ad56
code for ray added
slikhite-1 Nov 12, 2025
186a228
concurrency added
slikhite-1 Nov 12, 2025
54449d7
ray global ip changes added
slikhite-1 Nov 13, 2025
8ba0116
feat: local openhands
sdevare-nv Nov 16, 2025
dc4f1a0
fix: wrong folder bug
sdevare-nv Nov 17, 2025
3305c3d
feat: add tokens to final traj
sdevare-nv Nov 17, 2025
2d296ee
feat: add instance dict
sdevare-nv Nov 17, 2025
77a2957
feat: add swebench eval local setup
sdevare-nv Nov 17, 2025
3386cca
cleanup
sdevare-nv Nov 17, 2025
52da10c
feat: max retries
sdevare-nv Nov 17, 2025
e940ad3
feat: update oh with cookie info
sdevare-nv Nov 17, 2025
11dcabc
fix: token ids
sdevare-nv Nov 17, 2025
e03d947
feat: add dataset path and bump version
sdevare-nv Nov 18, 2025
12ffaa6
feat: add cleanup
sdevare-nv Nov 25, 2025
77bb3c5
feat: disable logging
sdevare-nv Nov 30, 2025
b15bfb1
refactor
sdevare-nv Dec 7, 2025
7ffc15d
feat: add locking
sdevare-nv Dec 7, 2025
124c361
feat: internal data support
sdevare-nv Dec 8, 2025
2c6cd6f
fix: test verifier
sdevare-nv Dec 10, 2025
c393126
fix: test verifier
sdevare-nv Dec 10, 2025
7cf0335
feat: temp rollout
sdevare-nv Dec 10, 2025
9f1da35
feat: remove cryptography
sdevare-nv Dec 10, 2025
f4411f6
feat: venv readonly
sdevare-nv Dec 10, 2025
54d0bab
feat: readonly miniforge
sdevare-nv Dec 10, 2025
be61f8a
Add support for R2E gym
slikhite-1 Dec 17, 2025
84e60ad
refactor
sdevare-nv Dec 18, 2025
8fadd88
feat: bump oh version
sdevare-nv Dec 18, 2025
0dfefb3
feat: multiple container paths
sdevare-nv Dec 18, 2025
cda91e7
fix: uv sync in r2e setup
sdevare-nv Dec 20, 2025
e208b65
feat: venv rename
sdevare-nv Dec 20, 2025
de51991
feat: fix agent config
sdevare-nv Dec 20, 2025
1a8c081
fix: .venv check
sdevare-nv Dec 21, 2025
5b065ae
feat: update openhands
sdevare-nv Jan 5, 2026
ca6dbf3
Merge remote-tracking branch 'origin/main' into sdd/sl/swe
sdevare-nv Jan 5, 2026
bc2e9d6
feat: revert vllm model change
sdevare-nv Jan 5, 2026
284f5f5
cleanup
sdevare-nv Jan 8, 2026
9de0f75
fix: uv lock
sdevare-nv Jan 8, 2026
be4bba5
feat: add orjson in server utils
sdevare-nv Jan 9, 2026
1c5bc41
feat: bump oh
sdevare-nv Jan 9, 2026
9c665e5
feat: make everything ro
sdevare-nv Jan 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nemo_gym/global_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,4 +489,4 @@ def format_almost_server_warning(server_name: str, error: ValidationError) -> st
{error_str}

This server will NOT be started.
"""
"""
32 changes: 22 additions & 10 deletions nemo_gym/openai_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Union,
)

import orjson
from openai.types.chat import (
ChatCompletion,
ChatCompletionAssistantMessageParam,
Expand Down Expand Up @@ -474,33 +475,44 @@ async def _raise_for_status(self, response: ClientResponse, request_kwargs: Dict
async def create_chat_completion(self, **kwargs):
request_kwargs = dict(
url=f"{self.base_url}/chat/completions",
json=kwargs,
headers={"Authorization": f"Bearer {self.api_key}"},
data=orjson.dumps(kwargs),
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
)
response = await self._request(method="POST", **request_kwargs)

await self._raise_for_status(response, request_kwargs)
return await response.json()
response_dict = orjson.loads(await response.read())
return response_dict

async def create_response(self, **kwargs):
request_kwargs = dict(
url=f"{self.base_url}/responses",
json=kwargs,
headers={"Authorization": f"Bearer {self.api_key}"},
data=orjson.dumps(kwargs),
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
)
response = await self._request(method="POST", **request_kwargs)

await self._raise_for_status(response, request_kwargs)
return await response.json()
response_dict = orjson.loads(await response.read())
return response_dict

async def create_tokenize(self, **kwargs):
base_url = self.base_url.removesuffix("/v1")
request_kwargs = dict(
url=f"{base_url}/tokenize",
json=kwargs,
headers={"Authorization": f"Bearer {self.api_key}"},
data=orjson.dumps(kwargs),
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
)
response = await self._request(method="POST", **request_kwargs)

await self._raise_for_status(response, request_kwargs)
return await response.json()
response_dict = orjson.loads(await response.read())
return response_dict
2 changes: 1 addition & 1 deletion nemo_gym/rollout_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,4 @@ def collect_rollouts(): # pragma: no cover
config = RolloutCollectionConfig.model_validate(get_global_config_dict())
rch = RolloutCollectionHelper()

asyncio.run(rch.run_from_config(config))
asyncio.run(rch.run_from_config(config))
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ dependencies = [
# Updated Thu Dec 04, 2025 with datasets==4.4.1
# License: Apache 2.0 https://github.com/huggingface/datasets/blob/main/LICENSE
"datasets",

# orjson: Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy
# Updated: Thu Jan 08, 2026 with orjson==3.11.3
# License: Apache 2.0 https://github.com/ijl/orjson/blob/master/LICENSE
"orjson",
]

[dependency-groups]
Expand Down
4 changes: 4 additions & 0 deletions responses_api_agents/swe_agents/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
swe_openhands_setup
swe_swebench_setup
swe_r2e_gym_setup
swebench_results_*
229 changes: 229 additions & 0 deletions responses_api_agents/swe_agents/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# Quick Start: Running SWE Agents

This guide shows how to run the SWE agents that use OpenAI GPT-4.1 (or any other model) to solve real-world GitHub issues.

## Prerequisites

1. **Install Apptainer** (for container execution):
```bash
# Install Apptainer on Ubuntu/Debian
apt install -y wget && \
cd /tmp && \
wget https://github.com/apptainer/apptainer/releases/download/v1.4.1/apptainer_1.4.1_amd64.deb && \
apt install -y ./apptainer_1.4.1_amd64.deb

# Verify installation
apptainer --version
```


## Step 1: Configure Your API Key

Create or update your `env.yaml` file in the NeMo-Gym root directory:

```yaml
# For OpenAI models
policy_base_url: https://api.openai.com/v1
policy_api_key: {your OpenAI API key}
policy_model_name: gpt-4.1-2025-04-14
```

You can also host a vLLM model.

Start VLLM server (in separate terminal):
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
vllm serve Qwen/Qwen3-Coder-30B-A3B-Instruct \
--max-model-len 131072 \
--enable-expert-parallel \
--tensor-parallel-size 4 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--port 8000 \
--enforce-eager
```
Then set
```yaml
policy_base_url: http://localhost:8000/v1
policy_api_key: dummy
policy_model_name: Qwen/Qwen3-Coder-30B-A3B-Instruct
```


## Step 2: Run the SWE Agents

Start the servers with SWE-agent configuration:

```bash
# Define config paths
# OpenAI model
config_paths="responses_api_agents/swe_agents/configs/swebench_swe_agent.yaml,\
responses_api_models/openai_model/configs/openai_model.yaml"

or
# vLLM model
config_paths="responses_api_agents/swe_agents/configs/swebench_swe_agent.yaml,\
responses_api_models/vllm_model/configs/vllm_model.yaml"

# Run the servers
# If you have pre-downloaded images, you can set the path with container_formatter, e.g.
ng_run "+config_paths=[$config_paths]" \
+swe_agents.responses_api_agents.swe_agents.container_formatter=/lustre/xxx/images/swe-bench/swebench_sweb.eval.x86_64.\{instance_id\}.sif \
+swe_agents.responses_api_agents.swe_agents.model_server.name=vllm_model

```

To run OpenHands server, simply replace the SWE-agent config path to OpenHands config
```bash
responses_api_agents/swe_agents/configs/swebench_openhands.yaml
```

For how to download images and convert to .sif, you can refer to https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/swe-bench/dump_images.py


You should see output like:
```
INFO: Started server process [1815588]
INFO: Uvicorn running on http://127.0.0.1:25347 (Press CTRL+C to quit)
INFO: Started server process [1815587]
INFO: Uvicorn running on http://127.0.0.1:56809 (Press CTRL+C to quit)
```

## Step 3: Query the Agent

In a new terminal, run the client script:

```bash
python responses_api_agents/swe_agents/client.py
```


## Advanced usage: Run Batch Evaluation/Data Collection

For multiple problems, use rollout collection:

```
# Collect rollouts
ng_collect_rollouts +agent_name=swe_agents \
+input_jsonl_fpath=swebench-verified-converted.jsonl \
+output_jsonl_fpath=swebench-verified.openhands.qwen3-30b-coder.jsonl \
+model=Qwen/Qwen3-Coder-30B-A3B-Instruct \
+temperature=0.7 \
+top_p=0.8
```
By default, the concurrency of ng_collect_rollouts is 100. You may want to adjust it based on your hardware configuration accordingly.

## Step 6: View Results

View the collected results:

```bash
ng_viewer +jsonl_fpath=swebench-verified.openhands.qwen3-30b-coder.jsonl
```


## Expected Output

A successful run will show:
```json
{
"responses_create_params": {
"background": null,
"include": null,
"input": [
{
"content": "You are OpenHands agent, a helpful AI assistant...",
"role": "system",
"type": "message"
},
{
"content": "I've uploaded a python code repository...",
"role": "user",
"type": "message"
}
],
"instructions": null,
"max_output_tokens": null,
"max_tool_calls": null,
"metadata": {
"instance_id": "astropy__astropy-12907",
"base_commit": "d16bfe05a744909de4b27f5875fe0d4ed41ce607",
"dataset_name": "princeton-nlp/SWE-bench_Verified",
"split": "test",
"problem_statement": "Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels\nConsider the following model:\n\n```python\nfrom astropy.modeling import models as m\nfrom astropy.modeling.separable import separability_matrix\n\ncm = m.Linear1D(10) & m.Linear1D(5)\n```\n\nIt's separability matrix as you might expect is a diagonal:\n\n```python\n>>> separability_matrix(cm)\narray([[ True, False],\n[False, True]])\n```\n\nIf I make the model more complex:\n```python\n>>> separability_matrix(m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5))\narray([[ True, True, False, False],\n[ True, True, False, False],\n[False, False, True, False],\n[False, False, False, True]])\n```\n\nThe output matrix is again, as expected, the outputs and inputs to the linear models are separable and independent of each other.\n\nIf however, I nest these compound models:\n```python\n>>> separability_matrix(m.Pix2Sky_TAN() & cm)\narray([[ True, True, False, False],\n[ True, True, False, False],\n[False, False, True, True],\n[False, False, True, True]])\n```\nSuddenly the inputs and outputs are no longer separable?\n\nThis feels like a bug to me, but I might be missing something?"
},
"model": "Qwen/Qwen/Qwen3-Coder-30B-A3B-Instruct",
"parallel_tool_calls": true,
"previous_response_id": null,
"prompt": null,
"reasoning": null,
"service_tier": null,
"store": null,
"temperature": 0.7,
"text": null,
"tool_choice": "auto",
"tools": [...]
},
"response": {
"id": "swebench-astropy__astropy-12907",
"created_at": 1757366053,
"error": null,
"incomplete_details": null,
"instructions": null,
"metadata": null,
"model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
"object": "response",
"output": [
{
"id": "msg-2",
"content": [
{
"annotations": [],
"text": "I'll help you implement the necessary changes...",
"type": "output_text",
"logprobs": null
}
],
"role": "assistant",
"status": "completed",
"type": "message"
}
],
"parallel_tool_calls": true,
"temperature": null,
"tool_choice": "auto",
"tools": [...],
"top_p": null,
"background": null,
"max_output_tokens": null,
"max_tool_calls": null,
"previous_response_id": null,
"prompt": null,
"reasoning": null,
"service_tier": null,
"status": null,
"text": null,
"top_logprobs": null,
"truncation": null,
"usage": null,
"user": null
},
"reward": 1.0,
"swebench_metrics": {
"patch_is_None": false,
"patch_exists": true,
"patch_successfully_applied": true,
"resolved": true,
},
"resolved": 1,
"patch_exists": 1,
"patch_successfully_applied": 1,
"metadata": {
"instance_id": "astropy__astropy-12907",
"agent_framework": "openhands",
"patch_exists": true,
"patch_successfully_applied": true,
"resolved": true
}
}
```
22 changes: 22 additions & 0 deletions responses_api_agents/swe_agents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""SWE-bench wrapper agent for NeMo-Gym.

This module provides integration between NeMo-Skills' SWE-bench evaluation
capabilities and NeMo-Gym's agent framework.
"""

from .app import (
SWEBenchRunRequest,
SWEBenchVerifyRequest,
SWEBenchVerifyResponse,
SWEBenchWrapper,
SWEBenchWrapperConfig,
)


__all__ = [
"SWEBenchWrapper",
"SWEBenchWrapperConfig",
"SWEBenchRunRequest",
"SWEBenchVerifyRequest",
"SWEBenchVerifyResponse",
]
Loading
Loading