Skip to content

Commit dcd5d27

Browse files
authored
release: v0.1.6
release: v0.1.6 with new features supported
2 parents c9e8a79 + edcf5c6 commit dcd5d27

File tree

6 files changed

+151
-54
lines changed

6 files changed

+151
-54
lines changed

README.md

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,13 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
225225
226226
```bash
227227
# Mount the current directory to the container
228-
docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated
229-
# ...Or locally ⚠️
230-
bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated
231-
# ...If the ground truth is working locally (due to some flaky tests)
232-
bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated --no-gt
228+
# If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
229+
# If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
230+
# If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
231+
docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
232+
233+
# If you only want to check the ground truths
234+
docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --check-gt-only
233235
```
234236
235237
...Or if you want to try it locally regardless of the risks ⚠️:
@@ -245,8 +247,8 @@ Then, run the evaluation:
245247
```bash
246248
# ...Or locally ⚠️
247249
bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
248-
# ...If the ground truth is not working locally
249-
bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated --no-gt
250+
# ...If you really don't want to check the ground truths
251+
bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --no-gt
250252
```
251253
252254
> [!Tip]
@@ -276,8 +278,9 @@ Reading samples...
276278
1140it [00:00, 1901.64it/s]
277279
Evaluating samples...
278280
100%|██████████████████████████████████████████| 1140/1140 [19:53<00:00, 6.75it/s]
279-
bigcodebench
280-
{'pass@1': 0.568}
281+
BigCodeBench-instruct-calibrated
282+
Groundtruth pass rate: 1.000
283+
pass@1: 0.568
281284
```
282285
283286
- The "k" includes `[1, 5, 10]` where k values `<=` the sample size will be used
@@ -330,9 +333,7 @@ We share pre-generated code samples from LLMs we have [evaluated](https://huggin
330333
331334
## Known Issues
332335
333-
- [ ] We notice that some tasks heavily use memory for scientific modeling during testing. This will lead to timeout issues for some machines. If you get an error message like `Check failed: ret == 0 (11 vs. 0)Thread creation via pthread_create() failed.` in Tensorflow, it is very likely due to the memory issue. Try to allocate more memory to the process or reduce the number of parallel processes.
334-
335-
- [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.2%) between runs. We are working on improving the evaluation stability.
336+
- [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.1%) between runs. We are working on improving the evaluation stability.
336337
337338
- [ ] We are aware of the issue of some users needing to use a proxy to access the internet. We are working on a subset of the tasks that do not require internet access to evaluate the code.
338339

bigcodebench/eval/__init__.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ def unsafe_execute(
110110
code: str,
111111
test_code: str,
112112
timeout: float,
113+
max_as_limit: float,
114+
max_data_limit: float,
115+
max_stack_limit: float,
113116
stat, # Value
114117
details, # Array
115118
):
@@ -123,9 +126,7 @@ def unsafe_execute(
123126
rmdir = os.rmdir
124127
chdir = os.chdir
125128
# Disable functionalities that can make destructive changes to the test.
126-
# allow only 128GB memory usage
127-
maximum_memory_bytes = 128 * 1024 * 1024 * 1024
128-
reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
129+
reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
129130
module_name = "__test__"
130131
new_module = types.ModuleType(module_name)
131132
# Set necessary attributes for the module
@@ -170,11 +171,14 @@ def untrusted_check(
170171
code: str,
171172
test_code: str,
172173
entry_point: str,
174+
max_as_limit: float,
175+
max_data_limit: float,
176+
max_stack_limit: float,
173177
min_time_limit: float = 10,
174178
gt_time_limit: float = 60
175179
) -> Tuple[str, np.ndarray]:
176180
time_limit = max(min_time_limit, gt_time_limit)
177-
timeout = max(os.getenv("EVALPLUS_TIMEOUT_PER_TASK", 120), time_limit) + 1
181+
timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120), time_limit) + 1
178182
# shared memory objects
179183
stat = Value("i", _UNKNOWN)
180184
manager = Manager()
@@ -187,6 +191,9 @@ def untrusted_check(
187191
code,
188192
test_code,
189193
timeout,
194+
max_as_limit,
195+
max_data_limit,
196+
max_stack_limit,
190197
stat,
191198
details,
192199
),

bigcodebench/eval/utils.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ class redirect_stdin(contextlib._RedirectStream): # type: ignore
258258
_stream = "stdin"
259259

260260

261-
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
261+
def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
262262
"""
263263
This disables various destructive functions and prevents the generated code
264264
from interfering with the test (e.g. fork bomb, killing other processes,
@@ -282,18 +282,22 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
282282
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
283283
os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
284284

285-
if maximum_memory_bytes is not None:
285+
if max_as_limit and max_data_limit and max_stack_limit:
286286
import resource
287-
287+
288+
max_as_limit = max_as_limit * 1024 * 1024
289+
max_data_limit = max_data_limit * 1024 * 1024
290+
max_stack_limit = max_stack_limit * 1024 * 1024
291+
288292
resource.setrlimit(
289-
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
293+
resource.RLIMIT_AS, (max_as_limit, max_as_limit)
290294
)
291295
resource.setrlimit(
292-
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
296+
resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
293297
)
294298
if not platform.uname().system == "Darwin":
295299
resource.setrlimit(
296-
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
300+
resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
297301
)
298302

299303
faulthandler.disable()

bigcodebench/evaluate.py

Lines changed: 63 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@
2727
estimate_pass_at_k,
2828
untrusted_check,
2929
)
30-
from bigcodebench.gen.util import trusted_exec
30+
from bigcodebench.gen.util import trusted_check
3131

3232
# 1st item: the status
3333
# 2nd item (optional): the detailed pass/fail boolean for each input
3434
Result = Tuple[str, List[bool]]
3535

3636

37-
def get_groundtruth(problems, hashcode, check_gt_only):
37+
def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit):
3838
cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
3939
if os.path.exists(cache_file):
4040
if check_gt_only:
@@ -47,13 +47,29 @@ def get_groundtruth(problems, hashcode, check_gt_only):
4747
os.makedirs(CACHE_DIR, exist_ok=True)
4848
print("\nAsserting the groundtruth...")
4949
tbegin = time.time()
50-
expected_time = {}
51-
for task_id, problem in tqdm(problems.items()):
52-
expected_time[task_id] = trusted_exec(
53-
problem["complete_prompt"] + "\n" + problem["canonical_solution"],
54-
problem["test"],
55-
problem["task_id"],
56-
)
50+
51+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
52+
futures = []
53+
n_samples = 0
54+
expected_time = dict()
55+
56+
for problem in problems.values():
57+
args = (
58+
problem["complete_prompt"] + "\n" + problem["canonical_solution"],
59+
problem["test"],
60+
problem["task_id"],
61+
max_as_limit,
62+
max_data_limit,
63+
max_stack_limit
64+
)
65+
66+
futures.append(executor.submit(trusted_check, *args))
67+
n_samples += 1
68+
69+
for future in tqdm(as_completed(futures), total=n_samples):
70+
result = future.result()
71+
expected_time[result["task_id"]] = result["time"]
72+
5773
print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
5874

5975
with open(cache_file, "wb") as f:
@@ -65,9 +81,12 @@ def check_correctness(
6581
completion_id: int,
6682
problem: Dict[str, Any],
6783
solution: str,
84+
max_as_limit: float,
85+
max_data_limit: float,
86+
max_stack_limit: float,
6887
identifier=None,
6988
min_time_limit: float = 0.1,
70-
gt_time_limit: float = 2.0
89+
gt_time_limit: float = 2.0,
7190
) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details)
7291
ret = {
7392
"completion_id": completion_id,
@@ -79,8 +98,11 @@ def check_correctness(
7998
solution,
8099
problem["test"],
81100
problem["entry_point"],
101+
max_as_limit,
102+
max_data_limit,
103+
max_stack_limit,
82104
min_time_limit,
83-
gt_time_limit
105+
gt_time_limit,
84106
)
85107
return ret
86108

@@ -101,18 +123,21 @@ def evaluate(flags):
101123
assert flags.samples.endswith(".jsonl")
102124
result_path = flags.samples.replace(".jsonl", "_eval_results.json")
103125

126+
problems = get_bigcodebench()
127+
dataset_hash = get_bigcodebench_hash()
128+
129+
if not flags.no_gt:
130+
expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit)
131+
else:
132+
expected_time = {task_id: None for task_id in problems}
133+
104134
if os.path.isfile(result_path):
105135
print(f"Load from previous results from {result_path}")
106136
with open(result_path, "r") as f:
107137
results = json.load(f)
108138

109139
results = compatible_eval_result(results)
110140
else:
111-
problems = get_bigcodebench()
112-
dataset_hash = get_bigcodebench_hash()
113-
expected_time = None
114-
if not flags.no_gt:
115-
expected_time = get_groundtruth(problems, dataset_hash, flags.check_gt_only)
116141

117142
if flags.check_gt_only:
118143
return
@@ -150,9 +175,12 @@ def evaluate(flags):
150175
completion_id[task_id],
151176
problems[task_id],
152177
solution,
178+
flags.max_as_limit,
179+
flags.max_data_limit,
180+
flags.max_stack_limit,
153181
sample["_identifier"],
154182
flags.min_time_limit,
155-
expected_time[task_id] if expected_time else 20
183+
expected_time[task_id] if expected_time[task_id] else 20
156184
)
157185
futures.append(executor.submit(check_correctness, *args))
158186
completion_id[task_id] += 1
@@ -208,7 +236,21 @@ def stucking_checker():
208236
for k in [1, 5, 10, 25, 100]
209237
if total.min() >= k
210238
}
211-
cprint(f"BigCodeBench-{flags.subset}", "green")
239+
240+
mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
241+
flags.subset = flags.subset[0].upper() + flags.subset[1:]
242+
cprint(f"BigCodeBench-{flags.subset}{mode}", "green")
243+
244+
gt_pass_rate = np.mean([1 if v is not None else 0 for v in expected_time.values()])
245+
246+
if flags.no_gt:
247+
cprint(f"Groundtruth is not checked", "yellow")
248+
else:
249+
if gt_pass_rate > 0.95:
250+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
251+
else:
252+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
253+
212254
for k, v in pass_at_k.items():
213255
cprint(f"{k}:\t{v:.3f}", "green")
214256

@@ -240,6 +282,9 @@ def main():
240282
parser.add_argument("--samples", required=True, type=str)
241283
parser.add_argument("--parallel", default=None, type=int)
242284
parser.add_argument("--min-time-limit", default=1, type=float)
285+
parser.add_argument("--max-as-limit", default=128*1024, type=float)
286+
parser.add_argument("--max-data-limit", default=4*1024, type=float)
287+
parser.add_argument("--max-stack-limit", default=5, type=float)
243288
parser.add_argument(
244289
"--check-gt-only", action="store_true", help="Check the groundtruth"
245290
)

bigcodebench/gen/util/__init__.py

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import os
12
import time
23
import sys
34
import types
45
import unittest
5-
6+
import multiprocessing
7+
from multiprocessing import Array, Value, Manager
68
from bigcodebench.eval.utils import (
79
create_tempdir,
810
reliability_guard,
@@ -12,7 +14,7 @@
1214
)
1315

1416

15-
def trusted_exec(code, test_code, task_id):
17+
def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit, times):
1618
"""Execute trusted code in place."""
1719

1820
with create_tempdir():
@@ -25,8 +27,7 @@ def trusted_exec(code, test_code, task_id):
2527
chdir = os.chdir
2628
module_name = "__test__"
2729
new_module = types.ModuleType(module_name)
28-
maximum_memory_bytes = 128 * 1024 * 1024 * 1024
29-
reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
30+
reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
3031
# Set necessary attributes for the module
3132
new_module.__dict__.update({
3233
'__builtins__': builtins,
@@ -50,19 +51,19 @@ def trusted_exec(code, test_code, task_id):
5051
suite = loader.loadTestsFromTestCase(TestCases)
5152
test_result = unittest.TestResult()
5253
start = time.time()
53-
with safe_environment(), swallow_io():
54+
with safe_environment(), swallow_io(), time_limit(seconds=120):
5455
suite.run(test_result)
55-
for test, trace in test_result.failures + test_result.errors:
56-
print(trace)
56+
57+
if len(test_result.failures + test_result.errors) > 0:
58+
times.value = -1
59+
else:
60+
times.value = time.time() - start
61+
5762
# Needed for cleaning up.
5863
shutil.rmtree = rmtree
5964
os.rmdir = rmdir
6065
os.chdir = chdir
61-
assert len(
62-
test_result.failures + test_result.errors
63-
) == 0, f"{task_id} failed with errors: {test_result.errors} and failures: {test_result.failures}"
6466

65-
return time.time() - start
6667

6768
def trusted_check_exec(code, inputs):
6869
"""Check trusted_exec success."""
@@ -72,3 +73,45 @@ def trusted_check_exec(code, inputs):
7273
except Exception:
7374
return False
7475
return True
76+
77+
78+
def trusted_check(
79+
code: str,
80+
test_code: str,
81+
task_id: str,
82+
max_as_limit: float,
83+
max_data_limit: float,
84+
max_stack_limit: float,
85+
):
86+
timeout = os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120) + 1
87+
# shared memory objects
88+
times = Value("d")
89+
manager = Manager()
90+
91+
p = multiprocessing.Process(
92+
target=trusted_exec,
93+
args=(
94+
code,
95+
test_code,
96+
task_id,
97+
max_as_limit,
98+
max_data_limit,
99+
max_stack_limit,
100+
times,
101+
),
102+
)
103+
p.start()
104+
p.join(timeout=timeout+1)
105+
if p.is_alive():
106+
p.terminate()
107+
time.sleep(0.1)
108+
if p.is_alive():
109+
p.kill()
110+
time.sleep(0.1)
111+
112+
if times.value == -1:
113+
times = -1
114+
else:
115+
times = times.value
116+
117+
return {"task_id": task_id, "time": times}

0 commit comments

Comments
 (0)