Skip to content

Commit a06bf00

Browse files
committed
feat: add trust check
1 parent 4902151 commit a06bf00

File tree

2 files changed

+51
-6
lines changed

2 files changed

+51
-6
lines changed

bigcodebench/evaluate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def get_groundtruth(problems, hashcode, check_gt_only, max_as_limit, max_data_li
4949
tbegin = time.time()
5050
expected_time = {}
5151
for task_id, problem in tqdm(problems.items()):
52-
expected_time[task_id] = trusted_exec(
52+
expected_time[task_id] = trusted_check(
5353
problem["complete_prompt"] + "\n" + problem["canonical_solution"],
5454
problem["test"],
5555
problem["task_id"],

bigcodebench/gen/util/__init__.py

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import sys
33
import types
44
import unittest
5-
5+
import multiprocessing
6+
from multiprocessing import Array, Value, Manager
67
from bigcodebench.eval.utils import (
78
create_tempdir,
89
reliability_guard,
@@ -12,7 +13,7 @@
1213
)
1314

1415

15-
def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit):
16+
def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit, times):
1617
"""Execute trusted code in place."""
1718

1819
with create_tempdir():
@@ -51,15 +52,17 @@ def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_sta
5152
start = time.time()
5253
with safe_environment(), swallow_io(), time_limit(seconds=120):
5354
suite.run(test_result)
55+
56+
if len(test_result.failures + test_result.errors) > 0:
57+
times.value = -1
58+
else:
59+
times.value = time.time() - start
5460

5561
# Needed for cleaning up.
5662
shutil.rmtree = rmtree
5763
os.rmdir = rmdir
5864
os.chdir = chdir
5965

60-
if len(test_result.failures + test_result.errors) > 0:
61-
return None
62-
return time.time() - start
6366

6467
def trusted_check_exec(code, inputs):
6568
"""Check trusted_exec success."""
@@ -69,3 +72,45 @@ def trusted_check_exec(code, inputs):
6972
except Exception:
7073
return False
7174
return True
75+
76+
77+
def trusted_check(
78+
code: str,
79+
test_code: str,
80+
task_id: str,
81+
max_as_limit: float,
82+
max_data_limit: float,
83+
max_stack_limit: float,
84+
) -> Tuple[str, np.ndarray]:
85+
timeout = os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120) + 1
86+
# shared memory objects
87+
times = Value("i")
88+
manager = Manager()
89+
90+
p = multiprocessing.Process(
91+
target=trusted_exec,
92+
args=(
93+
code,
94+
test_code,
95+
task_id,
96+
max_as_limit,
97+
max_data_limit,
98+
max_stack_limit,
99+
times,
100+
),
101+
)
102+
p.start()
103+
p.join(timeout=timeout+1)
104+
if p.is_alive():
105+
p.terminate()
106+
time.sleep(0.1)
107+
if p.is_alive():
108+
p.kill()
109+
time.sleep(0.1)
110+
111+
if times.value == -1:
112+
times = -1
113+
else:
114+
times = times.value
115+
116+
return times

0 commit comments

Comments
 (0)