Skip to content

Commit 7f529e8

Browse files
committed
feat: config default setup
1 parent 2f2e49b commit 7f529e8

File tree

2 files changed

+24
-3
lines changed

2 files changed

+24
-3
lines changed

bigcodebench/eval/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,11 @@ def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
284284

285285
if maximum_memory_bytes:
286286
import resource
287-
287+
288+
max_as_limit = max_as_limit * 1024 * 1024
289+
max_data_limit = max_data_limit * 1024 * 1024
290+
max_stack_limit = max_stack_limit * 1024 * 1024
291+
288292
resource.setrlimit(
289293
resource.RLIMIT_AS, (max_as_limit, max_as_limit)
290294
)

bigcodebench/evaluate.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,11 @@ def evaluate(flags):
120120
problems = get_bigcodebench()
121121
dataset_hash = get_bigcodebench_hash()
122122
expected_time = None
123+
123124
if not flags.no_gt:
124125
expected_time = get_groundtruth(problems, dataset_hash, flags.check_gt_only)
126+
else:
127+
expected_time = {task_id: None for task_id in problems}
125128

126129
if flags.check_gt_only:
127130
return
@@ -164,7 +167,7 @@ def evaluate(flags):
164167
flags.max_stack_limit,
165168
sample["_identifier"],
166169
flags.min_time_limit,
167-
expected_time[task_id] if expected_time else 20
170+
expected_time[task_id] if expected_time[task_id] else 20
168171
)
169172
futures.append(executor.submit(check_correctness, *args))
170173
completion_id[task_id] += 1
@@ -220,7 +223,21 @@ def stucking_checker():
220223
for k in [1, 5, 10, 25, 100]
221224
if total.min() >= k
222225
}
223-
cprint(f"BigCodeBench-{flags.subset}", "green")
226+
227+
mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
228+
229+
cprint(f"BigCodeBench-{flags.subset}{mode}", "green")
230+
231+
gt_pass_rate = np.mean([1 if v is not None else 0 for v in expected_time.values()])
232+
233+
if flags.no_gt:
234+
cprint(f"Groundtruth is not checked", "yellow")
235+
else:
236+
if gt_pass_rate > 0.95:
237+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
238+
else:
239+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
240+
224241
for k, v in pass_at_k.items():
225242
cprint(f"{k}:\t{v:.3f}", "green")
226243

0 commit comments

Comments
 (0)