Skip to content

Commit 16ec422

Browse files
committed
fix(evaluate): update the calibration setup
1 parent 1d9ea6a commit 16ec422

File tree

1 file changed

+16
-16
lines changed

1 file changed

+16
-16
lines changed

bigcodebench/evaluate.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def evaluate(
233233
if "solution" in sample
234234
else problems[task_id]["complete_prompt"] + sample["completion"]
235235
)
236-
if "sanitized-calibrated" in samples:
236+
if "sanitized_calibrated" in samples:
237237
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
238238
remainings.add(sample["_identifier"])
239239
args = (
@@ -254,22 +254,22 @@ def evaluate(
254254
assert n_samples == len(remainings), "Missing problems in unfinished"
255255
assert len(completion_id) == len(problems), "Missing problems in samples"
256256

257-
def stucking_checker():
258-
while remainings:
259-
last_size = len(remainings)
260-
time.sleep(240)
261-
if last_size != len(remainings) or len(remainings) == 0:
262-
continue
263-
# Potential stucking
264-
warn("No samples had finished testing in the last 240s")
265-
warn(f"{len(remainings)} samples to be tested: {remainings}")
257+
def stucking_checker():
258+
while remainings:
259+
last_size = len(remainings)
260+
time.sleep(240)
261+
if last_size != len(remainings) or len(remainings) == 0:
262+
continue
263+
# Potential stucking
264+
warn("No samples had finished testing in the last 240s")
265+
warn(f"{len(remainings)} samples to be tested: {remainings}")
266266

267-
threading.Thread(target=stucking_checker).start()
267+
threading.Thread(target=stucking_checker).start()
268268

269-
for future in tqdm(as_completed(futures), total=n_samples):
270-
result = future.result()
271-
remainings.remove(result["_identifier"])
272-
eval_results[result["task_id"]].append(result)
269+
for future in tqdm(as_completed(futures), total=n_samples):
270+
result = future.result()
271+
remainings.remove(result["_identifier"])
272+
eval_results[result["task_id"]].append(result)
273273

274274
# sort the results for each problem by completion_id
275275
for task_id, task_results in eval_results.items():
@@ -307,7 +307,7 @@ def stucking_checker():
307307
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
308308
pass_at_k["split"] = split
309309
pass_at_k["subset"] = subset
310-
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
310+
pass_at_k["calibrated"] = "sanitized_calibrated" in samples
311311
pass_at_k["gt_pass_rate"] = gt_pass_rate
312312
pass_at_k["failed_tasks"] = failed_tasks
313313

0 commit comments

Comments
 (0)