@@ -233,7 +233,7 @@ def evaluate(
233233 if "solution" in sample
234234 else problems [task_id ]["complete_prompt" ] + sample ["completion" ]
235235 )
236- if "sanitized-calibrated " in samples :
236+ if "sanitized_calibrated " in samples :
237237 solution = problems [task_id ]["code_prompt" ] + "\n pass\n " + solution
238238 remainings .add (sample ["_identifier" ])
239239 args = (
@@ -254,22 +254,22 @@ def evaluate(
254254 assert n_samples == len (remainings ), "Missing problems in unfinished"
255255 assert len (completion_id ) == len (problems ), "Missing problems in samples"
256256
257- def stucking_checker ():
258- while remainings :
259- last_size = len (remainings )
260- time .sleep (240 )
261- if last_size != len (remainings ) or len (remainings ) == 0 :
262- continue
263- # Potential stucking
264- warn ("No samples had finished testing in the last 240s" )
265- warn (f"{ len (remainings )} samples to be tested: { remainings } " )
257+ def stucking_checker ():
258+ while remainings :
259+ last_size = len (remainings )
260+ time .sleep (240 )
261+ if last_size != len (remainings ) or len (remainings ) == 0 :
262+ continue
263+ # Potential stucking
264+ warn ("No samples had finished testing in the last 240s" )
265+ warn (f"{ len (remainings )} samples to be tested: { remainings } " )
266266
267- threading .Thread (target = stucking_checker ).start ()
267+ threading .Thread (target = stucking_checker ).start ()
268268
269- for future in tqdm (as_completed (futures ), total = n_samples ):
270- result = future .result ()
271- remainings .remove (result ["_identifier" ])
272- eval_results [result ["task_id" ]].append (result )
269+ for future in tqdm (as_completed (futures ), total = n_samples ):
270+ result = future .result ()
271+ remainings .remove (result ["_identifier" ])
272+ eval_results [result ["task_id" ]].append (result )
273273
274274 # sort the results for each problem by completion_id
275275 for task_id , task_results in eval_results .items ():
@@ -307,7 +307,7 @@ def stucking_checker():
307307 pass_at_k ["model" ] = os .path .basename (samples ).split ("--bigcodebench-" )[0 ]
308308 pass_at_k ["split" ] = split
309309 pass_at_k ["subset" ] = subset
310- pass_at_k ["calibrated" ] = "sanitized-calibrated " in samples
310+ pass_at_k ["calibrated" ] = "sanitized_calibrated " in samples
311311 pass_at_k ["gt_pass_rate" ] = gt_pass_rate
312312 pass_at_k ["failed_tasks" ] = failed_tasks
313313
0 commit comments