@@ -120,8 +120,11 @@ def evaluate(flags):
120120 problems = get_bigcodebench ()
121121 dataset_hash = get_bigcodebench_hash ()
122122 expected_time = None
123+
123124 if not flags .no_gt :
124125 expected_time = get_groundtruth (problems , dataset_hash , flags .check_gt_only )
126+ else :
127+ expected_time = {task_id : None for task_id in problems }
125128
126129 if flags .check_gt_only :
127130 return
@@ -164,7 +167,7 @@ def evaluate(flags):
164167 flags .max_stack_limit ,
165168 sample ["_identifier" ],
166169 flags .min_time_limit ,
167- expected_time [task_id ] if expected_time else 20
170+ expected_time [task_id ] if expected_time [ task_id ] else 20
168171 )
169172 futures .append (executor .submit (check_correctness , * args ))
170173 completion_id [task_id ] += 1
@@ -220,7 +223,21 @@ def stucking_checker():
220223 for k in [1 , 5 , 10 , 25 , 100 ]
221224 if total .min () >= k
222225 }
223- cprint (f"BigCodeBench-{ flags .subset } " , "green" )
226+
227+ mode = "-calibrated" if "sanitized-calibrated" in flags .samples else ""
228+
229+ cprint (f"BigCodeBench-{ flags .subset } { mode } " , "green" )
230+
231+ gt_pass_rate = np .mean ([1 if v is not None else 0 for v in expected_time .values ()])
232+
233+ if flags .no_gt :
234+ cprint (f"Groundtruth is not checked" , "yellow" )
235+ else :
236+ if gt_pass_rate > 0.95 :
237+ cprint (f"Groundtruth pass rate: { gt_pass_rate :.3f} " , "green" )
238+ else :
239+ cprint (f"Groundtruth pass rate: { gt_pass_rate :.3f} \n Please be cautious!" , "red" )
240+
224241 for k , v in pass_at_k .items ():
225242 cprint (f"{ k } :\t { v :.3f} " , "green" )
226243
0 commit comments