1111import math
1212from datasets import Dataset , DatasetDict , load_dataset
1313from transformers import AutoTokenizer
14+ from cuml .linear_model import LogisticRegression
15+ import cupy as cp
1416
1517def update_model_info (model_info ):
1618 for model , info in model_info .items ():
@@ -67,6 +69,8 @@ def get_results(tids):
6769 data = json .load (f )
6870 status = []
6971
72+ if len (data ["eval" ]) < len (tids ):
73+ continue
7074 for key , value in data ["eval" ].items ():
7175 if key not in tids :
7276 continue
@@ -163,23 +167,23 @@ def read_task_perf(tids, task="complete"):
163167 try :
164168 try :
165169 try :
166- if info ["prompted" ]:# and not info["direct_complete"]:
167- files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_hard_eval_results .json" )
170+ if info ["prompted" ]:
171+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results .json" )
168172 if files :
169173 file = files [0 ]
170174 else :
171- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results .json" )[0 ]
175+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results .json" )[0 ]
172176 else :
173- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results .json" )[0 ]
177+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results .json" )[0 ]
174178 except :
175- if info ["prompted" ]:
176- files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results .json" )
179+ if info ["prompted" ]:# and not info["direct_complete"]:
180+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_hard_eval_results .json" )
177181 if files :
178182 file = files [0 ]
179183 else :
180- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results .json" )[0 ]
184+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results .json" )[0 ]
181185 else :
182- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results .json" )[0 ]
186+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results .json" )[0 ]
183187 except :
184188 try :
185189 if info ["prompted" ]:# and not info["direct_complete"]:
@@ -205,6 +209,9 @@ def read_task_perf(tids, task="complete"):
205209 result_files .append (file )
206210 with open (file , "r" ) as f :
207211 data = json .load (f )
212+
213+ if len (data ["eval" ]) < len (tids ):
214+ continue
208215 for task_id , perfs in data ["eval" ].items ():
209216 if task_id in tids :
210217 status = 1 if perfs [0 ]["status" ] == "pass" else 0
@@ -271,25 +278,26 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
271278
272279
273280def get_elo_mle (df , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 ):
274- from sklearn .linear_model import LogisticRegression
281+
282+
275283 models = pd .concat ([df ["model_a" ], df ["model_b" ]]).unique ()
276284 models = pd .Series (np .arange (len (models )), index = models )
277285 p = len (models .index )
278286 n = df .shape [0 ]
279287
280- X = np .zeros ([n , p ])
281- X [np .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
282- X [np .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
288+ X = cp .zeros ([n , p ])
289+ X [cp .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
290+ X [cp .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
283291
284- Y = np .zeros (n )
292+ Y = cp .zeros (n )
285293 Y [df ["winner" ] == "model_a" ] = 1.0
286294
287295 lr = LogisticRegression (fit_intercept = False )
288- lr .fit (X ,Y )
296+ lr .fit (X , Y )
289297
290298 elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
291299
292- return pd .Series (elo_scores , index = models .index ).sort_values (ascending = False )
300+ return pd .Series (cp . asnumpy ( elo_scores ) , index = models .index ).sort_values (ascending = False )
293301
294302
295303def update_elo_rating (results , elo_dict ):
@@ -387,11 +395,10 @@ def get_perf_df(data_dict):
387395
388396if __name__ == "__main__" :
389397
390- # bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
391- bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.0_hf" )
392- # model_info = update_model_info(model_info)
398+ bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.1" )
399+ bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.1" )
393400 bcb_config = {
394- # "": bcb_orig,
401+ "" : bcb_orig ,
395402 "-hard" : bcb_hard ,
396403 }
397404 for suffix , bcb in bcb_config .items ():
@@ -401,9 +408,9 @@ def get_perf_df(data_dict):
401408 instruct_data , instruct_files = read_task_perf (bcb ["task_id" ], "instruct" )
402409 complete_df = get_perf_df (complete_data )
403410 instruct_df = get_perf_df (instruct_data )
411+
404412 push_ds (DatasetDict ({"complete" : Dataset .from_pandas (complete_df ), "instruct" : Dataset .from_pandas (instruct_df )}), f"bigcode/bigcodebench{ suffix } -perf" )
405- assert len (model_info ) == len (complete_data ),\
406- f"Missing results for { set ([val ['name' ] for val in model_info .values ()]) - set ([model for model in complete_data .keys ()])} "
413+
407414 with open ("task2domain.json" , "r" ) as f :
408415 task2domain = json .load (f )
409416 domain_complete = get_domain_perf (complete_data , task2domain )
0 commit comments