99import pandas as pd
1010import itertools
1111import math
12- from datasets import Dataset , DatasetDict
12+ from datasets import Dataset , DatasetDict , load_dataset
1313from transformers import AutoTokenizer
1414
15-
1615def update_model_info (model_info ):
1716 for model , info in model_info .items ():
1817 if "https://huggingface.co/" in info ["link" ]:
@@ -29,7 +28,7 @@ def update_model_info(model_info):
2928 return model_info
3029
3130
32- def get_results ():
31+ def get_results (tids ):
3332 results = {}
3433 for model , info in model_info .items ():
3534 results [info ["name" ]] = {
@@ -42,7 +41,9 @@ def get_results():
4241 "instruct-cal" : None ,
4342 },
4443 "prompted" : info ["prompted" ],
44+ "moe" : info ["moe" ],
4545 "size" : info ["size" ],
46+ "act_param" : info ["act_param" ],
4647 "direct_complete" : info ["direct_complete" ],
4748 }
4849
@@ -59,9 +60,9 @@ def get_results():
5960 status = []
6061 with open ("results/" + model + "--bigcodebench-" + suffix , "r" ) as f :
6162 data = json .load (f )
62- if len (data ["eval" ]) != 1140 :
63- continue
6463 for key , value in data ["eval" ].items ():
64+ if key not in tids :
65+ continue
6566 if value [0 ]["status" ] == "pass" :
6667 status .append (1 )
6768 else :
@@ -143,14 +144,14 @@ def split_gen():
143144 f .writelines (data )
144145
145146
146- def read_task_perf (task = "complete" ):
147+ def read_task_perf (tids , task = "complete" ):
147148 model_results = dict ()
148149 result_files = []
149150 for model , info in model_info .items ():
150151 if task == "instruct" and (not info ["prompted" ] or info ["name" ] in ["Granite-Code-3B-Instruct" , "Granite-Code-8B-Instruct" ]):
151152 continue
152153
153- task_perf = { f"BigCodeBench/ { task_id } " : 0 for task_id in range ( 1140 )}
154+ task_perf = dict ()
154155 model = model .replace ("/" , "--" )
155156 # if info["link"].startswith("https://huggingface.co/"):
156157 # model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
@@ -170,22 +171,22 @@ def read_task_perf(task="complete"):
170171 with open (file , "r" ) as f :
171172 data = json .load (f )
172173 for task_id , perfs in data ["eval" ].items ():
173- status = 1 if perfs [0 ]["status" ] == "pass" else 0
174- task_perf [task_id ] = status
174+ if task_id in tids :
175+ status = 1 if perfs [0 ]["status" ] == "pass" else 0
176+ task_perf [task_id ] = status
175177 model_results [info ["name" ]] = task_perf
176178 return model_results , result_files
177179
178180
179- def get_winner_df (data_dict , task , task_level = True , no_tie = True ):
181+ def get_winner_df (data_dict , tids , task , task_level = True , no_tie = True ):
180182 winner_dict = {"task_id" : [], "model_a" : [], "model_b" : [], "winner" : []}
181183 if not task_level :
182184 file = f"{ task } _winner_df.csv"
183185 else :
184186 file = f"{ task } _winner_task_df.csv"
185187
186188 if task_level :
187- for task_id in tqdm (range (1140 )):
188- task_id = f"BigCodeBench/{ task_id } "
189+ for task_id in tqdm (tids ):
189190 # pair without repetition (a, b) and (b, a) are the same
190191 for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
191192 solve_rate_a = data_dict [model_a ][task_id ]
@@ -264,23 +265,51 @@ def update_elo_rating(results, elo_dict):
264265 return results
265266
266267
268+ def get_domain_perf (data_dict , task2domain ):
269+ domain_perfs = {
270+ "Model" : [],
271+ "Computation" : [],
272+ "General" : [],
273+ "Visualization" : [],
274+ "System" : [],
275+ "Time" : [],
276+ "Network" : [],
277+ "Cryptography" : []
278+ }
279+ for model , task_perf in data_dict .items ():
280+ model_domain = {"Computation" : [], "General" : [], "Visualization" : [], "System" : [], "Time" : [], "Network" : [], "Cryptography" : []}
281+ for task_id , status in task_perf .items ():
282+ domains = task2domain [task_id ]
283+ for domain in domains :
284+ model_domain [domain ].append (status )
285+ domain_perf = {domain : round (np .mean (perfs )* 100 , 1 ) for domain , perfs in model_domain .items ()}
286+ domain_perfs ["Model" ].append (model )
287+ for domain in model_domain .keys ():
288+ domain_perfs [domain ].append (domain_perf [domain ])
289+ return Dataset .from_dict (domain_perfs )
290+
291+
267292def get_solve_rate (data_dict , task = "complete" ):
268- task_solve_count = { f"BigCodeBench/ { task_id } " : [] for task_id in range ( 1140 )}
293+ task_solve_count = dict ()
269294 for model , task_perf in data_dict .items ():
270- for task_id in range (1140 ):
271- task_solve_count [f"BigCodeBench/{ task_id } " ].append (task_perf [f"BigCodeBench/{ task_id } " ])
295+ for task_id , score in task_perf .items ():
296+ if task_id not in task_solve_count :
297+ task_solve_count [task_id ] = []
298+ task_solve_count [task_id ].append (score )
272299 solve_rate = {task_id : round (np .mean (perfs ) * 100 , 1 ) for task_id , perfs in task_solve_count .items ()}
273300 return Dataset .from_dict ({"task_id" : list (solve_rate .keys ()), "solve_rate" : list (solve_rate .values ())})
274301
275302
276303def get_hf_ds (results ):
277- hf_dataset = {"model" : [], "link" : [], "size" : [], "type" : [], "lazy" : [], "direct_complete" : [],
304+ hf_dataset = {"model" : [], "link" : [], "moe" : [], " size" : [], "act_param " : [], "type" : [], "lazy" : [], "direct_complete" : [],
278305 "complete" : [], "instruct" : [], "elo_mle" : []}
279306
280307 for model , result in results .items ():
281308 hf_dataset ["model" ].append (model )
282309 hf_dataset ["link" ].append (result ["link" ])
310+ hf_dataset ["moe" ].append (result ["moe" ])
283311 hf_dataset ["size" ].append (result ["size" ])
312+ hf_dataset ["act_param" ].append (result ["act_param" ])
284313 hf_dataset ["type" ].append ("🔶" if result ["prompted" ] else "🟢" )
285314 hf_dataset ["lazy" ].append (result ["lazy" ])
286315 hf_dataset ["complete" ].append (result ["pass@1" ]["complete" ])
@@ -311,52 +340,56 @@ def push_ds(ds, path, local=False):
311340
312341if __name__ == "__main__" :
313342
343+ bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.0_hf" )
344+ bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.0_hf" )
314345 model_info = update_model_info (model_info )
315- results = get_results ()
316- files = []
317- complete_data , complete_files = read_task_perf ("complete" )
318- instruct_data , instruct_files = read_task_perf ("instruct" )
319- assert len (model_info ) == len (complete_data )
320- # complete_map = {model.replace("-","_").replace("+","_plus").replace(" ","_"):
321- # Dataset.from_dict({"task_id": list(task_perf.keys()), "status": list(task_perf.values())}) for model, task_perf in complete_data.items()}
322- # instruct_map = {model.replace("-","_").replace("+","_plus").replace(" ","_"):
323- # Dataset.from_dict({"task_id": list(task_perf.keys()), "status": list(task_perf.values())}) for model, task_perf in instruct_data.items()}
324- # complete_ds = DatasetDict(complete_map)
325- # instruct_ds = DatasetDict(instruct_map)
326- # push_ds(complete_ds, "bigcode/bigcodebench-complete-perf")
327- # push_ds(instruct_ds, "bigcode/bigcodebench-instruct-perf")
328-
329- files .extend (complete_files )
330- files .extend (instruct_files )
331- shutil .rmtree ("eval_results" , ignore_errors = True )
332- os .makedirs ("eval_results" , exist_ok = True )
333- for file in files :
334- shutil .copy (file , "eval_results" )
335-
336- complete_solve_rate = get_solve_rate (complete_data , task = "complete" )
337- instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
338- solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
339- push_ds (solve_rate_ds , "bigcode/bigcodebench-solve-rate" )
340-
341- elo_config = {
342- "task_no_tie" : (True , True ),
343- "benchmark_tie" : (False , False ),
346+ bcb_config = {
347+ "" : bcb_orig ,
348+ "-hard" : bcb_hard ,
344349 }
345- elo_ds = dict ()
346- for config , (task_level , no_tie ) in elo_config .items ():
347- battles = get_winner_df (complete_data , "complete" , task_level = task_level , no_tie = no_tie )
348- elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
349- bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
350- bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
351- bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
352- if config == "task_no_tie" :
353- task_elo = bootstrap_lu_median_dict
354- elo = get_bootstrap_scores (elo_mle_bootstrap )
355- elo_ds [config ] = elo
356- push_ds (DatasetDict (elo_ds ), "bigcode/bigcodebench-elo" )
357-
358- results = update_elo_rating (results , task_elo )
359- with open ("results.json" , "w" ) as f :
360- json .dump (results , f , indent = 4 )
361- ds = get_hf_ds (results )
362- push_ds (ds , "bigcode/bigcodebench-results" )
350+ for suffix , bcb in bcb_config .items ():
351+ results = get_results (bcb ["task_id" ])
352+ files = []
353+ complete_data , complete_files = read_task_perf (bcb ["task_id" ], "complete" )
354+ instruct_data , instruct_files = read_task_perf (bcb ["task_id" ], "instruct" )
355+ assert len (model_info ) == len (complete_data )
356+ with open ("task2domain.json" , "r" ) as f :
357+ task2domain = json .load (f )
358+ domain_complete = get_domain_perf (complete_data , task2domain )
359+ domain_instruct = get_domain_perf (instruct_data , task2domain )
360+ DatasetDict ({"complete" : domain_complete , "instruct" : domain_instruct }).push_to_hub (f"bigcode/bigcodebench{ suffix } -domain" )
361+
362+ files .extend (complete_files )
363+ files .extend (instruct_files )
364+ shutil .rmtree ("eval_results" , ignore_errors = True )
365+ os .makedirs ("eval_results" , exist_ok = True )
366+ for file in files :
367+ shutil .copy (file , "eval_results" )
368+
369+ complete_solve_rate = get_solve_rate (complete_data , task = "complete" )
370+ instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
371+ solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
372+ push_ds (solve_rate_ds , f"bigcode/bigcodebench{ suffix } -solve-rate" )
373+
374+ elo_config = {
375+ "task_no_tie" : (True , True ),
376+ "benchmark_tie" : (False , False ),
377+ }
378+ elo_ds = dict ()
379+ for config , (task_level , no_tie ) in elo_config .items ():
380+ battles = get_winner_df (complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
381+ elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
382+ bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
383+ bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
384+ bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
385+ if config == "task_no_tie" :
386+ task_elo = bootstrap_lu_median_dict
387+ elo = get_bootstrap_scores (elo_mle_bootstrap )
388+ elo_ds [config ] = elo
389+ push_ds (DatasetDict (elo_ds ), f"bigcode/bigcodebench{ suffix } -elo" )
390+
391+ results = update_elo_rating (results , task_elo )
392+ with open (f"results{ suffix } .json" , "w" ) as f :
393+ json .dump (results , f , indent = 4 )
394+ ds = get_hf_ds (results )
395+ push_ds (ds , f"bigcode/bigcodebench{ suffix } -results" )
0 commit comments