99import pandas as pd
1010import itertools
1111import math
12- from datasets import Dataset , DatasetDict
12+ from datasets import Dataset , DatasetDict , load_dataset
1313from transformers import AutoTokenizer
1414
15-
1615def update_model_info (model_info ):
1716 for model , info in model_info .items ():
1817 if "https://huggingface.co/" in info ["link" ]:
1918 hf_model = info ["link" ].split ("https://huggingface.co/" )[- 1 ]
19+ print (hf_model )
2020 tokenizer = AutoTokenizer .from_pretrained (hf_model , trust_remote_code = True )
2121 if tokenizer .chat_template is None :
2222 model_info [model ]["direct_complete" ] = True
@@ -28,7 +28,7 @@ def update_model_info(model_info):
2828 return model_info
2929
3030
31- def get_results ():
31+ def get_results (tids ):
3232 results = {}
3333 for model , info in model_info .items ():
3434 results [info ["name" ]] = {
@@ -41,26 +41,28 @@ def get_results():
4141 "instruct-cal" : None ,
4242 },
4343 "prompted" : info ["prompted" ],
44+ "moe" : info ["moe" ],
4445 "size" : info ["size" ],
46+ "act_param" : info ["act_param" ],
4547 "direct_complete" : info ["direct_complete" ],
4648 }
4749
4850 for model , info in model_info .items ():
4951 model = model .replace ("/" , "--" )
5052 hf_model = ""
51- if "https://huggingface.co/" in info ["link" ]:
52- hf_model = info ["link" ].split ("https://huggingface.co/" )[- 1 ]
53- model = hf_model .replace ("/" , "--" )
5453 files = glob (f"results/{ model } --bigcodebench-*.json" )
5554 assert files , f"No files found for results/{ model } --bigcodebench-*.json"
55+ # if "https://huggingface.co/" in info["link"]:
56+ # hf_model = info["link"].split("https://huggingface.co/")[-1]
57+ # model = hf_model.replace("/", "--")
5658 for file in files :
5759 _ , suffix = os .path .basename (file ).split ("--bigcodebench-" )
5860 status = []
5961 with open ("results/" + model + "--bigcodebench-" + suffix , "r" ) as f :
6062 data = json .load (f )
61- if len (data ["eval" ]) != 1140 :
62- continue
6363 for key , value in data ["eval" ].items ():
64+ if key not in tids :
65+ continue
6466 if value [0 ]["status" ] == "pass" :
6567 status .append (1 )
6668 else :
@@ -142,17 +144,17 @@ def split_gen():
142144 f .writelines (data )
143145
144146
145- def read_task_perf (task = "complete" ):
147+ def read_task_perf (tids , task = "complete" ):
146148 model_results = dict ()
147149 result_files = []
148150 for model , info in model_info .items ():
149151 if task == "instruct" and (not info ["prompted" ] or info ["name" ] in ["Granite-Code-3B-Instruct" , "Granite-Code-8B-Instruct" ]):
150152 continue
151153
152- task_perf = { f"BigCodeBench/ { task_id } " : 0 for task_id in range ( 1140 )}
154+ task_perf = dict ()
153155 model = model .replace ("/" , "--" )
154- if info ["link" ].startswith ("https://huggingface.co/" ):
155- model = info ["link" ].split ("https://huggingface.co/" )[- 1 ].replace ("/" , "--" )
156+ # if info["link"].startswith("https://huggingface.co/"):
157+ # model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
156158 try :
157159 if info ["prompted" ] and not info ["direct_complete" ]:
158160 files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
@@ -169,22 +171,22 @@ def read_task_perf(task="complete"):
169171 with open (file , "r" ) as f :
170172 data = json .load (f )
171173 for task_id , perfs in data ["eval" ].items ():
172- status = 1 if perfs [0 ]["status" ] == "pass" else 0
173- task_perf [task_id ] = status
174+ if task_id in tids :
175+ status = 1 if perfs [0 ]["status" ] == "pass" else 0
176+ task_perf [task_id ] = status
174177 model_results [info ["name" ]] = task_perf
175178 return model_results , result_files
176179
177180
178- def get_winner_df (data_dict , task , task_level = True , no_tie = True ):
181+ def get_winner_df (data_dict , tids , task , task_level = True , no_tie = True ):
179182 winner_dict = {"task_id" : [], "model_a" : [], "model_b" : [], "winner" : []}
180183 if not task_level :
181184 file = f"{ task } _winner_df.csv"
182185 else :
183186 file = f"{ task } _winner_task_df.csv"
184187
185188 if task_level :
186- for task_id in tqdm (range (1140 )):
187- task_id = f"BigCodeBench/{ task_id } "
189+ for task_id in tqdm (tids ):
188190 # pair without repetition (a, b) and (b, a) are the same
189191 for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
190192 solve_rate_a = data_dict [model_a ][task_id ]
@@ -263,23 +265,51 @@ def update_elo_rating(results, elo_dict):
263265 return results
264266
265267
268+ def get_domain_perf (data_dict , task2domain ):
269+ domain_perfs = {
270+ "Model" : [],
271+ "Computation" : [],
272+ "General" : [],
273+ "Visualization" : [],
274+ "System" : [],
275+ "Time" : [],
276+ "Network" : [],
277+ "Cryptography" : []
278+ }
279+ for model , task_perf in data_dict .items ():
280+ model_domain = {"Computation" : [], "General" : [], "Visualization" : [], "System" : [], "Time" : [], "Network" : [], "Cryptography" : []}
281+ for task_id , status in task_perf .items ():
282+ domains = task2domain [task_id ]
283+ for domain in domains :
284+ model_domain [domain ].append (status )
285+ domain_perf = {domain : round (np .mean (perfs )* 100 , 1 ) for domain , perfs in model_domain .items ()}
286+ domain_perfs ["Model" ].append (model )
287+ for domain in model_domain .keys ():
288+ domain_perfs [domain ].append (domain_perf [domain ])
289+ return Dataset .from_dict (domain_perfs )
290+
291+
266292def get_solve_rate (data_dict , task = "complete" ):
267- task_solve_count = { f"BigCodeBench/ { task_id } " : [] for task_id in range ( 1140 )}
293+ task_solve_count = dict ()
268294 for model , task_perf in data_dict .items ():
269- for task_id in range (1140 ):
270- task_solve_count [f"BigCodeBench/{ task_id } " ].append (task_perf [f"BigCodeBench/{ task_id } " ])
295+ for task_id , score in task_perf .items ():
296+ if task_id not in task_solve_count :
297+ task_solve_count [task_id ] = []
298+ task_solve_count [task_id ].append (score )
271299 solve_rate = {task_id : round (np .mean (perfs ) * 100 , 1 ) for task_id , perfs in task_solve_count .items ()}
272300 return Dataset .from_dict ({"task_id" : list (solve_rate .keys ()), "solve_rate" : list (solve_rate .values ())})
273301
274302
275303def get_hf_ds (results ):
276- hf_dataset = {"model" : [], "link" : [], "size" : [], "type" : [], "lazy" : [], "direct_complete" : [],
304+ hf_dataset = {"model" : [], "link" : [], "moe" : [], " size" : [], "act_param " : [], "type" : [], "lazy" : [], "direct_complete" : [],
277305 "complete" : [], "instruct" : [], "elo_mle" : []}
278306
279307 for model , result in results .items ():
280308 hf_dataset ["model" ].append (model )
281309 hf_dataset ["link" ].append (result ["link" ])
310+ hf_dataset ["moe" ].append (result ["moe" ])
282311 hf_dataset ["size" ].append (result ["size" ])
312+ hf_dataset ["act_param" ].append (result ["act_param" ])
283313 hf_dataset ["type" ].append ("🔶" if result ["prompted" ] else "🟢" )
284314 hf_dataset ["lazy" ].append (result ["lazy" ])
285315 hf_dataset ["complete" ].append (result ["pass@1" ]["complete" ])
@@ -310,42 +340,56 @@ def push_ds(ds, path, local=False):
310340
311341if __name__ == "__main__" :
312342
343+ bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.0_hf" )
344+ bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.0_hf" )
313345 model_info = update_model_info (model_info )
314- results = get_results ()
315- files = []
316- complete_data , complete_files = read_task_perf ("complete" )
317- instruct_data , instruct_files = read_task_perf ("instruct" )
318- files .extend (complete_files )
319- files .extend (instruct_files )
320- shutil .rmtree ("eval_results" , ignore_errors = True )
321- os .makedirs ("eval_results" , exist_ok = True )
322- for file in files :
323- shutil .copy (file , "eval_results" )
324-
325- complete_solve_rate = get_solve_rate (complete_data , task = "complete" )
326- instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
327- solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
328- push_ds (solve_rate_ds , "bigcode/bigcodebench-solve-rate" )
329-
330- elo_config = {
331- "task_no_tie" : (True , True ),
332- "benchmark_tie" : (False , False ),
346+ bcb_config = {
347+ "" : bcb_orig ,
348+ "-hard" : bcb_hard ,
333349 }
334- elo_ds = dict ()
335- for config , (task_level , no_tie ) in elo_config .items ():
336- battles = get_winner_df (complete_data , "complete" , task_level = task_level , no_tie = no_tie )
337- elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
338- bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
339- bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
340- bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
341- if config == "task_no_tie" :
342- task_elo = bootstrap_lu_median_dict
343- elo = get_bootstrap_scores (elo_mle_bootstrap )
344- elo_ds [config ] = elo
345- push_ds (DatasetDict (elo_ds ), "bigcode/bigcodebench-elo" )
346-
347- results = update_elo_rating (results , task_elo )
348- with open ("results.json" , "w" ) as f :
349- json .dump (results , f , indent = 4 )
350- ds = get_hf_ds (results )
351- push_ds (ds , "bigcode/bigcodebench-results" )
350+ for suffix , bcb in bcb_config .items ():
351+ results = get_results (bcb ["task_id" ])
352+ files = []
353+ complete_data , complete_files = read_task_perf (bcb ["task_id" ], "complete" )
354+ instruct_data , instruct_files = read_task_perf (bcb ["task_id" ], "instruct" )
355+ assert len (model_info ) == len (complete_data )
356+ with open ("task2domain.json" , "r" ) as f :
357+ task2domain = json .load (f )
358+ domain_complete = get_domain_perf (complete_data , task2domain )
359+ domain_instruct = get_domain_perf (instruct_data , task2domain )
360+ DatasetDict ({"complete" : domain_complete , "instruct" : domain_instruct }).push_to_hub (f"bigcode/bigcodebench{ suffix } -domain" )
361+
362+ files .extend (complete_files )
363+ files .extend (instruct_files )
364+ shutil .rmtree ("eval_results" , ignore_errors = True )
365+ os .makedirs ("eval_results" , exist_ok = True )
366+ for file in files :
367+ shutil .copy (file , "eval_results" )
368+
369+ complete_solve_rate = get_solve_rate (complete_data , task = "complete" )
370+ instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
371+ solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
372+ push_ds (solve_rate_ds , f"bigcode/bigcodebench{ suffix } -solve-rate" )
373+
374+ elo_config = {
375+ "task_no_tie" : (True , True ),
376+ "benchmark_tie" : (False , False ),
377+ }
378+ elo_ds = dict ()
379+ for config , (task_level , no_tie ) in elo_config .items ():
380+ battles = get_winner_df (complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
381+ elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
382+ bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
383+ bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
384+ bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
385+ if config == "task_no_tie" :
386+ task_elo = bootstrap_lu_median_dict
387+ elo = get_bootstrap_scores (elo_mle_bootstrap )
388+ elo_ds [config ] = elo
389+ push_ds (DatasetDict (elo_ds ), f"bigcode/bigcodebench{ suffix } -elo" )
390+
391+ results = update_elo_rating (results , task_elo )
392+ with open (f"results{ suffix } .json" , "w" ) as f :
393+ json .dump (results , f , indent = 4 )
394+ ds = get_hf_ds (results )
395+ push_ds (ds , f"bigcode/bigcodebench{ suffix } -results" )
0 commit comments