Skip to content

Commit 087c202

Browse files
committed
update he result analysis
1 parent 170c9a0 commit 087c202

File tree

1 file changed

+96
-63
lines changed

1 file changed

+96
-63
lines changed

analysis/get_results.py

Lines changed: 96 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,9 @@
99
import pandas as pd
1010
import itertools
1111
import math
12-
from datasets import Dataset, DatasetDict
12+
from datasets import Dataset, DatasetDict, load_dataset
1313
from transformers import AutoTokenizer
1414

15-
1615
def update_model_info(model_info):
1716
for model, info in model_info.items():
1817
if "https://huggingface.co/" in info["link"]:
@@ -29,7 +28,7 @@ def update_model_info(model_info):
2928
return model_info
3029

3130

32-
def get_results():
31+
def get_results(tids):
3332
results = {}
3433
for model, info in model_info.items():
3534
results[info["name"]] = {
@@ -42,7 +41,9 @@ def get_results():
4241
"instruct-cal": None,
4342
},
4443
"prompted": info["prompted"],
44+
"moe": info["moe"],
4545
"size": info["size"],
46+
"act_param": info["act_param"],
4647
"direct_complete": info["direct_complete"],
4748
}
4849

@@ -59,9 +60,9 @@ def get_results():
5960
status = []
6061
with open("results/"+model+"--bigcodebench-"+suffix, "r") as f:
6162
data = json.load(f)
62-
if len(data["eval"]) != 1140:
63-
continue
6463
for key, value in data["eval"].items():
64+
if key not in tids:
65+
continue
6566
if value[0]["status"] == "pass":
6667
status.append(1)
6768
else:
@@ -143,14 +144,14 @@ def split_gen():
143144
f.writelines(data)
144145

145146

146-
def read_task_perf(task="complete"):
147+
def read_task_perf(tids, task="complete"):
147148
model_results = dict()
148149
result_files = []
149150
for model, info in model_info.items():
150151
if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
151152
continue
152153

153-
task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
154+
task_perf = dict()
154155
model = model.replace("/", "--")
155156
# if info["link"].startswith("https://huggingface.co/"):
156157
# model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
@@ -170,22 +171,22 @@ def read_task_perf(task="complete"):
170171
with open(file, "r") as f:
171172
data = json.load(f)
172173
for task_id, perfs in data["eval"].items():
173-
status = 1 if perfs[0]["status"] == "pass" else 0
174-
task_perf[task_id] = status
174+
if task_id in tids:
175+
status = 1 if perfs[0]["status"] == "pass" else 0
176+
task_perf[task_id] = status
175177
model_results[info["name"]] = task_perf
176178
return model_results, result_files
177179

178180

179-
def get_winner_df(data_dict, task, task_level=True, no_tie=True):
181+
def get_winner_df(data_dict, tids, task, task_level=True, no_tie=True):
180182
winner_dict = {"task_id": [], "model_a": [], "model_b": [], "winner": []}
181183
if not task_level:
182184
file = f"{task}_winner_df.csv"
183185
else:
184186
file = f"{task}_winner_task_df.csv"
185187

186188
if task_level:
187-
for task_id in tqdm(range(1140)):
188-
task_id = f"BigCodeBench/{task_id}"
189+
for task_id in tqdm(tids):
189190
# pair without repetition (a, b) and (b, a) are the same
190191
for model_a, model_b in itertools.combinations(data_dict.keys(), 2):
191192
solve_rate_a = data_dict[model_a][task_id]
@@ -264,23 +265,51 @@ def update_elo_rating(results, elo_dict):
264265
return results
265266

266267

268+
def get_domain_perf(data_dict, task2domain):
269+
domain_perfs = {
270+
"Model": [],
271+
"Computation": [],
272+
"General": [],
273+
"Visualization": [],
274+
"System": [],
275+
"Time": [],
276+
"Network": [],
277+
"Cryptography": []
278+
}
279+
for model, task_perf in data_dict.items():
280+
model_domain = {"Computation": [], "General": [], "Visualization": [], "System": [], "Time": [], "Network": [], "Cryptography": []}
281+
for task_id, status in task_perf.items():
282+
domains = task2domain[task_id]
283+
for domain in domains:
284+
model_domain[domain].append(status)
285+
domain_perf = {domain: round(np.mean(perfs)*100, 1) for domain, perfs in model_domain.items()}
286+
domain_perfs["Model"].append(model)
287+
for domain in model_domain.keys():
288+
domain_perfs[domain].append(domain_perf[domain])
289+
return Dataset.from_dict(domain_perfs)
290+
291+
267292
def get_solve_rate(data_dict, task="complete"):
268-
task_solve_count = {f"BigCodeBench/{task_id}": [] for task_id in range(1140)}
293+
task_solve_count = dict()
269294
for model, task_perf in data_dict.items():
270-
for task_id in range(1140):
271-
task_solve_count[f"BigCodeBench/{task_id}"].append(task_perf[f"BigCodeBench/{task_id}"])
295+
for task_id, score in task_perf.items():
296+
if task_id not in task_solve_count:
297+
task_solve_count[task_id] = []
298+
task_solve_count[task_id].append(score)
272299
solve_rate = {task_id: round(np.mean(perfs) * 100, 1) for task_id, perfs in task_solve_count.items()}
273300
return Dataset.from_dict({"task_id": list(solve_rate.keys()), "solve_rate": list(solve_rate.values())})
274301

275302

276303
def get_hf_ds(results):
277-
hf_dataset = {"model": [], "link": [], "size": [], "type": [], "lazy": [], "direct_complete": [],
304+
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], "lazy": [], "direct_complete": [],
278305
"complete": [], "instruct": [], "elo_mle": []}
279306

280307
for model, result in results.items():
281308
hf_dataset["model"].append(model)
282309
hf_dataset["link"].append(result["link"])
310+
hf_dataset["moe"].append(result["moe"])
283311
hf_dataset["size"].append(result["size"])
312+
hf_dataset["act_param"].append(result["act_param"])
284313
hf_dataset["type"].append("🔶" if result["prompted"] else "🟢")
285314
hf_dataset["lazy"].append(result["lazy"])
286315
hf_dataset["complete"].append(result["pass@1"]["complete"])
@@ -311,52 +340,56 @@ def push_ds(ds, path, local=False):
311340

312341
if __name__ == "__main__":
313342

343+
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
344+
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.0_hf")
314345
model_info = update_model_info(model_info)
315-
results = get_results()
316-
files = []
317-
complete_data, complete_files = read_task_perf("complete")
318-
instruct_data, instruct_files = read_task_perf("instruct")
319-
assert len(model_info) == len(complete_data)
320-
# complete_map = {model.replace("-","_").replace("+","_plus").replace(" ","_"):
321-
# Dataset.from_dict({"task_id": list(task_perf.keys()), "status": list(task_perf.values())}) for model, task_perf in complete_data.items()}
322-
# instruct_map = {model.replace("-","_").replace("+","_plus").replace(" ","_"):
323-
# Dataset.from_dict({"task_id": list(task_perf.keys()), "status": list(task_perf.values())}) for model, task_perf in instruct_data.items()}
324-
# complete_ds = DatasetDict(complete_map)
325-
# instruct_ds = DatasetDict(instruct_map)
326-
# push_ds(complete_ds, "bigcode/bigcodebench-complete-perf")
327-
# push_ds(instruct_ds, "bigcode/bigcodebench-instruct-perf")
328-
329-
files.extend(complete_files)
330-
files.extend(instruct_files)
331-
shutil.rmtree("eval_results", ignore_errors=True)
332-
os.makedirs("eval_results", exist_ok=True)
333-
for file in files:
334-
shutil.copy(file, "eval_results")
335-
336-
complete_solve_rate = get_solve_rate(complete_data, task="complete")
337-
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
338-
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
339-
push_ds(solve_rate_ds, "bigcode/bigcodebench-solve-rate")
340-
341-
elo_config = {
342-
"task_no_tie": (True, True),
343-
"benchmark_tie": (False, False),
346+
bcb_config = {
347+
"": bcb_orig,
348+
"-hard": bcb_hard,
344349
}
345-
elo_ds = dict()
346-
for config, (task_level, no_tie) in elo_config.items():
347-
battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie)
348-
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
349-
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
350-
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
351-
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
352-
if config == "task_no_tie":
353-
task_elo = bootstrap_lu_median_dict
354-
elo = get_bootstrap_scores(elo_mle_bootstrap)
355-
elo_ds[config] = elo
356-
push_ds(DatasetDict(elo_ds), "bigcode/bigcodebench-elo")
357-
358-
results = update_elo_rating(results, task_elo)
359-
with open("results.json", "w") as f:
360-
json.dump(results, f, indent=4)
361-
ds = get_hf_ds(results)
362-
push_ds(ds, "bigcode/bigcodebench-results")
350+
for suffix, bcb in bcb_config.items():
351+
results = get_results(bcb["task_id"])
352+
files = []
353+
complete_data, complete_files = read_task_perf(bcb["task_id"], "complete")
354+
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
355+
assert len(model_info) == len(complete_data)
356+
with open("task2domain.json", "r") as f:
357+
task2domain = json.load(f)
358+
domain_complete = get_domain_perf(complete_data, task2domain)
359+
domain_instruct = get_domain_perf(instruct_data, task2domain)
360+
DatasetDict({"complete": domain_complete, "instruct": domain_instruct}).push_to_hub(f"bigcode/bigcodebench{suffix}-domain")
361+
362+
files.extend(complete_files)
363+
files.extend(instruct_files)
364+
shutil.rmtree("eval_results", ignore_errors=True)
365+
os.makedirs("eval_results", exist_ok=True)
366+
for file in files:
367+
shutil.copy(file, "eval_results")
368+
369+
complete_solve_rate = get_solve_rate(complete_data, task="complete")
370+
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
371+
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
372+
push_ds(solve_rate_ds, f"bigcode/bigcodebench{suffix}-solve-rate")
373+
374+
elo_config = {
375+
"task_no_tie": (True, True),
376+
"benchmark_tie": (False, False),
377+
}
378+
elo_ds = dict()
379+
for config, (task_level, no_tie) in elo_config.items():
380+
battles = get_winner_df(complete_data, bcb["task_id"], "complete", task_level=task_level, no_tie=no_tie)
381+
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
382+
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
383+
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
384+
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
385+
if config == "task_no_tie":
386+
task_elo = bootstrap_lu_median_dict
387+
elo = get_bootstrap_scores(elo_mle_bootstrap)
388+
elo_ds[config] = elo
389+
push_ds(DatasetDict(elo_ds), f"bigcode/bigcodebench{suffix}-elo")
390+
391+
results = update_elo_rating(results, task_elo)
392+
with open(f"results{suffix}.json", "w") as f:
393+
json.dump(results, f, indent=4)
394+
ds = get_hf_ds(results)
395+
push_ds(ds, f"bigcode/bigcodebench{suffix}-results")

0 commit comments

Comments
 (0)