Skip to content

Commit 5b67995

Browse files
authored
v0.1.8 Release
v0.1.8 Release
2 parents bbe93d6 + 3978502 commit 5b67995

File tree

9 files changed

+437
-120
lines changed

9 files changed

+437
-120
lines changed

analysis/get_results.py

Lines changed: 102 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
import pandas as pd
1010
import itertools
1111
import math
12-
from datasets import Dataset, DatasetDict
12+
from datasets import Dataset, DatasetDict, load_dataset
1313
from transformers import AutoTokenizer
1414

15-
1615
def update_model_info(model_info):
1716
for model, info in model_info.items():
1817
if "https://huggingface.co/" in info["link"]:
1918
hf_model = info["link"].split("https://huggingface.co/")[-1]
19+
print(hf_model)
2020
tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
2121
if tokenizer.chat_template is None:
2222
model_info[model]["direct_complete"] = True
@@ -28,7 +28,7 @@ def update_model_info(model_info):
2828
return model_info
2929

3030

31-
def get_results():
31+
def get_results(tids):
3232
results = {}
3333
for model, info in model_info.items():
3434
results[info["name"]] = {
@@ -41,26 +41,28 @@ def get_results():
4141
"instruct-cal": None,
4242
},
4343
"prompted": info["prompted"],
44+
"moe": info["moe"],
4445
"size": info["size"],
46+
"act_param": info["act_param"],
4547
"direct_complete": info["direct_complete"],
4648
}
4749

4850
for model, info in model_info.items():
4951
model = model.replace("/", "--")
5052
hf_model = ""
51-
if "https://huggingface.co/" in info["link"]:
52-
hf_model = info["link"].split("https://huggingface.co/")[-1]
53-
model = hf_model.replace("/", "--")
5453
files = glob(f"results/{model}--bigcodebench-*.json")
5554
assert files, f"No files found for results/{model}--bigcodebench-*.json"
55+
# if "https://huggingface.co/" in info["link"]:
56+
# hf_model = info["link"].split("https://huggingface.co/")[-1]
57+
# model = hf_model.replace("/", "--")
5658
for file in files:
5759
_, suffix = os.path.basename(file).split("--bigcodebench-")
5860
status = []
5961
with open("results/"+model+"--bigcodebench-"+suffix, "r") as f:
6062
data = json.load(f)
61-
if len(data["eval"]) != 1140:
62-
continue
6363
for key, value in data["eval"].items():
64+
if key not in tids:
65+
continue
6466
if value[0]["status"] == "pass":
6567
status.append(1)
6668
else:
@@ -142,17 +144,17 @@ def split_gen():
142144
f.writelines(data)
143145

144146

145-
def read_task_perf(task="complete"):
147+
def read_task_perf(tids, task="complete"):
146148
model_results = dict()
147149
result_files = []
148150
for model, info in model_info.items():
149151
if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
150152
continue
151153

152-
task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
154+
task_perf = dict()
153155
model = model.replace("/", "--")
154-
if info["link"].startswith("https://huggingface.co/"):
155-
model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
156+
# if info["link"].startswith("https://huggingface.co/"):
157+
# model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
156158
try:
157159
if info["prompted"] and not info["direct_complete"]:
158160
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
@@ -169,22 +171,22 @@ def read_task_perf(task="complete"):
169171
with open(file, "r") as f:
170172
data = json.load(f)
171173
for task_id, perfs in data["eval"].items():
172-
status = 1 if perfs[0]["status"] == "pass" else 0
173-
task_perf[task_id] = status
174+
if task_id in tids:
175+
status = 1 if perfs[0]["status"] == "pass" else 0
176+
task_perf[task_id] = status
174177
model_results[info["name"]] = task_perf
175178
return model_results, result_files
176179

177180

178-
def get_winner_df(data_dict, task, task_level=True, no_tie=True):
181+
def get_winner_df(data_dict, tids, task, task_level=True, no_tie=True):
179182
winner_dict = {"task_id": [], "model_a": [], "model_b": [], "winner": []}
180183
if not task_level:
181184
file = f"{task}_winner_df.csv"
182185
else:
183186
file = f"{task}_winner_task_df.csv"
184187

185188
if task_level:
186-
for task_id in tqdm(range(1140)):
187-
task_id = f"BigCodeBench/{task_id}"
189+
for task_id in tqdm(tids):
188190
# pair without repetition (a, b) and (b, a) are the same
189191
for model_a, model_b in itertools.combinations(data_dict.keys(), 2):
190192
solve_rate_a = data_dict[model_a][task_id]
@@ -263,23 +265,51 @@ def update_elo_rating(results, elo_dict):
263265
return results
264266

265267

268+
def get_domain_perf(data_dict, task2domain):
269+
domain_perfs = {
270+
"Model": [],
271+
"Computation": [],
272+
"General": [],
273+
"Visualization": [],
274+
"System": [],
275+
"Time": [],
276+
"Network": [],
277+
"Cryptography": []
278+
}
279+
for model, task_perf in data_dict.items():
280+
model_domain = {"Computation": [], "General": [], "Visualization": [], "System": [], "Time": [], "Network": [], "Cryptography": []}
281+
for task_id, status in task_perf.items():
282+
domains = task2domain[task_id]
283+
for domain in domains:
284+
model_domain[domain].append(status)
285+
domain_perf = {domain: round(np.mean(perfs)*100, 1) for domain, perfs in model_domain.items()}
286+
domain_perfs["Model"].append(model)
287+
for domain in model_domain.keys():
288+
domain_perfs[domain].append(domain_perf[domain])
289+
return Dataset.from_dict(domain_perfs)
290+
291+
266292
def get_solve_rate(data_dict, task="complete"):
267-
task_solve_count = {f"BigCodeBench/{task_id}": [] for task_id in range(1140)}
293+
task_solve_count = dict()
268294
for model, task_perf in data_dict.items():
269-
for task_id in range(1140):
270-
task_solve_count[f"BigCodeBench/{task_id}"].append(task_perf[f"BigCodeBench/{task_id}"])
295+
for task_id, score in task_perf.items():
296+
if task_id not in task_solve_count:
297+
task_solve_count[task_id] = []
298+
task_solve_count[task_id].append(score)
271299
solve_rate = {task_id: round(np.mean(perfs) * 100, 1) for task_id, perfs in task_solve_count.items()}
272300
return Dataset.from_dict({"task_id": list(solve_rate.keys()), "solve_rate": list(solve_rate.values())})
273301

274302

275303
def get_hf_ds(results):
276-
hf_dataset = {"model": [], "link": [], "size": [], "type": [], "lazy": [], "direct_complete": [],
304+
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], "lazy": [], "direct_complete": [],
277305
"complete": [], "instruct": [], "elo_mle": []}
278306

279307
for model, result in results.items():
280308
hf_dataset["model"].append(model)
281309
hf_dataset["link"].append(result["link"])
310+
hf_dataset["moe"].append(result["moe"])
282311
hf_dataset["size"].append(result["size"])
312+
hf_dataset["act_param"].append(result["act_param"])
283313
hf_dataset["type"].append("🔶" if result["prompted"] else "🟢")
284314
hf_dataset["lazy"].append(result["lazy"])
285315
hf_dataset["complete"].append(result["pass@1"]["complete"])
@@ -310,42 +340,56 @@ def push_ds(ds, path, local=False):
310340

311341
if __name__ == "__main__":
312342

343+
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
344+
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.0_hf")
313345
model_info = update_model_info(model_info)
314-
results = get_results()
315-
files = []
316-
complete_data, complete_files = read_task_perf("complete")
317-
instruct_data, instruct_files = read_task_perf("instruct")
318-
files.extend(complete_files)
319-
files.extend(instruct_files)
320-
shutil.rmtree("eval_results", ignore_errors=True)
321-
os.makedirs("eval_results", exist_ok=True)
322-
for file in files:
323-
shutil.copy(file, "eval_results")
324-
325-
complete_solve_rate = get_solve_rate(complete_data, task="complete")
326-
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
327-
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
328-
push_ds(solve_rate_ds, "bigcode/bigcodebench-solve-rate")
329-
330-
elo_config = {
331-
"task_no_tie": (True, True),
332-
"benchmark_tie": (False, False),
346+
bcb_config = {
347+
"": bcb_orig,
348+
"-hard": bcb_hard,
333349
}
334-
elo_ds = dict()
335-
for config, (task_level, no_tie) in elo_config.items():
336-
battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie)
337-
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
338-
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
339-
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
340-
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
341-
if config == "task_no_tie":
342-
task_elo = bootstrap_lu_median_dict
343-
elo = get_bootstrap_scores(elo_mle_bootstrap)
344-
elo_ds[config] = elo
345-
push_ds(DatasetDict(elo_ds), "bigcode/bigcodebench-elo")
346-
347-
results = update_elo_rating(results, task_elo)
348-
with open("results.json", "w") as f:
349-
json.dump(results, f, indent=4)
350-
ds = get_hf_ds(results)
351-
push_ds(ds, "bigcode/bigcodebench-results")
350+
for suffix, bcb in bcb_config.items():
351+
results = get_results(bcb["task_id"])
352+
files = []
353+
complete_data, complete_files = read_task_perf(bcb["task_id"], "complete")
354+
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
355+
assert len(model_info) == len(complete_data)
356+
with open("task2domain.json", "r") as f:
357+
task2domain = json.load(f)
358+
domain_complete = get_domain_perf(complete_data, task2domain)
359+
domain_instruct = get_domain_perf(instruct_data, task2domain)
360+
DatasetDict({"complete": domain_complete, "instruct": domain_instruct}).push_to_hub(f"bigcode/bigcodebench{suffix}-domain")
361+
362+
files.extend(complete_files)
363+
files.extend(instruct_files)
364+
shutil.rmtree("eval_results", ignore_errors=True)
365+
os.makedirs("eval_results", exist_ok=True)
366+
for file in files:
367+
shutil.copy(file, "eval_results")
368+
369+
complete_solve_rate = get_solve_rate(complete_data, task="complete")
370+
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
371+
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
372+
push_ds(solve_rate_ds, f"bigcode/bigcodebench{suffix}-solve-rate")
373+
374+
elo_config = {
375+
"task_no_tie": (True, True),
376+
"benchmark_tie": (False, False),
377+
}
378+
elo_ds = dict()
379+
for config, (task_level, no_tie) in elo_config.items():
380+
battles = get_winner_df(complete_data, bcb["task_id"], "complete", task_level=task_level, no_tie=no_tie)
381+
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
382+
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
383+
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
384+
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
385+
if config == "task_no_tie":
386+
task_elo = bootstrap_lu_median_dict
387+
elo = get_bootstrap_scores(elo_mle_bootstrap)
388+
elo_ds[config] = elo
389+
push_ds(DatasetDict(elo_ds), f"bigcode/bigcodebench{suffix}-elo")
390+
391+
results = update_elo_rating(results, task_elo)
392+
with open(f"results{suffix}.json", "w") as f:
393+
json.dump(results, f, indent=4)
394+
ds = get_hf_ds(results)
395+
push_ds(ds, f"bigcode/bigcodebench{suffix}-results")

0 commit comments

Comments
 (0)