Skip to content

Commit f2cf707

Browse files
committed
fix: merge hf dataset
1 parent 3768bd0 commit f2cf707

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

analysis/get_results.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pandas as pd
1010
import itertools
1111
import math
12-
from datasets import Dataset
12+
from datasets import Dataset, DatasetDict
1313
from transformers import AutoTokenizer
1414

1515

@@ -317,21 +317,27 @@ def push_ds(ds, path, local=False):
317317
instruct_data = read_task_perf("instruct")
318318
complete_solve_rate = get_solve_rate(complete_data, task="complete")
319319
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
320-
push_ds(complete_solve_rate, "bigcode/bigcodebench-complete-solve-rate")
321-
push_ds(instruct_solve_rate, "bigcode/bigcodebench-instruct-solve-rate")
320+
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
321+
push_ds(solve_rate_ds, "bigcode/bigcodebench-solve-rate")
322322

323-
task_level = True
324-
no_tie = True
325-
battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie)
326-
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
327-
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
328-
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
329-
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
330-
elo = get_bootstrap_scores(elo_mle_bootstrap)
331-
push_ds(elo, "bigcode/bigcodebench-elo")
332-
# push_ds(elo, "bigcode/bigcodebench-elo-model-with-tie")
333-
334-
results = update_elo_rating(results, bootstrap_lu_median_dict)
323+
elo_config = {
324+
"task_no_tie": (True, True),
325+
"benchmark_tie": (False, False),
326+
}
327+
elo_ds = dict()
328+
for config, (task_level, no_tie) in elo_config.items():
329+
battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie)
330+
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
331+
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
332+
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
333+
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
334+
if config == "task_no_tie":
335+
task_elo = bootstrap_lu_median_dict
336+
elo = get_bootstrap_scores(elo_mle_bootstrap)
337+
elo_ds[config] = elo
338+
push_ds(DatasetDict(elo_ds), "bigcode/bigcodebench-elo")
339+
340+
results = update_elo_rating(results, task_elo)
335341
with open("results.json", "w") as f:
336342
json.dump(results, f, indent=4)
337343
ds = get_hf_ds(results)

0 commit comments

Comments
 (0)