|
9 | 9 | import pandas as pd |
10 | 10 | import itertools |
11 | 11 | import math |
12 | | -from datasets import Dataset |
| 12 | +from datasets import Dataset, DatasetDict |
13 | 13 | from transformers import AutoTokenizer |
14 | 14 |
|
15 | 15 |
|
@@ -317,21 +317,27 @@ def push_ds(ds, path, local=False): |
317 | 317 | instruct_data = read_task_perf("instruct") |
318 | 318 | complete_solve_rate = get_solve_rate(complete_data, task="complete") |
319 | 319 | instruct_solve_rate = get_solve_rate(instruct_data, task="instruct") |
320 | | - push_ds(complete_solve_rate, "bigcode/bigcodebench-complete-solve-rate") |
321 | | - push_ds(instruct_solve_rate, "bigcode/bigcodebench-instruct-solve-rate") |
| 320 | + solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate}) |
| 321 | + push_ds(solve_rate_ds, "bigcode/bigcodebench-solve-rate") |
322 | 322 |
|
323 | | - task_level = True |
324 | | - no_tie = True |
325 | | - battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie) |
326 | | - elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500) |
327 | | - bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1) |
328 | | - bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int) |
329 | | - bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict() |
330 | | - elo = get_bootstrap_scores(elo_mle_bootstrap) |
331 | | - push_ds(elo, "bigcode/bigcodebench-elo") |
332 | | - # push_ds(elo, "bigcode/bigcodebench-elo-model-with-tie") |
333 | | - |
334 | | - results = update_elo_rating(results, bootstrap_lu_median_dict) |
| 323 | + elo_config = { |
| 324 | + "task_no_tie": (True, True), |
| 325 | + "benchmark_tie": (False, False), |
| 326 | + } |
| 327 | + elo_ds = dict() |
| 328 | + for config, (task_level, no_tie) in elo_config.items(): |
| 329 | + battles = get_winner_df(complete_data, "complete", task_level=task_level, no_tie=no_tie) |
| 330 | + elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500) |
| 331 | + bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1) |
| 332 | + bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int) |
| 333 | + bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict() |
| 334 | + if config == "task_no_tie": |
| 335 | + task_elo = bootstrap_lu_median_dict |
| 336 | + elo = get_bootstrap_scores(elo_mle_bootstrap) |
| 337 | + elo_ds[config] = elo |
| 338 | + push_ds(DatasetDict(elo_ds), "bigcode/bigcodebench-elo") |
| 339 | + |
| 340 | + results = update_elo_rating(results, task_elo) |
335 | 341 | with open("results.json", "w") as f: |
336 | 342 | json.dump(results, f, indent=4) |
337 | 343 | ds = get_hf_ds(results) |
|
0 commit comments