Skip to content

Commit 49e3b3c

Browse files
committed
rfactor(analysis): update get results
1 parent c436061 commit 49e3b3c

File tree

1 file changed

+28
-21
lines changed

1 file changed

+28
-21
lines changed

analysis/get_results.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import math
1212
from datasets import Dataset, DatasetDict, load_dataset
1313
from transformers import AutoTokenizer
14+
from cuml.linear_model import LogisticRegression
15+
import cupy as cp
1416

1517
def update_model_info(model_info):
1618
for model, info in model_info.items():
@@ -67,6 +69,8 @@ def get_results(tids):
6769
data = json.load(f)
6870
status = []
6971

72+
if len(data["eval"]) < len(tids):
73+
continue
7074
for key, value in data["eval"].items():
7175
if key not in tids:
7276
continue
@@ -163,23 +167,23 @@ def read_task_perf(tids, task="complete"):
163167
try:
164168
try:
165169
try:
166-
if info["prompted"]:# and not info["direct_complete"]:
167-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
170+
if info["prompted"]:
171+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
168172
if files:
169173
file = files[0]
170174
else:
171-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
175+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
172176
else:
173-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
177+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
174178
except:
175-
if info["prompted"]:
176-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
179+
if info["prompted"]:# and not info["direct_complete"]:
180+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
177181
if files:
178182
file = files[0]
179183
else:
180-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
184+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
181185
else:
182-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
186+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
183187
except:
184188
try:
185189
if info["prompted"]:# and not info["direct_complete"]:
@@ -205,6 +209,9 @@ def read_task_perf(tids, task="complete"):
205209
result_files.append(file)
206210
with open(file, "r") as f:
207211
data = json.load(f)
212+
213+
if len(data["eval"]) < len(tids):
214+
continue
208215
for task_id, perfs in data["eval"].items():
209216
if task_id in tids:
210217
status = 1 if perfs[0]["status"] == "pass" else 0
@@ -271,25 +278,26 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
271278

272279

273280
def get_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
274-
from sklearn.linear_model import LogisticRegression
281+
282+
275283
models = pd.concat([df["model_a"], df["model_b"]]).unique()
276284
models = pd.Series(np.arange(len(models)), index=models)
277285
p = len(models.index)
278286
n = df.shape[0]
279287

280-
X = np.zeros([n, p])
281-
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
282-
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
288+
X = cp.zeros([n, p])
289+
X[cp.arange(n), models[df["model_a"]]] = +math.log(BASE)
290+
X[cp.arange(n), models[df["model_b"]]] = -math.log(BASE)
283291

284-
Y = np.zeros(n)
292+
Y = cp.zeros(n)
285293
Y[df["winner"] == "model_a"] = 1.0
286294

287295
lr = LogisticRegression(fit_intercept=False)
288-
lr.fit(X,Y)
296+
lr.fit(X, Y)
289297

290298
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
291299

292-
return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
300+
return pd.Series(cp.asnumpy(elo_scores), index=models.index).sort_values(ascending=False)
293301

294302

295303
def update_elo_rating(results, elo_dict):
@@ -387,11 +395,10 @@ def get_perf_df(data_dict):
387395

388396
if __name__ == "__main__":
389397

390-
# bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
391-
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.0_hf")
392-
# model_info = update_model_info(model_info)
398+
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
399+
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
393400
bcb_config = {
394-
# "": bcb_orig,
401+
"": bcb_orig,
395402
"-hard": bcb_hard,
396403
}
397404
for suffix, bcb in bcb_config.items():
@@ -401,9 +408,9 @@ def get_perf_df(data_dict):
401408
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
402409
complete_df = get_perf_df(complete_data)
403410
instruct_df = get_perf_df(instruct_data)
411+
404412
push_ds(DatasetDict({"complete": Dataset.from_pandas(complete_df), "instruct": Dataset.from_pandas(instruct_df)}), f"bigcode/bigcodebench{suffix}-perf")
405-
assert len(model_info) == len(complete_data),\
406-
f"Missing results for {set([val['name'] for val in model_info.values()]) - set([model for model in complete_data.keys()])}"
413+
407414
with open("task2domain.json", "r") as f:
408415
task2domain = json.load(f)
409416
domain_complete = get_domain_perf(complete_data, task2domain)

0 commit comments

Comments
 (0)