Skip to content

Commit 2498259

Browse files
committed
2 parents 312321d + fce1f38 commit 2498259

File tree

2 files changed

+136
-21
lines changed

2 files changed

+136
-21
lines changed

analysis/get_results.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import math
1212
from datasets import Dataset, DatasetDict, load_dataset
1313
from transformers import AutoTokenizer
14+
from cuml.linear_model import LogisticRegression
15+
import cupy as cp
1416

1517
def update_model_info(model_info):
1618
for model, info in model_info.items():
@@ -67,6 +69,8 @@ def get_results(tids):
6769
data = json.load(f)
6870
status = []
6971

72+
if len(data["eval"]) < len(tids):
73+
continue
7074
for key, value in data["eval"].items():
7175
if key not in tids:
7276
continue
@@ -163,23 +167,23 @@ def read_task_perf(tids, task="complete"):
163167
try:
164168
try:
165169
try:
166-
if info["prompted"]:# and not info["direct_complete"]:
167-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
170+
if info["prompted"]:
171+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
168172
if files:
169173
file = files[0]
170174
else:
171-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
175+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
172176
else:
173-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
177+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
174178
except:
175-
if info["prompted"]:
176-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
179+
if info["prompted"]:# and not info["direct_complete"]:
180+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
177181
if files:
178182
file = files[0]
179183
else:
180-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
184+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
181185
else:
182-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
186+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
183187
except:
184188
try:
185189
if info["prompted"]:# and not info["direct_complete"]:
@@ -205,6 +209,9 @@ def read_task_perf(tids, task="complete"):
205209
result_files.append(file)
206210
with open(file, "r") as f:
207211
data = json.load(f)
212+
213+
if len(data["eval"]) < len(tids):
214+
continue
208215
for task_id, perfs in data["eval"].items():
209216
if task_id in tids:
210217
status = 1 if perfs[0]["status"] == "pass" else 0
@@ -271,25 +278,26 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
271278

272279

273280
def get_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
274-
from sklearn.linear_model import LogisticRegression
281+
282+
275283
models = pd.concat([df["model_a"], df["model_b"]]).unique()
276284
models = pd.Series(np.arange(len(models)), index=models)
277285
p = len(models.index)
278286
n = df.shape[0]
279287

280-
X = np.zeros([n, p])
281-
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
282-
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
288+
X = cp.zeros([n, p])
289+
X[cp.arange(n), models[df["model_a"]]] = +math.log(BASE)
290+
X[cp.arange(n), models[df["model_b"]]] = -math.log(BASE)
283291

284-
Y = np.zeros(n)
292+
Y = cp.zeros(n)
285293
Y[df["winner"] == "model_a"] = 1.0
286294

287295
lr = LogisticRegression(fit_intercept=False)
288-
lr.fit(X,Y)
296+
lr.fit(X, Y)
289297

290298
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
291299

292-
return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
300+
return pd.Series(cp.asnumpy(elo_scores), index=models.index).sort_values(ascending=False)
293301

294302

295303
def update_elo_rating(results, elo_dict):
@@ -387,11 +395,10 @@ def get_perf_df(data_dict):
387395

388396
if __name__ == "__main__":
389397

390-
# bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
391-
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.0_hf")
392-
# model_info = update_model_info(model_info)
398+
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
399+
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
393400
bcb_config = {
394-
# "": bcb_orig,
401+
"": bcb_orig,
395402
"-hard": bcb_hard,
396403
}
397404
for suffix, bcb in bcb_config.items():
@@ -401,9 +408,9 @@ def get_perf_df(data_dict):
401408
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
402409
complete_df = get_perf_df(complete_data)
403410
instruct_df = get_perf_df(instruct_data)
411+
404412
push_ds(DatasetDict({"complete": Dataset.from_pandas(complete_df), "instruct": Dataset.from_pandas(instruct_df)}), f"bigcode/bigcodebench{suffix}-perf")
405-
assert len(model_info) == len(complete_data),\
406-
f"Missing results for {set([val['name'] for val in model_info.values()]) - set([model for model in complete_data.keys()])}"
413+
407414
with open("task2domain.json", "r") as f:
408415
task2domain = json.load(f)
409416
domain_complete = get_domain_perf(complete_data, task2domain)

analysis/utils.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,4 +1169,112 @@
11691169
"act_param": None,
11701170
"open-data": "None",
11711171
},
1172+
"Qwen/Qwen2.5-Coder-1.5B-Instruct": {
1173+
"name": "Qwen2.5-Coder-1.5B-Instruct",
1174+
"link": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct",
1175+
"prompted": True,
1176+
"moe": False,
1177+
"size": 1.5,
1178+
"act_param": 1.5,
1179+
"open-data": "None",
1180+
},
1181+
"Qwen/Qwen2.5-Coder-7B-Instruct": {
1182+
"name": "Qwen2.5-Coder-7B-Instruct",
1183+
"link": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
1184+
"prompted": True,
1185+
"moe": False,
1186+
"size": 7,
1187+
"act_param": 7,
1188+
"open-data": "None",
1189+
},
1190+
"gemini-1.5-pro-002": {
1191+
"name": "Gemini-1.5-Pro-002",
1192+
"link": "https://deepmind.google/technologies/gemini/pro",
1193+
"prompted": True,
1194+
"moe": False,
1195+
"size": None,
1196+
"act_param": None,
1197+
"open-data": "None",
1198+
},
1199+
"mistralai/Mistral-Small-Instruct-2409": {
1200+
"name": "Mistral-Small-Instruct-2409",
1201+
"link": "https://huggingface.co/mistralai/Mistral-Small-Instruct-2409",
1202+
"prompted": True,
1203+
"moe": False,
1204+
"size": 22.2,
1205+
"act_param": 22.2,
1206+
"open-data": "None",
1207+
},
1208+
"Qwen/Qwen2.5-0.5B-Instruct": {
1209+
"name": "Qwen2.5-0.5B-Instruct",
1210+
"link": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct",
1211+
"prompted": True,
1212+
"moe": False,
1213+
"size": 0.5,
1214+
"act_param": 0.5,
1215+
"open-data": "None",
1216+
},
1217+
"Qwen/Qwen2.5-1.5B-Instruct": {
1218+
"name": "Qwen2.5-1.5B-Instruct",
1219+
"link": "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct",
1220+
"prompted": True,
1221+
"moe": False,
1222+
"size": 1.5,
1223+
"act_param": 1.5,
1224+
"open-data": "None",
1225+
},
1226+
"Qwen/Qwen2.5-7B-Instruct": {
1227+
"name": "Qwen2.5-7B-Instruct",
1228+
"link": "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct",
1229+
"prompted": True,
1230+
"moe": False,
1231+
"size": 7,
1232+
"act_param": 7,
1233+
"open-data": "None",
1234+
},
1235+
"Qwen/Qwen2.5-14B-Instruct": {
1236+
"name": "Qwen2.5-14B-Instruct",
1237+
"link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
1238+
"prompted": True,
1239+
"moe": False,
1240+
"size": 14,
1241+
"act_param": 14,
1242+
"open-data": "None",
1243+
},
1244+
"Qwen/Qwen2.5-32B-Instruct": {
1245+
"name": "Qwen2.5-32B-Instruct",
1246+
"link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
1247+
"prompted": True,
1248+
"moe": False,
1249+
"size": 32,
1250+
"act_param": 32,
1251+
"open-data": "None",
1252+
},
1253+
"Qwen/Qwen2.5-72B-Instruct": {
1254+
"name": "Qwen2.5-72B-Instruct",
1255+
"link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
1256+
"prompted": True,
1257+
"moe": False,
1258+
"size": 72,
1259+
"act_param": 72,
1260+
"open-data": "None",
1261+
},
1262+
"meta-llama/Llama-3.2-1B-Instruct": {
1263+
"name": "Llama-3.2-1B-Instruct",
1264+
"link": "https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct",
1265+
"prompted": True,
1266+
"moe": False,
1267+
"size": 1,
1268+
"act_param": 1,
1269+
"open-data": "None",
1270+
},
1271+
"meta-llama/Llama-3.2-3B-Instruct": {
1272+
"name": "Llama-3.2-3B-Instruct",
1273+
"link": "https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct",
1274+
"prompted": True,
1275+
"moe": False,
1276+
"size": 3,
1277+
"act_param": 3,
1278+
"open-data": "None",
1279+
},
11721280
}

0 commit comments

Comments
 (0)