Skip to content

Commit bc98b3f

Browse files
authored
Merge pull request #2 from bigcode-project/hf
Release BigCodeBench v0.1.5
2 parents 0a26c44 + 0f0ea6e commit bc98b3f

File tree

10 files changed

+71
-43
lines changed

10 files changed

+71
-43
lines changed

CITATION.cff

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
1-
@article{bigcodebench,
2-
  title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
3-
  author={Zhuo, Terry Yue and Vu, Min Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and Brunner, Simon and Gong, Chen and Hoang, Thong and Zebaze, Armel Randy and Hong, Xiaoheng and Li, Wen-Ding and Kaddour, Jean and Xu, Ming and Zhang, Zhihan and Yadav, Prateek and Jain, Naman and Gu, Alex and Cheng, Zhoujun and Liu, Jiawei and Liu, Qian and Wang, Zijian and Lo, David and Hui, Binyuan and Muennighoff, Niklas and Fried, Daniel and Du, Xiaoning and de Vries, Harm and Von Werra, Leandro},
4-
  year={2024}
5-
}
1+
cff-version: 1.2.0
2+
message: "If you use this work and love it, consider citing it as below \U0001F917"
3+
title: BigCodeBench
4+
authors:
5+
- family-names: BigCodeBench Team
6+
url: https://github.com/bigcode-project/bigcodebench
7+
doi:
8+
date-released: 2024-06-18
9+
license: Apache-2.0
10+
preferred-citation:
11+
type: article
12+
title: "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
13+
authors:
14+
- family-names: BigCodeBench Team
15+
year: 2024
16+
journal:
17+
doi:
18+
url:

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ Following which, you can run the built container as shown in above.
160160
161161
* `task_id` is the identifier string for the task
162162
* `entry_point` is the name of the function
163-
* `prompt` is the prompt for BigCodeBench-Complete
164-
* `instruction` is the prompt for BigCodeBench-Instruct
163+
* `complete_prompt` is the prompt for BigCodeBench-Complete
164+
* `instruct_prompt` is the prompt for BigCodeBench-Instruct
165165
+ `canonical_solution` is the ground-truth implementation
166166
+ `test` is the `unittest.TestCase` class
167167

bigcodebench/data/bigcodebench.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
make_cache,
1111
stream_jsonl,
1212
)
13+
from datasets import load_dataset
1314

1415
BIGCODEBENCH_OVERRIDE_PATH = os.environ.get("BIGCODEBENCH_OVERRIDE_PATH", None)
15-
BIGCODEBENCH_VERSION = "v0.1.0"
16+
BIGCODEBENCH_HF = "bigcode/bigcodebench"
17+
BIGCODEBENCH_VERSION = "v0.1.0_hf"
1618

1719
def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") -> str:
1820
if BIGCODEBENCH_OVERRIDE_PATH:
@@ -22,7 +24,12 @@ def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") ->
2224
url, path = get_dataset_metadata(
2325
"BigCodeBench", BIGCODEBENCH_VERSION, mini, noextreme
2426
)
25-
make_cache(url, path)
27+
28+
try:
29+
dataset = load_dataset(BIGCODEBENCH_HF, split=BIGCODEBENCH_VERSION)
30+
make_cache(url, dataset, path)
31+
except:
32+
make_cache(url, None, path, gh=True)
2633

2734
return path
2835

@@ -33,12 +40,14 @@ def get_bigcodebench(
3340
"""Get BigCodeBench from BigCode's github repo and return as a list of parsed dicts.
3441
3542
Returns:
36-
List[Dict[str, str]]: List of dicts with keys "prompt", "test", "entry_point"
43+
List[Dict[str, str]]: List of dicts with keys "complete_prompt", "instruct_prompt", "canonical_solution", "test", "entry_point"
3744
3845
Notes:
3946
"task_id" is the identifier string for the task.
40-
"prompt" is the prompt to be used for the task (function signature with docstrings).
41-
"test" is test-cases wrapped in a `check` function.
47+
"complete_prompt" is the prompt to be used for BigCodeBench-Complete.
48+
"instruct_prompt" is the prompt to be used for BigCodeBench-Instruct.
49+
"canonical_solution" is the ground-truth implementation
50+
"test" is the `unittest.TestCase` class.
4251
"entry_point" is the name of the function.
4352
"""
4453
# Check if open eval file exists in CACHE_DIR

bigcodebench/data/utils.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,29 @@ def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool =
2424
return url, cache_path
2525

2626

27-
def make_cache(gzip_url, cache_path):
27+
def make_cache(gzip_url, hf_data, cache_path, gh=False):
2828
# Check if open eval file exists in CACHE_DIR
2929
if not os.path.exists(cache_path):
30-
# Install BigCodeBench dataset and parse as jsonl
31-
print(f"Downloading dataset from {gzip_url}")
32-
with tempdir.TempDir() as tmpdir:
33-
gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
34-
wget.download(gzip_url, gz_path)
30+
31+
if gh:
32+
# Install BigCodeBench dataset and parse as jsonl
33+
print(f"Downloading dataset from {gzip_url}")
34+
with tempdir.TempDir() as tmpdir:
35+
gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
36+
wget.download(gzip_url, gz_path)
3537

36-
with gzip.open(gz_path, "rb") as f:
37-
data = f.read().decode("utf-8")
38+
with gzip.open(gz_path, "rb") as f:
39+
data = f.read().decode("utf-8")
3840

39-
# create CACHE_DIR if not exists
40-
if not os.path.exists(CACHE_DIR):
41-
os.makedirs(CACHE_DIR)
41+
# create CACHE_DIR if not exists
42+
if not os.path.exists(CACHE_DIR):
43+
os.makedirs(CACHE_DIR)
4244

43-
# Write the original open eval file to CACHE_DIR
44-
with open(cache_path, "w") as f:
45-
f.write(data)
45+
# Write the original open eval file to CACHE_DIR
46+
with open(cache_path, "w") as f:
47+
f.write(data)
48+
else:
49+
hf_data.to_json(cache_path)
4650

4751

4852
def write_jsonl(
@@ -152,10 +156,12 @@ def write_directory(directory: PathLike, data: Iterable[Dict]):
152156
def completeness_check(name, data):
153157
for task_id, task in data.items():
154158
for key in [
155-
"prompt",
159+
"complete_prompt",
160+
"instruct_prompt",
156161
"canonical_solution",
162+
"code_prompt",
157163
"test",
158-
"instruction"
164+
"entry_point"
159165
]:
160166
assert key in task, f"{key} not found in {name} #{task_id}!"
161167

bigcodebench/evaluate.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def get_groundtruth(problems, hashcode, check_gt_only):
5050
expected_time = {}
5151
for task_id, problem in tqdm(problems.items()):
5252
expected_time[task_id] = trusted_exec(
53-
problem["prompt"] + "\n" + problem["clean_canonical_solution"],
53+
problem["complete_prompt"] + "\n" + problem["canonical_solution"],
5454
problem["test"],
5555
problem["task_id"],
5656
)
@@ -141,18 +141,18 @@ def evaluate(flags):
141141
solution = (
142142
sample["solution"]
143143
if "solution" in sample
144-
else problems[task_id]["prompt"] + sample["completion"]
144+
else problems[task_id]["complete_prompt"] + sample["completion"]
145145
)
146146
if "sanitized-calibrated" in flags.samples:
147-
solution = problems[task_id]["prompt_wo_doc"] + "\n pass\n" + solution
147+
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
148148
remainings.add(sample["_identifier"])
149149
args = (
150150
completion_id[task_id],
151151
problems[task_id],
152152
solution,
153153
sample["_identifier"],
154154
flags.min_time_limit,
155-
expected_time[task_id] if not flags.no_gt else 20
155+
expected_time[task_id] if expected_time else 20
156156
)
157157
futures.append(executor.submit(check_correctness, *args))
158158
completion_id[task_id] += 1

bigcodebench/generate.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,9 @@ def codegen(
6969

7070
sidx = n_samples - nsamples
7171
while sidx < n_samples:
72-
if subset == "instruct":
73-
prompt = task["instruction"]
74-
elif subset == "complete":
75-
prompt = task["prompt"]
76-
else:
72+
try:
73+
prompt = task[f"{subset}_prompt"]
74+
except:
7775
raise Exception(f"Invalid subset {subset}")
7876
if strip_newlines:
7977
prompt = prompt.strip("\n")
@@ -87,7 +85,7 @@ def codegen(
8785
samples = [
8886
dict(
8987
task_id=task_id,
90-
solution=task["prompt"]+completion
88+
solution=task["complete_prompt"]+completion
9189
)
9290
for task_id, completion in zip([task_id]*len(outputs), outputs)
9391
]

bigcodebench/inspect.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def inspection(args):
3030
os.makedirs(task_path)
3131
task_id_data = problems[task_id]
3232
with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
33-
f.write(task_id_data["prompt"] + "\n\n" + task_id_data["canonical_solution"])
33+
f.write(task_id_data[f"{args.subset}_prompt"] + "\n\n" + task_id_data["canonical_solution"])
3434

3535
# write test
3636
with open(os.path.join(task_path, "test_case.py"), "w") as f:
@@ -49,6 +49,7 @@ def inspection(args):
4949
def main():
5050
parser = argparse.ArgumentParser()
5151
parser.add_argument("--eval-results", required=True, type=str)
52+
parser.add_argument("--subset", required=True, type=str)
5253
parser.add_argument("--in-place", action="store_true")
5354
args = parser.parse_args()
5455

bigcodebench/sanitize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,10 @@ def script(
228228
if "solution" in solution:
229229
old_code = solution["solution"]
230230
if calibrate:
231-
old_code = solution["solution"].replace("```python\n ", "```python\n"+dataset[task_id]["prompt"]+" ")
231+
old_code = solution["solution"].replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ")
232232
else:
233233
assert "completion" in solution
234-
old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]
234+
old_code = dataset[task_id]["complete_prompt"] + "\n" + solution["completion"]
235235

236236
new_code = sanitize(code=old_code, entrypoint=function_name)
237237
# if changed, print the message

bigcodebench/syncheck.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def script(
4141
id2solutions[task_id] = []
4242
if "solution" not in solution:
4343
assert "completion" in solution, "solution or completion must exist!"
44-
solution["solution"] = dataset[task_id]["prompt"] + solution["completion"]
44+
solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"]
4545
id2solutions[task_id].append(solution)
4646

4747
print(colored("==============================", "blue"))

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = bigcodebench
3-
description = "A viable evaluation package for BigCodeBench"
3+
description = "Evaluation package for BigCodeBench"
44
long_description = file: README.md
55
long_description_content_type = text/markdown
66
url = https://github.com/bigcode-project/bigcodebench
@@ -26,6 +26,7 @@ install_requires =
2626
tree_sitter_languages>=1.10.2
2727
tree-sitter==0.21.3
2828
wget>=3.2
29+
datasets
2930

3031
[options.extras_require]
3132
generate =

0 commit comments

Comments
 (0)