Merge pull request #2 from bigcode-project/hf

terryyz · web-flow · commit bc98b3f566dc · 2024-06-18T21:29:39.000+08:00
Release BigCodeBench v0.1.5
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,5 +1,18 @@
-@article{bigcodebench,
-  title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
-  author={Zhuo, Terry Yue and Vu, Min Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and Brunner, Simon and Gong, Chen and Hoang, Thong and Zebaze, Armel Randy and Hong, Xiaoheng and Li, Wen-Ding and Kaddour, Jean and Xu, Ming and Zhang, Zhihan and Yadav, Prateek and Jain, Naman and Gu, Alex and Cheng, Zhoujun and Liu, Jiawei and Liu, Qian and Wang, Zijian and Lo, David and Hui, Binyuan and Muennighoff, Niklas and Fried, Daniel and Du, Xiaoning and de Vries, Harm and Von Werra, Leandro},
-  year={2024}
-}
+cff-version: 1.2.0
+message: "If you use this work and love it, consider citing it as below \U0001F917"
+title: BigCodeBench
+authors:
+  - family-names: BigCodeBench Team
+url: https://github.com/bigcode-project/bigcodebench
+doi: 
+date-released: 2024-06-18
+license: Apache-2.0
+preferred-citation:
+  type: article
+  title: "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
+  authors:
+    - family-names: BigCodeBench Team
+  year: 2024
+  journal:
+  doi:
+  url:
diff --git a/README.md b/README.md
@@ -160,8 +160,8 @@ Following which, you can run the built container as shown in above.
 
 * `task_id` is the identifier string for the task
 * `entry_point` is the name of the function
-* `prompt` is the prompt for BigCodeBench-Complete
-* `instruction` is the prompt for BigCodeBench-Instruct
+* `complete_prompt` is the prompt for BigCodeBench-Complete
+* `instruct_prompt` is the prompt for BigCodeBench-Instruct
 + `canonical_solution` is the ground-truth implementation
 + `test` is the `unittest.TestCase` class
 
diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py
@@ -10,9 +10,11 @@
     make_cache,
     stream_jsonl,
 )
+from datasets import load_dataset
 
 BIGCODEBENCH_OVERRIDE_PATH = os.environ.get("BIGCODEBENCH_OVERRIDE_PATH", None)
-BIGCODEBENCH_VERSION = "v0.1.0"
+BIGCODEBENCH_HF = "bigcode/bigcodebench"
+BIGCODEBENCH_VERSION = "v0.1.0_hf"
 
 def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") -> str:
     if BIGCODEBENCH_OVERRIDE_PATH:
@@ -22,7 +24,12 @@ def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") ->
     url, path = get_dataset_metadata(
         "BigCodeBench", BIGCODEBENCH_VERSION, mini, noextreme
     )
-    make_cache(url, path)
+    
+    try:
+        dataset = load_dataset(BIGCODEBENCH_HF, split=BIGCODEBENCH_VERSION)
+        make_cache(url, dataset, path)
+    except:
+        make_cache(url, None, path, gh=True)
 
     return path
 
@@ -33,12 +40,14 @@ def get_bigcodebench(
     """Get BigCodeBench from BigCode's github repo and return as a list of parsed dicts.
 
     Returns:
-        List[Dict[str, str]]: List of dicts with keys "prompt", "test", "entry_point"
+        List[Dict[str, str]]: List of dicts with keys "complete_prompt", "instruct_prompt", "canonical_solution", "test", "entry_point"
 
     Notes:
         "task_id" is the identifier string for the task.
-        "prompt" is the prompt to be used for the task (function signature with docstrings).
-        "test" is test-cases wrapped in a `check` function.
+        "complete_prompt" is the prompt to be used for BigCodeBench-Complete.
+        "instruct_prompt" is the prompt to be used for BigCodeBench-Instruct.
+        "canonical_solution" is the ground-truth implementation
+        "test" is the `unittest.TestCase` class.
         "entry_point" is the name of the function.
     """
     # Check if open eval file exists in CACHE_DIR
diff --git a/bigcodebench/data/utils.py b/bigcodebench/data/utils.py
@@ -24,25 +24,29 @@ def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool =
     return url, cache_path
 
 
-def make_cache(gzip_url, cache_path):
+def make_cache(gzip_url, hf_data, cache_path, gh=False):
     # Check if open eval file exists in CACHE_DIR
     if not os.path.exists(cache_path):
-        # Install BigCodeBench dataset and parse as jsonl
-        print(f"Downloading dataset from {gzip_url}")
-        with tempdir.TempDir() as tmpdir:
-            gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
-            wget.download(gzip_url, gz_path)
+        
+        if gh:
+            # Install BigCodeBench dataset and parse as jsonl
+            print(f"Downloading dataset from {gzip_url}")
+            with tempdir.TempDir() as tmpdir:
+                gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
+                wget.download(gzip_url, gz_path)
 
-            with gzip.open(gz_path, "rb") as f:
-                data = f.read().decode("utf-8")
+                with gzip.open(gz_path, "rb") as f:
+                    data = f.read().decode("utf-8")
 
-        # create CACHE_DIR if not exists
-        if not os.path.exists(CACHE_DIR):
-            os.makedirs(CACHE_DIR)
+            # create CACHE_DIR if not exists
+            if not os.path.exists(CACHE_DIR):
+                os.makedirs(CACHE_DIR)
 
-        # Write the original open eval file to CACHE_DIR
-        with open(cache_path, "w") as f:
-            f.write(data)
+            # Write the original open eval file to CACHE_DIR
+            with open(cache_path, "w") as f:
+                f.write(data)
+        else:
+            hf_data.to_json(cache_path)
 
 
 def write_jsonl(
@@ -152,10 +156,12 @@ def write_directory(directory: PathLike, data: Iterable[Dict]):
 def completeness_check(name, data):
     for task_id, task in data.items():
         for key in [
-            "prompt",
+            "complete_prompt",
+            "instruct_prompt",
             "canonical_solution",
+            "code_prompt",
             "test",
-            "instruction"
+            "entry_point"
         ]:
             assert key in task, f"{key} not found in {name} #{task_id}!"
 
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -50,7 +50,7 @@ def get_groundtruth(problems, hashcode, check_gt_only):
     expected_time = {}
     for task_id, problem in tqdm(problems.items()):
         expected_time[task_id] = trusted_exec(
-            problem["prompt"] + "\n" + problem["clean_canonical_solution"],
+            problem["complete_prompt"] + "\n" + problem["canonical_solution"],
             problem["test"],
             problem["task_id"],
         )
@@ -141,18 +141,18 @@ def evaluate(flags):
                 solution = (
                     sample["solution"]
                     if "solution" in sample
-                    else problems[task_id]["prompt"] + sample["completion"]
+                    else problems[task_id]["complete_prompt"] + sample["completion"]
                 )
                 if "sanitized-calibrated" in flags.samples:
-                    solution = problems[task_id]["prompt_wo_doc"] + "\n    pass\n" + solution
+                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
                 remainings.add(sample["_identifier"])
                 args = (
                     completion_id[task_id],
                     problems[task_id],
                     solution,
                     sample["_identifier"],
                     flags.min_time_limit,
-                    expected_time[task_id] if not flags.no_gt else 20
+                    expected_time[task_id] if expected_time else 20
                 )
                 futures.append(executor.submit(check_correctness, *args))
                 completion_id[task_id] += 1
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
@@ -69,11 +69,9 @@ def codegen(
 
             sidx = n_samples - nsamples
             while sidx < n_samples:
-                if subset == "instruct":
-                    prompt = task["instruction"]
-                elif subset == "complete":
-                    prompt = task["prompt"]
-                else:
+                try:
+                    prompt = task[f"{subset}_prompt"]
+                except:
                     raise Exception(f"Invalid subset {subset}")
                 if strip_newlines:
                     prompt = prompt.strip("\n")
@@ -87,7 +85,7 @@ def codegen(
                     samples = [
                         dict(
                             task_id=task_id,
-                            solution=task["prompt"]+completion
+                            solution=task["complete_prompt"]+completion
                         )
                         for task_id, completion in zip([task_id]*len(outputs), outputs)
                     ]
diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
@@ -30,7 +30,7 @@ def inspection(args):
             os.makedirs(task_path)
         task_id_data = problems[task_id]
         with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
-            f.write(task_id_data["prompt"] + "\n\n" + task_id_data["canonical_solution"])
+            f.write(task_id_data[f"{args.subset}_prompt"] + "\n\n" + task_id_data["canonical_solution"])
         
         # write test
         with open(os.path.join(task_path, "test_case.py"), "w") as f:
@@ -49,6 +49,7 @@ def inspection(args):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--eval-results", required=True, type=str)
+    parser.add_argument("--subset", required=True, type=str)
     parser.add_argument("--in-place", action="store_true")
     args = parser.parse_args()
     
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
@@ -228,10 +228,10 @@ def script(
         if "solution" in solution:
             old_code = solution["solution"]
             if calibrate:
-                old_code = solution["solution"].replace("```python\n    ", "```python\n"+dataset[task_id]["prompt"]+"    ")
+                old_code = solution["solution"].replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
         else:
             assert "completion" in solution
-            old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]
+            old_code = dataset[task_id]["complete_prompt"] + "\n" + solution["completion"]
 
         new_code = sanitize(code=old_code, entrypoint=function_name)
         # if changed, print the message
diff --git a/bigcodebench/syncheck.py b/bigcodebench/syncheck.py
@@ -41,7 +41,7 @@ def script(
             id2solutions[task_id] = []
         if "solution" not in solution:
             assert "completion" in solution, "solution or completion must exist!"
-            solution["solution"] = dataset[task_id]["prompt"] + solution["completion"]
+            solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"]
         id2solutions[task_id].append(solution)
 
     print(colored("==============================", "blue"))
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = bigcodebench
-description = "A viable evaluation package for BigCodeBench"
+description = "Evaluation package for BigCodeBench"
 long_description = file: README.md
 long_description_content_type = text/markdown
 url = https://github.com/bigcode-project/bigcodebench
@@ -26,6 +26,7 @@ install_requires =
     tree_sitter_languages>=1.10.2
     tree-sitter==0.21.3
     wget>=3.2
+    datasets
 
 [options.extras_require]
 generate =