release: v0.1.6

terryyz · web-flow · commit dcd5d271f6f2 · 2024-06-27T05:41:58.000+08:00
release: v0.1.6 with new features supported
diff --git a/README.md b/README.md
@@ -225,11 +225,13 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
 
 ```bash
 # Mount the current directory to the container
-docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated
-# ...Or locally ⚠️
-bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated
-# ...If the ground truth is working locally (due to some flaky tests)
-bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated --no-gt
+# If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
+# If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
+# If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
+docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
+
+# If you only want to check the ground truths
+docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --check-gt-only
 ```
 
 ...Or if you want to try it locally regardless of the risks ⚠️:
@@ -245,8 +247,8 @@ Then, run the evaluation:
 ```bash
 # ...Or locally ⚠️
 bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
-# ...If the ground truth is not working locally
-bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated --no-gt
+# ...If you really don't want to check the ground truths
+bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --no-gt
 ```
 
 > [!Tip]
@@ -276,8 +278,9 @@ Reading samples...
 1140it [00:00, 1901.64it/s]
 Evaluating samples...
 100%|██████████████████████████████████████████| 1140/1140 [19:53<00:00, 6.75it/s]
-bigcodebench
-{'pass@1': 0.568}
+BigCodeBench-instruct-calibrated
+Groundtruth pass rate: 1.000
+pass@1: 0.568
 ```
 
 - The "k" includes `[1, 5, 10]` where k values `<=` the sample size will be used
@@ -330,9 +333,7 @@ We share pre-generated code samples from LLMs we have [evaluated](https://huggin
 
 ## Known Issues
 
-- [ ] We notice that some tasks heavily use memory for scientific modeling during testing. This will lead to timeout issues for some machines. If you get an error message like `Check failed: ret == 0 (11 vs. 0)Thread creation via pthread_create() failed.` in Tensorflow, it is very likely due to the memory issue. Try to allocate more memory to the process or reduce the number of parallel processes.
-
-- [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.2%) between runs. We are working on improving the evaluation stability.
+- [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.1%) between runs. We are working on improving the evaluation stability.
 
 - [ ] We are aware of the issue of some users needing to use a proxy to access the internet. We are working on a subset of the tasks that do not require internet access to evaluate the code.
 
diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
@@ -110,6 +110,9 @@ def unsafe_execute(
     code: str,
     test_code: str,
     timeout: float,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     stat,  # Value
     details,  # Array
 ):
@@ -123,9 +126,7 @@ def unsafe_execute(
         rmdir = os.rmdir
         chdir = os.chdir
         # Disable functionalities that can make destructive changes to the test.
-        # allow only 128GB memory usage
-        maximum_memory_bytes = 128 * 1024 * 1024 * 1024
-        reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
+        reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
         module_name = "__test__"
         new_module = types.ModuleType(module_name)
         # Set necessary attributes for the module
@@ -170,11 +171,14 @@ def untrusted_check(
     code: str,
     test_code: str,
     entry_point: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     min_time_limit: float = 10,
     gt_time_limit: float = 60
 ) -> Tuple[str, np.ndarray]:
     time_limit = max(min_time_limit, gt_time_limit)
-    timeout = max(os.getenv("EVALPLUS_TIMEOUT_PER_TASK", 120), time_limit) + 1
+    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120), time_limit) + 1
     # shared memory objects
     stat = Value("i", _UNKNOWN)
     manager = Manager()
@@ -187,6 +191,9 @@ def untrusted_check(
             code,
             test_code,
             timeout,
+            max_as_limit,
+            max_data_limit,
+            max_stack_limit,
             stat,
             details,
         ),
diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
@@ -258,7 +258,7 @@ class redirect_stdin(contextlib._RedirectStream):  # type: ignore
     _stream = "stdin"
 
 
-def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
     """
     This disables various destructive functions and prevents the generated code
     from interfering with the test (e.g. fork bomb, killing other processes,
@@ -282,18 +282,22 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 
     os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
     
-    if maximum_memory_bytes is not None:
+    if max_as_limit and max_data_limit and max_stack_limit:
         import resource
-
+        
+        max_as_limit = max_as_limit * 1024 * 1024
+        max_data_limit = max_data_limit * 1024 * 1024
+        max_stack_limit = max_stack_limit * 1024 * 1024
+        
         resource.setrlimit(
-            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+            resource.RLIMIT_AS, (max_as_limit, max_as_limit)
         )
         resource.setrlimit(
-            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+            resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
         )
         if not platform.uname().system == "Darwin":
             resource.setrlimit(
-                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+                resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
             )
 
     faulthandler.disable()
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -27,14 +27,14 @@
     estimate_pass_at_k,
     untrusted_check,
 )
-from bigcodebench.gen.util import trusted_exec
+from bigcodebench.gen.util import trusted_check
 
 # 1st item: the status
 # 2nd item (optional): the detailed pass/fail boolean for each input
 Result = Tuple[str, List[bool]]
 
 
-def get_groundtruth(problems, hashcode, check_gt_only):
+def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit):
     cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
     if os.path.exists(cache_file):
         if check_gt_only:
@@ -47,13 +47,29 @@ def get_groundtruth(problems, hashcode, check_gt_only):
     os.makedirs(CACHE_DIR, exist_ok=True)
     print("\nAsserting the groundtruth...")
     tbegin = time.time()
-    expected_time = {}
-    for task_id, problem in tqdm(problems.items()):
-        expected_time[task_id] = trusted_exec(
-            problem["complete_prompt"] + "\n" + problem["canonical_solution"],
-            problem["test"],
-            problem["task_id"],
-        )
+    
+    with ProcessPoolExecutor(max_workers=n_workers) as executor:
+        futures = []
+        n_samples = 0
+        expected_time = dict()
+        
+        for problem in problems.values():
+            args = (
+                problem["complete_prompt"] + "\n" + problem["canonical_solution"],
+                problem["test"],
+                problem["task_id"],
+                max_as_limit,
+                max_data_limit,
+                max_stack_limit
+            )
+            
+            futures.append(executor.submit(trusted_check, *args))
+            n_samples += 1
+
+        for future in tqdm(as_completed(futures), total=n_samples):
+            result = future.result()
+            expected_time[result["task_id"]] = result["time"]
+    
     print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
     
     with open(cache_file, "wb") as f:
@@ -65,9 +81,12 @@ def check_correctness(
     completion_id: int,
     problem: Dict[str, Any],
     solution: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     identifier=None,
     min_time_limit: float = 0.1,
-    gt_time_limit: float = 2.0
+    gt_time_limit: float = 2.0,
 ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
     ret = {
         "completion_id": completion_id,
@@ -79,8 +98,11 @@ def check_correctness(
         solution,
         problem["test"],
         problem["entry_point"],
+        max_as_limit,
+        max_data_limit,
+        max_stack_limit,
         min_time_limit,
-        gt_time_limit
+        gt_time_limit,
     )
     return ret
 
@@ -101,18 +123,21 @@ def evaluate(flags):
         assert flags.samples.endswith(".jsonl")
         result_path = flags.samples.replace(".jsonl", "_eval_results.json")
 
+    problems = get_bigcodebench()
+    dataset_hash = get_bigcodebench_hash()
+    
+    if not flags.no_gt:
+        expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit)
+    else:
+        expected_time = {task_id: None for task_id in problems}
+    
     if os.path.isfile(result_path):
         print(f"Load from previous results from {result_path}")
         with open(result_path, "r") as f:
             results = json.load(f)
 
         results = compatible_eval_result(results)
     else:
-        problems = get_bigcodebench()
-        dataset_hash = get_bigcodebench_hash()
-        expected_time = None
-        if not flags.no_gt:
-            expected_time = get_groundtruth(problems, dataset_hash, flags.check_gt_only)
         
         if flags.check_gt_only:
             return
@@ -150,9 +175,12 @@ def evaluate(flags):
                     completion_id[task_id],
                     problems[task_id],
                     solution,
+                    flags.max_as_limit,
+                    flags.max_data_limit,
+                    flags.max_stack_limit,
                     sample["_identifier"],
                     flags.min_time_limit,
-                    expected_time[task_id] if expected_time else 20
+                    expected_time[task_id] if expected_time[task_id] else 20
                 )
                 futures.append(executor.submit(check_correctness, *args))
                 completion_id[task_id] += 1
@@ -208,7 +236,21 @@ def stucking_checker():
         for k in [1, 5, 10, 25, 100]
         if total.min() >= k
     }
-    cprint(f"BigCodeBench-{flags.subset}", "green")
+    
+    mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
+    flags.subset = flags.subset[0].upper() + flags.subset[1:]
+    cprint(f"BigCodeBench-{flags.subset}{mode}", "green")
+    
+    gt_pass_rate = np.mean([1 if v is not None else 0 for v in expected_time.values()])
+    
+    if flags.no_gt:
+        cprint(f"Groundtruth is not checked", "yellow")
+    else:
+        if gt_pass_rate > 0.95:
+            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
+        else:
+            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
+    
     for k, v in pass_at_k.items():
         cprint(f"{k}:\t{v:.3f}", "green")
 
@@ -240,6 +282,9 @@ def main():
     parser.add_argument("--samples", required=True, type=str)
     parser.add_argument("--parallel", default=None, type=int)
     parser.add_argument("--min-time-limit", default=1, type=float)
+    parser.add_argument("--max-as-limit", default=128*1024, type=float)
+    parser.add_argument("--max-data-limit", default=4*1024, type=float)
+    parser.add_argument("--max-stack-limit", default=5, type=float)
     parser.add_argument(
         "--check-gt-only", action="store_true", help="Check the groundtruth"
     )
diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py
@@ -1,8 +1,10 @@
+import os
 import time
 import sys
 import types
 import unittest
-
+import multiprocessing
+from multiprocessing import Array, Value, Manager
 from bigcodebench.eval.utils import (
     create_tempdir,
     reliability_guard,
@@ -12,7 +14,7 @@
 )
 
 
-def trusted_exec(code, test_code, task_id):
+def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit, times):
     """Execute trusted code in place."""
 
     with create_tempdir():
@@ -25,8 +27,7 @@ def trusted_exec(code, test_code, task_id):
         chdir = os.chdir
         module_name = "__test__"
         new_module = types.ModuleType(module_name)
-        maximum_memory_bytes = 128 * 1024 * 1024 * 1024
-        reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
+        reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
         # Set necessary attributes for the module
         new_module.__dict__.update({
             '__builtins__': builtins,
@@ -50,19 +51,19 @@ def trusted_exec(code, test_code, task_id):
         suite = loader.loadTestsFromTestCase(TestCases)
         test_result = unittest.TestResult()
         start = time.time()
-        with safe_environment(), swallow_io():
+        with safe_environment(), swallow_io(), time_limit(seconds=120):
             suite.run(test_result)
-        for test, trace in test_result.failures + test_result.errors:
-            print(trace)
+
+        if len(test_result.failures + test_result.errors) > 0:
+            times.value = -1
+        else:
+            times.value = time.time() - start
+        
         # Needed for cleaning up.
         shutil.rmtree = rmtree
         os.rmdir = rmdir
         os.chdir = chdir
-        assert len(
-            test_result.failures + test_result.errors
-        ) == 0, f"{task_id} failed with errors: {test_result.errors} and failures: {test_result.failures}"
 
-        return time.time() - start
 
 def trusted_check_exec(code, inputs):
     """Check trusted_exec success."""
@@ -72,3 +73,45 @@ def trusted_check_exec(code, inputs):
     except Exception:
         return False
     return True
+
+
+def trusted_check(
+    code: str,
+    test_code: str,
+    task_id: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
+):
+    timeout = os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120) + 1
+    # shared memory objects
+    times = Value("d")
+    manager = Manager()
+
+    p = multiprocessing.Process(
+        target=trusted_exec,
+        args=(
+            code,
+            test_code,
+            task_id,
+            max_as_limit,
+            max_data_limit,
+            max_stack_limit,
+            times,
+        ),
+    )
+    p.start()
+    p.join(timeout=timeout+1)
+    if p.is_alive():
+        p.terminate()
+        time.sleep(0.1)
+    if p.is_alive():
+        p.kill()
+        time.sleep(0.1)
+
+    if times.value == -1:
+        times = -1
+    else:
+        times = times.value
+    
+    return {"task_id": task_id, "time": times}
diff --git a/run.sh b/run.sh