feat: add passing arguments for ram limits

terryyz · terryyz · commit 2f2e49bfeb3c · 2024-06-27T02:29:32.000+08:00
diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
@@ -110,6 +110,9 @@ def unsafe_execute(
     code: str,
     test_code: str,
     timeout: float,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     stat,  # Value
     details,  # Array
 ):
@@ -123,9 +126,7 @@ def unsafe_execute(
         rmdir = os.rmdir
         chdir = os.chdir
         # Disable functionalities that can make destructive changes to the test.
-        # allow only 128GB memory usage
-        maximum_memory_bytes = 128 * 1024 * 1024 * 1024
-        reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
+        reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
         module_name = "__test__"
         new_module = types.ModuleType(module_name)
         # Set necessary attributes for the module
@@ -170,11 +171,14 @@ def untrusted_check(
     code: str,
     test_code: str,
     entry_point: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     min_time_limit: float = 10,
     gt_time_limit: float = 60
 ) -> Tuple[str, np.ndarray]:
     time_limit = max(min_time_limit, gt_time_limit)
-    timeout = max(os.getenv("EVALPLUS_TIMEOUT_PER_TASK", 120), time_limit) + 1
+    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", 120), time_limit) + 1
     # shared memory objects
     stat = Value("i", _UNKNOWN)
     manager = Manager()
@@ -187,6 +191,9 @@ def untrusted_check(
             code,
             test_code,
             timeout,
+            max_as_limit,
+            max_data_limit,
+            max_stack_limit,
             stat,
             details,
         ),
diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
@@ -258,7 +258,7 @@ class redirect_stdin(contextlib._RedirectStream):  # type: ignore
     _stream = "stdin"
 
 
-def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
     """
     This disables various destructive functions and prevents the generated code
     from interfering with the test (e.g. fork bomb, killing other processes,
@@ -282,18 +282,18 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 
     os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
     
-    if maximum_memory_bytes is not None:
+    if maximum_memory_bytes:
         import resource
 
         resource.setrlimit(
-            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+            resource.RLIMIT_AS, (max_as_limit, max_as_limit)
         )
         resource.setrlimit(
-            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+            resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
         )
         if not platform.uname().system == "Darwin":
             resource.setrlimit(
-                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+                resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
             )
 
     faulthandler.disable()
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -34,7 +34,7 @@
 Result = Tuple[str, List[bool]]
 
 
-def get_groundtruth(problems, hashcode, check_gt_only):
+def get_groundtruth(problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit):
     cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
     if os.path.exists(cache_file):
         if check_gt_only:
@@ -53,6 +53,9 @@ def get_groundtruth(problems, hashcode, check_gt_only):
             problem["complete_prompt"] + "\n" + problem["canonical_solution"],
             problem["test"],
             problem["task_id"],
+            max_as_limit,
+            max_data_limit,
+            max_stack_limit
         )
     print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
     
@@ -65,9 +68,12 @@ def check_correctness(
     completion_id: int,
     problem: Dict[str, Any],
     solution: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
     identifier=None,
     min_time_limit: float = 0.1,
-    gt_time_limit: float = 2.0
+    gt_time_limit: float = 2.0,
 ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
     ret = {
         "completion_id": completion_id,
@@ -79,8 +85,11 @@ def check_correctness(
         solution,
         problem["test"],
         problem["entry_point"],
+        max_as_limit,
+        max_data_limit,
+        max_stack_limit,
         min_time_limit,
-        gt_time_limit
+        gt_time_limit,
     )
     return ret
 
@@ -150,6 +159,9 @@ def evaluate(flags):
                     completion_id[task_id],
                     problems[task_id],
                     solution,
+                    flags.max_as_limit,
+                    flags.max_data_limit,
+                    flags.max_stack_limit,
                     sample["_identifier"],
                     flags.min_time_limit,
                     expected_time[task_id] if expected_time else 20
@@ -240,6 +252,9 @@ def main():
     parser.add_argument("--samples", required=True, type=str)
     parser.add_argument("--parallel", default=None, type=int)
     parser.add_argument("--min-time-limit", default=1, type=float)
+    parser.add_argument("--max-as-limit", default=128*1024, type=float)
+    parser.add_argument("--max-data-limit", default=4*1024, type=float)
+    parser.add_argument("--max-stack-limit", default=5, type=float)
     parser.add_argument(
         "--check-gt-only", action="store_true", help="Check the groundtruth"
     )
diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py
@@ -12,7 +12,7 @@
 )
 
 
-def trusted_exec(code, test_code, task_id):
+def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit):
     """Execute trusted code in place."""
 
     with create_tempdir():
@@ -25,8 +25,7 @@ def trusted_exec(code, test_code, task_id):
         chdir = os.chdir
         module_name = "__test__"
         new_module = types.ModuleType(module_name)
-        maximum_memory_bytes = 128 * 1024 * 1024 * 1024
-        reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
+        reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
         # Set necessary attributes for the module
         new_module.__dict__.update({
             '__builtins__': builtins,