sys-intelligence · xuafeng · Dec 22, 2025 · Dec 21, 2025 · Dec 21, 2025 · Dec 21, 2025
diff --git a/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl b/benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
@@ -1,3 +1,4 @@
 {"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
-{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
+{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import sys
+from typing import Dict
+
+# from oracle_artifact_build import OracleArtifactBuild
+from oracle_env_setup import OracleEnvSetup
+# from oracle_benchmark_prep import OracleBenchmarkPrep
+# from oracle_experiment_runs import OracleExperimentRuns
+
+from utils import logger
+
+def main():
+  results: Dict[str, int] = {}
+
+  score = 0
+  for cls in (OracleEnvSetup, OracleArtifactBuild, OracleBenchmarkPrep, OracleExperimentRuns):
+    checker = cls()
+    ok = checker.run()
+    name = cls.__name__
+    logger.info(f"{name}: {'PASS' if ok else 'FAIL'}")
+    if ok:
+      results[name] = 1
+      score += 1
+    else:
+      results[name] = 0
+
+  logger.info(f"Agent scores: {results}")
+  return score
+
+
+if __name__ == "__main__":
+  main()
diff --git a/...arks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py b/...arks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Tuple
+from pathlib import Path
+
+from utils import REPO_DIR
+from utils import logger
+
+
+@dataclass(frozen=True)
+class BuildTarget:
+  name: str
+  repo_key: str
+  cmd: List[str]
+
+
+BUILD_TARGETS: List[BuildTarget] = [
+  BuildTarget(
+    name="artifact-core",
+    repo_key="artifact-core",
+    cmd=[
+      "make",
+      "-j8",
+      "tools/diamond-types/target/release/dt",
+      "tools/crdt-converter/target/release/crdt-converter",
+      "tools/diamond-types/target/release/paper-stats",
+      "tools/paper-benchmarks/target/memusage/paper-benchmarks",
+      "tools/paper-benchmarks/target/release/paper-benchmarks",
+      "tools/ot-bench/target/memusage/ot-bench",
+      "tools/ot-bench/target/release/ot-bench"
+    ],
+  ),
+]
+
+
+class OracleArtifactBuild:
+
+  def __init__(self) -> None:
+    self.repo_dir = REPO_DIR
+
+  def run_shell_command(
+    self,
+    cmd: Iterable[str],
+    cwd: Optional[Path] = None,
+  ) -> Tuple[int, str, str]:
+    """
+    Run a command and return (rc, stdout, stderr) tuple.
+    """
+    try:
+      cp = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        cwd=str(cwd) if cwd is not None else None,
+      )
+      return cp.returncode, cp.stdout or "", cp.stderr or ""
+    except FileNotFoundError:
+      return 127, "", ""
+
+  def build_target(self, target: BuildTarget) -> Optional[str]:
+    """
+    Build a single target using its configured repository and command.
+    """
+    repo_path = Path(os.path.expanduser(self.repo_dir))
+    if not repo_path.exists():
+      return f"{target.name} repo directory missing"
+
+    rc, out, err = self.run_shell_command(target.cmd, cwd=repo_path)
+    if rc != 0:
+      return f"{target.name} build failed (error code: {rc}; error message: {err})"
+
+    return None
+
+  def build_check(self):
+    """
+    Run builds for all configured targets and collect failures.
+    """
+    problems: List[str] = []
+    for target in BUILD_TARGETS:
+      msg = self.build_target(target)
+      if msg:
+        problems.append(msg)
+    if problems:
+      return False, "; ".join(problems)
+    return True, ""
+
+  def run(self):
+    ok, why = self.build_check()
+    logger.info(f"Build: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
+    return ok
diff --git a/...arks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py b/...arks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+
+from utils import HOME
+from utils import REPO_DIR
+from utils import REFERENCE_BENCHMARK_FILE
+from utils import logger
+
+
+@dataclass(frozen=True)
+class DatasetRef:
+  filepath: str
+  sizeinbytes: int
+
+
+class OracleBenchmarkPrep:
+
+  def __init__(self) -> None:
+    self.home = Path(os.path.expanduser(str(HOME)))
+    self.repo_path = Path(os.path.expanduser(str(REPO_DIR)))
+    self.ref_json = Path(os.path.expanduser(str(REFERENCE_BENCHMARK_FILE)))
+
+  def load_json(self, path: Path) -> Tuple[Optional[Any], str]:
+    """
+    Load JSON from disk and return (obj, err).
+    """
+    if not path.exists():
+      return None, f"ref json missing: {path}"
+    try:
+      with path.open("r", encoding="utf-8") as f:
+        return json.load(f), ""
+    except Exception as e:
+      return None, f"ref json unreadable: {e}"
+
+  def iter_ref_entries(self, obj: Any) -> List[dict]:
+    """
+    Extract benchmark entries from a reference JSON.
+    """
+    if isinstance(obj, list):
+      return [x for x in obj if isinstance(x, dict)]
+    if isinstance(obj, dict):
+      for v in obj.values():
+        if isinstance(v, list) and v and all(isinstance(x, dict) for x in v):
+          return v
+    return []
+
+  def parse_entry(self, d: dict) -> Tuple[Optional[DatasetRef], str]:
+    """
+    Parse a single JSON entry into DatasetRef.
+    """
+    if "filepath" not in d:
+      return None, "missing filepath"
+    if "sizeinbytes" not in d:
+      return None, "missing sizeinbytes"
+
+    fp = d.get("filepath", "")
+    sz = d.get("sizeinbytes", None)
+
+    if not isinstance(fp, str) or not fp:
+      return None, "invalid filepath"
+    if not isinstance(sz, int) or sz < 0:
+      return None, "invalid sizeinbytes"
+
+    return DatasetRef(filepath=fp, sizeinbytes=sz), ""
+
+  def check_entry(self, ref: DatasetRef) -> Optional[str]:
+    """
+    Validate that dataset files exist and matche the expected sizes (in bytes).
+    """
+    rel = Path(ref.filepath)
+
+    if rel.is_absolute():
+      return f"{ref.filepath}: absolute paths not allowed"
+
+    p = self.repo_path / rel
+    if not p.exists():
+      return f"{ref.filepath}: missing"
+    if not p.is_file():
+      return f"{ref.filepath}: not a file"
+
+    try:
+      actual = p.stat().st_size
+    except OSError as e:
+      return f"{ref.filepath}: stat failed ({e})"
+
+    if actual != ref.sizeinbytes:
+      return f"{ref.filepath}: size mismatch (expected {ref.sizeinbytes}, got {actual})"
+
+    return None
+
+  def datasets_check(self) -> Tuple[bool, str]:
+    """
+    Check all referenced dataset files are present and match expected sizes.
+    """
+    obj, err = self.load_json(self.ref_json)
+    if err:
+      return False, err
+
+    entries = self.iter_ref_entries(obj)
+    if not entries:
+      return False, "no entries found in ref json"
+
+    problems: List[str] = []
+    for d in entries:
+      ref, perr = self.parse_entry(d)
+      if perr or ref is None:
+        problems.append(perr or "invalid entry")
+        continue
+
+      msg = self.check_entry(ref)
+      if msg:
+        problems.append(msg)
+
+    if problems:
+      return False, "; ".join(problems)
+    return True, ""
+
+  def run(self) -> bool:
+    ok, why = self.datasets_check()
+    logger.info(f"Datasets: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
+    return ok