Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
5 changes: 3 additions & 2 deletions benchmarks/arteval_bench/data/benchmark/arteval_tasks.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python3
import sys
from typing import Dict

# from oracle_artifact_build import OracleArtifactBuild
from oracle_env_setup import OracleEnvSetup
# from oracle_benchmark_prep import OracleBenchmarkPrep
# from oracle_experiment_runs import OracleExperimentRuns

from utils import logger

def main():
results: Dict[str, int] = {}

score = 0
for cls in (OracleEnvSetup, OracleArtifactBuild, OracleBenchmarkPrep, OracleExperimentRuns):
checker = cls()
ok = checker.run()
name = cls.__name__
logger.info(f"{name}: {'PASS' if ok else 'FAIL'}")
if ok:
results[name] = 1
score += 1
else:
results[name] = 0

logger.info(f"Agent scores: {results}")
return score


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
import os
import subprocess
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple
from pathlib import Path

from utils import REPO_DIR
from utils import logger


@dataclass(frozen=True)
class BuildTarget:
name: str
repo_key: str
cmd: List[str]


BUILD_TARGETS: List[BuildTarget] = [
BuildTarget(
name="artifact-core",
repo_key="artifact-core",
cmd=[
"make",
"-j8",
"tools/diamond-types/target/release/dt",
"tools/crdt-converter/target/release/crdt-converter",
"tools/diamond-types/target/release/paper-stats",
"tools/paper-benchmarks/target/memusage/paper-benchmarks",
"tools/paper-benchmarks/target/release/paper-benchmarks",
"tools/ot-bench/target/memusage/ot-bench",
"tools/ot-bench/target/release/ot-bench"
],
),
]


class OracleArtifactBuild:

def __init__(self) -> None:
self.repo_dir = REPO_DIR

def run_shell_command(
self,
cmd: Iterable[str],
cwd: Optional[Path] = None,
) -> Tuple[int, str, str]:
"""
Run a command and return (rc, stdout, stderr) tuple.
"""
try:
cp = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=str(cwd) if cwd is not None else None,
)
return cp.returncode, cp.stdout or "", cp.stderr or ""
except FileNotFoundError:
return 127, "", ""

def build_target(self, target: BuildTarget) -> Optional[str]:
"""
Build a single target using its configured repository and command.
"""
repo_path = Path(os.path.expanduser(self.repo_dir))
if not repo_path.exists():
return f"{target.name} repo directory missing"

rc, out, err = self.run_shell_command(target.cmd, cwd=repo_path)
if rc != 0:
return f"{target.name} build failed (error code: {rc}; error message: {err})"

return None

def build_check(self):
"""
Run builds for all configured targets and collect failures.
"""
problems: List[str] = []
for target in BUILD_TARGETS:
msg = self.build_target(target)
if msg:
problems.append(msg)
if problems:
return False, "; ".join(problems)
return True, ""

def run(self):
ok, why = self.build_check()
logger.info(f"Build: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
return ok
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python3
import json
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List, Optional, Tuple

from utils import HOME
from utils import REPO_DIR
from utils import REFERENCE_BENCHMARK_FILE
from utils import logger


@dataclass(frozen=True)
class DatasetRef:
filepath: str
sizeinbytes: int


class OracleBenchmarkPrep:

def __init__(self) -> None:
self.home = Path(os.path.expanduser(str(HOME)))
self.repo_path = Path(os.path.expanduser(str(REPO_DIR)))
self.ref_json = Path(os.path.expanduser(str(REFERENCE_BENCHMARK_FILE)))

def load_json(self, path: Path) -> Tuple[Optional[Any], str]:
"""
Load JSON from disk and return (obj, err).
"""
if not path.exists():
return None, f"ref json missing: {path}"
try:
with path.open("r", encoding="utf-8") as f:
return json.load(f), ""
except Exception as e:
return None, f"ref json unreadable: {e}"

def iter_ref_entries(self, obj: Any) -> List[dict]:
"""
Extract benchmark entries from a reference JSON.
"""
if isinstance(obj, list):
return [x for x in obj if isinstance(x, dict)]
if isinstance(obj, dict):
for v in obj.values():
if isinstance(v, list) and v and all(isinstance(x, dict) for x in v):
return v
return []

def parse_entry(self, d: dict) -> Tuple[Optional[DatasetRef], str]:
"""
Parse a single JSON entry into DatasetRef.
"""
if "filepath" not in d:
return None, "missing filepath"
if "sizeinbytes" not in d:
return None, "missing sizeinbytes"

fp = d.get("filepath", "")
sz = d.get("sizeinbytes", None)

if not isinstance(fp, str) or not fp:
return None, "invalid filepath"
if not isinstance(sz, int) or sz < 0:
return None, "invalid sizeinbytes"

return DatasetRef(filepath=fp, sizeinbytes=sz), ""

def check_entry(self, ref: DatasetRef) -> Optional[str]:
"""
Validate that dataset files exist and matche the expected sizes (in bytes).
"""
rel = Path(ref.filepath)

if rel.is_absolute():
return f"{ref.filepath}: absolute paths not allowed"

p = self.repo_path / rel
if not p.exists():
return f"{ref.filepath}: missing"
if not p.is_file():
return f"{ref.filepath}: not a file"

try:
actual = p.stat().st_size
except OSError as e:
return f"{ref.filepath}: stat failed ({e})"

if actual != ref.sizeinbytes:
return f"{ref.filepath}: size mismatch (expected {ref.sizeinbytes}, got {actual})"

return None

def datasets_check(self) -> Tuple[bool, str]:
"""
Check all referenced dataset files are present and match expected sizes.
"""
obj, err = self.load_json(self.ref_json)
if err:
return False, err

entries = self.iter_ref_entries(obj)
if not entries:
return False, "no entries found in ref json"

problems: List[str] = []
for d in entries:
ref, perr = self.parse_entry(d)
if perr or ref is None:
problems.append(perr or "invalid entry")
continue

msg = self.check_entry(ref)
if msg:
problems.append(msg)

if problems:
return False, "; ".join(problems)
return True, ""

def run(self) -> bool:
ok, why = self.datasets_check()
logger.info(f"Datasets: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
return ok
Loading
Loading