SREGym · Jackcuii · Dec 4, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/benchmarks/sregym/.gitignore b/benchmarks/sregym/.gitignore
@@ -0,0 +1,5 @@
+# ignore all html files in current directory
+*.html
+agent_graph.png
+*.csv
+
diff --git a/benchmarks/sregym/Dockerfile b/benchmarks/sregym/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:24.04
+
+WORKDIR /usr/src
+COPY . .
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    wget \
+    python3-pip \
+    python3-venv
+
+RUN chmod +x install.sh test.sh && ./install.sh
+
+ENTRYPOINT ["./test.sh"]
diff --git a/benchmarks/sregym/README.md b/benchmarks/sregym/README.md
@@ -0,0 +1,69 @@
+# SREGym Quick Guide
+
+In this README.md, I will quickly explain how to run SREGym within the System Intelligence Framework.
+
+For advanced use of *System Intelligence* and *SREGym*, please refer to the docs of [*System Intelligence*](https://github.com/sys-intelligence/system-intelligence-benchmark/tree/main/doc) and [*SREGym*](https://sregym.com/docs)
+
+## Architecture Explanation
+
+SREGym has a decoupled design which complies with *System Intelligence* philosophy.
+Here is the correspondence of the components in *System Intelligence* and *SREGym*:
+
+The `Executor` is the agent in *SREGym*, which is decoupled from the framework functionality. We have a baseline agent implementation in `sregym_core/clients/stratus/stratus_agent/` and it is run by default. If you want to bring your own agent, please follow the [Running Your Own Agent](https://sregym.com/docs/running-your-own-agent) guide.
+
+The `Evaluator` is the evaluation oracles in *SREGym*, which is decoupled from the agent implementation. 
+
+The*SREGym*'s `Conductor` serves as the `Environment` in *System Intelligence*.
+
+## Run SREGym
+
+1. Prepare `.env` for the configurations. You can make a copy of `.env.example` into `.env` and set the keys in the `.env` file. For System Intelligence, you need to set the API keys for the models you want to test, like below:
+
+``` shell
+PROVIDER_TOOLS="litellm"
+PROVIDER="litellm"
+
+GEMINI_API_KEY="XXXXXX"
+OPENAI_API_KEY="XXXXXX"
+ANTHROPIC_API_KEY="XXXXXX"
+MOONSHOT_API_KEY="XXXXXX"
+```
+> You do not need to set the `MODEL_TOOLS` in the `.env` file. It will be set automatically by the System Intelligence Framework. It is indented for individual run of SREGym.
+
+2. You need to make a `inventory.yml` file in the `sregym_core/scripts/ansible` directory. You can make a copy of `inventory.yml.example` into `inventory.yml` and set the hosts in the `inventory.yml` file. You can follow the instructions [here](https://github.com/SREGym/SREGym?tab=readme-ov-file#a-kubernetes-cluster-recommended) to get a cluster and set up the inventory file.
+
+3. Install the dependencies
+``` shell
+cd benchmarks/sregym
+./install.sh
+```
+
+4. Run the benchmark
+``` shell
+cd benchmarks/sregym
+./run.sh <model_name>
+```
+> Some tested available names are: "gemini/gemini-2.5-flash", "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514", "moonshot/moonshot-v1-32k".
+
+The wrapper executes `python src/main.py --agent "stratus" --model_name "${MODEL_NAME}"` to run the benchmark.
+
+The results will be saved in the `outputs/` directory.
+``` shell
+outputs/sregym__<model>__<agent>__<timestamp>/
+├── avg_score.json     # Average score
+└── result.jsonl       # Detailed results
+```
+
+## Use the System Intelligence CLI (optional)
+
+To orchestrate SysMoBench alongside other benchmarks:
+
+```bash
+cd cli
+./run_all_local.sh <model_name>
+```
+
+## Contribution
+
+We strongly welcome contributions to SREGym.   
+You can report bugs, suggest features, or contribute code to SREGym in the upstream repository [SREGym](https://github.com/SREGym/SREGym).
diff --git a/benchmarks/sregym/data/benchmark/tasks.jsonl b/benchmarks/sregym/data/benchmark/tasks.jsonl
@@ -0,0 +1,89 @@
+{"id": "sregym_001", "task_name": "faulty_image_correlated"}
+{"id": "sregym_002", "task_name": "update_incompatible_correlated"}
+{"id": "sregym_003", "task_name": "incorrect_image"}
+{"id": "sregym_004", "task_name": "incorrect_port_assignment"}
+{"id": "sregym_005", "task_name": "misconfig_app_hotel_res"}
+{"id": "sregym_006", "task_name": "missing_env_variable_astronomy_shop"}
+{"id": "sregym_007", "task_name": "revoke_auth_mongodb-1"}
+{"id": "sregym_008", "task_name": "revoke_auth_mongodb-2"}
+{"id": "sregym_009", "task_name": "storage_user_unregistered-1"}
+{"id": "sregym_010", "task_name": "storage_user_unregistered-2"}
+{"id": "sregym_011", "task_name": "valkey_auth_disruption"}
+{"id": "sregym_012", "task_name": "valkey_memory_disruption"}
+{"id": "sregym_013", "task_name": "capacity_decrease_rpc_retry_storm"}
+{"id": "sregym_014", "task_name": "gc_capacity_degradation"}
+{"id": "sregym_015", "task_name": "load_spike_rpc_retry_storm"}
+{"id": "sregym_016", "task_name": "assign_to_non_existent_node"}
+{"id": "sregym_017", "task_name": "auth_miss_mongodb"}
+{"id": "sregym_018", "task_name": "configmap_drift_hotel_reservation"}
+{"id": "sregym_019", "task_name": "duplicate_pvc_mounts_astronomy_shop"}
+{"id": "sregym_020", "task_name": "duplicate_pvc_mounts_hotel_reservation"}
+{"id": "sregym_021", "task_name": "duplicate_pvc_mounts_social_network"}
+{"id": "sregym_022", "task_name": "env_variable_shadowing_astronomy_shop"}
+{"id": "sregym_023", "task_name": "k8s_target_port-misconfig"}
+{"id": "sregym_024", "task_name": "liveness_probe_misconfiguration_astronomy_shop"}
+{"id": "sregym_025", "task_name": "liveness_probe_misconfiguration_hotel_reservation"}
+{"id": "sregym_026", "task_name": "liveness_probe_misconfiguration_social_network"}
+{"id": "sregym_027", "task_name": "liveness_probe_too_aggressive_astronomy_shop"}
+{"id": "sregym_028", "task_name": "liveness_probe_too_aggressive_hotel_reservation"}
+{"id": "sregym_029", "task_name": "liveness_probe_too_aggressive_social_network"}
+{"id": "sregym_030", "task_name": "missing_configmap_hotel_reservation"}
+{"id": "sregym_031", "task_name": "missing_configmap_social_network"}
+{"id": "sregym_032", "task_name": "missing_service_astronomy_shop"}
+{"id": "sregym_033", "task_name": "missing_service_hotel_reservation"}
+{"id": "sregym_034", "task_name": "missing_service_social_network"}
+{"id": "sregym_035", "task_name": "namespace_memory_limit"}
+{"id": "sregym_036", "task_name": "pod_anti_affinity_deadlock"}
+{"id": "sregym_037", "task_name": "persistent_volume_affinity_violation"}
+{"id": "sregym_038", "task_name": "pvc_claim_mismatch"}
+{"id": "sregym_039", "task_name": "rbac_misconfiguration"}
+{"id": "sregym_040", "task_name": "readiness_probe_misconfiguration_astronomy_shop"}
+{"id": "sregym_041", "task_name": "readiness_probe_misconfiguration_hotel_reservation"}
+{"id": "sregym_042", "task_name": "readiness_probe_misconfiguration_social_network"}
+{"id": "sregym_043", "task_name": "resource_request_too_large"}
+{"id": "sregym_044", "task_name": "resource_request_too_small"}
+{"id": "sregym_045", "task_name": "rolling_update_misconfigured_hotel_reservation"}
+{"id": "sregym_046", "task_name": "rolling_update_misconfigured_social_network"}
+{"id": "sregym_047", "task_name": "scale_pod_zero_social_net"}
+{"id": "sregym_048", "task_name": "service_dns_resolution_failure_astronomy_shop"}
+{"id": "sregym_049", "task_name": "service_dns_resolution_failure_social_network"}
+{"id": "sregym_050", "task_name": "sidecar_port_conflict_astronomy_shop"}
+{"id": "sregym_051", "task_name": "sidecar_port_conflict_hotel_reservation"}
+{"id": "sregym_052", "task_name": "sidecar_port_conflict_social_network"}
+{"id": "sregym_053", "task_name": "stale_coredns_config_astronomy_shop"}
+{"id": "sregym_054", "task_name": "stale_coredns_config_social_network"}
+{"id": "sregym_055", "task_name": "taint_no_toleration_social_network"}
+{"id": "sregym_056", "task_name": "wrong_bin_usage"}
+{"id": "sregym_057", "task_name": "wrong_dns_policy_astronomy_shop"}
+{"id": "sregym_058", "task_name": "wrong_dns_policy_hotel_reservation"}
+{"id": "sregym_059", "task_name": "wrong_dns_policy_social_network"}
+{"id": "sregym_060", "task_name": "wrong_service_selector_astronomy_shop"}
+{"id": "sregym_061", "task_name": "wrong_service_selector_hotel_reservation"}
+{"id": "sregym_062", "task_name": "wrong_service_selector_social_network"}
+{"id": "sregym_063", "task_name": "astronomy_shop_ad_service_failure"}
+{"id": "sregym_064", "task_name": "astronomy_shop_ad_service_high_cpu"}
+{"id": "sregym_065", "task_name": "astronomy_shop_ad_service_manual_gc"}
+{"id": "sregym_066", "task_name": "astronomy_shop_cart_service_failure"}
+{"id": "sregym_067", "task_name": "astronomy_shop_ad_service_image_slow_load"}
+{"id": "sregym_068", "task_name": "astronomy_shop_payment_service_failure"}
+{"id": "sregym_069", "task_name": "astronomy_shop_payment_service_unreachable"}
+{"id": "sregym_070", "task_name": "astronomy_shop_product_catalog_service_failure"}
+{"id": "sregym_071", "task_name": "astronomy_shop_recommendation_service_cache_failure"}
+{"id": "sregym_072", "task_name": "kafka_queue_problems"}
+{"id": "sregym_073", "task_name": "loadgenerator_flood_homepage"}
+{"id": "sregym_074", "task_name": "trainticket_f17_nested_sql_select_clause_error"}
+{"id": "sregym_075", "task_name": "trainticket_f22_sql_column_name_mismatch_error"}
+{"id": "sregym_076", "task_name": "read_error"}
+{"id": "sregym_077", "task_name": "latent_sector_error"}
+{"id": "sregym_078", "task_name": "silent_data_corruption"}
+{"id": "sregym_079", "task_name": "ingress_misroute"}
+{"id": "sregym_080", "task_name": "network_policy_block"}
+{"id": "sregym_081", "task_name": "social_net_hotel_res_astro_shop_concurrent_failures"}
+{"id": "sregym_082", "task_name": "kubelet_crash"}
+{"id": "sregym_083", "task_name": "workload_imbalance"}
+{"id": "sregym_084", "task_name": "operator_overload_replicas"}
+{"id": "sregym_085", "task_name": "operator_non_existent_storage"}
+{"id": "sregym_086", "task_name": "operator_invalid_affinity_toleration"}
+{"id": "sregym_087", "task_name": "operator_security_context_fault"}
+{"id": "sregym_088", "task_name": "operator_wrong_update_strategy_fault"}
+
diff --git a/benchmarks/sregym/data/pretrain/example_bench_pretrain_timestamp.jsonl b/benchmarks/sregym/data/pretrain/example_bench_pretrain_timestamp.jsonl
@@ -0,0 +1 @@
+{"text": "text of one doc", "metadata": {"scenarios": "XXX", "subtask": "XXXX", "description": "xx", "link": "XXX", "XXX": "XXX"}}
diff --git a/benchmarks/sregym/data/sft/example_bench_sft_timestamp.jsonl b/benchmarks/sregym/data/sft/example_bench_sft_timestamp.jsonl
@@ -0,0 +1,2 @@
+{"sys_prompt": "You are XXX", "user_prompt": "what", "thinking": "chain of thought", "response": "XXX", "metadata": {"scenario": "XX", "subtask": "XXX", "data_quality":"high", "XXX": "XXX"}}
+
diff --git a/benchmarks/sregym/env.toml b/benchmarks/sregym/env.toml
@@ -0,0 +1,15 @@
+[llm]
+AZURE_API_KEY = "XXX"
+AZURE_API_BASE = "XXXX"
+AZURE_API_VERSION = "XXX"
+ANTHROPIC_API_KEY = "sk-XXXX"
+
+[hardware]
+use_gpu = false
+
+[env-docker]
+image = "default"
+entrypoint = "./run.sh"
+
+[sregym_mode]
+mode = "remote" # where to run the Kubernetes Cluster {kind, remote}
diff --git a/benchmarks/sregym/install.sh b/benchmarks/sregym/install.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+set -e
+
+# Clone submodule to SREGym-applications if no file in sregym_core/SREGym-applications
+if [ ! -f "sregym_core/SREGym-applications/README.md" ]; then
+    echo "==> Cloning SREGym-applications..."
+    git clone https://github.com/SREGym/SREGym-applications.git sregym_core/SREGym-applications --recurse-submodules
+else
+    echo "==> SREGym-applications already exists. Skipping clone."
+fi
+
+# Homebrew installation
+if ! command -v brew >/dev/null 2>&1; then
+    echo "==> Homebrew not found. Installing Homebrew..."
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+fi
+
+# Helm installation
+if ! command -v helm >/dev/null 2>&1; then
+    echo "==> Helm not found. Installing Helm..."
+    brew install helm
+fi
+
+# Detect architecture and set kubectl architecture
+ARCH=$(uname -m)
+if [ "$ARCH" == "x86_64" ]; then
+    KUBECTL_ARCH="amd64"
+    echo "==> x86_64 machine detected. Using amd64 kubectl."
+elif [ "$ARCH" == "aarch64" ]; then
+    KUBECTL_ARCH="arm64"
+    echo "==> aarch64 machine detected. Using arm64 kubectl."
+else
+    echo "==> Unknown machine type detected: $ARCH"
+    exit 1
+fi
+
+# kubectl installation
+if ! command -v kubectl >/dev/null 2>&1; then
+    echo "==> kubectl not found. Installing kubectl..."
+
+    # Get the latest stable version
+    KUBECTL_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt)
+    echo "==> Installing kubectl version: $KUBECTL_VERSION"
+
+    # Download kubectl binary
+    curl -LO "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${KUBECTL_ARCH}/kubectl"
+
+    # Download kubectl checksum file
+    curl -LO "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${KUBECTL_ARCH}/kubectl.sha256"
+
+    # Validate the binary against the checksum file
+    echo "$(cat kubectl.sha256)  kubectl" | sha256sum --check
+    if [ $? -ne 0 ]; then
+        echo "==> Error: kubectl checksum validation failed!"
+        rm -f kubectl kubectl.sha256
+        exit 1
+    fi
+
+    # Install kubectl
+    sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+    # Clean up installation files
+    rm -f kubectl kubectl.sha256
+
+    # Verify installation
+    kubectl version --client
+    echo "==> kubectl installed successfully."
+else
+    echo "==> kubectl is already installed."
+    kubectl version --client
+fi
+
+# Install uv (Python package manager)
+if ! command -v uv >/dev/null 2>&1; then
+    echo "==> uv not found. Installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+
+    # Add uv to PATH if not already there
+    export PATH="$HOME/.cargo/bin:$HOME/.local/bin:$PATH"
+
+    # Verify installation
+    if command -v uv >/dev/null 2>&1; then
+        uv --version
+        echo "==> uv installed successfully."
+    else
+        echo "==> Warning: uv installation may have failed. Please check your PATH."
+        echo "==> You may need to add ~/.cargo/bin or ~/.local/bin to your PATH."
+    fi
+else
+    echo "==> uv is already installed."
+    uv --version
+fi
+
+# Synchronize the dependencies
+cd sregym_core
+uv sync
+uv run pre-commit install
+
+# Check for inventory.yml and configure remote cluster if present
+if [ -f "scripts/ansible/inventory.yml" ]; then
+    echo "==> Found inventory.yml. Checking if cluster already exists..."
+
+    # Check if cluster is already configured
+    CLUSTER_EXISTS=false
+    if kubectl get nodes >/dev/null 2>&1; then
+        READY_NODES=$(kubectl get nodes --no-headers 2>/dev/null | grep -c " Ready " || true)
+        TOTAL_NODES=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || true)
+
+        if [ "$TOTAL_NODES" -gt 0 ] && [ "$READY_NODES" -eq "$TOTAL_NODES" ]; then
+            CLUSTER_EXISTS=true
+            echo "==> Cluster already exists with $TOTAL_NODES node(s) in Ready state."
+            echo "==> Cluster nodes:"
+            kubectl get nodes
+            echo ""
+            echo "==> ✓ Skipping cluster setup. Cluster is already configured correctly."
+        fi
+    fi
+
+    # Only run ansible-playbook if cluster doesn't exist or is not fully ready
+    if [ "$CLUSTER_EXISTS" = false ]; then
+        echo "==> Cluster not found or not fully ready. Configuring remote cluster with Ansible..."
+        cd scripts/ansible
+        ansible-playbook -i inventory.yml setup_cluster.yml
+
+        if [ $? -eq 0 ]; then
+            echo "==> Ansible playbook completed successfully."
+
+            # Verify cluster configuration with kubectl
+            echo "==> Verifying cluster configuration..."
+            if kubectl get nodes >/dev/null 2>&1; then
+                echo "==> Cluster nodes:"
+                kubectl get nodes
+                echo ""
+
+                # Check if nodes are in Ready state
+                READY_NODES=$(kubectl get nodes --no-headers 2>/dev/null | grep -c " Ready " || true)
+                TOTAL_NODES=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || true)
+
+                if [ "$READY_NODES" -gt 0 ] && [ "$READY_NODES" -eq "$TOTAL_NODES" ]; then
+                    echo "==> ✓ All nodes are in Ready state. Cluster configuration is correct."
+                elif [ "$READY_NODES" -gt 0 ]; then
+                    echo "==> ⚠ Warning: Only $READY_NODES out of $TOTAL_NODES nodes are Ready."
+                    echo "==> Please check the cluster status manually."
+                else
+                    echo "==> ⚠ Warning: No nodes are in Ready state."
+                    echo "==> Please check the cluster status manually."
+                fi
+else
+                echo "==> ⚠ Warning: kubectl cannot connect to the cluster."
+                echo "==> Please verify the cluster configuration manually."
+            fi
+            cd - >/dev/null
+        else
+            echo "==> Error: Ansible playbook failed."
+            cd - >/dev/null
+            exit 1
+        fi
+    fi
+else
+    echo "==> No inventory.yml found at scripts/ansible/inventory.yml. Skipping remote cluster setup."
+    echo "==> To set up a remote cluster, create inventory.yml from inventory.yml.example"
+fi
+
+cd -
+
+source sregym_core/.venv/bin/activate
+# uv pip install -r requirements.txt
+deactivate
diff --git a/benchmarks/sregym/requirements.txt b/benchmarks/sregym/requirements.txt
@@ -0,0 +1,5 @@
+sentence-transformers==4.0.1
+scikit-learn==1.6.1
+requests
+azure-identity
+litellm==1.77.5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"text": "text of one doc", "metadata": {"scenarios": "XXX", "subtask": "XXXX", "description": "xx", "link": "XXX", "XXX": "XXX"}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"sys_prompt": "You are XXX", "user_prompt": "what", "thinking": "chain of thought", "response": "XXX", "metadata": {"scenario": "XX", "subtask": "XXX", "data_quality":"high", "XXX": "XXX"}}