From b2fa8ac65fbf3683f2f2dbb3a71bd1e3e2e3e25b Mon Sep 17 00:00:00 2001
From: houqiii <houqiii@houqiiideMacBook-Air.local>
Date: Thu, 11 Dec 2025 12:24:23 +0800
Subject: [PATCH 1/3] feat: Add TopoSense-Bench (HuggingFace integration)

---
 benchmarks/toposense_bench/README.md          |  90 +++++++++
 benchmarks/toposense_bench/env.toml.example   |   5 +
 benchmarks/toposense_bench/requirements.txt   |  11 ++
 benchmarks/toposense_bench/run.sh             |  23 +++
 benchmarks/toposense_bench/src/__init__.py    |   0
 benchmarks/toposense_bench/src/evaluator.py   |  77 ++++++++
 benchmarks/toposense_bench/src/main.py        | 172 ++++++++++++++++++
 .../toposense_bench/src/topology_loader.py    |  96 ++++++++++
 benchmarks/toposense_bench/tests/__init__.py  |   0
 .../toposense_bench/tests/test_benchmark.py   |  40 ++++
 10 files changed, 514 insertions(+)
 create mode 100644 benchmarks/toposense_bench/README.md
 create mode 100644 benchmarks/toposense_bench/env.toml.example
 create mode 100644 benchmarks/toposense_bench/requirements.txt
 create mode 100755 benchmarks/toposense_bench/run.sh
 create mode 100644 benchmarks/toposense_bench/src/__init__.py
 create mode 100644 benchmarks/toposense_bench/src/evaluator.py
 create mode 100644 benchmarks/toposense_bench/src/main.py
 create mode 100644 benchmarks/toposense_bench/src/topology_loader.py
 create mode 100644 benchmarks/toposense_bench/tests/__init__.py
 create mode 100644 benchmarks/toposense_bench/tests/test_benchmark.py

diff --git a/benchmarks/toposense_bench/README.md b/benchmarks/toposense_bench/README.md
new file mode 100644
index 0000000..c5c8979
--- /dev/null
+++ b/benchmarks/toposense_bench/README.md
@@ -0,0 +1,90 @@
+# TopoSense-Bench: Semantic-Spatial Sensor Scheduling
+
+**TopoSense-Bench** is a large-scale, rigorous benchmark designed to evaluate Large Language Models (LLMs) on the **Semantic-Spatial Sensor Scheduling (S³)** problem.
+
+Originating from the **ACM MobiCom '26** paper *"IoT-Brain: Grounding LLMs for Semantic-Spatial Sensor Scheduling"*, this benchmark tests an agent's ability to translate high-level natural language user intents (e.g., *"Find my backpack lost between the library and the gym"*) into precise physical sensor activation plans within a large-scale digital twin.
+
+## 📊 Overview
+
+- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain-Project/TopoSense-Bench) (Seamlessly integrated via the `datasets` library).
+- **Scale**:
+  - **5,250** Natural Language Queries.
+  - **2,510** Sensors (Cameras).
+  - **161** Floor Plans across **33** Buildings.
+- **Problem Domain**: Embodied AI, IoT, Spatial Reasoning, and RAG (Retrieval-Augmented Generation).
+
+## 🎯 Task Taxonomy
+
+The benchmark categorizes queries into three tiers of complexity based on the spatial scope and reasoning difficulty:
+
+- **Tier 1: Intra-Zone Perception**
+  - Simple queries focused on specific rooms or focal areas (e.g., *"Check the entrance of the conference hall"*).
+- **Tier 2: Intra-Building Coordination**
+  - Complex queries requiring navigation across multiple floors within a single building (e.g., *"Track the path from the 4th-floor lab to the ground floor exit"*).
+- **Tier 3: Inter-Building Coordination**
+  - Long-horizon queries involving transitions between outdoor spaces and multiple buildings (e.g., *"I walked from the Library to the Gym, check cameras along the way"*).
+
+## ⚙️ Evaluation Methodology
+
+Unlike standard QA benchmarks, TopoSense-Bench employs a **Retrieval-Augmented Generation (RAG)** workflow to simulate realistic sensor scheduling:
+
+1.  **Context Retrieval**: The system dynamically retrieves the relevant topological map data (textual representation of buildings/floors) based on the user's query using a heuristic `TopologyManager`.
+2.  **Reasoning**: The LLM acts as a scheduler. It must analyze the provided map data and the user's intent to identify the specific sensor node ID that best satisfies the request.
+3.  **Scoring**: The evaluation uses a parsing-based exact match metric. It compares the core identifier in the LLM's output against the ground truth sensor ID (e.g., `teaching_building_1_camera_03`).
+
+## 🚀 Quick Start
+
+### 1. Installation
+
+Ensure you are in the `benchmarks/toposense_bench` directory, then install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+### 2. Configuration
+
+Create or edit `env.toml` to configure your LLM provider. This benchmark uses `litellm` for model calls.
+
+```toml
+[llm]
+# Example for OpenAI
+OPENAI_API_KEY = "sk-..."
+
+# Example for DeepSeek (OpenAI-Compatible)
+# OPENAI_API_KEY = "sk-..."
+# OPENAI_API_BASE = "https://api.deepseek.com"
+```
+
+### 3. Run Evaluation
+
+Run the evaluation script. You must specify the model name.
+
+> **Note**: If using a non-OpenAI provider (like DeepSeek or Qwen) via the OpenAI-compatible endpoint, please add the `openai/` prefix to the model name.
+
+```bash
+# Run with GPT-4o
+bash run.sh "gpt-4o"
+
+# Run with DeepSeek-Chat
+bash run.sh "openai/deepseek-chat"
+```
+
+### 4. Results
+
+After the run completes, results will be saved in the `outputs/` directory:
+- `summary.json`: Overall accuracy and breakdown by task tier.
+- `results.jsonl`: Detailed logs including retrieval status, model input/output, and correctness for every query.
+
+## 📚 Citation
+
+If you use this benchmark in your research, please cite our MobiCom '26 paper:
+
+```bibtex
+@inproceedings{iotbrain2026,
+  title={IoT-Brain: Grounding LLMs for Semantic-Spatial Sensor Scheduling},
+  author={Anonymous Author(s)},
+  booktitle={Proceedings of the 32nd Annual International Conference on Mobile Computing and Networking (MobiCom '26)},
+  year={2026}
+}
+```
diff --git a/benchmarks/toposense_bench/env.toml.example b/benchmarks/toposense_bench/env.toml.example
new file mode 100644
index 0000000..b014379
--- /dev/null
+++ b/benchmarks/toposense_bench/env.toml.example
@@ -0,0 +1,5 @@
+[llm]
+
+OPENAI_API_KEY = "your_key_here" 
+
+OPENAI_API_BASE = "your_url_here"
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/requirements.txt b/benchmarks/toposense_bench/requirements.txt
new file mode 100644
index 0000000..28aeae1
--- /dev/null
+++ b/benchmarks/toposense_bench/requirements.txt
@@ -0,0 +1,11 @@
+# Hugging Face Ecosystem
+datasets>=2.14.0
+huggingface_hub>=0.16.0
+
+# Data Processing & Utilities
+pandas>=1.5.0
+tqdm
+loguru
+
+# Configuration parsing (for compatibility with older Python versions)
+tomli>=2.0.1; python_version < "3.11"
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/run.sh b/benchmarks/toposense_bench/run.sh
new file mode 100755
index 0000000..d0ea063
--- /dev/null
+++ b/benchmarks/toposense_bench/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# ==============================================================================
+# TopoSense-Bench Execution Script
+#
+# Usage:
+#   ./run.sh [model_name]
+#
+# Examples:
+#   ./run.sh "gpt-4o"                  # Run with OpenAI GPT-4o (Default)
+#   ./run.sh "openai/deepseek-chat"    # Run with DeepSeek (via OpenAI-compatible endpoint)
+#
+# Note: Ensure that API keys are correctly configured in 'env.toml'.
+# ==============================================================================
+
+# Set default model to "gpt-4o" if no argument is provided
+MODEL_NAME=${1:-"gpt-4o"}
+
+echo "🚀 Starting TopoSense-Bench evaluation..."
+echo "🤖 Model: $MODEL_NAME"
+
+# Run the main evaluation script
+python src/main.py --model_name "$MODEL_NAME"
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/src/__init__.py b/benchmarks/toposense_bench/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/toposense_bench/src/evaluator.py b/benchmarks/toposense_bench/src/evaluator.py
new file mode 100644
index 0000000..6de5b7d
--- /dev/null
+++ b/benchmarks/toposense_bench/src/evaluator.py
@@ -0,0 +1,77 @@
+"""Evaluator for TopoSense Benchmark."""
+
+import re
+import ast
+from loguru import logger
+
+
+class TopoSenseEvaluator:
+    """Evaluator class for Semantic-Spatial Sensor Scheduling tasks."""
+
+    def __init__(self):
+        pass
+
+    def parse_node_info(self, text):
+        """
+        Parses the Node string representation to extract the critical 'name' tag.
+
+        Input format example:
+        "Node(223, 307, Tags: {'man_made': 'surveillance', 'name': 'camera_1'})"
+
+        Args:
+            text (str): The raw ground truth string from the dataset.
+
+        Returns:
+            str: The extracted sensor name (e.g., "camera_1") or the original text if parsing fails.
+        """
+        try:
+            # 1. Attempt to extract the Tags dictionary part using regex
+            tags_match = re.search(r"Tags:\s*(\{.*?\})", text)
+            if tags_match:
+                tags_str = tags_match.group(1)
+                # Safely evaluate the string as a Python dictionary
+                tags = ast.literal_eval(tags_str)
+                # Return the 'name' tag converted to lowercase
+                return tags.get('name', '').lower()
+
+            # 2. Fallback: If it's a pure ID format or regex fails, return normalized text
+            return text.strip().lower()
+        except Exception:
+            return text.strip().lower()
+
+    def eval(self, llm_response_json, ground_truth_str):
+        """
+        Evaluate the LLM's response against the ground truth.
+
+        Args:
+            llm_response_json (dict): The JSON output from the LLM.
+                                      Expected format: {"answer": "...", "explanation": "..."}
+            ground_truth_str (str): The raw answer string from the dataset.
+
+        Returns:
+            dict: Evaluation result containing status, score, and parsed ground truth.
+        """
+        # 1. Extract the core answer from the LLM response
+        llm_answer = str(llm_response_json.get("answer", "")).lower()
+
+        # 2. Parse the unique identifier (Target Name) from the Ground Truth
+        gt_target_name = self.parse_node_info(ground_truth_str)
+
+        # 3. Evaluation Logic
+        # Requirement: The LLM's answer must contain the core identifier of the GT.
+        # Example:
+        #   GT: "fire_fighting_access_1_camera_1"
+        #   LLM: "I suggest using fire_fighting_access_1_camera_1" -> Correct
+
+        # Normalize strings by replacing underscores and hyphens with spaces for robust matching
+        clean_llm = llm_answer.replace("_", " ").replace("-", " ")
+        clean_gt = gt_target_name.replace("_", " ").replace("-", " ")
+
+        # Perform containment check
+        is_correct = clean_gt in clean_llm
+
+        return {
+            "status": "correct" if is_correct else "incorrect",
+            "score": 1 if is_correct else 0,
+            "parsed_gt": gt_target_name
+        }
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py
new file mode 100644
index 0000000..8673b46
--- /dev/null
+++ b/benchmarks/toposense_bench/src/main.py
@@ -0,0 +1,172 @@
+"""
+Run TopoSense-Bench Evaluation.
+
+This script executes the benchmark by loading queries from Hugging Face,
+retrieving relevant topological contexts, and evaluating the LLM's response.
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+
+# Add parent directory to path to import the shared SDK
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+from loguru import logger
+from sdk.executor import SimpleExecutor
+from sdk.utils import set_llm_endpoint_from_config
+from evaluator import TopoSenseEvaluator
+from topology_loader import TopologyManager
+
+# System Prompt emphasizes reasoning based on the provided Map Data
+SYSTEM_PROMPT_TEMPLATE = """You are an intelligent sensor scheduling agent (IoT-Brain).
+You will be provided with a specific TOPOLOGICAL MAP of a building floor and a USER QUERY.
+
+Your Task:
+1. Analyze the user's intent and location description in the query.
+2. Search the provided MAP DATA to find the specific sensor node that best answers the query (e.g., covers the mentioned area).
+3. Return the exact 'name' of the sensor node.
+
+Output Format:
+Please return a JSON object:
+```json
+{
+    "answer": "sensor_name_here",
+    "explanation": "Brief reasoning based on map tags"
+}
+Output ONLY the JSON code block.
+"""
+
+def main(model_name, output_dir):
+    """
+    Main evaluation loop.
+    Args:
+        model_name (str): The name of the LLM to evaluate.
+        output_dir (str): Directory to save results.
+    """
+
+    # 1. Setup Configuration Path
+    # Prioritize finding env.toml in the current benchmark directory (src/../env.toml)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    local_config = os.path.abspath(os.path.join(current_dir, "../env.toml"))
+    global_config = os.path.abspath(os.path.join(current_dir, "../../env.toml"))
+
+    if os.path.exists(local_config):
+        config_path = local_config
+    elif os.path.exists(global_config):
+        config_path = global_config
+    else:
+        config_path = local_config  # Default to local path for clearer error messages
+
+    if os.path.exists(config_path):
+        logger.info(f"Loading config from: {config_path}")
+        set_llm_endpoint_from_config(config_path)
+    else:
+        logger.warning(f"⚠️ Config file not found at {config_path}, relying on environment variables.")
+
+    # Initialize components
+    executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE)
+    evaluator = TopoSenseEvaluator()
+    topo_manager = TopologyManager()  # Handles loading and indexing of topology data
+
+    # 2. Load Queries from Hugging Face
+    logger.info("📥 Loading Queries...")
+    # Hugging Face defaults uploaded JSONL files to the 'train' split if not specified in YAML
+    dataset = load_dataset("IoT-Brain-Project/TopoSense-Bench", "queries", split="train")
+
+    results = []
+
+    # 3. Evaluation Loop
+    for item in tqdm(dataset):
+        query = item['query']
+        ground_truth = item['answer']
+        category = item['category']
+
+        # --- Context Retrieval ---
+        # Retrieve the relevant map/floor plan based on the query
+        context_map = topo_manager.retrieve_context(query)
+
+        if context_map:
+            # Oracle Context Mode: Provide the specific map data
+            user_prompt = f"{context_map}\n\n[User Query]\n{query}"
+        else:
+            # Zero-context Fallback: If retrieval fails (e.g., complex cross-floor queries)
+            user_prompt = (
+                f"[Map Data]\n(No specific map retrieved, please rely on common knowledge if possible)\n\n"
+                f"[User Query]\n{query}"
+            )
+
+        try:
+            # Call LLM using SimpleExecutor (which extracts JSON from markdown blocks)
+            response_str = executor.run(user_prompt, lang='json')
+
+            # Parse JSON Response
+            try:
+                response_json = json.loads(response_str)
+                llm_answer = response_json.get("answer", "")
+            except json.JSONDecodeError:
+                # Fallback if response is not valid JSON
+                llm_answer = response_str
+                response_json = {"answer": response_str}
+
+            # Evaluate Answer
+            eval_res = evaluator.eval(response_json, ground_truth)
+
+            results.append({
+                "category": category,
+                "query": query,
+                "ground_truth": ground_truth,
+                "retrieved_context": bool(context_map),  # Track retrieval success
+                "llm_answer": llm_answer,
+                "status": eval_res["status"],
+                "score": eval_res["score"]
+            })
+
+        except Exception as e:
+            logger.error(f"Error processing query: {e}")
+            results.append({
+                "query": query,
+                "status": "error",
+                "error": str(e)
+            })
+
+    # 4. Save Results & Summarize
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    df = pd.DataFrame(results)
+    df.to_json(
+        os.path.join(output_dir, "results.jsonl"),
+        orient="records",
+        lines=True,
+        force_ascii=False
+    )
+
+    # Calculate overall accuracy
+    if len(df) > 0:
+        acc = df[df["status"] == "correct"].shape[0] / len(df)
+        retrieval_rate = df["retrieved_context"].mean() if "retrieved_context" in df.columns else 0
+    else:
+        acc = 0
+        retrieval_rate = 0
+
+    logger.info(f"✅ Eval Done. Accuracy: {acc:.2%}")
+    logger.info(f"   Context Retrieval Rate: {retrieval_rate:.2%}")
+
+if __name__ == "__main__":  
+    parser = argparse.ArgumentParser(description="TopoSense-Bench Evaluation")
+    parser.add_argument("-m", "--model_name", default="gpt-4o", help="Model name to evaluate")
+    parser.add_argument("-o", "--output_dir", default=None, help="Directory to save results")
+    args = parser.parse_args()
+    if args.output_dir is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        safe_model_name = args.model_name.replace('/', '_')
+        args.output_dir = f"./outputs/toposense_{safe_model_name}_{timestamp}"
+
+    main(args.model_name, args.output_dir)
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/src/topology_loader.py b/benchmarks/toposense_bench/src/topology_loader.py
new file mode 100644
index 0000000..a289a10
--- /dev/null
+++ b/benchmarks/toposense_bench/src/topology_loader.py
@@ -0,0 +1,96 @@
+"""Helper to load and index topology data from Hugging Face."""
+
+from datasets import load_dataset
+from loguru import logger
+
+
+class TopologyManager:
+    """
+    Manages the loading and indexing of the topological knowledge base.
+    """
+
+    def __init__(self):
+        # Index structure: { "building_name": { "floor": "content..." } }
+        self.topo_index = {}
+        self._load_data()
+
+    def _load_data(self):
+        """Loads the topology dataset from Hugging Face and builds an in-memory index."""
+        logger.info("🗺️ Loading Topological Knowledgebase from Hugging Face...")
+        try:
+            # Load the 'topology' configuration.
+            # Hugging Face defaults uploaded JSONL files to the 'train' split.
+            ds = load_dataset("IoT-Brain/TopoSense-Bench", "topology", split="train")
+
+            for item in ds:
+                # Normalize keys for easier matching (snake_case for building names)
+                b_name = item['building'].lower().replace(" ", "_")
+                floor = item['floor'].lower()
+                content = item['content']
+
+                if b_name not in self.topo_index:
+                    self.topo_index[b_name] = {}
+
+                self.topo_index[b_name][floor] = content
+
+            logger.info(f"✅ Indexed {len(self.topo_index)} buildings.")
+        except Exception as e:
+            logger.error(f"❌ Failed to load topology: {e}")
+
+    def retrieve_context(self, query):
+        """
+        A simple heuristic retriever.
+        Identifies the relevant map file based on keywords in the query.
+        This simulates the 'Topological Anchor' step in the IoT-Brain architecture.
+
+        Args:
+            query (str): The user's natural language query.
+
+        Returns:
+            str or None: The content of the specific floor plan if found, else None.
+        """
+        query_lower = query.lower()
+
+        target_building = None
+        target_floor = None
+
+        # 1. Building Matching Logic
+        # Iterate through all known building names in the index
+        for b_name in self.topo_index.keys():
+            # Replace underscores with spaces for natural language matching
+            # (e.g., teaching_building_1 -> "teaching building 1")
+            natural_name = b_name.replace("_", " ")
+            if natural_name in query_lower:
+                target_building = b_name
+                break
+
+        # 2. Floor Matching Logic
+        # Handle common short formats: "1f", "2f"...
+        floors = ["1f", "2f", "3f", "4f", "5f", "6f", "7f", "8f", "9f", "10f"]
+        for f in floors:
+            # Match variations like "1st floor", "2nd floor", "10th floor"
+            digit = f[:-1]
+            if (f in query_lower or
+                f"{digit}st floor" in query_lower or
+                f"{digit}nd floor" in query_lower or
+                f"{digit}rd floor" in query_lower or
+                f"{digit}th floor" in query_lower):
+                target_floor = f.upper()  # Standardize to "1F"
+                break
+
+        # Map explicit natural language floor descriptions to standard format
+        if "first floor" in query_lower: target_floor = "1F"
+        if "second floor" in query_lower: target_floor = "2F"
+        if "third floor" in query_lower: target_floor = "3F"
+        if "fourth floor" in query_lower: target_floor = "4F"
+
+        # 3. Retrieve and Return Map Content
+        if target_building and target_floor:
+            # Retrieve specific floor map from index
+            floors_map = self.topo_index[target_building]
+            # Try to match the key (case-insensitive)
+            for key, content in floors_map.items():
+                if key.lower() == target_floor.lower():
+                    return f"Building: {target_building}, Floor: {target_floor}\n\n[Map Data]\n{content}"
+
+        return None
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/tests/__init__.py b/benchmarks/toposense_bench/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/toposense_bench/tests/test_benchmark.py b/benchmarks/toposense_bench/tests/test_benchmark.py
new file mode 100644
index 0000000..fd565e4
--- /dev/null
+++ b/benchmarks/toposense_bench/tests/test_benchmark.py
@@ -0,0 +1,40 @@
+"""Unit tests for TopoSense-Bench."""
+
+import unittest
+from datasets import load_dataset
+
+
+class TestTopoSenseBench(unittest.TestCase):
+    """Test suite for TopoSense-Bench data connectivity and integrity."""
+
+    def test_hf_connection(self):
+        """
+        Test if we can connect to Hugging Face, authenticate (if needed),
+        and stream the first data sample successfully.
+        """
+        try:
+            # Load the dataset in streaming mode to avoid downloading the entire file.
+            # Note: Using 'train' split as per default Hugging Face JSONL behavior.
+            dataset = load_dataset(
+                "IoT-Brain-Project/TopoSense-Bench",
+                "queries",
+                split="train",
+                streaming=True
+            )
+
+            # Retrieve the first item to verify data access
+            first_item = next(iter(dataset))
+            
+            print(f"Successfully loaded item category: {first_item.get('category', 'Unknown')}")
+
+            # Assert that essential fields are present in the data
+            self.assertTrue('query' in first_item, "Field 'query' is missing.")
+            self.assertTrue('answer' in first_item, "Field 'answer' is missing.")
+            self.assertTrue('category' in first_item, "Field 'category' is missing.")
+
+        except Exception as e:
+            self.fail(f"Failed to load dataset from Hugging Face: {e}")
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 7321ffc857af2bcfbbc4957ba76e733cc4859a3d Mon Sep 17 00:00:00 2001
From: houqiii <houqiii@houqiiideMacBook-Air.local>
Date: Sun, 14 Dec 2025 13:32:11 +0800
Subject: [PATCH 2/3] fix: Finalize main.py with robust error handling and
 output format

---
 benchmarks/toposense_bench/src/main.py | 232 +++++++++++++++----------
 1 file changed, 142 insertions(+), 90 deletions(-)

diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py
index 8673b46..55b866c 100644
--- a/benchmarks/toposense_bench/src/main.py
+++ b/benchmarks/toposense_bench/src/main.py
@@ -1,8 +1,5 @@
 """
 Run TopoSense-Bench Evaluation.
-
-This script executes the benchmark by loading queries from Hugging Face,
-retrieving relevant topological contexts, and evaluating the LLM's response.
 """
 
 import argparse
@@ -24,7 +21,7 @@
 from evaluator import TopoSenseEvaluator
 from topology_loader import TopologyManager
 
-# System Prompt emphasizes reasoning based on the provided Map Data
+# System Prompt
 SYSTEM_PROMPT_TEMPLATE = """You are an intelligent sensor scheduling agent (IoT-Brain).
 You will be provided with a specific TOPOLOGICAL MAP of a building floor and a USER QUERY.
 
@@ -42,17 +39,41 @@
 }
 Output ONLY the JSON code block.
 """
+def compute_summary(results_df):
+    """Compute summary statistics."""
+    total_questions = len(results_df)
+    answered = len(results_df[results_df["status"] != "error"])
+    correct = len(results_df[results_df["status"] == "correct"])
+    incorrect = len(results_df[results_df["status"] == "incorrect"])
+
+    summary = {
+        "overall": {
+            "total_questions": total_questions,
+            "answered": answered,
+            "correct": correct,
+            "incorrect": incorrect,
+            "accuracy": round(correct / answered, 4) if answered > 0 else 0,
+        },
+        "by_category": []
+    }
+
+    if "category" in results_df.columns:
+        for category in results_df["category"].unique():
+            cat_df = results_df[results_df["category"] == category]
+            cat_total = len(cat_df)
+            cat_correct = len(cat_df[cat_df["status"] == "correct"])
+            
+            summary["by_category"].append({
+                "category": category,
+                "total": cat_total,
+                "correct": cat_correct,
+                "accuracy": round(cat_correct / cat_total, 4) if cat_total > 0 else 0
+            })
+    return summary
 
 def main(model_name, output_dir):
-    """
-    Main evaluation loop.
-    Args:
-        model_name (str): The name of the LLM to evaluate.
-        output_dir (str): Directory to save results.
-    """
-
+    """Main evaluation loop."""
     # 1. Setup Configuration Path
-    # Prioritize finding env.toml in the current benchmark directory (src/../env.toml)
     current_dir = os.path.dirname(os.path.abspath(__file__))
     local_config = os.path.abspath(os.path.join(current_dir, "../env.toml"))
     global_config = os.path.abspath(os.path.join(current_dir, "../../env.toml"))
@@ -62,108 +83,139 @@ def main(model_name, output_dir):
     elif os.path.exists(global_config):
         config_path = global_config
     else:
-        config_path = local_config  # Default to local path for clearer error messages
+        config_path = local_config 
 
     if os.path.exists(config_path):
         logger.info(f"Loading config from: {config_path}")
         set_llm_endpoint_from_config(config_path)
     else:
-        logger.warning(f"⚠️ Config file not found at {config_path}, relying on environment variables.")
+        logger.warning(f"Config file not found at {config_path}, relying on environment variables.")
 
     # Initialize components
-    executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE)
+    try:
+        executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE)
+    except Exception as e:
+        logger.error(f"Failed to initialize Executor: {e}")
+        return
+
     evaluator = TopoSenseEvaluator()
-    topo_manager = TopologyManager()  # Handles loading and indexing of topology data
+    topo_manager = TopologyManager()
 
-    # 2. Load Queries from Hugging Face
-    logger.info("📥 Loading Queries...")
-    # Hugging Face defaults uploaded JSONL files to the 'train' split if not specified in YAML
-    dataset = load_dataset("IoT-Brain-Project/TopoSense-Bench", "queries", split="train")
+    # 2. Load Queries
+    logger.info("📥 Loading Queries from Hugging Face...")
+    try:
+        dataset = load_dataset("IoT-Brain/TopoSense-Bench", "queries", split="train")
+        logger.info(f"✅ Loaded {len(dataset)} queries.")
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        return
 
-    results = []
+    minimal_results = []
+    detailed_results = []
 
     # 3. Evaluation Loop
-    for item in tqdm(dataset):
-        query = item['query']
-        ground_truth = item['answer']
-        category = item['category']
-
-        # --- Context Retrieval ---
-        # Retrieve the relevant map/floor plan based on the query
-        context_map = topo_manager.retrieve_context(query)
-
-        if context_map:
-            # Oracle Context Mode: Provide the specific map data
-            user_prompt = f"{context_map}\n\n[User Query]\n{query}"
-        else:
-            # Zero-context Fallback: If retrieval fails (e.g., complex cross-floor queries)
-            user_prompt = (
-                f"[Map Data]\n(No specific map retrieved, please rely on common knowledge if possible)\n\n"
-                f"[User Query]\n{query}"
-            )
-
-        try:
-            # Call LLM using SimpleExecutor (which extracts JSON from markdown blocks)
-            response_str = executor.run(user_prompt, lang='json')
-
-            # Parse JSON Response
+    try:
+        for item in tqdm(dataset):
+            query = item['query']
+            ground_truth = item['answer']
+            category = item['category']
+
+            # Context Retrieval
+            context_map = topo_manager.retrieve_context(query)
+
+            if context_map:
+                user_prompt = f"{context_map}\n\n[User Query]\n{query}"
+            else:
+                user_prompt = (
+                    f"[Map Data]\n(No specific map retrieved, relying on common knowledge)\n\n"
+                    f"[User Query]\n{query}"
+                )
+
             try:
-                response_json = json.loads(response_str)
-                llm_answer = response_json.get("answer", "")
-            except json.JSONDecodeError:
-                # Fallback if response is not valid JSON
-                llm_answer = response_str
-                response_json = {"answer": response_str}
+                response_str = executor.run(user_prompt, lang='json')
+                
+                try:
+                    response_json = json.loads(response_str)
+                    llm_answer = response_json.get("answer", "")
+                    llm_explanation = response_json.get("explanation", "")
+                except json.JSONDecodeError:
+                    llm_answer = response_str
+                    llm_explanation = "Failed to parse JSON"
+                    response_json = {"answer": response_str}
+
+                # Evaluate Answer
+                eval_res = evaluator.eval(response_json, ground_truth)
+
+                # Construct Result Objects
+                minimal_result = {
+                    "category": category,
+                    "query": query,
+                    "ground_truth": ground_truth,
+                    "llm_answer": llm_answer,
+                    "status": eval_res["status"],
+                    "score": eval_res["score"]
+                }
+                
+                detailed_result = {
+                    **minimal_result,
+                    "llm_explanation": llm_explanation,
+                    "retrieved_context": bool(context_map),
+                    "full_prompt": user_prompt,
+                    "raw_response": response_str
+                }
+
+                minimal_results.append(minimal_result)
+                detailed_results.append(detailed_result)
+
+            except Exception as e:
+                logger.error(f"Error processing query: {e}")
+                error_result = {
+                    "category": category,
+                    "query": query,
+                    "status": "error",
+                    "error": str(e)
+                }
+                minimal_results.append(error_result)
+                detailed_results.append(error_result)
+                
+    except KeyboardInterrupt:
+        logger.warning("Evaluation interrupted by user. Saving partial results...")
+
+    # 4. Save Results
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
 
-            # Evaluate Answer
-            eval_res = evaluator.eval(response_json, ground_truth)
+    # 4.1 Save Detailed Results (JSONL)
+    with open(os.path.join(output_dir, "results_detailed.jsonl"), "w", encoding="utf-8") as f:
+        for res in detailed_results:
+            f.write(json.dumps(res, ensure_ascii=False) + "\n")
 
-            results.append({
-                "category": category,
-                "query": query,
-                "ground_truth": ground_truth,
-                "retrieved_context": bool(context_map),  # Track retrieval success
-                "llm_answer": llm_answer,
-                "status": eval_res["status"],
-                "score": eval_res["score"]
-            })
+    # 4.2 Save Minimal Results (JSONL)
+    with open(os.path.join(output_dir, "results.jsonl"), "w", encoding="utf-8") as f:
+        for res in minimal_results:
+            f.write(json.dumps(res, ensure_ascii=False) + "\n")
 
-        except Exception as e:
-            logger.error(f"Error processing query: {e}")
-            results.append({
-                "query": query,
-                "status": "error",
-                "error": str(e)
-            })
+    # 4.3 Generate Summary
+    results_df = pd.DataFrame(minimal_results)
+    if not results_df.empty:
+        summary = compute_summary(results_df)
+        summary["model"] = model_name
+        summary["timestamp"] = datetime.now().isoformat()
 
-    # 4. Save Results & Summarize
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
+        with open(os.path.join(output_dir, "summary.json"), "w", encoding="utf-8") as f:
+            json.dump(summary, f, indent=2, ensure_ascii=False)
 
-    df = pd.DataFrame(results)
-    df.to_json(
-        os.path.join(output_dir, "results.jsonl"),
-        orient="records",
-        lines=True,
-        force_ascii=False
-    )
-
-    # Calculate overall accuracy
-    if len(df) > 0:
-        acc = df[df["status"] == "correct"].shape[0] / len(df)
-        retrieval_rate = df["retrieved_context"].mean() if "retrieved_context" in df.columns else 0
+        logger.info(f"✅ Eval Done. Accuracy: {summary['overall']['accuracy']:.2%}")
+        logger.info(f"📂 Results saved to: {output_dir}")
     else:
-        acc = 0
-        retrieval_rate = 0
-
-    logger.info(f"✅ Eval Done. Accuracy: {acc:.2%}")
-    logger.info(f"   Context Retrieval Rate: {retrieval_rate:.2%}")
+        logger.warning("No results to save.")
 
-if __name__ == "__main__":  
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="TopoSense-Bench Evaluation")
     parser.add_argument("-m", "--model_name", default="gpt-4o", help="Model name to evaluate")
     parser.add_argument("-o", "--output_dir", default=None, help="Directory to save results")
     args = parser.parse_args()
+
     if args.output_dir is None:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         safe_model_name = args.model_name.replace('/', '_')

From c1dd37f6258560622d6a8ca11de2de39cb8b3559 Mon Sep 17 00:00:00 2001
From: houqiii <houqiii@houqiiideMacBook-Air.local>
Date: Sun, 14 Dec 2025 13:43:40 +0800
Subject: [PATCH 3/3] fix: Finalize codebase (docs, CI, install script, and
 output format)

---
 .github/workflows/test.yml                       |  1 +
 README.md                                        |  1 +
 benchmarks/toposense_bench/README.md             |  2 +-
 benchmarks/toposense_bench/Why.md                | 15 +++++++++++++++
 benchmarks/toposense_bench/install.sh            | 16 ++++++++++++++++
 benchmarks/toposense_bench/src/main.py           |  1 +
 .../toposense_bench/tests/test_benchmark.py      |  2 +-
 7 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/toposense_bench/Why.md
 create mode 100755 benchmarks/toposense_bench/install.sh

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a348b86..32dd8b9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,6 +19,7 @@ jobs:
         benchmark:
           - example_bench
           - course_exam_bench
+          - toposense_bench
           # TODO: For now, we comment out other benchmarks as they have no tests
           # - arteval_bench
           # - cache_bench
diff --git a/README.md b/README.md
index 8ea3103..47300b5 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ System Intelligence Benchmark currently includes the following example benchmark
 - **System Lab Benchmark** ([benchmarks/course_lab_bench/](benchmarks/course_lab_bench/)) - Assesses AI capability on practical system course labs and projects 
 - **System Artifact Benchmark** ([benchmarks/arteval_bench/](benchmarks/arteval_bench/)) - Evaluates AI performance on artifact evaluation
 - **System Modeling Benchmark** ([benchmarks/sysmobench/](benchmarks/sysmobench/)) - Evaluates an agent's ability to produce correct TLA+ models for real-world concurrent and distributed systems, covering system capabilities across system comprehension, abstraction, and potentially tool fluency.
+- **TopoSense Benchmark** ([benchmarks/toposense_bench/](benchmarks/toposense_bench/)) - Evaluates Semantic-Spatial Sensor Scheduling (S³) capabilities in large-scale IoT digital twins (5,250 queries across 2,510 cameras)
 - **Example Benchmark** ([benchmarks/example_bench/](benchmarks/example_bench/)) - Template and reference implementation for creating new benchmarks
 
 ## Quick Start
diff --git a/benchmarks/toposense_bench/README.md b/benchmarks/toposense_bench/README.md
index c5c8979..13d3214 100644
--- a/benchmarks/toposense_bench/README.md
+++ b/benchmarks/toposense_bench/README.md
@@ -6,7 +6,7 @@ Originating from the **ACM MobiCom '26** paper *"IoT-Brain: Grounding LLMs for S
 
 ## 📊 Overview
 
-- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain-Project/TopoSense-Bench) (Seamlessly integrated via the `datasets` library).
+- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain/TopoSense-Bench) (Seamlessly integrated via the `datasets` library).
 - **Scale**:
   - **5,250** Natural Language Queries.
   - **2,510** Sensors (Cameras).
diff --git a/benchmarks/toposense_bench/Why.md b/benchmarks/toposense_bench/Why.md
new file mode 100644
index 0000000..f096e7c
--- /dev/null
+++ b/benchmarks/toposense_bench/Why.md
@@ -0,0 +1,15 @@
+# Why TopoSense-Bench?
+
+## The Problem: The Semantic-Physical Mapping Gap
+Modern IoT systems are transitioning from passive monitoring to intent-driven operation. However, a critical gap exists between high-level human intent (e.g., *"Find my backpack lost between the library and the gym"*) and the precise physical sensor actions required to fulfill it.
+
+Existing benchmarks often focus on pure QA or code generation, overlooking the **embodied** and **spatial** reasoning capabilities required for real-world cyber-physical systems.
+
+## The Solution: Semantic-Spatial Sensor Scheduling (S³)
+TopoSense-Bench introduces the S³ challenge, requiring LLMs to:
+1.  **Reason Spatially**: Understand complex topological relationships (connectivity, floor transitions) in a large-scale digital twin.
+2.  **Act Proactively**: Select the optimal subset of sensors from a massive network (2,510 cameras) to satisfy a query, rather than just answering a text question.
+3.  **Ground in Reality**: Map vague natural language to concrete sensor identifiers (e.g., `teaching_building_1_camera_03`).
+
+## Impact
+By mastering this benchmark, LLMs demonstrate the capability to serve as the "brain" for large-scale smart city and smart campus infrastructures, moving beyond chatbots to actionable physical agents.
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/install.sh b/benchmarks/toposense_bench/install.sh
new file mode 100755
index 0000000..8526a25
--- /dev/null
+++ b/benchmarks/toposense_bench/install.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Create virtual environment
+python3 -m venv .venv
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Upgrade pip
+pip install --upgrade pip
+
+# Install requirements
+pip install -r requirements.txt
+
+echo "✅ Installation complete. Virtual environment created in .venv/"
+echo "👉 To activate: source .venv/bin/activate"
\ No newline at end of file
diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py
index 55b866c..243ab87 100644
--- a/benchmarks/toposense_bench/src/main.py
+++ b/benchmarks/toposense_bench/src/main.py
@@ -37,6 +37,7 @@
     "answer": "sensor_name_here",
     "explanation": "Brief reasoning based on map tags"
 }
+```
 Output ONLY the JSON code block.
 """
 def compute_summary(results_df):
diff --git a/benchmarks/toposense_bench/tests/test_benchmark.py b/benchmarks/toposense_bench/tests/test_benchmark.py
index fd565e4..a23e100 100644
--- a/benchmarks/toposense_bench/tests/test_benchmark.py
+++ b/benchmarks/toposense_bench/tests/test_benchmark.py
@@ -16,7 +16,7 @@ def test_hf_connection(self):
             # Load the dataset in streaming mode to avoid downloading the entire file.
             # Note: Using 'train' split as per default Hugging Face JSONL behavior.
             dataset = load_dataset(
-                "IoT-Brain-Project/TopoSense-Bench",
+                "IoT-Brain/TopoSense-Bench",
                 "queries",
                 split="train",
                 streaming=True