From b2fa8ac65fbf3683f2f2dbb3a71bd1e3e2e3e25b Mon Sep 17 00:00:00 2001 From: houqiii Date: Thu, 11 Dec 2025 12:24:23 +0800 Subject: [PATCH 1/3] feat: Add TopoSense-Bench (HuggingFace integration) --- benchmarks/toposense_bench/README.md | 90 +++++++++ benchmarks/toposense_bench/env.toml.example | 5 + benchmarks/toposense_bench/requirements.txt | 11 ++ benchmarks/toposense_bench/run.sh | 23 +++ benchmarks/toposense_bench/src/__init__.py | 0 benchmarks/toposense_bench/src/evaluator.py | 77 ++++++++ benchmarks/toposense_bench/src/main.py | 172 ++++++++++++++++++ .../toposense_bench/src/topology_loader.py | 96 ++++++++++ benchmarks/toposense_bench/tests/__init__.py | 0 .../toposense_bench/tests/test_benchmark.py | 40 ++++ 10 files changed, 514 insertions(+) create mode 100644 benchmarks/toposense_bench/README.md create mode 100644 benchmarks/toposense_bench/env.toml.example create mode 100644 benchmarks/toposense_bench/requirements.txt create mode 100755 benchmarks/toposense_bench/run.sh create mode 100644 benchmarks/toposense_bench/src/__init__.py create mode 100644 benchmarks/toposense_bench/src/evaluator.py create mode 100644 benchmarks/toposense_bench/src/main.py create mode 100644 benchmarks/toposense_bench/src/topology_loader.py create mode 100644 benchmarks/toposense_bench/tests/__init__.py create mode 100644 benchmarks/toposense_bench/tests/test_benchmark.py diff --git a/benchmarks/toposense_bench/README.md b/benchmarks/toposense_bench/README.md new file mode 100644 index 0000000..c5c8979 --- /dev/null +++ b/benchmarks/toposense_bench/README.md @@ -0,0 +1,90 @@ +# TopoSense-Bench: Semantic-Spatial Sensor Scheduling + +**TopoSense-Bench** is a large-scale, rigorous benchmark designed to evaluate Large Language Models (LLMs) on the **Semantic-Spatial Sensor Scheduling (SΒ³)** problem. + +Originating from the **ACM MobiCom '26** paper *"IoT-Brain: Grounding LLMs for Semantic-Spatial Sensor Scheduling"*, this benchmark tests an agent's ability to translate high-level natural language user intents (e.g., *"Find my backpack lost between the library and the gym"*) into precise physical sensor activation plans within a large-scale digital twin. + +## πŸ“Š Overview + +- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain-Project/TopoSense-Bench) (Seamlessly integrated via the `datasets` library). +- **Scale**: + - **5,250** Natural Language Queries. + - **2,510** Sensors (Cameras). + - **161** Floor Plans across **33** Buildings. +- **Problem Domain**: Embodied AI, IoT, Spatial Reasoning, and RAG (Retrieval-Augmented Generation). + +## 🎯 Task Taxonomy + +The benchmark categorizes queries into three tiers of complexity based on the spatial scope and reasoning difficulty: + +- **Tier 1: Intra-Zone Perception** + - Simple queries focused on specific rooms or focal areas (e.g., *"Check the entrance of the conference hall"*). +- **Tier 2: Intra-Building Coordination** + - Complex queries requiring navigation across multiple floors within a single building (e.g., *"Track the path from the 4th-floor lab to the ground floor exit"*). +- **Tier 3: Inter-Building Coordination** + - Long-horizon queries involving transitions between outdoor spaces and multiple buildings (e.g., *"I walked from the Library to the Gym, check cameras along the way"*). + +## βš™οΈ Evaluation Methodology + +Unlike standard QA benchmarks, TopoSense-Bench employs a **Retrieval-Augmented Generation (RAG)** workflow to simulate realistic sensor scheduling: + +1. **Context Retrieval**: The system dynamically retrieves the relevant topological map data (textual representation of buildings/floors) based on the user's query using a heuristic `TopologyManager`. +2. **Reasoning**: The LLM acts as a scheduler. It must analyze the provided map data and the user's intent to identify the specific sensor node ID that best satisfies the request. +3. **Scoring**: The evaluation uses a parsing-based exact match metric. It compares the core identifier in the LLM's output against the ground truth sensor ID (e.g., `teaching_building_1_camera_03`). + +## πŸš€ Quick Start + +### 1. Installation + +Ensure you are in the `benchmarks/toposense_bench` directory, then install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +### 2. Configuration + +Create or edit `env.toml` to configure your LLM provider. This benchmark uses `litellm` for model calls. + +```toml +[llm] +# Example for OpenAI +OPENAI_API_KEY = "sk-..." + +# Example for DeepSeek (OpenAI-Compatible) +# OPENAI_API_KEY = "sk-..." +# OPENAI_API_BASE = "https://api.deepseek.com" +``` + +### 3. Run Evaluation + +Run the evaluation script. You must specify the model name. + +> **Note**: If using a non-OpenAI provider (like DeepSeek or Qwen) via the OpenAI-compatible endpoint, please add the `openai/` prefix to the model name. + +```bash +# Run with GPT-4o +bash run.sh "gpt-4o" + +# Run with DeepSeek-Chat +bash run.sh "openai/deepseek-chat" +``` + +### 4. Results + +After the run completes, results will be saved in the `outputs/` directory: +- `summary.json`: Overall accuracy and breakdown by task tier. +- `results.jsonl`: Detailed logs including retrieval status, model input/output, and correctness for every query. + +## πŸ“š Citation + +If you use this benchmark in your research, please cite our MobiCom '26 paper: + +```bibtex +@inproceedings{iotbrain2026, + title={IoT-Brain: Grounding LLMs for Semantic-Spatial Sensor Scheduling}, + author={Anonymous Author(s)}, + booktitle={Proceedings of the 32nd Annual International Conference on Mobile Computing and Networking (MobiCom '26)}, + year={2026} +} +``` diff --git a/benchmarks/toposense_bench/env.toml.example b/benchmarks/toposense_bench/env.toml.example new file mode 100644 index 0000000..b014379 --- /dev/null +++ b/benchmarks/toposense_bench/env.toml.example @@ -0,0 +1,5 @@ +[llm] + +OPENAI_API_KEY = "your_key_here" + +OPENAI_API_BASE = "your_url_here" \ No newline at end of file diff --git a/benchmarks/toposense_bench/requirements.txt b/benchmarks/toposense_bench/requirements.txt new file mode 100644 index 0000000..28aeae1 --- /dev/null +++ b/benchmarks/toposense_bench/requirements.txt @@ -0,0 +1,11 @@ +# Hugging Face Ecosystem +datasets>=2.14.0 +huggingface_hub>=0.16.0 + +# Data Processing & Utilities +pandas>=1.5.0 +tqdm +loguru + +# Configuration parsing (for compatibility with older Python versions) +tomli>=2.0.1; python_version < "3.11" \ No newline at end of file diff --git a/benchmarks/toposense_bench/run.sh b/benchmarks/toposense_bench/run.sh new file mode 100755 index 0000000..d0ea063 --- /dev/null +++ b/benchmarks/toposense_bench/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# ============================================================================== +# TopoSense-Bench Execution Script +# +# Usage: +# ./run.sh [model_name] +# +# Examples: +# ./run.sh "gpt-4o" # Run with OpenAI GPT-4o (Default) +# ./run.sh "openai/deepseek-chat" # Run with DeepSeek (via OpenAI-compatible endpoint) +# +# Note: Ensure that API keys are correctly configured in 'env.toml'. +# ============================================================================== + +# Set default model to "gpt-4o" if no argument is provided +MODEL_NAME=${1:-"gpt-4o"} + +echo "πŸš€ Starting TopoSense-Bench evaluation..." +echo "πŸ€– Model: $MODEL_NAME" + +# Run the main evaluation script +python src/main.py --model_name "$MODEL_NAME" \ No newline at end of file diff --git a/benchmarks/toposense_bench/src/__init__.py b/benchmarks/toposense_bench/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/toposense_bench/src/evaluator.py b/benchmarks/toposense_bench/src/evaluator.py new file mode 100644 index 0000000..6de5b7d --- /dev/null +++ b/benchmarks/toposense_bench/src/evaluator.py @@ -0,0 +1,77 @@ +"""Evaluator for TopoSense Benchmark.""" + +import re +import ast +from loguru import logger + + +class TopoSenseEvaluator: + """Evaluator class for Semantic-Spatial Sensor Scheduling tasks.""" + + def __init__(self): + pass + + def parse_node_info(self, text): + """ + Parses the Node string representation to extract the critical 'name' tag. + + Input format example: + "Node(223, 307, Tags: {'man_made': 'surveillance', 'name': 'camera_1'})" + + Args: + text (str): The raw ground truth string from the dataset. + + Returns: + str: The extracted sensor name (e.g., "camera_1") or the original text if parsing fails. + """ + try: + # 1. Attempt to extract the Tags dictionary part using regex + tags_match = re.search(r"Tags:\s*(\{.*?\})", text) + if tags_match: + tags_str = tags_match.group(1) + # Safely evaluate the string as a Python dictionary + tags = ast.literal_eval(tags_str) + # Return the 'name' tag converted to lowercase + return tags.get('name', '').lower() + + # 2. Fallback: If it's a pure ID format or regex fails, return normalized text + return text.strip().lower() + except Exception: + return text.strip().lower() + + def eval(self, llm_response_json, ground_truth_str): + """ + Evaluate the LLM's response against the ground truth. + + Args: + llm_response_json (dict): The JSON output from the LLM. + Expected format: {"answer": "...", "explanation": "..."} + ground_truth_str (str): The raw answer string from the dataset. + + Returns: + dict: Evaluation result containing status, score, and parsed ground truth. + """ + # 1. Extract the core answer from the LLM response + llm_answer = str(llm_response_json.get("answer", "")).lower() + + # 2. Parse the unique identifier (Target Name) from the Ground Truth + gt_target_name = self.parse_node_info(ground_truth_str) + + # 3. Evaluation Logic + # Requirement: The LLM's answer must contain the core identifier of the GT. + # Example: + # GT: "fire_fighting_access_1_camera_1" + # LLM: "I suggest using fire_fighting_access_1_camera_1" -> Correct + + # Normalize strings by replacing underscores and hyphens with spaces for robust matching + clean_llm = llm_answer.replace("_", " ").replace("-", " ") + clean_gt = gt_target_name.replace("_", " ").replace("-", " ") + + # Perform containment check + is_correct = clean_gt in clean_llm + + return { + "status": "correct" if is_correct else "incorrect", + "score": 1 if is_correct else 0, + "parsed_gt": gt_target_name + } \ No newline at end of file diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py new file mode 100644 index 0000000..8673b46 --- /dev/null +++ b/benchmarks/toposense_bench/src/main.py @@ -0,0 +1,172 @@ +""" +Run TopoSense-Bench Evaluation. + +This script executes the benchmark by loading queries from Hugging Face, +retrieving relevant topological contexts, and evaluating the LLM's response. +""" + +import argparse +import json +import os +import sys +from datetime import datetime + +import pandas as pd +from datasets import load_dataset +from tqdm import tqdm + +# Add parent directory to path to import the shared SDK +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))) + +from loguru import logger +from sdk.executor import SimpleExecutor +from sdk.utils import set_llm_endpoint_from_config +from evaluator import TopoSenseEvaluator +from topology_loader import TopologyManager + +# System Prompt emphasizes reasoning based on the provided Map Data +SYSTEM_PROMPT_TEMPLATE = """You are an intelligent sensor scheduling agent (IoT-Brain). +You will be provided with a specific TOPOLOGICAL MAP of a building floor and a USER QUERY. + +Your Task: +1. Analyze the user's intent and location description in the query. +2. Search the provided MAP DATA to find the specific sensor node that best answers the query (e.g., covers the mentioned area). +3. Return the exact 'name' of the sensor node. + +Output Format: +Please return a JSON object: +```json +{ + "answer": "sensor_name_here", + "explanation": "Brief reasoning based on map tags" +} +Output ONLY the JSON code block. +""" + +def main(model_name, output_dir): + """ + Main evaluation loop. + Args: + model_name (str): The name of the LLM to evaluate. + output_dir (str): Directory to save results. + """ + + # 1. Setup Configuration Path + # Prioritize finding env.toml in the current benchmark directory (src/../env.toml) + current_dir = os.path.dirname(os.path.abspath(__file__)) + local_config = os.path.abspath(os.path.join(current_dir, "../env.toml")) + global_config = os.path.abspath(os.path.join(current_dir, "../../env.toml")) + + if os.path.exists(local_config): + config_path = local_config + elif os.path.exists(global_config): + config_path = global_config + else: + config_path = local_config # Default to local path for clearer error messages + + if os.path.exists(config_path): + logger.info(f"Loading config from: {config_path}") + set_llm_endpoint_from_config(config_path) + else: + logger.warning(f"⚠️ Config file not found at {config_path}, relying on environment variables.") + + # Initialize components + executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE) + evaluator = TopoSenseEvaluator() + topo_manager = TopologyManager() # Handles loading and indexing of topology data + + # 2. Load Queries from Hugging Face + logger.info("πŸ“₯ Loading Queries...") + # Hugging Face defaults uploaded JSONL files to the 'train' split if not specified in YAML + dataset = load_dataset("IoT-Brain-Project/TopoSense-Bench", "queries", split="train") + + results = [] + + # 3. Evaluation Loop + for item in tqdm(dataset): + query = item['query'] + ground_truth = item['answer'] + category = item['category'] + + # --- Context Retrieval --- + # Retrieve the relevant map/floor plan based on the query + context_map = topo_manager.retrieve_context(query) + + if context_map: + # Oracle Context Mode: Provide the specific map data + user_prompt = f"{context_map}\n\n[User Query]\n{query}" + else: + # Zero-context Fallback: If retrieval fails (e.g., complex cross-floor queries) + user_prompt = ( + f"[Map Data]\n(No specific map retrieved, please rely on common knowledge if possible)\n\n" + f"[User Query]\n{query}" + ) + + try: + # Call LLM using SimpleExecutor (which extracts JSON from markdown blocks) + response_str = executor.run(user_prompt, lang='json') + + # Parse JSON Response + try: + response_json = json.loads(response_str) + llm_answer = response_json.get("answer", "") + except json.JSONDecodeError: + # Fallback if response is not valid JSON + llm_answer = response_str + response_json = {"answer": response_str} + + # Evaluate Answer + eval_res = evaluator.eval(response_json, ground_truth) + + results.append({ + "category": category, + "query": query, + "ground_truth": ground_truth, + "retrieved_context": bool(context_map), # Track retrieval success + "llm_answer": llm_answer, + "status": eval_res["status"], + "score": eval_res["score"] + }) + + except Exception as e: + logger.error(f"Error processing query: {e}") + results.append({ + "query": query, + "status": "error", + "error": str(e) + }) + + # 4. Save Results & Summarize + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + df = pd.DataFrame(results) + df.to_json( + os.path.join(output_dir, "results.jsonl"), + orient="records", + lines=True, + force_ascii=False + ) + + # Calculate overall accuracy + if len(df) > 0: + acc = df[df["status"] == "correct"].shape[0] / len(df) + retrieval_rate = df["retrieved_context"].mean() if "retrieved_context" in df.columns else 0 + else: + acc = 0 + retrieval_rate = 0 + + logger.info(f"βœ… Eval Done. Accuracy: {acc:.2%}") + logger.info(f" Context Retrieval Rate: {retrieval_rate:.2%}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="TopoSense-Bench Evaluation") + parser.add_argument("-m", "--model_name", default="gpt-4o", help="Model name to evaluate") + parser.add_argument("-o", "--output_dir", default=None, help="Directory to save results") + args = parser.parse_args() + if args.output_dir is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_model_name = args.model_name.replace('/', '_') + args.output_dir = f"./outputs/toposense_{safe_model_name}_{timestamp}" + + main(args.model_name, args.output_dir) \ No newline at end of file diff --git a/benchmarks/toposense_bench/src/topology_loader.py b/benchmarks/toposense_bench/src/topology_loader.py new file mode 100644 index 0000000..a289a10 --- /dev/null +++ b/benchmarks/toposense_bench/src/topology_loader.py @@ -0,0 +1,96 @@ +"""Helper to load and index topology data from Hugging Face.""" + +from datasets import load_dataset +from loguru import logger + + +class TopologyManager: + """ + Manages the loading and indexing of the topological knowledge base. + """ + + def __init__(self): + # Index structure: { "building_name": { "floor": "content..." } } + self.topo_index = {} + self._load_data() + + def _load_data(self): + """Loads the topology dataset from Hugging Face and builds an in-memory index.""" + logger.info("πŸ—ΊοΈ Loading Topological Knowledgebase from Hugging Face...") + try: + # Load the 'topology' configuration. + # Hugging Face defaults uploaded JSONL files to the 'train' split. + ds = load_dataset("IoT-Brain/TopoSense-Bench", "topology", split="train") + + for item in ds: + # Normalize keys for easier matching (snake_case for building names) + b_name = item['building'].lower().replace(" ", "_") + floor = item['floor'].lower() + content = item['content'] + + if b_name not in self.topo_index: + self.topo_index[b_name] = {} + + self.topo_index[b_name][floor] = content + + logger.info(f"βœ… Indexed {len(self.topo_index)} buildings.") + except Exception as e: + logger.error(f"❌ Failed to load topology: {e}") + + def retrieve_context(self, query): + """ + A simple heuristic retriever. + Identifies the relevant map file based on keywords in the query. + This simulates the 'Topological Anchor' step in the IoT-Brain architecture. + + Args: + query (str): The user's natural language query. + + Returns: + str or None: The content of the specific floor plan if found, else None. + """ + query_lower = query.lower() + + target_building = None + target_floor = None + + # 1. Building Matching Logic + # Iterate through all known building names in the index + for b_name in self.topo_index.keys(): + # Replace underscores with spaces for natural language matching + # (e.g., teaching_building_1 -> "teaching building 1") + natural_name = b_name.replace("_", " ") + if natural_name in query_lower: + target_building = b_name + break + + # 2. Floor Matching Logic + # Handle common short formats: "1f", "2f"... + floors = ["1f", "2f", "3f", "4f", "5f", "6f", "7f", "8f", "9f", "10f"] + for f in floors: + # Match variations like "1st floor", "2nd floor", "10th floor" + digit = f[:-1] + if (f in query_lower or + f"{digit}st floor" in query_lower or + f"{digit}nd floor" in query_lower or + f"{digit}rd floor" in query_lower or + f"{digit}th floor" in query_lower): + target_floor = f.upper() # Standardize to "1F" + break + + # Map explicit natural language floor descriptions to standard format + if "first floor" in query_lower: target_floor = "1F" + if "second floor" in query_lower: target_floor = "2F" + if "third floor" in query_lower: target_floor = "3F" + if "fourth floor" in query_lower: target_floor = "4F" + + # 3. Retrieve and Return Map Content + if target_building and target_floor: + # Retrieve specific floor map from index + floors_map = self.topo_index[target_building] + # Try to match the key (case-insensitive) + for key, content in floors_map.items(): + if key.lower() == target_floor.lower(): + return f"Building: {target_building}, Floor: {target_floor}\n\n[Map Data]\n{content}" + + return None \ No newline at end of file diff --git a/benchmarks/toposense_bench/tests/__init__.py b/benchmarks/toposense_bench/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/toposense_bench/tests/test_benchmark.py b/benchmarks/toposense_bench/tests/test_benchmark.py new file mode 100644 index 0000000..fd565e4 --- /dev/null +++ b/benchmarks/toposense_bench/tests/test_benchmark.py @@ -0,0 +1,40 @@ +"""Unit tests for TopoSense-Bench.""" + +import unittest +from datasets import load_dataset + + +class TestTopoSenseBench(unittest.TestCase): + """Test suite for TopoSense-Bench data connectivity and integrity.""" + + def test_hf_connection(self): + """ + Test if we can connect to Hugging Face, authenticate (if needed), + and stream the first data sample successfully. + """ + try: + # Load the dataset in streaming mode to avoid downloading the entire file. + # Note: Using 'train' split as per default Hugging Face JSONL behavior. + dataset = load_dataset( + "IoT-Brain-Project/TopoSense-Bench", + "queries", + split="train", + streaming=True + ) + + # Retrieve the first item to verify data access + first_item = next(iter(dataset)) + + print(f"Successfully loaded item category: {first_item.get('category', 'Unknown')}") + + # Assert that essential fields are present in the data + self.assertTrue('query' in first_item, "Field 'query' is missing.") + self.assertTrue('answer' in first_item, "Field 'answer' is missing.") + self.assertTrue('category' in first_item, "Field 'category' is missing.") + + except Exception as e: + self.fail(f"Failed to load dataset from Hugging Face: {e}") + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 7321ffc857af2bcfbbc4957ba76e733cc4859a3d Mon Sep 17 00:00:00 2001 From: houqiii Date: Sun, 14 Dec 2025 13:32:11 +0800 Subject: [PATCH 2/3] fix: Finalize main.py with robust error handling and output format --- benchmarks/toposense_bench/src/main.py | 232 +++++++++++++++---------- 1 file changed, 142 insertions(+), 90 deletions(-) diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py index 8673b46..55b866c 100644 --- a/benchmarks/toposense_bench/src/main.py +++ b/benchmarks/toposense_bench/src/main.py @@ -1,8 +1,5 @@ """ Run TopoSense-Bench Evaluation. - -This script executes the benchmark by loading queries from Hugging Face, -retrieving relevant topological contexts, and evaluating the LLM's response. """ import argparse @@ -24,7 +21,7 @@ from evaluator import TopoSenseEvaluator from topology_loader import TopologyManager -# System Prompt emphasizes reasoning based on the provided Map Data +# System Prompt SYSTEM_PROMPT_TEMPLATE = """You are an intelligent sensor scheduling agent (IoT-Brain). You will be provided with a specific TOPOLOGICAL MAP of a building floor and a USER QUERY. @@ -42,17 +39,41 @@ } Output ONLY the JSON code block. """ +def compute_summary(results_df): + """Compute summary statistics.""" + total_questions = len(results_df) + answered = len(results_df[results_df["status"] != "error"]) + correct = len(results_df[results_df["status"] == "correct"]) + incorrect = len(results_df[results_df["status"] == "incorrect"]) + + summary = { + "overall": { + "total_questions": total_questions, + "answered": answered, + "correct": correct, + "incorrect": incorrect, + "accuracy": round(correct / answered, 4) if answered > 0 else 0, + }, + "by_category": [] + } + + if "category" in results_df.columns: + for category in results_df["category"].unique(): + cat_df = results_df[results_df["category"] == category] + cat_total = len(cat_df) + cat_correct = len(cat_df[cat_df["status"] == "correct"]) + + summary["by_category"].append({ + "category": category, + "total": cat_total, + "correct": cat_correct, + "accuracy": round(cat_correct / cat_total, 4) if cat_total > 0 else 0 + }) + return summary def main(model_name, output_dir): - """ - Main evaluation loop. - Args: - model_name (str): The name of the LLM to evaluate. - output_dir (str): Directory to save results. - """ - + """Main evaluation loop.""" # 1. Setup Configuration Path - # Prioritize finding env.toml in the current benchmark directory (src/../env.toml) current_dir = os.path.dirname(os.path.abspath(__file__)) local_config = os.path.abspath(os.path.join(current_dir, "../env.toml")) global_config = os.path.abspath(os.path.join(current_dir, "../../env.toml")) @@ -62,108 +83,139 @@ def main(model_name, output_dir): elif os.path.exists(global_config): config_path = global_config else: - config_path = local_config # Default to local path for clearer error messages + config_path = local_config if os.path.exists(config_path): logger.info(f"Loading config from: {config_path}") set_llm_endpoint_from_config(config_path) else: - logger.warning(f"⚠️ Config file not found at {config_path}, relying on environment variables.") + logger.warning(f"Config file not found at {config_path}, relying on environment variables.") # Initialize components - executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE) + try: + executor = SimpleExecutor(model_name, SYSTEM_PROMPT_TEMPLATE) + except Exception as e: + logger.error(f"Failed to initialize Executor: {e}") + return + evaluator = TopoSenseEvaluator() - topo_manager = TopologyManager() # Handles loading and indexing of topology data + topo_manager = TopologyManager() - # 2. Load Queries from Hugging Face - logger.info("πŸ“₯ Loading Queries...") - # Hugging Face defaults uploaded JSONL files to the 'train' split if not specified in YAML - dataset = load_dataset("IoT-Brain-Project/TopoSense-Bench", "queries", split="train") + # 2. Load Queries + logger.info("πŸ“₯ Loading Queries from Hugging Face...") + try: + dataset = load_dataset("IoT-Brain/TopoSense-Bench", "queries", split="train") + logger.info(f"βœ… Loaded {len(dataset)} queries.") + except Exception as e: + logger.error(f"Failed to load dataset: {e}") + return - results = [] + minimal_results = [] + detailed_results = [] # 3. Evaluation Loop - for item in tqdm(dataset): - query = item['query'] - ground_truth = item['answer'] - category = item['category'] - - # --- Context Retrieval --- - # Retrieve the relevant map/floor plan based on the query - context_map = topo_manager.retrieve_context(query) - - if context_map: - # Oracle Context Mode: Provide the specific map data - user_prompt = f"{context_map}\n\n[User Query]\n{query}" - else: - # Zero-context Fallback: If retrieval fails (e.g., complex cross-floor queries) - user_prompt = ( - f"[Map Data]\n(No specific map retrieved, please rely on common knowledge if possible)\n\n" - f"[User Query]\n{query}" - ) - - try: - # Call LLM using SimpleExecutor (which extracts JSON from markdown blocks) - response_str = executor.run(user_prompt, lang='json') - - # Parse JSON Response + try: + for item in tqdm(dataset): + query = item['query'] + ground_truth = item['answer'] + category = item['category'] + + # Context Retrieval + context_map = topo_manager.retrieve_context(query) + + if context_map: + user_prompt = f"{context_map}\n\n[User Query]\n{query}" + else: + user_prompt = ( + f"[Map Data]\n(No specific map retrieved, relying on common knowledge)\n\n" + f"[User Query]\n{query}" + ) + try: - response_json = json.loads(response_str) - llm_answer = response_json.get("answer", "") - except json.JSONDecodeError: - # Fallback if response is not valid JSON - llm_answer = response_str - response_json = {"answer": response_str} + response_str = executor.run(user_prompt, lang='json') + + try: + response_json = json.loads(response_str) + llm_answer = response_json.get("answer", "") + llm_explanation = response_json.get("explanation", "") + except json.JSONDecodeError: + llm_answer = response_str + llm_explanation = "Failed to parse JSON" + response_json = {"answer": response_str} + + # Evaluate Answer + eval_res = evaluator.eval(response_json, ground_truth) + + # Construct Result Objects + minimal_result = { + "category": category, + "query": query, + "ground_truth": ground_truth, + "llm_answer": llm_answer, + "status": eval_res["status"], + "score": eval_res["score"] + } + + detailed_result = { + **minimal_result, + "llm_explanation": llm_explanation, + "retrieved_context": bool(context_map), + "full_prompt": user_prompt, + "raw_response": response_str + } + + minimal_results.append(minimal_result) + detailed_results.append(detailed_result) + + except Exception as e: + logger.error(f"Error processing query: {e}") + error_result = { + "category": category, + "query": query, + "status": "error", + "error": str(e) + } + minimal_results.append(error_result) + detailed_results.append(error_result) + + except KeyboardInterrupt: + logger.warning("Evaluation interrupted by user. Saving partial results...") + + # 4. Save Results + if not os.path.exists(output_dir): + os.makedirs(output_dir) - # Evaluate Answer - eval_res = evaluator.eval(response_json, ground_truth) + # 4.1 Save Detailed Results (JSONL) + with open(os.path.join(output_dir, "results_detailed.jsonl"), "w", encoding="utf-8") as f: + for res in detailed_results: + f.write(json.dumps(res, ensure_ascii=False) + "\n") - results.append({ - "category": category, - "query": query, - "ground_truth": ground_truth, - "retrieved_context": bool(context_map), # Track retrieval success - "llm_answer": llm_answer, - "status": eval_res["status"], - "score": eval_res["score"] - }) + # 4.2 Save Minimal Results (JSONL) + with open(os.path.join(output_dir, "results.jsonl"), "w", encoding="utf-8") as f: + for res in minimal_results: + f.write(json.dumps(res, ensure_ascii=False) + "\n") - except Exception as e: - logger.error(f"Error processing query: {e}") - results.append({ - "query": query, - "status": "error", - "error": str(e) - }) + # 4.3 Generate Summary + results_df = pd.DataFrame(minimal_results) + if not results_df.empty: + summary = compute_summary(results_df) + summary["model"] = model_name + summary["timestamp"] = datetime.now().isoformat() - # 4. Save Results & Summarize - if not os.path.exists(output_dir): - os.makedirs(output_dir) + with open(os.path.join(output_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) - df = pd.DataFrame(results) - df.to_json( - os.path.join(output_dir, "results.jsonl"), - orient="records", - lines=True, - force_ascii=False - ) - - # Calculate overall accuracy - if len(df) > 0: - acc = df[df["status"] == "correct"].shape[0] / len(df) - retrieval_rate = df["retrieved_context"].mean() if "retrieved_context" in df.columns else 0 + logger.info(f"βœ… Eval Done. Accuracy: {summary['overall']['accuracy']:.2%}") + logger.info(f"πŸ“‚ Results saved to: {output_dir}") else: - acc = 0 - retrieval_rate = 0 - - logger.info(f"βœ… Eval Done. Accuracy: {acc:.2%}") - logger.info(f" Context Retrieval Rate: {retrieval_rate:.2%}") + logger.warning("No results to save.") -if __name__ == "__main__": +if __name__ == "__main__": parser = argparse.ArgumentParser(description="TopoSense-Bench Evaluation") parser.add_argument("-m", "--model_name", default="gpt-4o", help="Model name to evaluate") parser.add_argument("-o", "--output_dir", default=None, help="Directory to save results") args = parser.parse_args() + if args.output_dir is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_model_name = args.model_name.replace('/', '_') From c1dd37f6258560622d6a8ca11de2de39cb8b3559 Mon Sep 17 00:00:00 2001 From: houqiii Date: Sun, 14 Dec 2025 13:43:40 +0800 Subject: [PATCH 3/3] fix: Finalize codebase (docs, CI, install script, and output format) --- .github/workflows/test.yml | 1 + README.md | 1 + benchmarks/toposense_bench/README.md | 2 +- benchmarks/toposense_bench/Why.md | 15 +++++++++++++++ benchmarks/toposense_bench/install.sh | 16 ++++++++++++++++ benchmarks/toposense_bench/src/main.py | 1 + .../toposense_bench/tests/test_benchmark.py | 2 +- 7 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 benchmarks/toposense_bench/Why.md create mode 100755 benchmarks/toposense_bench/install.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a348b86..32dd8b9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,6 +19,7 @@ jobs: benchmark: - example_bench - course_exam_bench + - toposense_bench # TODO: For now, we comment out other benchmarks as they have no tests # - arteval_bench # - cache_bench diff --git a/README.md b/README.md index 8ea3103..47300b5 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ System Intelligence Benchmark currently includes the following example benchmark - **System Lab Benchmark** ([benchmarks/course_lab_bench/](benchmarks/course_lab_bench/)) - Assesses AI capability on practical system course labs and projects - **System Artifact Benchmark** ([benchmarks/arteval_bench/](benchmarks/arteval_bench/)) - Evaluates AI performance on artifact evaluation - **System Modeling Benchmark** ([benchmarks/sysmobench/](benchmarks/sysmobench/)) - Evaluates an agent's ability to produce correct TLA+ models for real-world concurrent and distributed systems, covering system capabilities across system comprehension, abstraction, and potentially tool fluency. +- **TopoSense Benchmark** ([benchmarks/toposense_bench/](benchmarks/toposense_bench/)) - Evaluates Semantic-Spatial Sensor Scheduling (SΒ³) capabilities in large-scale IoT digital twins (5,250 queries across 2,510 cameras) - **Example Benchmark** ([benchmarks/example_bench/](benchmarks/example_bench/)) - Template and reference implementation for creating new benchmarks ## Quick Start diff --git a/benchmarks/toposense_bench/README.md b/benchmarks/toposense_bench/README.md index c5c8979..13d3214 100644 --- a/benchmarks/toposense_bench/README.md +++ b/benchmarks/toposense_bench/README.md @@ -6,7 +6,7 @@ Originating from the **ACM MobiCom '26** paper *"IoT-Brain: Grounding LLMs for S ## πŸ“Š Overview -- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain-Project/TopoSense-Bench) (Seamlessly integrated via the `datasets` library). +- **Source**: Hosted on [Hugging Face](https://huggingface.co/datasets/IoT-Brain/TopoSense-Bench) (Seamlessly integrated via the `datasets` library). - **Scale**: - **5,250** Natural Language Queries. - **2,510** Sensors (Cameras). diff --git a/benchmarks/toposense_bench/Why.md b/benchmarks/toposense_bench/Why.md new file mode 100644 index 0000000..f096e7c --- /dev/null +++ b/benchmarks/toposense_bench/Why.md @@ -0,0 +1,15 @@ +# Why TopoSense-Bench? + +## The Problem: The Semantic-Physical Mapping Gap +Modern IoT systems are transitioning from passive monitoring to intent-driven operation. However, a critical gap exists between high-level human intent (e.g., *"Find my backpack lost between the library and the gym"*) and the precise physical sensor actions required to fulfill it. + +Existing benchmarks often focus on pure QA or code generation, overlooking the **embodied** and **spatial** reasoning capabilities required for real-world cyber-physical systems. + +## The Solution: Semantic-Spatial Sensor Scheduling (SΒ³) +TopoSense-Bench introduces the SΒ³ challenge, requiring LLMs to: +1. **Reason Spatially**: Understand complex topological relationships (connectivity, floor transitions) in a large-scale digital twin. +2. **Act Proactively**: Select the optimal subset of sensors from a massive network (2,510 cameras) to satisfy a query, rather than just answering a text question. +3. **Ground in Reality**: Map vague natural language to concrete sensor identifiers (e.g., `teaching_building_1_camera_03`). + +## Impact +By mastering this benchmark, LLMs demonstrate the capability to serve as the "brain" for large-scale smart city and smart campus infrastructures, moving beyond chatbots to actionable physical agents. \ No newline at end of file diff --git a/benchmarks/toposense_bench/install.sh b/benchmarks/toposense_bench/install.sh new file mode 100755 index 0000000..8526a25 --- /dev/null +++ b/benchmarks/toposense_bench/install.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Create virtual environment +python3 -m venv .venv + +# Activate virtual environment +source .venv/bin/activate + +# Upgrade pip +pip install --upgrade pip + +# Install requirements +pip install -r requirements.txt + +echo "βœ… Installation complete. Virtual environment created in .venv/" +echo "πŸ‘‰ To activate: source .venv/bin/activate" \ No newline at end of file diff --git a/benchmarks/toposense_bench/src/main.py b/benchmarks/toposense_bench/src/main.py index 55b866c..243ab87 100644 --- a/benchmarks/toposense_bench/src/main.py +++ b/benchmarks/toposense_bench/src/main.py @@ -37,6 +37,7 @@ "answer": "sensor_name_here", "explanation": "Brief reasoning based on map tags" } +``` Output ONLY the JSON code block. """ def compute_summary(results_df): diff --git a/benchmarks/toposense_bench/tests/test_benchmark.py b/benchmarks/toposense_bench/tests/test_benchmark.py index fd565e4..a23e100 100644 --- a/benchmarks/toposense_bench/tests/test_benchmark.py +++ b/benchmarks/toposense_bench/tests/test_benchmark.py @@ -16,7 +16,7 @@ def test_hf_connection(self): # Load the dataset in streaming mode to avoid downloading the entire file. # Note: Using 'train' split as per default Hugging Face JSONL behavior. dataset = load_dataset( - "IoT-Brain-Project/TopoSense-Bench", + "IoT-Brain/TopoSense-Bench", "queries", split="train", streaming=True