diff --git a/experimental/apps-mcp/evals/README.md b/experimental/apps-mcp/evals/README.md new file mode 100644 index 0000000000..a464afc13d --- /dev/null +++ b/experimental/apps-mcp/evals/README.md @@ -0,0 +1,129 @@ +# Apps-MCP Evals + +Databricks Asset Bundle for generating and evaluating apps using the Apps-MCP system with klaudbiusz framework. + +## Overview + +This bundle provides two jobs: +1. **Generation Job** - Generates apps using klaudbiusz with the Databricks CLI as MCP server +2. **Evaluation Job** - Evaluates generated apps and logs results to MLflow + +## Prerequisites + +1. **Databricks Secrets** - Create secret scope and add tokens: + ```bash + databricks secrets create-scope apps-mcp-evals + databricks secrets put-secret apps-mcp-evals anthropic-api-key + databricks secrets put-secret apps-mcp-evals databricks-token + ``` + +2. **UC Volumes** - Create volumes for artifacts: + ```bash + databricks volumes create main.default.apps_mcp_artifacts + databricks volumes create main.default.apps_mcp_generated + ``` + +3. **CLI Binary** - Build and upload Linux CLI binary: + ```bash + GOOS=linux GOARCH=amd64 go build -o databricks-linux + databricks fs cp databricks-linux /Volumes/main/default/apps_mcp_artifacts/ + ``` + +## Quick Start + +```bash +cd experimental/apps-mcp/evals + +# Validate bundle +databricks bundle validate -t dev + +# Deploy +databricks bundle deploy -t dev + +# Run generation (creates apps in UC Volume) +databricks bundle run -t dev apps_generation_job + +# Run evaluation (evaluates apps, logs to MLflow) +databricks bundle run -t dev apps_eval_job +``` + +## Jobs + +### Generation Job (`apps_generation_job`) + +Generates apps using klaudbiusz's local_run with LiteLLM backend. + +**Parameters:** +- `prompts` - Prompt set: `databricks`, `databricks_v2`, or `test` (default: `test`) +- `cli_binary_volume` - Path to CLI binary volume +- `apps_volume` - Output volume for generated apps + +**Cluster:** Jobs cluster with Spark 16.2.x (Python 3.12) + +### Evaluation Job (`apps_eval_job`) + +Evaluates generated apps using klaudbiusz's Docker-based evaluation. + +**Parameters:** +- `apps_volume` - Volume containing apps to evaluate +- `mlflow_experiment` - MLflow experiment for logging results +- `parallelism` - Number of parallel evaluations + +**Cluster:** Jobs cluster with Spark 16.2.x, Docker installed via init script + +**Schedule:** Nightly at 2am UTC + +## Configuration + +### Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `prompts` | Prompt set for generation | `test` | +| `cli_binary_volume` | UC Volume for CLI binary | `/Volumes/main/default/apps_mcp_artifacts` | +| `apps_volume` | UC Volume for generated apps | `/Volumes/main/default/apps_mcp_generated` | +| `mlflow_experiment` | MLflow experiment path | `/Shared/apps-mcp-evaluations` | +| `eval_parallelism` | Parallel eval workers | `4` | +| `evals_git_url` | klaudbiusz repo URL | `https://github.com/neondatabase/appdotbuild-agent.git` | + +### Targets + +- **dev** - Development mode, staging MLflow experiment +- **prod** - Production mode, service principal identity + +## Monitoring + +- **MLflow** - View metrics at the configured experiment path +- **Health Alerts** - Eval job alerts if runtime exceeds 2 hours +- **Logs** - Check job run output for detailed evaluation results + +## Architecture + +``` +evals/ +├── databricks.yml # Bundle configuration +├── resources/ +│ ├── apps_generation_job.job.yml # Generation job +│ └── apps_eval_job.job.yml # Evaluation job +├── init/ +│ ├── setup_generation.sh # Generation cluster init +│ └── setup_eval.sh # Eval cluster init (Docker) +├── src/ +│ ├── generate_apps.py # App generation orchestrator +│ └── run_evals.py # Evaluation orchestrator +└── pyproject.toml # Python package config +``` + +## Prompt Sets + +Available prompt sets (configured via `prompts` variable): + +- `test` - Simple test prompts (1 app) for quick validation +- `databricks` - 5 Databricks-focused dashboard prompts +- `databricks_v2` - 20 realistic human-style prompts + +## Known Limitations + +- Docker containers require `--privileged` flag on Databricks clusters +- Generation uses LiteLLM backend (Claude Agent SDK has root user restriction) +- UC Volumes don't support symlinks, uses `latest.txt` file instead diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml new file mode 100644 index 0000000000..2ea444d420 --- /dev/null +++ b/experimental/apps-mcp/evals/databricks.yml @@ -0,0 +1,62 @@ +# Databricks Asset Bundle for Apps-MCP Continuous Evals +# See https://docs.databricks.com/dev-tools/bundles/index.html +bundle: + name: apps-mcp-evals + uuid: 80e50a10-c2da-4b59-99d6-e101b1bcf485 + +include: + - resources/*.yml + +artifacts: + apps_mcp_evals: + type: whl + build: uv build --wheel + path: . + +variables: + catalog: + description: Unity Catalog for eval results + default: main + schema: + description: Schema for eval tables + mlflow_experiment: + description: MLflow experiment path for tracking + default: /Shared/apps-mcp-evaluations + evals_git_url: + description: Git URL for appdotbuild-agent eval framework + default: https://github.com/neondatabase/appdotbuild-agent.git + eval_parallelism: + description: Number of parallel eval workers + default: "4" + cli_binary_volume: + description: UC Volume path for CLI binary + default: /Volumes/main/default/apps_mcp_artifacts + apps_volume: + description: UC Volume path for generated apps + default: /Volumes/main/default/apps_mcp_generated + generation_parallelism: + description: Number of parallel app generations + default: "4" + prompts: + description: Prompt set for generation (databricks, databricks_v2, test) + default: test + +targets: + dev: + mode: development + default: true + workspace: + host: https://6177827686947384.4.gcp.databricks.com + variables: + schema: ${workspace.current_user.short_name} + mlflow_experiment: /Shared/apps-mcp-evaluations-staging + + prod: + mode: production + workspace: + host: https://6177827686947384.4.gcp.databricks.com + root_path: /Workspace/Users/${workspace.current_user.user_name}/.bundle/${bundle.name}/${bundle.target} + variables: + schema: evals + run_as: + service_principal_name: apps-mcp-eval-sp diff --git a/experimental/apps-mcp/evals/init/setup_eval.sh b/experimental/apps-mcp/evals/init/setup_eval.sh new file mode 100644 index 0000000000..53d059aab6 --- /dev/null +++ b/experimental/apps-mcp/evals/init/setup_eval.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +echo "=== Apps-MCP Eval Setup ===" +echo "Python version: $(python --version)" + +# Install Node.js (required for local npm install/build/test) +echo "Installing Node.js..." +curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash - +sudo apt-get install -y nodejs + +echo "Node version: $(node --version)" +echo "npm version: $(npm --version)" + +# Install Python dependencies +pip install fire mlflow + +echo "=== Setup complete ===" diff --git a/experimental/apps-mcp/evals/init/setup_generation.sh b/experimental/apps-mcp/evals/init/setup_generation.sh new file mode 100644 index 0000000000..5cdce5dcf6 --- /dev/null +++ b/experimental/apps-mcp/evals/init/setup_generation.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +echo "=== Setting up generation environment ===" + +# Install Dagger (required for klaudbiusz container orchestration) +echo "Installing Dagger..." +curl -fsSL https://dl.dagger.io/dagger/install.sh | sh +export PATH=$PATH:/root/.local/bin + +# Install Python dependencies for klaudbiusz +echo "Installing Python dependencies..." +pip install --quiet dagger-io fire tqdm python-dotenv claude-agent-sdk litellm joblib tenacity + +echo "=== Setup complete ===" diff --git a/experimental/apps-mcp/evals/pyproject.toml b/experimental/apps-mcp/evals/pyproject.toml new file mode 100644 index 0000000000..02f47339e0 --- /dev/null +++ b/experimental/apps-mcp/evals/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[project] +name = "apps_mcp_evals" +version = "0.1.0" +description = "Continuous evaluation framework for Apps-MCP code generation" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "mlflow>=2.15.0", + "fire>=0.7.1", +] + +[project.scripts] +main = "src.run_evals:cli" + +[tool.ruff] +line-length = 120 +target-version = "py310" diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml new file mode 100644 index 0000000000..c5c82c9e10 --- /dev/null +++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml @@ -0,0 +1,68 @@ +# Apps-MCP Evaluation Job +# Runs nightly + supports manual trigger via: databricks bundle run -t dev apps_eval_job + +resources: + jobs: + apps_eval_job: + name: "[${bundle.target}] Apps-MCP Continuous Evals" + + # Nightly schedule (2am UTC) + trigger: + periodic: + interval: 1 + unit: DAYS + + # Health monitoring - alert if eval takes > 2 hours + health: + rules: + - metric: RUN_DURATION_SECONDS + op: GREATER_THAN + value: 7200 + + email_notifications: + on_failure: + - apps-mcp-team@databricks.com + + parameters: + - name: mlflow_experiment + default: ${var.mlflow_experiment} + - name: parallelism + default: ${var.eval_parallelism} + - name: evals_git_url + default: ${var.evals_git_url} + - name: apps_volume + default: ${var.apps_volume} + + job_clusters: + - job_cluster_key: eval_cluster + new_cluster: + spark_version: "16.2.x-scala2.12" + node_type_id: "n2-standard-4" + num_workers: 0 + data_security_mode: SINGLE_USER + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: "local[*]" + custom_tags: + ResourceClass: SingleNode + spark_env_vars: + DATABRICKS_HOST: ${workspace.host} + DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}" + init_scripts: + - workspace: + destination: ${workspace.file_path}/init/setup_eval.sh + + tasks: + - task_key: run_evals + job_cluster_key: eval_cluster + spark_python_task: + python_file: ${workspace.file_path}/src/run_evals.py + parameters: + - --mlflow-experiment + - ${var.mlflow_experiment} + - --parallelism + - ${var.eval_parallelism} + - --evals-git-url + - ${var.evals_git_url} + - --apps-volume + - ${var.apps_volume} diff --git a/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml new file mode 100644 index 0000000000..ce62d0fcde --- /dev/null +++ b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml @@ -0,0 +1,39 @@ +resources: + jobs: + apps_generation_job: + name: "[${bundle.target}] Apps-MCP Generation" + + job_clusters: + - job_cluster_key: generation_cluster + new_cluster: + spark_version: "16.2.x-scala2.12" + node_type_id: "n2-standard-8" + num_workers: 0 + data_security_mode: SINGLE_USER + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: "local[*]" + custom_tags: + ResourceClass: SingleNode + spark_env_vars: + ANTHROPIC_API_KEY: "{{secrets/apps-mcp-evals/anthropic-api-key}}" + DATABRICKS_HOST: ${workspace.host} + DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}" + init_scripts: + - workspace: + destination: ${workspace.file_path}/init/setup_generation.sh + + tasks: + - task_key: generate_apps + job_cluster_key: generation_cluster + spark_python_task: + python_file: ${workspace.file_path}/src/generate_apps.py + parameters: + - --mcp-binary + - ${var.cli_binary_volume}/databricks-linux + - --output-volume + - ${var.apps_volume} + - --prompts + - ${var.prompts} + - --max-concurrency + - ${var.generation_parallelism} diff --git a/experimental/apps-mcp/evals/src/__init__.py b/experimental/apps-mcp/evals/src/__init__.py new file mode 100644 index 0000000000..0a8e6c04aa --- /dev/null +++ b/experimental/apps-mcp/evals/src/__init__.py @@ -0,0 +1 @@ +"""Apps-MCP Evaluation Framework.""" diff --git a/experimental/apps-mcp/evals/src/generate_apps.py b/experimental/apps-mcp/evals/src/generate_apps.py new file mode 100644 index 0000000000..313ed732c1 --- /dev/null +++ b/experimental/apps-mcp/evals/src/generate_apps.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""Generate apps using klaudbiusz with CLI-built MCP server.""" + +import os +import shutil +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import fire + + +def clone_klaudbiusz(work_dir: Path) -> Path: + """Clone the klaudbiusz generation framework.""" + repo_dir = work_dir / "appdotbuild-agent" + if repo_dir.exists(): + shutil.rmtree(repo_dir) + + print("Cloning appdotbuild-agent repository...") + subprocess.run( + [ + "git", + "clone", + "--depth", + "1", + "https://github.com/neondatabase/appdotbuild-agent.git", + str(repo_dir), + ], + check=True, + ) + return repo_dir + + +def install_klaudbiusz_deps(klaudbiusz_dir: Path) -> None: + """Install klaudbiusz Python dependencies.""" + print("Installing klaudbiusz dependencies...") + result = subprocess.run( + [sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f"Warning: pip install had issues: {result.stderr[:500]}") + + +def get_prompts(prompts_name: str) -> dict: + """Load prompts from klaudbiusz.""" + from cli.prompts import get_prompts as klaudbiusz_get_prompts + + return klaudbiusz_get_prompts(prompts_name) + + +def run_generation( + klaudbiusz_dir: Path, + mcp_binary: str, + output_dir: Path, + prompts: str, + max_concurrency: int, +) -> int: + """Run app generation using local_run (no Dagger required).""" + print(f"\nStarting app generation (local mode, no Dagger)...") + print(f" MCP binary: {mcp_binary}") + print(f" Prompts: {prompts}") + print(f" Output dir: {output_dir}") + + env = os.environ.copy() + env["PYTHONPATH"] = str(klaudbiusz_dir) + + prompt_dict = get_prompts(prompts) + print(f" Total prompts: {len(prompt_dict)}") + + success_count = 0 + fail_count = 0 + + for app_name, prompt in prompt_dict.items(): + print(f"\n{'=' * 60}") + print(f"Generating: {app_name}") + print(f"Prompt: {prompt[:100]}...") + print("=" * 60) + + # Use LiteLLM backend to avoid Claude Agent SDK root user restriction + # (Databricks clusters run as root, Claude Agent SDK refuses to run as root) + cmd = [ + sys.executable, + "-m", + "cli.generation.local_run", + prompt, + f"--app_name={app_name}", + "--backend=litellm", + "--model=anthropic/claude-sonnet-4-20250514", + f"--mcp_binary={mcp_binary}", + '--mcp_args=["experimental", "apps-mcp"]', + f"--output_dir={output_dir}", + ] + + result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env) + + if result.returncode == 0: + success_count += 1 + print(f"SUCCESS: {app_name}") + else: + fail_count += 1 + print(f"FAILED: {app_name} (return code: {result.returncode})") + + print(f"\nGeneration summary: {success_count} succeeded, {fail_count} failed") + return success_count + + +def upload_to_volume(local_dir: Path, volume_path: str) -> int: + """Upload generated apps to UC Volume.""" + if not local_dir.exists(): + print(f"No apps directory found at {local_dir}") + return 0 + + apps = list(local_dir.iterdir()) + if not apps: + print("No apps generated") + return 0 + + print(f"\nUploading {len(apps)} apps to {volume_path}...") + + volume_dir = Path(volume_path) + volume_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + dest_dir = volume_dir / f"run_{timestamp}" + + shutil.copytree(local_dir, dest_dir) + print(f"Uploaded to {dest_dir}") + + # Write latest run path to a file (symlinks not supported on UC Volumes) + latest_file = volume_dir / "latest.txt" + latest_file.write_text(str(dest_dir)) + print(f"Latest run recorded in {latest_file}") + + return len(apps) + + +def main( + mcp_binary: str, + output_volume: str, + prompts: str = "databricks", + max_concurrency: int = 4, +) -> None: + """ + Generate apps using klaudbiusz with the Databricks CLI as MCP server. + + Args: + mcp_binary: Path to databricks-linux binary in UC Volume + output_volume: UC Volume path for generated apps + prompts: Prompt set (databricks, databricks_v2, test) + max_concurrency: Number of parallel generations + """ + print("=" * 60) + print("Apps-MCP Generation") + print("=" * 60) + print(f" MCP Binary: {mcp_binary}") + print(f" Output Volume: {output_volume}") + print(f" Prompts: {prompts}") + print(f" Max Concurrency: {max_concurrency}") + + if not Path(mcp_binary).exists(): + print(f"\nError: MCP binary not found at {mcp_binary}") + print("Please upload the databricks-linux binary to the UC Volume first.") + sys.exit(1) + + subprocess.run(["chmod", "+x", mcp_binary], check=True) + + work_dir = Path("/tmp/apps-generation") + work_dir.mkdir(exist_ok=True) + + repo_dir = clone_klaudbiusz(work_dir) + klaudbiusz_dir = repo_dir / "klaudbiusz" + + install_klaudbiusz_deps(klaudbiusz_dir) + + local_output = work_dir / "generated_apps" + local_output.mkdir(exist_ok=True) + + run_generation( + klaudbiusz_dir=klaudbiusz_dir, + mcp_binary=mcp_binary, + output_dir=local_output, + prompts=prompts, + max_concurrency=max_concurrency, + ) + + app_count = upload_to_volume(local_output, output_volume) + + print("\n" + "=" * 60) + print("Generation Complete") + print("=" * 60) + print(f" Apps generated: {app_count}") + print(f" Output location: {output_volume}") + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py new file mode 100644 index 0000000000..bdc41856de --- /dev/null +++ b/experimental/apps-mcp/evals/src/run_evals.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +"""Apps-MCP Evaluation Runner for Databricks Jobs.""" + +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Optional + +import fire + + +def clone_and_install_klaudbiusz(work_dir: Path, git_url: str) -> Path: + """Clone klaudbiusz and install dependencies.""" + print(f"Cloning {git_url}...") + repo_dir = work_dir / "appdotbuild-agent" + subprocess.run(["git", "clone", "--depth", "1", git_url, str(repo_dir)], check=True) + klaudbiusz_dir = repo_dir / "klaudbiusz" + print("Installing klaudbiusz...") + subprocess.run([sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)], check=True) + sys.path.insert(0, str(klaudbiusz_dir)) + return klaudbiusz_dir + + +def find_apps_dir(apps_volume: str) -> Optional[Path]: + """Find apps directory from UC Volume.""" + volume_path = Path(apps_volume) + latest_file = volume_path / "latest.txt" + if latest_file.exists(): + return Path(latest_file.read_text().strip()) + if volume_path.exists(): + run_dirs = [d for d in volume_path.iterdir() if d.is_dir() and d.name.startswith("run_")] + if run_dirs: + return max(run_dirs, key=lambda d: d.name) + return None + + +def run_local_evaluation(apps_dir: Path, mlflow_experiment: str) -> dict: + """Run local evaluation using shell scripts (no Docker/Dagger).""" + import time + from dataclasses import asdict + + from cli.evaluation.evaluate_app import evaluate_app + from cli.evaluation.evaluate_all import generate_summary_report + from cli.utils.apps_discovery import list_apps_in_dir + + app_dirs = list_apps_in_dir(apps_dir) + if not app_dirs: + raise ValueError(f"No apps found in: {apps_dir}") + + print(f"Evaluating {len(app_dirs)} apps locally...") + + results = [] + eval_start = time.time() + + for i, app_dir in enumerate(app_dirs, 1): + print(f"\n[{i}/{len(app_dirs)}] {app_dir.name}") + try: + result = evaluate_app(app_dir, prompt=None, port=8000 + i) + results.append(asdict(result)) + except Exception as e: + print(f" Error: {e}") + + eval_duration = time.time() - eval_start + print(f"\nEvaluated {len(results)}/{len(app_dirs)} apps in {eval_duration:.1f}s") + + summary = generate_summary_report(results) + report = {"summary": summary, "apps": results} + + if mlflow_experiment: + from cli.evaluation.tracking import log_evaluation_to_mlflow, setup_mlflow + if setup_mlflow(mlflow_experiment): + run_id = log_evaluation_to_mlflow(report) + if run_id: + print(f"MLflow run logged: {run_id}") + + return report + + +def main( + mlflow_experiment: str = "/Shared/apps-mcp-evaluations", + parallelism: int = 4, + apps_volume: Optional[str] = None, + evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git", +) -> None: + """Run Apps-MCP evaluations using klaudbiusz (local mode).""" + print("=" * 60) + print("Apps-MCP Evaluation (Local Mode)") + print("=" * 60) + print(f" MLflow Experiment: {mlflow_experiment}") + print(f" Apps Volume: {apps_volume or 'not specified'}") + + work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-")) + clone_and_install_klaudbiusz(work_dir, evals_git_url) + + apps_dir = find_apps_dir(apps_volume) if apps_volume else None + if apps_dir: + print(f" Apps Dir: {apps_dir}") + else: + print(" Apps Dir: not found, will use default") + apps_dir = work_dir / "appdotbuild-agent" / "klaudbiusz" / "app" + + print("\n" + "=" * 60) + print("Running local evaluation...") + print("=" * 60) + + report = run_local_evaluation( + apps_dir=apps_dir, + mlflow_experiment=mlflow_experiment, + ) + + summary = report.get("summary", {}) + metrics = summary.get("metrics_summary", {}) + + print("\n" + "=" * 60) + print("EVALUATION SUMMARY") + print("=" * 60) + print(f"Total Apps: {summary.get('total_apps', 0)}") + print(f"Avg AppEval Score: {metrics.get('avg_appeval_100', 0):.1f}/100") + print(f"Build Success: {metrics.get('build_success', 0)}") + print(f"Runtime Success: {metrics.get('runtime_success', 0)}") + print(f"Type Safety: {metrics.get('type_safety_pass', 0)}") + print(f"Tests Pass: {metrics.get('tests_pass', 0)}") + print("\nEvaluation complete!") + + +def cli(): + """CLI entry point.""" + fire.Fire(main) + + +if __name__ == "__main__": + cli()