From 3bd89fb3f68b8db6af5590594a5f7a100aea9584 Mon Sep 17 00:00:00 2001 From: symbiologist Date: Sun, 14 Dec 2025 15:53:08 -0500 Subject: [PATCH 1/3] Update copy, runtimes --- data/metadata.csv | 2 +- data/metrics.csv | 11400 +++++++----------- frontend/public/data/combination-index.json | 652 +- frontend/src/components/FiltersPanel.tsx | 4 +- 4 files changed, 4756 insertions(+), 7302 deletions(-) diff --git a/data/metadata.csv b/data/metadata.csv index 0bb93d8..17d2248 100644 --- a/data/metadata.csv +++ b/data/metadata.csv @@ -28,4 +28,4 @@ NA,OverallScoreSCP7,FALSE,FALSE,NA,Higher,Percent,0,1,Overall Score SCP7,Placeho NA,Raw Accuracy,FALSE,FALSE,NA,Higher,Percent,0,1,Raw Accuracy,Placeholder NA,Recall8,FALSE,FALSE,NA,Higher,Percent,0,1,Recall8,Placeholder NA,OverallScore7,FALSE,FALSE,NA,Higher,Percent,0,1,Overall7,Placeholder -11,Runtime,TRUE,FALSE,NA,Lower,Absolute,0,180,Runtime,Inference time in seconds +11,Runtime,TRUE,FALSE,NA,Lower,Absolute,0,255,Runtime,Inference time in seconds diff --git a/data/metrics.csv b/data/metrics.csv index 8676b60..869ed6b 100644 --- a/data/metrics.csv +++ b/data/metrics.csv @@ -1,7121 +1,4279 @@ -Model,Team,Condition,Harm,Metric,trials,mean,sd,se,ci,Format,Cases,Grading,Type,Model1,Model2,Model3,Provider,Label,Exclude -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.928,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.927,0.001,0,0.001,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.781,0.015,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.932,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.933,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.923,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.936,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.922,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.92,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.926,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.925,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.93,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.933,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.925,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.932,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.922,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.93,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.922,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.934,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.923,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.925,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.926,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.726,0.025,0.014,0.028,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.93,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.93,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.909,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.92,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.904,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.902,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.912,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.924,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.733,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.928,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.927,0.001,0,0.001,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.931,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.923,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.928,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.925,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.906,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.924,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.71,0.057,0.025,0.05,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.927,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.924,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.918,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.918,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.921,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,9,0.928,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.923,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.82,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.934,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.893,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.919,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.893,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.885,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.91,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.923,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.922,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.905,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.934,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.918,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.699,0.046,0.021,0.04,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.94,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.94,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.934,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.912,0.014,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.939,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.931,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.929,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.928,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.917,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.832,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.931,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.928,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.889,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.916,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.926,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.884,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.878,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.904,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.923,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,5,0.915,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.843,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.932,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.922,0.001,0,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.867,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,10,0.923,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.841,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,8,0.833,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.888,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Accuracy,4,0.914,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.901,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.764,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.92,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.918,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.898,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.924,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.874,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.869,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Accuracy,3,0.866,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.924,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.933,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,8,0.928,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.933,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,8,0.933,0.001,0,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,4,0.94,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.933,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.939,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.927,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,4,0.927,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,9,0.932,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,3,0.939,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.921,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.925,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.925,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.94,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.926,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,3,0.927,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.942,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.927,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.939,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.94,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.936,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,6,0.923,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,3,0.944,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.885,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.931,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.945,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.934,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,10,0.942,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,4,0.933,0.009,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.934,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.932,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.841,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.841,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.907,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.934,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,3,0.929,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,5,0.835,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Accuracy,4,0.833,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Accuracy,10,0.923,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Accuracy,20,0.905,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Accuracy,20,0.925,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Accuracy,20,0.924,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Accuracy,15,0.912,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Accuracy,20,0.908,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Accuracy,20,0.88,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Accuracy,20,0.887,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Accuracy,10,0.848,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Accuracy,20,0.93,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Accuracy,20,0.924,0.006,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Accuracy,10,0.89,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Accuracy,10,0.89,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Accuracy,20,0.909,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Accuracy,20,0.935,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Accuracy,20,0.916,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Accuracy,13,0.933,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Accuracy,15,0.927,0.006,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Accuracy,15,0.916,0.006,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Accuracy,15,0.897,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Accuracy,20,0.929,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Accuracy,10,0.853,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Accuracy,20,0.883,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Accuracy,20,0.828,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Accuracy,15,0.901,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Accuracy,15,0.887,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Accuracy,5,0.867,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Accuracy,13,0.861,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Accuracy,10,0.915,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Accuracy,10,0.831,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Accuracy,20,0.872,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Accuracy,10,0.88,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.569,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.492,0.043,0.025,0.048,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.618,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.488,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.553,0.039,0.023,0.044,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.589,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.52,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.549,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.549,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.516,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.517,0.018,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.555,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.595,0.034,0.015,0.03,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.509,0.037,0.013,0.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.598,0.044,0.025,0.05,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.604,0.028,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.606,0.039,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.559,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.646,0.036,0.011,0.023,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.565,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.546,0.038,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.593,0.043,0.025,0.048,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.699,0.037,0.022,0.042,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.512,0.044,0.025,0.05,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.585,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.638,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.589,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.622,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.622,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.602,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.419,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.622,0.032,0.019,0.037,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.468,0.034,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.5,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.52,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.516,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.516,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.51,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.467,0.037,0.022,0.042,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.496,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.455,0.037,0.022,0.042,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.659,0.027,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.492,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.516,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.537,0.044,0.025,0.05,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.557,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.472,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.459,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,9,0.634,0.027,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.553,0.043,0.025,0.048,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.703,0.039,0.023,0.044,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.545,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.602,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.659,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.533,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.663,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.663,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.627,0.029,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.537,0.037,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.634,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.615,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.553,0.031,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.673,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.622,0.031,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.496,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.729,0.045,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.549,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.591,0.04,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.589,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.541,0.039,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.618,0.045,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.537,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.537,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.52,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.593,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.463,0.049,0.028,0.055,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.646,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.492,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.581,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.549,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.606,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.643,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.497,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.504,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.476,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.61,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,5,0.537,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.654,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.512,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.581,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.659,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,10,0.618,0.037,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.587,0.018,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,8,0.591,0.016,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.577,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Completeness,4,0.534,0.025,0.013,0.025,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.358,0.039,0.023,0.044,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.537,0.044,0.025,0.05,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.435,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.447,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.411,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.52,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.264,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.28,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Completeness,3,0.268,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.5,0.043,0.018,0.034,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.498,0.03,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,8,0.614,0.027,0.009,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.583,0.016,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,8,0.633,0.038,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,4,0.558,0.034,0.017,0.033,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.587,0.02,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.544,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.451,0.029,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,4,0.573,0.01,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,9,0.636,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,3,0.561,0.032,0.019,0.037,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.598,0.041,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.543,0.032,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.539,0.028,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.568,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.535,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,3,0.573,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.59,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.565,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.566,0.046,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.6,0.041,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.553,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,6,0.522,0.035,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,3,0.593,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.495,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.618,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.599,0.042,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.618,0.038,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,10,0.66,0.038,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,4,0.628,0.051,0.026,0.05,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.534,0.03,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.615,0.035,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.593,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.598,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.51,0.046,0.021,0.04,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.537,0.036,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,3,0.626,0.058,0.033,0.065,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,5,0.593,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Completeness,4,0.595,0.021,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Completeness,10,0.543,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Completeness,20,0.512,0.043,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Completeness,20,0.51,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Completeness,20,0.573,0.035,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Completeness,15,0.628,0.046,0.012,0.023,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Completeness,20,0.618,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Completeness,20,0.458,0.031,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Completeness,20,0.565,0.093,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Completeness,10,0.332,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Completeness,20,0.502,0.04,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Completeness,20,0.505,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Completeness,10,0.438,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Completeness,10,0.654,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Completeness,20,0.632,0.028,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Completeness,20,0.583,0.061,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Completeness,20,0.435,0.06,0.014,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Completeness,13,0.52,0.05,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Completeness,15,0.573,0.063,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Completeness,15,0.554,0.066,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Completeness,15,0.637,0.05,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Completeness,20,0.671,0.028,0.006,0.012,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Completeness,10,0.56,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Completeness,20,0.516,0.024,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Completeness,20,0.579,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Completeness,15,0.498,0.071,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Completeness,15,0.474,0.057,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Completeness,5,0.549,0.049,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Completeness,13,0.483,0.034,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Completeness,10,0.441,0.037,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Completeness,10,0.487,0.045,0.014,0.028,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Completeness,20,0.282,0.029,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Completeness,10,0.355,0.028,0.009,0.018,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0.042,0.118,0.042,0.082,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.167,0.176,0.056,0.109,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.408,0.224,0.071,0.139,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.2,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.667,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.667,0.333,0.192,0.377,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0.25,0.154,0.055,0.107,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.222,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.167,0.176,0.056,0.109,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0.533,0.298,0.133,0.261,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,9,0.037,0.111,0.037,0.073,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0.067,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.222,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.383,0.112,0.036,0.07,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0.531,0.178,0.063,0.123,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.35,0.053,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.35,0.053,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.444,0.385,0.222,0.436,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0.867,0.183,0.082,0.16,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.667,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.467,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0.583,0.154,0.055,0.107,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.4,0.141,0.044,0.087,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.667,0.333,0.192,0.377,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.556,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.2,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.222,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,10,0.533,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,8,0.042,0.118,0.042,0.082,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Emergencies,4,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.222,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.111,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Emergencies,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.056,0.136,0.056,0.109,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.056,0.136,0.056,0.109,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.111,0.172,0.07,0.138,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,8,0.167,0.178,0.063,0.123,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,4,0.167,0.192,0.096,0.189,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.333,0.211,0.086,0.169,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.267,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.278,0.136,0.056,0.109,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,4,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,9,0.037,0.111,0.037,0.073,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.433,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.444,0.136,0.056,0.109,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.55,0.201,0.09,0.176,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.55,0.201,0.09,0.176,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.542,0.181,0.074,0.145,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,3,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.333,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.486,0.186,0.076,0.149,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.467,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.467,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.583,0.139,0.057,0.112,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,6,0.611,0.251,0.102,0.201,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.567,0.161,0.051,0.1,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.533,0.172,0.054,0.107,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.567,0.161,0.051,0.1,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,10,0.567,0.161,0.051,0.1,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,4,0.667,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.067,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.4,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.067,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,3,0.444,0.192,0.111,0.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,5,0.067,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Emergencies,4,0.083,0.167,0.083,0.163,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Emergencies,20,0.1,0.157,0.035,0.069,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Emergencies,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Emergencies,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Emergencies,15,0,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Emergencies,20,0.1,0.157,0.035,0.069,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Emergencies,20,0.033,0.103,0.023,0.045,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Emergencies,20,0.45,0.196,0.044,0.086,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Emergencies,10,0.567,0.225,0.071,0.139,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Emergencies,20,0.1,0.157,0.035,0.069,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Emergencies,20,0.033,0.103,0.023,0.045,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Emergencies,10,0.033,0.105,0.033,0.065,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Emergencies,20,0.342,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Emergencies,20,0.458,0.161,0.036,0.071,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Emergencies,20,0.6,0.137,0.031,0.06,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Emergencies,13,0.115,0.185,0.051,0.101,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Emergencies,15,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Emergencies,15,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Emergencies,15,0.156,0.213,0.055,0.108,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Emergencies,20,0.267,0.137,0.031,0.06,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Emergencies,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Emergencies,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Emergencies,15,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Emergencies,15,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Emergencies,5,0.333,0.236,0.105,0.207,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Emergencies,13,0.122,0.162,0.045,0.088,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Emergencies,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Emergencies,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Emergencies,10,0.167,0.176,0.056,0.109,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.766,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.723,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.756,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.692,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.627,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.662,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.754,0.033,0.019,0.037,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.647,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.652,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.647,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.688,0.036,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.684,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.782,0.026,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.743,0.024,0.008,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.657,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.752,0.025,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.783,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.774,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.782,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.75,0.025,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.734,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.746,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.821,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.667,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.652,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.736,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.776,0.039,0.023,0.045,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.716,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.716,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.701,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.657,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.771,0.048,0.028,0.054,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.655,0.027,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.597,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.637,0.034,0.02,0.039,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.692,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.667,0.048,0.028,0.054,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.669,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.632,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.647,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.622,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.749,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.607,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.542,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.597,0.065,0.038,0.074,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.658,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.527,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.512,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,9,0.718,0.027,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.713,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.781,0.052,0.03,0.059,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.682,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.657,0.045,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.677,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.739,0.049,0.028,0.055,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.701,0.032,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.692,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.654,0.041,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.821,0.039,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.801,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.839,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.775,0.044,0.016,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.851,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.829,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.724,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.866,0.038,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.751,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.712,0.038,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.782,0.023,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.729,0.033,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.776,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.738,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.738,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.708,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.768,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.74,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.811,0.034,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.697,0.034,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.662,0.048,0.028,0.054,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.766,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.764,0.059,0.034,0.067,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.787,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.769,0.025,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.776,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.743,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.796,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,5,0.732,0.028,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.851,0.039,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.711,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.607,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.776,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,10,0.821,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.778,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,8,0.754,0.029,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.721,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Escalation,4,0.649,0.019,0.01,0.019,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.554,0.059,0.034,0.067,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.584,0.093,0.054,0.106,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.552,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.495,0.033,0.019,0.038,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.55,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.676,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.34,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.374,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Escalation,3,0.34,0.024,0.014,0.027,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.727,0.018,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.764,0.033,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,8,0.749,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.754,0.023,0.009,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,8,0.777,0.024,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,4,0.754,0.019,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.765,0.048,0.02,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.759,0.043,0.019,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.669,0.026,0.01,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,4,0.722,0.039,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,9,0.73,0.028,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,3,0.708,0.033,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.839,0.03,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.8,0.056,0.023,0.044,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.774,0.033,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.771,0.038,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.767,0.042,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,3,0.811,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.785,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.783,0.03,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.713,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.727,0.042,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.727,0.041,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,6,0.725,0.045,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,3,0.721,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.77,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.824,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.753,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.823,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,10,0.83,0.022,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,4,0.833,0.015,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.725,0.037,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.82,0.027,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.779,0.027,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.773,0.029,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.794,0.029,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.749,0.034,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,3,0.838,0.039,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,5,0.758,0.031,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Escalation,4,0.765,0.031,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Escalation,10,0.642,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Escalation,20,0.687,0.032,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Escalation,20,0.64,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Escalation,20,0.756,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Escalation,15,0.765,0.04,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Escalation,20,0.724,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Escalation,20,0.527,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Escalation,20,0.81,0.04,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Escalation,10,0.722,0.024,0.007,0.015,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Escalation,20,0.678,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Escalation,20,0.54,0.05,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Escalation,10,0.568,0.036,0.011,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Escalation,10,0.693,0.033,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Escalation,20,0.823,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Escalation,20,0.733,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Escalation,20,0.6,0.054,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Escalation,13,0.711,0.045,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Escalation,15,0.755,0.027,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Escalation,15,0.751,0.038,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Escalation,15,0.82,0.037,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Escalation,20,0.731,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Escalation,10,0.76,0.045,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Escalation,20,0.758,0.029,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Escalation,20,0.76,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Escalation,15,0.689,0.067,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Escalation,15,0.617,0.057,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Escalation,5,0.776,0.038,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Escalation,13,0.714,0.043,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Escalation,10,0.6,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Escalation,10,0.76,0.022,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Escalation,20,0.384,0.04,0.009,0.018,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Escalation,10,0.35,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,3,0.619,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.633,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.456,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.664,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.645,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.608,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.636,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,3,0.611,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.604,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.622,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,5,0.639,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.656,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,5,0.626,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,8,0.617,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.628,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,8,0.592,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,F1,10,0.612,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,10,0.612,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.637,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,10,0.599,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,10,0.608,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,3,0.604,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.407,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.654,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.625,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.565,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.607,0.009,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,3,0.56,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.555,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.577,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.673,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.429,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,8,0.671,0.01,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.669,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.669,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.675,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.653,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.675,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,3,0.676,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.638,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.674,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,5,0.415,0.039,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.669,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.652,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.652,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.641,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.639,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.648,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,9,0.598,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.603,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.471,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.662,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.617,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.533,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.596,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,5,0.536,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.525,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,5,0.568,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.61,0.011,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.61,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,F1,10,0.589,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,8,0.581,0.019,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.625,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,10,0.582,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.643,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,5,0.402,0.022,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.674,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,10,0.65,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.639,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,8,0.603,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.659,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,3,0.637,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.634,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.635,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,5,0.61,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.611,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.49,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.668,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.634,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.543,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.597,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,F1,10,0.62,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,8,0.544,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.536,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.582,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,3,0.6,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,5,0.604,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.492,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.672,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.62,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.516,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,10,0.601,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,8,0.491,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,8,0.482,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.56,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,F1,4,0.635,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.632,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,F1,3,0.444,0.027,0.015,0.03,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,F1,3,0.67,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.651,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,F1,3,0.614,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,F1,3,0.645,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,F1,3,0.573,0.016,0.009,0.019,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,F1,3,0.558,0.011,0.006,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,F1,3,0.558,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.615,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.653,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,8,0.602,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.628,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,8,0.629,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,4,0.662,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.621,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.661,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.675,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,4,0.606,0.013,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,9,0.622,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,3,0.654,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.588,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.607,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.612,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.652,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.613,0.011,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,3,0.612,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.662,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.612,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.661,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.656,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.662,0.011,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,6,0.624,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,3,0.666,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.546,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.612,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.664,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.616,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,10,0.64,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,4,0.612,0.012,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.663,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.623,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.49,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.49,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.584,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.665,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,3,0.614,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,5,0.484,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,F1,4,0.482,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,F1,10,0.614,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,F1,20,0.562,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,F1,20,0.642,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,F1,20,0.601,0.011,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,F1,15,0.564,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,F1,20,0.564,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,F1,20,0.54,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,F1,20,0.544,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,F1,10,0.506,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,F1,20,0.669,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,F1,20,0.645,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,F1,10,0.56,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,F1,10,0.531,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,F1,20,0.566,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,F1,20,0.627,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,F1,20,0.649,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,F1,13,0.662,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,F1,15,0.612,0.01,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,F1,15,0.596,0.01,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,F1,15,0.545,0.011,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,F1,20,0.623,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,F1,10,0.5,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,F1,20,0.544,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,F1,20,0.479,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,F1,15,0.578,0.01,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,F1,15,0.551,0.006,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,F1,5,0.527,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,F1,13,0.503,0.012,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,F1,10,0.625,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,F1,10,0.485,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,F1,20,0.571,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,F1,10,0.584,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.59,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.571,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.458,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.579,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.591,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.588,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.582,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.576,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.573,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.568,0.015,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.586,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.61,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.603,0.015,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.567,0.019,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.602,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.591,0.014,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.597,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.589,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.628,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.579,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.577,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.586,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.43,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.584,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.594,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.576,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.599,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.567,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.565,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.572,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.551,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.445,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.57,0.018,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.581,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.593,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.596,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.592,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.589,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.574,0.024,0.014,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.575,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.563,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.431,0.05,0.022,0.044,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.574,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.578,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.583,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.604,0.013,0.007,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.545,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.543,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,9,0.602,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.575,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.496,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.603,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.597,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.56,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.569,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.562,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.556,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.576,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.575,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.606,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.592,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.57,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.63,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.592,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.575,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.439,0.028,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.614,0.014,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.616,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.614,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.579,0.021,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.634,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.591,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.59,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.583,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.597,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.547,0.029,0.017,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.497,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.575,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.597,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.54,0.011,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.597,0.01,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.621,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.529,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.522,0.015,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.538,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.593,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,5,0.574,0.012,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.507,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.593,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.59,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.555,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,10,0.608,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.513,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,8,0.503,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.568,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,OverallScore,4,0.593,0.015,0.008,0.015,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.504,0.034,0.019,0.038,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.447,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.555,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.543,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.518,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.59,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.411,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.421,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,OverallScore,3,0.411,0.016,0.009,0.019,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.562,0.02,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.578,0.014,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,8,0.599,0.016,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.603,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,8,0.619,0.017,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,4,0.608,0.016,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.598,0.013,0.005,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.608,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.564,0.017,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,4,0.586,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,9,0.616,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,3,0.611,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.586,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.575,0.018,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.581,0.014,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.609,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.579,0.016,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,3,0.591,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.622,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.591,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.615,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.623,0.022,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.611,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,6,0.581,0.019,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,3,0.627,0.018,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.529,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.612,0.017,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.632,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.615,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,10,0.643,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,4,0.613,0.022,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.594,0.015,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.615,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.515,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.517,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.554,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.597,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,3,0.615,0.027,0.016,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,5,0.507,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,OverallScore,4,0.506,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,OverallScore,10,0.576,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,OverallScore,20,0.537,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,OverallScore,20,0.582,0.019,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,OverallScore,20,0.581,0.015,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,OverallScore,15,0.577,0.017,0.004,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,OverallScore,20,0.564,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,OverallScore,20,0.497,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,OverallScore,20,0.536,0.034,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,OverallScore,10,0.437,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,OverallScore,20,0.583,0.02,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,OverallScore,20,0.57,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,OverallScore,10,0.511,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,OverallScore,10,0.556,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,OverallScore,20,0.582,0.015,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,OverallScore,20,0.599,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,OverallScore,20,0.548,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,OverallScore,13,0.59,0.022,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,OverallScore,15,0.58,0.023,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,OverallScore,15,0.572,0.026,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,OverallScore,15,0.561,0.017,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,OverallScore,20,0.623,0.013,0.003,0.005,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,OverallScore,10,0.511,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,OverallScore,20,0.535,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,OverallScore,20,0.496,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,OverallScore,15,0.537,0.026,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,OverallScore,15,0.502,0.025,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,OverallScore,5,0.53,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,OverallScore,13,0.488,0.018,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,OverallScore,10,0.532,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,OverallScore,10,0.475,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,OverallScore,20,0.427,0.024,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,OverallScore,10,0.479,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.505,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.542,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.313,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.59,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.54,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.486,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.523,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.495,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.486,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.511,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.54,0.03,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.561,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.514,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.519,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.517,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.462,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.488,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.5,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.518,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.478,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.495,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.486,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.27,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.576,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.509,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.43,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.484,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.43,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.423,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.455,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.634,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.291,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.618,0.015,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.601,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.604,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.612,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.567,0.028,0.016,0.031,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.615,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.635,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.567,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.636,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.276,0.035,0.016,0.03,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.606,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.57,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.567,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.545,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.564,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.581,0.016,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,9,0.468,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.489,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.322,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.575,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.493,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.387,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.472,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.389,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.378,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.431,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.501,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.484,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.457,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.456,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.497,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.45,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.561,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.262,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.596,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.543,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.528,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.5,0.029,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.556,0.006,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.537,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.533,0.015,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.536,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.491,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.523,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.345,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.6,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.526,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.405,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.466,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.495,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.417,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.405,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.471,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.472,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,5,0.503,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.346,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.601,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.507,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.371,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,10,0.471,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.355,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,8,0.347,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.439,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Precision,4,0.562,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.624,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.319,0.025,0.014,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.646,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.597,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.577,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.554,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.644,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.603,0.027,0.015,0.03,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Precision,3,0.615,0.037,0.021,0.042,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.518,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.573,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,8,0.474,0.01,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.518,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,8,0.509,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,4,0.567,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.504,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.571,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.626,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,4,0.484,0.018,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,9,0.495,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,3,0.554,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.455,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.493,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.498,0.007,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.55,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.502,0.012,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,3,0.496,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.56,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.496,0.013,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.572,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.553,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.581,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,6,0.531,0.018,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,3,0.567,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.418,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.484,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.56,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.489,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,10,0.515,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,4,0.479,0.012,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.582,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.504,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.354,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.354,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.479,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.584,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,3,0.49,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,5,0.348,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Precision,4,0.346,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Precision,10,0.499,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Precision,20,0.444,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Precision,20,0.555,0.011,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Precision,20,0.481,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Precision,15,0.425,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Precision,20,0.434,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Precision,20,0.423,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Precision,20,0.416,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Precision,10,0.455,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Precision,20,0.604,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Precision,20,0.567,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Precision,10,0.475,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Precision,10,0.385,0.005,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Precision,20,0.429,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Precision,20,0.515,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Precision,20,0.615,0.044,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Precision,13,0.589,0.029,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Precision,15,0.5,0.025,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Precision,15,0.487,0.026,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Precision,15,0.406,0.014,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Precision,20,0.493,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Precision,10,0.361,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Precision,20,0.415,0.007,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Precision,20,0.343,0.004,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Precision,15,0.474,0.03,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Precision,15,0.434,0.013,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Precision,5,0.401,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Precision,13,0.379,0.011,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Precision,10,0.551,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Precision,10,0.37,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Precision,20,0.635,0.019,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Precision,10,0.588,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.799,0.009,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.761,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.839,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.759,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.801,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.812,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.812,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.797,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.798,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.794,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.782,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.789,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.8,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.762,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.802,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.824,0.011,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.823,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.79,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.826,0.006,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.8,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.79,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.8,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.834,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.756,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.808,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.82,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.816,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.804,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.807,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.789,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.717,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.81,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.735,0.011,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.754,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.75,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.754,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.771,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.748,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.723,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.729,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.717,0.011,0.006,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.843,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.747,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.762,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.768,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.78,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.736,0.013,0.007,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.733,0.013,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,9,0.83,0.01,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.786,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.873,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.778,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.825,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.858,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.807,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.861,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.858,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.833,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.78,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.824,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.831,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.799,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.843,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.825,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.752,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.866,0.034,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.777,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.81,0.014,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.808,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.763,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.808,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.783,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.783,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.779,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.807,0.012,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.735,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.85,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.753,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.798,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.824,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.83,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.831,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.783,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.792,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.763,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.824,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,5,0.755,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.849,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.763,0.009,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.799,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.852,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,10,0.831,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.795,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,8,0.791,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.773,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Recall,4,0.73,0.016,0.008,0.015,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.64,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.732,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.696,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.717,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.656,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.772,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.516,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.519,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Recall,3,0.512,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.757,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.759,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,8,0.826,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.797,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,8,0.824,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,4,0.795,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.809,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.784,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.731,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,4,0.811,0.009,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,9,0.84,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,3,0.796,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.829,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.789,0.016,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.794,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.799,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.788,0.012,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,3,0.797,0.004,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.809,0.01,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.8,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.784,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.806,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.768,0.019,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,6,0.757,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,3,0.808,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.785,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.831,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.817,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.833,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,10,0.844,0.017,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,4,0.845,0.015,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.771,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.816,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.797,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.797,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.749,0.012,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.772,0.015,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,3,0.819,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,5,0.792,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Recall,4,0.795,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Recall,10,0.796,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Recall,20,0.768,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Recall,20,0.762,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Recall,20,0.802,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Recall,15,0.835,0.011,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Recall,20,0.806,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Recall,20,0.747,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Recall,20,0.787,0.047,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Recall,10,0.571,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Recall,20,0.752,0.024,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Recall,20,0.749,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Recall,10,0.682,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Recall,10,0.857,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Recall,20,0.833,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Recall,20,0.805,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Recall,20,0.694,0.057,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Recall,13,0.761,0.04,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Recall,15,0.793,0.039,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Recall,15,0.77,0.042,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Recall,15,0.831,0.019,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Recall,20,0.846,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Recall,10,0.814,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Recall,20,0.79,0.011,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Recall,20,0.793,0.007,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Recall,15,0.747,0.047,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Recall,15,0.757,0.026,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Recall,5,0.767,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Recall,13,0.751,0.018,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Recall,10,0.721,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Recall,10,0.704,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Recall,20,0.519,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Recall,10,0.58,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.542,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.573,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.444,0.013,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.614,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.574,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.533,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.556,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.54,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.533,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.551,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.583,0.01,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.599,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.544,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.551,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.55,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.507,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.526,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.543,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.556,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.52,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.532,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.523,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.402,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.602,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.547,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.485,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.535,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.486,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.481,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.503,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.66,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.447,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.644,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.631,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.637,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.639,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.615,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.646,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.662,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.632,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.664,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.426,0.028,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.633,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.606,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.609,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.6,0.003,0.002,0.003,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.606,0.009,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.617,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,9,0.508,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.528,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.424,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.604,0.011,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.535,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.452,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.52,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.455,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.448,0.003,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.487,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.539,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.525,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.503,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.517,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.538,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.499,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.586,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.403,0.014,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.615,0.001,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.57,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.563,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.551,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.586,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.569,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.567,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.569,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.529,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.56,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.45,0.018,0.01,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.626,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.566,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.476,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.523,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.543,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.491,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.484,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.529,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.518,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,5,0.547,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.441,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.629,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.554,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.455,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,10,0.52,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.451,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,8,0.448,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.513,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Restraint,4,0.61,0.007,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.665,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.472,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.672,0.001,0.001,0.002,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.634,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.629,0.014,0.008,0.015,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.597,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.694,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.668,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Restraint,3,0.685,0.024,0.014,0.027,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.551,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.595,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,8,0.515,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.548,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,8,0.547,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,4,0.59,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.538,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.591,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.651,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,4,0.523,0.009,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,9,0.536,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,3,0.579,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.502,0.003,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.53,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.539,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.573,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.539,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,3,0.533,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.584,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.534,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.593,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.58,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.602,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,6,0.567,0.009,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,3,0.588,0.014,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.492,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.524,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.583,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.526,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,10,0.548,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,4,0.52,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.608,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.54,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.45,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.45,0.004,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.53,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.609,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,3,0.53,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,5,0.448,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Restraint,4,0.447,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Restraint,10,0.543,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Restraint,20,0.494,0.016,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Restraint,20,0.591,0.01,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Restraint,20,0.521,0.013,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Restraint,15,0.478,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Restraint,20,0.487,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Restraint,20,0.498,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Restraint,20,0.487,0.016,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Restraint,10,0.554,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Restraint,20,0.631,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Restraint,20,0.601,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Restraint,10,0.534,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Restraint,10,0.452,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Restraint,20,0.485,0.016,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Restraint,20,0.545,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Restraint,20,0.648,0.045,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Restraint,13,0.613,0.029,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Restraint,15,0.537,0.024,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Restraint,15,0.532,0.026,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Restraint,15,0.469,0.013,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Restraint,20,0.541,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Restraint,10,0.451,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Restraint,20,0.491,0.008,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Restraint,20,0.446,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Restraint,15,0.532,0.03,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Restraint,15,0.505,0.017,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Restraint,5,0.488,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Restraint,13,0.46,0.01,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Restraint,10,0.59,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Restraint,10,0.479,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Restraint,20,0.693,0.017,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Restraint,10,0.651,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.673,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.681,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.374,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.665,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.657,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.656,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.697,0.01,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.654,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.65,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.652,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.68,0.017,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.691,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.687,0.028,0.013,0.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.663,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.671,0.013,0.007,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.689,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.681,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.686,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.701,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.675,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.672,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.657,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.328,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.659,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.66,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.633,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.693,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.615,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.617,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.627,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.65,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.347,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.64,0.019,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.634,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.64,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.654,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.667,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.633,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.635,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.62,0.033,0.019,0.037,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.616,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.327,0.067,0.03,0.059,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.621,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.626,0.033,0.019,0.037,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.613,0.027,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.664,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.577,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.58,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,9,0.694,0.023,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.661,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.443,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.674,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.67,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.614,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.679,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.61,0.012,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.605,0.002,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.643,0.015,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.666,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.682,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.69,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.66,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.707,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.688,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.67,0.008,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.339,0.048,0.021,0.042,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.698,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.705,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.706,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.661,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.715,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.687,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.686,0.002,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.682,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.69,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.652,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.442,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.631,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.65,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.614,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.684,0.015,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.7,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.616,0.012,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.59,0.032,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.631,0.028,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.674,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,5,0.65,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.471,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.661,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.643,0.014,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.592,0.008,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,10,0.717,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.52,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,8,0.491,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.625,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,All,Safety,4,0.647,0.014,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.607,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.367,0.033,0.019,0.037,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.621,0.024,0.014,0.027,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.584,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.567,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.673,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.483,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.486,0.005,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,All,Safety,3,0.477,0.007,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.658,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.669,0.011,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,8,0.695,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.698,0.02,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,8,0.698,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,4,0.694,0.023,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.691,0.034,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.712,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.641,0.022,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,4,0.686,0.032,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,9,0.698,0.016,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,3,0.714,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.69,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.675,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.688,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.707,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.69,0.017,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,3,0.687,0.032,0.019,0.037,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.708,0.018,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.7,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.705,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.706,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.697,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,6,0.675,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,3,0.715,0.039,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.619,0.015,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.73,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.736,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.736,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,10,0.756,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,4,0.729,0.03,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.655,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.715,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.522,0.012,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.524,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.64,0.02,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.661,0.029,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,3,0.718,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,5,0.5,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,All,Safety,4,0.499,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,All,Safety,10,0.654,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,All,Safety,20,0.628,0.02,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,All,Safety,20,0.668,0.02,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,All,Safety,20,0.672,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,All,Safety,15,0.662,0.021,0.005,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,All,Safety,20,0.609,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,All,Safety,20,0.543,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,All,Safety,20,0.575,0.036,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,All,Safety,10,0.49,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,All,Safety,20,0.642,0.024,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,All,Safety,20,0.621,0.031,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,All,Safety,10,0.586,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,All,Safety,10,0.605,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,All,Safety,20,0.664,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,All,Safety,20,0.695,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,All,Safety,20,0.628,0.031,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,All,Safety,13,0.663,0.024,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,All,Safety,15,0.646,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,All,Safety,15,0.649,0.041,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,All,Safety,15,0.613,0.024,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,All,Safety,20,0.679,0.019,0.004,0.008,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,All,Safety,10,0.538,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,All,Safety,20,0.612,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,All,Safety,20,0.482,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,All,Safety,15,0.604,0.023,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,All,Safety,15,0.537,0.027,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,All,Safety,5,0.563,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,All,Safety,13,0.527,0.026,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,All,Safety,10,0.599,0.015,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,All,Safety,10,0.461,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,All,Safety,20,0.494,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,All,Safety,10,0.525,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.868,0.209,0.121,0.237,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.008,0.193,0.112,0.219,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.888,0.614,0.354,0.694,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.008,0.193,0.112,0.219,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.746,0.269,0.155,0.304,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.697,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.417,0.722,0.417,0.817,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,5.426,1.168,0.674,1.322,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.684,0.153,0.089,0.174,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,2.784,0.155,0.089,0.175,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,13.426,2.891,1.669,3.272,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.468,0.332,0.191,0.375,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.782,0.13,0.075,0.147,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,23.333,8.819,5.092,9.98,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.889,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.384,0.963,0.556,1.089,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.192,0.058,0.034,0.066,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.143,0.296,0.171,0.335,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.862,2.036,1.176,2.304,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.131,0.367,0.212,0.416,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.024,0.482,0.278,0.546,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.871,0.247,0.143,0.28,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.135,0.647,0.373,0.732,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.171,0.257,0.148,0.291,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.636,0.139,0.08,0.157,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,77.778,38.49,22.222,43.556,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,11.126,2.774,1.602,3.139,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.164,0.772,0.446,0.874,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.994,0.318,0.184,0.36,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.838,0.179,0.103,0.202,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.159,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.4,0.576,0.333,0.652,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.994,0.318,0.184,0.36,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.766,0.234,0.135,0.265,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.906,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.833,0.722,0.417,0.817,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.153,0.373,0.215,0.422,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.782,0.13,0.075,0.147,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.262,0.808,0.467,0.915,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,3.225,0.409,0.183,0.358,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.832,0.167,0.075,0.146,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,5,53.333,27.386,12.247,24.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,9.485,0.755,0.337,0.661,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,5.042,0.997,0.446,0.874,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.223,0.173,0.055,0.107,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.809,0.158,0.05,0.098,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,78.333,28.382,8.975,17.592,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,13.492,1.571,0.497,0.974,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.146,0.374,0.118,0.232,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,3.036,0.414,0.185,0.363,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,3.304,0.24,0.107,0.211,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,10.152,1.341,0.6,1.175,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,3.993,0.518,0.232,0.454,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,3.427,0.461,0.163,0.319,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,2.829,0.386,0.136,0.267,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,8,81.25,25.877,9.149,17.932,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,9.36,2.235,0.79,1.549,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,4.564,0.471,0.166,0.326,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.707,0.137,0.079,0.155,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.862,0.138,0.079,0.156,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,61.111,34.694,20.031,39.26,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,11.574,0.802,0.463,0.907,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,3.688,0.415,0.239,0.469,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,3.17,0.412,0.146,0.285,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,2.474,0.253,0.089,0.175,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,8,93.75,17.678,6.25,12.25,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,12.208,2.583,0.913,1.79,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,5.629,0.989,0.35,0.685,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.284,0.244,0.077,0.151,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.902,0.264,0.084,0.164,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,11.636,2.419,0.765,1.5,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.025,0.329,0.104,0.204,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.161,0.24,0.076,0.149,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,3.04,0.253,0.08,0.157,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,10,86.667,28.109,8.889,17.422,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,10.373,1.749,0.553,1.084,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.188,0.525,0.166,0.326,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.189,0.391,0.124,0.242,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.985,0.4,0.126,0.248,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,78.333,28.382,8.975,17.592,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,14.599,2.371,0.75,1.47,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,3.945,0.589,0.186,0.365,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.445,0.343,0.108,0.213,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.53,0.266,0.084,0.165,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,10,95,15.811,5,9.8,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,10.842,2.342,0.741,1.451,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.936,0.572,0.181,0.355,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.51,0.503,0.159,0.312,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.595,0.306,0.097,0.19,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,10,85,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,10.891,2.693,0.852,1.669,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.74,0.488,0.154,0.302,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.062,0.055,0.032,0.062,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.032,0.092,0.053,0.104,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.237,0.28,0.161,0.316,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,6.296,0.642,0.37,0.726,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.572,0.181,0.104,0.204,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,2.633,0.069,0.04,0.078,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,14.286,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.459,0.239,0.138,0.271,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.94,0.34,0.196,0.385,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,44.444,9.623,5.556,10.889,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.394,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.237,0.28,0.161,0.316,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.112,0.295,0.17,0.333,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.054,0.321,0.186,0.364,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.754,0.666,0.385,0.754,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.976,0.387,0.224,0.438,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.691,0.226,0.131,0.256,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.926,0.642,0.37,0.726,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.878,0.31,0.179,0.35,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.946,0.146,0.084,0.165,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.121,0.52,0.3,0.588,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.047,0.278,0.161,0.315,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.733,0.152,0.088,0.172,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.676,0.357,0.206,0.405,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.961,0.285,0.164,0.322,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.663,0.181,0.104,0.204,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.906,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,6.564,0.636,0.367,0.72,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.882,0.332,0.191,0.375,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.929,0.262,0.151,0.296,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.602,0.626,0.362,0.709,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.673,0.29,0.167,0.328,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.609,0.039,0.022,0.044,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,26.111,6.736,3.889,7.622,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.462,1.332,0.769,1.508,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.458,0.169,0.097,0.191,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,6.961,1.252,0.723,1.416,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.616,0.17,0.098,0.192,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,2.615,0.152,0.088,0.172,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,13.095,1.031,0.595,1.167,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,4.056,0.415,0.147,0.288,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,2.804,0.23,0.081,0.159,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,8,22.202,5.627,1.989,3.899,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,8.098,0.998,0.353,0.692,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,4.564,0.309,0.109,0.214,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.073,0.332,0.192,0.376,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.837,0.172,0.099,0.194,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,36.111,12.729,7.349,14.405,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.012,0.267,0.154,0.302,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.413,0.137,0.079,0.156,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.103,0.213,0.123,0.241,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,33.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.095,1.65,0.952,1.867,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.419,0.219,0.126,0.247,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.622,0.204,0.118,0.231,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.94,0.34,0.196,0.385,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,31.667,16.073,9.28,18.188,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.104,3.636,2.099,4.114,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.227,0.105,0.06,0.118,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.451,0.119,0.069,0.135,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.905,0.3,0.173,0.339,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,38.889,9.623,5.556,10.889,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.442,1.125,0.649,1.273,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.643,0.429,0.248,0.486,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.855,0.334,0.106,0.207,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,3.046,0.23,0.073,0.142,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,28.667,11.595,3.667,7.187,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,8.155,0.925,0.292,0.573,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.109,0.449,0.142,0.278,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.918,0.372,0.215,0.421,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.834,0.121,0.07,0.137,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,3,28.889,7.698,4.444,8.711,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.937,1.803,1.041,2.04,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.491,0.301,0.174,0.341,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.565,0.438,0.253,0.495,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.862,0.138,0.079,0.156,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,38.889,9.623,5.556,10.889,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,6.843,1.954,1.128,2.211,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.4,0.576,0.333,0.652,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.079,0.404,0.233,0.457,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.862,0.146,0.084,0.165,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,26.111,6.736,3.889,7.622,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.222,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.486,0.239,0.138,0.271,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,7.227,2.577,1.152,2.259,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.846,0.135,0.06,0.118,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,2.518,0.324,0.145,0.284,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,11.263,1.399,0.625,1.226,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.004,0.16,0.093,0.181,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.066,0.146,0.084,0.165,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,33.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.425,1.067,0.616,1.207,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,3.906,0.237,0.137,0.268,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.643,0.391,0.226,0.442,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.973,0.1,0.058,0.113,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.639,2.055,1.187,2.326,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.12,0.254,0.147,0.287,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.679,0.334,0.193,0.378,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.002,0.106,0.061,0.12,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.555,1.609,0.929,1.82,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.139,0.422,0.243,0.477,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.536,0.188,0.108,0.212,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.984,0.244,0.141,0.276,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,66.667,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.928,1.162,0.671,1.315,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.013,0.29,0.167,0.328,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.966,0.338,0.195,0.383,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.946,0.155,0.089,0.175,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,5.828,0.74,0.428,0.838,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.565,0.377,0.217,0.426,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.171,0.174,0.1,0.197,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.888,0.129,0.075,0.146,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,6.035,0.571,0.33,0.646,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.287,0.105,0.06,0.118,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,9,2.668,0.301,0.1,0.196,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,9,3.248,0.344,0.115,0.224,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,9,83.333,25,8.333,16.333,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,9,11.79,2.032,0.677,1.328,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,9,4.712,0.501,0.167,0.327,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.022,0.315,0.182,0.357,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.913,0.049,0.028,0.055,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.697,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.758,1.035,0.597,1.171,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.675,0.636,0.367,0.72,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.406,0.147,0.085,0.167,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,3.394,0.34,0.196,0.385,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,14.815,3.208,1.852,3.63,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.074,0.241,0.139,0.273,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.196,0.154,0.089,0.174,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,41.667,14.434,8.333,16.333,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.394,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.419,0.219,0.126,0.247,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.066,0.146,0.084,0.165,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.133,0.196,0.113,0.222,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.889,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.293,0.219,0.126,0.247,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.192,0.058,0.034,0.066,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.197,0.161,0.093,0.182,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,8.625,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.014,0.442,0.255,0.501,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.736,0.355,0.205,0.402,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.975,2.219,1.281,2.512,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.504,0.659,0.38,0.746,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,3.174,0.197,0.088,0.173,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.288,0.118,0.053,0.103,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,8.97,0.689,0.308,0.604,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,7.872,0.729,0.326,0.639,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.167,0.203,0.117,0.229,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.232,0.185,0.107,0.209,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.625,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,8.928,1.162,0.671,1.315,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,2.949,0.168,0.075,0.148,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.649,0.114,0.051,0.1,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,9.677,0.893,0.399,0.783,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,5.853,0.511,0.229,0.448,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.367,0.426,0.246,0.482,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.643,0.209,0.121,0.237,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,77.778,38.49,22.222,43.556,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.961,0.942,0.544,1.066,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.175,0.275,0.159,0.311,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.871,0.247,0.143,0.28,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,11.667,1.443,0.833,1.633,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.167,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,2.991,0.226,0.072,0.14,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.55,0.261,0.083,0.162,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,10,90,21.082,6.667,13.067,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,11.136,2.45,0.775,1.519,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,6.102,1.194,0.377,0.74,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,3.019,0.301,0.106,0.209,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,2.919,0.309,0.109,0.214,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,8,93.75,17.678,6.25,12.25,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,8.799,1.735,0.614,1.203,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,5.221,0.606,0.214,0.42,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.149,0.224,0.071,0.139,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.852,0.254,0.08,0.158,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,95,15.811,5,9.8,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,15.718,4.417,1.397,2.738,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,4.045,0.364,0.115,0.225,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,2.865,0.287,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.542,0.299,0.095,0.185,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,11.636,3.116,0.985,1.932,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,6.59,1.138,0.36,0.705,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.004,0.136,0.078,0.154,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.17,0.241,0.139,0.273,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,48.333,44.814,25.874,50.712,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.723,0.596,0.344,0.674,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.4,0.576,0.333,0.652,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,6.508,1.698,0.759,1.488,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.907,0.24,0.107,0.21,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,2.476,0.288,0.129,0.253,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,12.5,2.406,1.076,2.109,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.205,0.254,0.147,0.288,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.228,0.104,0.06,0.118,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,19.167,6.292,3.632,7.12,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.53,1.765,1.019,1.998,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.505,0.438,0.253,0.496,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.184,0.255,0.081,0.158,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,3.267,0.274,0.087,0.17,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,10,85,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,11.781,2.617,0.828,1.622,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,3.593,0.292,0.092,0.181,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.916,0.128,0.074,0.145,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.946,0.385,0.222,0.435,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,61.111,34.694,20.031,39.26,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,11.204,1.253,0.723,1.417,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.937,0.871,0.503,0.986,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,3.034,0.231,0.082,0.16,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,3.122,0.285,0.101,0.197,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,8,87.5,23.146,8.183,16.039,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,7.996,1.222,0.432,0.847,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,4.887,0.592,0.209,0.41,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,3.131,0.233,0.074,0.144,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,3.11,0.286,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,56.667,23.831,7.536,14.77,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,15.147,4.164,1.317,2.581,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,3.818,0.438,0.138,0.271,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.971,0.051,0.03,0.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.018,0.278,0.16,0.314,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,3,61.111,34.694,20.031,39.26,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.968,0.605,0.349,0.684,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.001,0.051,0.03,0.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.984,0.244,0.141,0.276,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,77.778,38.49,22.222,43.556,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.87,0.471,0.272,0.533,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.971,0.051,0.03,0.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.062,0.055,0.032,0.062,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,55.556,38.49,22.222,43.556,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.024,0.414,0.239,0.469,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,2.765,0.103,0.046,0.091,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.997,0.102,0.045,0.089,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,10.5,1.118,0.5,0.98,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,4.908,0.597,0.267,0.523,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.079,0.281,0.162,0.318,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.569,0.132,0.076,0.149,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.625,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,6.127,0.212,0.123,0.24,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.504,0.781,0.451,0.883,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.324,0.24,0.139,0.272,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,3.622,0.204,0.118,0.231,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,15.873,1.375,0.794,1.556,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.755,0.159,0.092,0.179,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.097,0.144,0.083,0.163,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,21.667,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.009,0.592,0.342,0.67,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.563,0.344,0.198,0.389,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.213,0.313,0.181,0.355,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.382,0.231,0.133,0.261,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,3.973,0.389,0.225,0.44,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.036,0.155,0.089,0.175,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.043,0.074,0.043,0.083,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,10.278,2.097,1.211,2.373,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,12.632,1.591,0.919,1.801,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.831,0.046,0.026,0.052,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.302,0.164,0.095,0.185,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,77.778,38.49,22.222,43.556,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.848,0.262,0.152,0.297,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,2.959,0.24,0.076,0.149,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.695,0.148,0.047,0.092,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh,10,80,25.82,8.165,16.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,13.024,2.366,0.748,1.467,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,5.186,0.565,0.179,0.35,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,3.049,0.165,0.058,0.114,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,1.986,0.05,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,8,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,9.59,0.857,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,16.384,2.68,0.948,1.857,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.242,0.289,0.167,0.327,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,1.961,0.038,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.586,1.723,0.995,1.95,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,16.984,2.87,1.657,3.248,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.297,0.062,0.036,0.07,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.257,0.06,0.034,0.068,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,7.167,0.513,0.296,0.581,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.871,0.247,0.143,0.28,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.17,0.241,0.139,0.273,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.838,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.643,0.429,0.248,0.486,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,5,3.178,0.227,0.101,0.199,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,5,2.658,0.203,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,5,9.333,0.913,0.408,0.8,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,5,5.241,0.994,0.445,0.872,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.532,0.869,0.502,0.983,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.298,0.303,0.175,0.342,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,4.013,0.29,0.167,0.328,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,13.056,3.368,1.944,3.811,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.489,0.071,0.041,0.08,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.971,0.051,0.03,0.058,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,26.111,6.736,3.889,7.622,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.69,0.125,0.072,0.141,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.181,0.341,0.197,0.386,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.093,0.055,0.032,0.062,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,7.692,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.464,0.581,0.336,0.658,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.044,0.254,0.147,0.288,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,1.924,0.063,0.036,0.071,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.091,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,26.111,6.736,3.889,7.622,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,10,2.754,0.294,0.093,0.182,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,10,2.888,0.307,0.097,0.19,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,10,85,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,10,13.968,4.478,1.416,2.776,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,10,5.337,1.31,0.414,0.812,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,4.041,0.31,0.11,0.215,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,1.786,0.038,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,8,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,7.438,0.514,0.182,0.356,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,21.419,12.331,4.36,8.545,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,8,4.233,0.33,0.117,0.229,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,8,1.804,0.075,0.026,0.052,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,8,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,8,6.182,0.443,0.156,0.307,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,8,31.488,28.378,10.033,19.665,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,2.986,0.434,0.25,0.491,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.486,0.153,0.089,0.174,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,9.764,1.166,0.673,1.32,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,6.608,0.957,0.552,1.083,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh,4,2.991,0.155,0.077,0.152,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh,4,2.708,0.134,0.067,0.131,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh,4,41.667,9.623,4.811,9.43,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh,4,10.051,0.827,0.413,0.81,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,4,5.893,0.284,0.142,0.278,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.514,0.364,0.21,0.412,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.408,0.173,0.1,0.196,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh,3,36.111,12.729,7.349,14.405,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,6.564,0.636,0.367,0.72,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,8.889,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.968,0.605,0.349,0.684,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.758,0.162,0.094,0.184,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,2.81,0.16,0.092,0.181,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,17.593,6.991,4.036,7.911,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.58,0.533,0.308,0.603,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.891,0.169,0.098,0.192,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh,3,17.5,4.33,2.5,4.9,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,6.825,0.275,0.159,0.311,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,6.199,0.81,0.468,0.917,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.574,0.128,0.074,0.145,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.069,0.191,0.11,0.216,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,61.111,34.694,20.031,39.26,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,5.773,0.189,0.109,0.214,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,5.008,0.251,0.145,0.284,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.755,0.159,0.092,0.179,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.568,0.117,0.068,0.132,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,6.144,0.453,0.261,0.512,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,6.06,0.721,0.416,0.816,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,3.127,0.098,0.056,0.111,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,3.159,0.058,0.034,0.066,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh,3,56.667,40.415,23.333,45.733,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,4.658,0.557,0.322,0.63,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.111,0.096,0.056,0.109,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.48,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh,3,33.333,14.434,8.333,16.333,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,4.353,0.189,0.109,0.214,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,11.496,2.619,1.512,2.963,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.004,0.16,0.093,0.181,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.421,0.069,0.04,0.078,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh,3,30.556,4.811,2.778,5.444,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,4.486,0.239,0.138,0.271,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,13.056,3.368,1.944,3.811,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh,3,4.171,0.174,0.1,0.197,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh,3,2.501,0.063,0.036,0.071,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh,3,33.333,14.434,8.333,16.333,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh,3,4.227,0.105,0.06,0.118,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh,3,11.496,2.619,1.512,2.963,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.443,0.467,0.191,0.374,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.783,0.429,0.175,0.343,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,91.667,20.412,8.333,16.333,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,8.757,0.808,0.33,0.646,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.71,0.591,0.241,0.473,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.663,0.337,0.138,0.27,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.849,0.323,0.132,0.259,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,31.111,10.628,4.339,8.504,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,9.531,1.338,0.546,1.071,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.464,0.68,0.278,0.544,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,8,3.035,0.385,0.136,0.267,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,8,2.716,0.2,0.071,0.139,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,8,93.75,17.678,6.25,12.25,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,8,11.955,2.796,0.988,1.937,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,8,4.933,0.572,0.202,0.396,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.039,0.181,0.074,0.145,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,3.055,0.182,0.074,0.146,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,100,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,11.323,2.715,1.109,2.173,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.153,0.456,0.186,0.365,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,8,3.167,0.45,0.159,0.312,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,8,2.982,0.303,0.107,0.21,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,8,70.833,31.81,11.247,22.044,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,8,14.335,2.186,0.773,1.515,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,8,3.983,0.535,0.189,0.37,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,4,3.481,0.396,0.198,0.388,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,4,3.046,0.537,0.269,0.526,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,4,32.083,13.15,6.575,12.887,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,4,10.933,2.51,1.255,2.46,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,4,4.098,0.307,0.154,0.301,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.049,0.262,0.107,0.21,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,3.085,0.296,0.121,0.237,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,83.333,25.82,10.541,20.66,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,11.667,4.303,1.757,3.443,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.165,0.365,0.149,0.292,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.222,0.266,0.119,0.233,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.929,0.143,0.064,0.125,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,45,30.957,13.844,27.135,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,12.024,1.4,0.626,1.227,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,4.332,0.562,0.251,0.492,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,4.063,0.557,0.227,0.446,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.746,0.247,0.101,0.198,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,23.333,5.869,2.396,4.696,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,8.028,1.168,0.477,0.935,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.818,0.736,0.3,0.589,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,4,2.945,0.126,0.063,0.124,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,4,3.009,0.316,0.158,0.31,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,4,87.5,25,12.5,24.5,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,4,11.12,2.883,1.441,2.825,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,4,4.496,0.099,0.049,0.097,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,9,2.83,0.172,0.057,0.112,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,9,3.015,0.127,0.042,0.083,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,9,81.481,28.191,9.397,18.418,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,9,13.051,1.763,0.588,1.152,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,9,4.521,0.592,0.197,0.387,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,3,2.935,0.479,0.277,0.542,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,3,3.03,0.36,0.208,0.408,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,3,83.333,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,3,12.037,0.802,0.463,0.907,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,3,4.486,0.239,0.138,0.271,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.044,0.223,0.1,0.196,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.359,0.189,0.084,0.166,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,12.231,3.162,1.414,2.771,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,6.686,1.483,0.663,1.3,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.022,0.298,0.122,0.239,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.835,0.399,0.163,0.319,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,83.333,25.82,10.541,20.66,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,9.061,1.961,0.801,1.569,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,5.622,1.155,0.472,0.925,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.065,0.25,0.112,0.219,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.69,0.232,0.104,0.203,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,80,27.386,12.247,24.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,11.246,2.284,1.021,2.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,5.225,0.684,0.306,0.6,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.482,0.212,0.095,0.185,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.879,0.138,0.062,0.121,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,35,9.129,4.082,8.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,13.222,3.931,1.758,3.446,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,3.965,0.382,0.171,0.334,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,2.994,0.31,0.127,0.248,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.756,0.233,0.095,0.187,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,91.667,20.412,8.333,16.333,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,10.543,1.536,0.627,1.229,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,5.278,0.629,0.257,0.503,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,3,3.17,0.241,0.139,0.273,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,3,2.726,0.302,0.174,0.342,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,3,66.667,28.868,16.667,32.667,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,3,12.753,3.794,2.191,4.294,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,3,4.653,0.528,0.305,0.598,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.181,0.258,0.115,0.226,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,3.096,0.197,0.088,0.173,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,23.667,5.821,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,11.857,1.845,0.825,1.618,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,4.389,0.467,0.209,0.409,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,2.912,0.228,0.093,0.182,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,2.788,0.333,0.136,0.267,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,91.667,20.412,8.333,16.333,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,11.25,2.97,1.212,2.376,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,5.394,0.744,0.304,0.595,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,3.105,0.181,0.057,0.112,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,3.347,0.308,0.097,0.191,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,46.667,21.588,6.827,13.38,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,10.789,2.601,0.822,1.612,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,3.987,0.472,0.149,0.293,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,3.122,0.286,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,3.281,0.214,0.068,0.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,80,25.82,8.165,16.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,12.794,3.519,1.113,2.181,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,3.67,0.439,0.139,0.272,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.179,0.287,0.117,0.229,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,3.186,0.204,0.083,0.163,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,36.667,31.535,12.874,25.233,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,9.893,1.532,0.626,1.226,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.427,0.284,0.116,0.228,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,6,3.013,0.223,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,6,3.009,0.179,0.073,0.143,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,6,83.333,25.82,10.541,20.66,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,6,8.524,0.89,0.363,0.712,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,6,4.977,0.337,0.138,0.27,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,3,3.069,0.437,0.252,0.495,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,3,3.309,0.241,0.139,0.273,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,3,36.111,12.729,7.349,14.405,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,3,11.364,1.968,1.136,2.227,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,3,4.114,0.585,0.338,0.662,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.096,0.197,0.088,0.173,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,1.978,0.058,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,9.889,0.994,0.444,0.871,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,14.881,1.786,0.799,1.565,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,2.73,0.285,0.09,0.177,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,2.859,0.284,0.09,0.176,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,85,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,15.246,3.666,1.159,2.272,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,5.237,1.186,0.375,0.735,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,3.05,0.27,0.086,0.168,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,3.095,0.432,0.136,0.267,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,42.833,22.663,7.167,14.047,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,16.27,4.963,1.57,3.076,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,4.143,0.641,0.203,0.397,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,2.731,0.255,0.081,0.158,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,2.905,0.268,0.085,0.166,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,85,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,15.429,4.064,1.285,2.519,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,5.028,1.023,0.324,0.634,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,10,2.79,0.247,0.078,0.153,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,10,2.993,0.336,0.106,0.208,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,10,83.333,27.217,8.607,16.869,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,10,23.25,6.899,2.182,4.276,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,10,4.233,0.647,0.205,0.401,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,4,2.819,0.28,0.14,0.275,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,4,2.748,0.301,0.15,0.295,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,4,87.5,25,12.5,24.5,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,4,16.349,4.41,2.205,4.322,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,4,5.208,1.198,0.599,1.174,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.514,0.288,0.129,0.253,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,3.145,0.391,0.175,0.343,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,20.333,2.981,1.333,2.613,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,8.603,1.161,0.519,1.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,4.498,0.455,0.204,0.399,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,2.872,0.229,0.102,0.201,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.952,0.501,0.224,0.439,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,13,4.108,1.837,3.601,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,4.614,0.379,0.169,0.332,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.963,0.364,0.163,0.319,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,1.774,0.036,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,7.505,0.631,0.282,0.553,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,25.19,14.44,6.458,12.657,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.884,0.433,0.194,0.38,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,1.78,0.047,0.021,0.042,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,7.505,0.631,0.282,0.553,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,36.857,36.058,16.126,31.606,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.341,0.496,0.222,0.434,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,2.43,0.204,0.091,0.178,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,9.707,1.05,0.47,0.92,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,5.716,0.882,0.395,0.773,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,3.364,0.197,0.088,0.173,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,3.104,0.267,0.119,0.234,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,35.667,13.924,6.227,12.205,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,8.538,1.461,0.653,1.28,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,4.506,0.66,0.295,0.578,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,3,2.802,0.58,0.335,0.657,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,3,2.976,0.387,0.224,0.438,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,3,12.262,2.153,1.243,2.436,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,3,4.954,0.532,0.307,0.602,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,5,4.147,0.275,0.123,0.241,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,5,1.755,0.037,0.017,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,5,6.522,0.477,0.213,0.418,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,5,40.667,33.512,14.987,29.375,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh,4,4.055,0.274,0.137,0.268,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh,4,1.763,0.053,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh,4,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh,4,6.485,0.543,0.271,0.532,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh,4,44.583,37.352,18.676,36.605,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,nnh,10,3.088,0.181,0.057,0.112,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,nnh,10,2.813,0.112,0.035,0.069,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,nnh,10,8.368,0.572,0.181,0.354,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,nnh,10,5.119,0.496,0.157,0.308,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,nnh,20,3.195,0.357,0.08,0.157,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,nnh,20,2.448,0.128,0.029,0.056,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,nnh,20,7.442,0.869,0.194,0.381,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,nnh,20,7.476,1.312,0.293,0.575,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,nnh,20,3.183,0.251,0.056,0.11,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,nnh,20,2.835,0.232,0.052,0.102,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,nnh,20,92.5,18.317,4.096,8.028,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,nnh,20,9.958,2.025,0.453,0.888,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,nnh,20,4.703,0.685,0.153,0.3,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,nnh,20,3.518,0.382,0.085,0.168,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,nnh,20,2.624,0.272,0.061,0.119,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,nnh,20,95,15.39,3.441,6.745,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,nnh,20,10.566,2.042,0.456,0.895,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,nnh,20,4.598,0.705,0.158,0.309,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,nnh,15,2.828,0.244,0.063,0.123,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,nnh,15,2.748,0.28,0.072,0.141,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,nnh,15,100,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,nnh,15,9.644,1.62,0.418,0.82,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,nnh,15,6.058,1.166,0.301,0.59,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,nnh,20,3.117,0.233,0.052,0.102,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,nnh,20,2.72,0.176,0.039,0.077,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,nnh,20,7.805,0.819,0.183,0.359,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,nnh,20,5.688,0.719,0.161,0.315,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,nnh,20,3.702,0.216,0.048,0.095,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,nnh,20,2.192,0.122,0.027,0.053,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,nnh,20,5.65,0.442,0.099,0.194,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,nnh,20,10.964,1.478,0.33,0.648,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,nnh,20,3.593,0.579,0.129,0.254,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,nnh,20,2.187,0.215,0.048,0.094,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,nnh,20,6.698,1.081,0.242,0.474,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,nnh,20,11.003,2.778,0.621,1.218,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,nnh,10,3.981,0.435,0.138,0.27,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,nnh,10,1.988,0.134,0.042,0.083,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,nnh,10,95,15.811,5,9.8,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,nnh,10,5.034,0.568,0.18,0.352,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,nnh,10,42.833,30.752,9.725,19.061,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,nnh,20,3.776,0.492,0.11,0.215,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,nnh,20,3.021,0.236,0.053,0.104,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,nnh,20,29.583,17.703,3.959,7.759,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,nnh,20,7.982,1.095,0.245,0.48,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,nnh,20,4.35,0.307,0.069,0.134,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,nnh,20,3.754,0.488,0.109,0.214,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,nnh,20,3.062,0.265,0.059,0.116,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,nnh,20,72.5,28.753,6.429,12.602,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,nnh,20,7.23,1.035,0.231,0.454,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,nnh,20,4.139,0.436,0.098,0.191,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,nnh,10,3.569,0.482,0.152,0.299,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,nnh,10,2.247,0.209,0.066,0.129,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,nnh,10,65,24.152,7.638,14.97,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,nnh,10,6.318,0.912,0.288,0.565,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,nnh,10,11.802,3.42,1.082,2.12,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,nnh,10,3.156,0.277,0.088,0.172,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,nnh,10,2.257,0.113,0.036,0.07,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,nnh,10,9.128,1.088,0.344,0.675,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,nnh,10,8.226,0.933,0.295,0.579,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,nnh,20,2.9,0.266,0.06,0.117,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,nnh,20,2.464,0.264,0.059,0.116,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,nnh,20,97.5,11.18,2.5,4.9,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,nnh,20,11.461,2.887,0.645,1.265,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,nnh,20,7.057,1.577,0.353,0.691,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,nnh,20,3.004,0.272,0.061,0.119,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,nnh,20,3.126,0.255,0.057,0.112,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,nnh,20,70,31.692,7.087,13.89,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,nnh,20,10.425,2.904,0.649,1.273,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,nnh,20,4.518,0.469,0.105,0.205,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,nnh,20,3.489,0.323,0.072,0.142,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,nnh,20,2.643,0.186,0.042,0.082,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,nnh,20,36.25,8.667,1.938,3.798,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,nnh,20,7.621,0.92,0.206,0.403,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,nnh,20,6.24,1.485,0.332,0.651,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,nnh,13,3.275,0.242,0.067,0.132,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,nnh,13,3.203,0.285,0.079,0.155,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,nnh,13,27.051,9.407,2.609,5.114,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,nnh,13,8.411,1.156,0.321,0.628,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,nnh,13,4.719,0.791,0.219,0.43,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,nnh,15,3.087,0.25,0.065,0.127,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,nnh,15,3.413,0.409,0.106,0.207,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,nnh,15,59.444,26.515,6.846,13.418,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,nnh,15,7.28,1.2,0.31,0.607,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,nnh,15,4.691,0.542,0.14,0.274,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,nnh,15,3.386,0.508,0.131,0.257,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,nnh,15,2.741,0.258,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,nnh,15,77.778,28.637,7.394,14.493,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,nnh,15,9.236,1.916,0.495,0.969,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,nnh,15,4.99,0.568,0.147,0.288,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,nnh,15,3.274,0.436,0.113,0.221,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,nnh,15,2.204,0.188,0.049,0.095,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,nnh,15,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,nnh,15,8.633,1.252,0.323,0.634,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,nnh,15,9.036,1.871,0.483,0.947,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,nnh,20,3.137,0.266,0.06,0.117,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,nnh,20,2.863,0.208,0.046,0.091,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,nnh,20,92.5,18.317,4.096,8.028,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,nnh,20,10.726,1.876,0.419,0.822,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,nnh,20,4.513,0.493,0.11,0.216,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,nnh,10,3.475,0.094,0.03,0.059,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,nnh,10,1.833,0.053,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,nnh,10,8.27,0.81,0.256,0.502,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,nnh,10,24.167,6.632,2.097,4.11,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,nnh,20,3.039,0.204,0.046,0.089,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,nnh,20,1.996,0.09,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,nnh,20,10.337,1.48,0.331,0.649,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,nnh,20,15.802,4.868,1.088,2.133,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,nnh,20,4.225,0.383,0.086,0.168,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,nnh,20,1.776,0.05,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,nnh,20,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,nnh,20,6.372,0.398,0.089,0.175,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,nnh,20,28.298,17.375,3.885,7.615,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,nnh,15,3.31,0.266,0.069,0.135,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,nnh,15,2.4,0.1,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,nnh,15,95.556,17.213,4.444,8.711,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,nnh,15,7.162,1.247,0.322,0.631,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,nnh,15,7.915,1.271,0.328,0.643,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,nnh,15,4.67,0.534,0.138,0.27,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,nnh,15,2.067,0.091,0.024,0.046,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,nnh,15,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,nnh,15,6.202,0.935,0.241,0.473,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,nnh,15,7.67,0.917,0.237,0.464,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,nnh,5,3.155,0.196,0.088,0.172,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,nnh,5,2.449,0.176,0.079,0.154,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,nnh,5,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,nnh,5,6.216,0.593,0.265,0.52,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,nnh,5,9.722,2.291,1.024,2.008,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,nnh,13,3.662,0.403,0.112,0.219,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,nnh,13,2.039,0.133,0.037,0.072,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,nnh,13,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,nnh,13,5.78,0.687,0.191,0.374,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,nnh,13,20.293,9.678,2.684,5.261,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,nnh,10,3.463,0.322,0.102,0.199,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,nnh,10,2.875,0.24,0.076,0.149,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,nnh,10,69.167,34.258,10.833,21.233,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,nnh,10,5.608,0.577,0.183,0.358,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,nnh,10,6.258,0.578,0.183,0.358,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,nnh,10,4.421,0.716,0.226,0.444,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,nnh,10,1.823,0.129,0.041,0.08,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,nnh,10,100,0,0,0,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,nnh,10,6.131,0.873,0.276,0.541,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,nnh,10,23.333,5.827,1.843,3.611,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,nnh,20,3.86,0.334,0.075,0.147,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,nnh,20,2.498,0.135,0.03,0.059,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,nnh,20,31.167,17.705,3.959,7.76,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,nnh,20,4.512,0.291,0.065,0.128,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,nnh,20,13.639,3.968,0.887,1.739,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,nnh,10,3.552,0.153,0.048,0.095,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,nnh,10,2.674,0.207,0.065,0.128,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,nnh,10,30,8.958,2.833,5.552,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,nnh,10,4.901,0.813,0.257,0.504,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,nnh,10,10.783,2.61,0.825,1.618,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.261,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.257,0.077,0.044,0.087,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.245,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.139,0.189,0.109,0.214,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.697,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.01,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.083,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.365,0.059,0.034,0.066,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,2.784,0.155,0.089,0.175,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.311,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.113,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.889,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.049,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.335,0.053,0.031,0.06,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.295,0.134,0.077,0.152,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.862,2.036,1.176,2.304,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.245,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.151,0.164,0.095,0.186,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.268,0.057,0.033,0.064,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.114,0.068,0.039,0.076,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,11.126,2.774,1.602,3.139,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.23,0.032,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.104,0.132,0.076,0.15,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.159,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.21,0.034,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.048,0.147,0.085,0.166,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.906,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.252,0.057,0.033,0.065,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.086,0.087,0.05,0.098,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.295,0.067,0.03,0.059,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,2.175,0.059,0.026,0.052,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,9.485,0.755,0.337,0.661,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.023,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.347,0.041,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.321,0.13,0.041,0.08,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,13.492,1.571,0.497,0.974,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.014,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.357,0.062,0.028,0.054,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,2.484,0.17,0.076,0.149,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,10.152,1.341,0.6,1.175,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.008,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.304,0.046,0.016,0.032,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,2.142,0.212,0.075,0.147,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,9.36,2.235,0.79,1.549,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1.011,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.416,0.041,0.024,0.047,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.293,0.107,0.062,0.121,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,11.574,0.802,0.463,0.907,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.02,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.232,0.047,0.017,0.033,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,2.042,0.171,0.061,0.119,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,12.208,2.583,0.913,1.79,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1.005,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.349,0.038,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.308,0.195,0.062,0.121,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,11.636,2.419,0.765,1.5,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.008,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.338,0.047,0.015,0.029,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.332,0.125,0.04,0.078,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,10.373,1.749,0.553,1.084,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.01,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.373,0.051,0.016,0.032,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.459,0.261,0.083,0.162,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,14.599,2.371,0.75,1.47,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.013,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.27,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.03,0.151,0.048,0.093,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,10.842,2.342,0.741,1.451,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.007,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.289,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.072,0.189,0.06,0.117,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,10.891,2.693,0.852,1.669,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.011,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.322,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.329,0.108,0.063,0.123,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.075,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.299,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,2.633,0.069,0.04,0.078,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.352,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.231,0.174,0.1,0.197,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.394,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.024,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.295,0.065,0.038,0.074,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.231,0.174,0.1,0.197,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.205,0.025,0.014,0.028,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.046,0.125,0.072,0.142,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.267,0.043,0.025,0.049,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.274,0.052,0.03,0.058,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.22,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.045,0.108,0.062,0.122,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.186,0.016,0.009,0.019,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.992,0.123,0.071,0.14,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.906,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.23,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.165,0.145,0.084,0.164,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.333,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.288,0.025,0.015,0.029,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.987,0.046,0.027,0.052,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.462,1.332,0.769,1.508,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.042,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.095,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.305,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,2.615,0.152,0.088,0.172,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.01,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.366,0.041,0.014,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,2.071,0.095,0.033,0.066,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,8.098,0.998,0.353,0.692,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1.05,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.389,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.114,0.052,0.03,0.059,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.031,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.346,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.222,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.095,1.65,0.952,1.867,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.031,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.376,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.223,0.049,0.029,0.056,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.104,3.636,2.099,4.114,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.038,0.016,0.009,0.019,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.323,0.044,0.026,0.05,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.146,0.094,0.054,0.106,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.442,1.125,0.649,1.273,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.027,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.4,0.051,0.016,0.032,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.208,0.116,0.037,0.072,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,8.155,0.925,0.292,0.573,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.041,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.352,0.018,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.07,0.065,0.037,0.073,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.937,1.803,1.041,2.04,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.038,0.013,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.272,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.989,0.084,0.048,0.095,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,6.843,1.954,1.128,2.211,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.027,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.358,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.042,0.071,0.041,0.08,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.222,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.042,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.104,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,1.33,0.084,0.038,0.074,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,2.518,0.324,0.145,0.284,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.004,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.402,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.159,0.055,0.032,0.062,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.425,1.067,0.616,1.207,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.031,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.333,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.115,0.095,0.055,0.107,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.639,2.055,1.187,2.326,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.346,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.13,0.091,0.052,0.103,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.555,1.609,0.929,1.82,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.365,0.042,0.024,0.048,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.229,0.149,0.086,0.169,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.928,1.162,0.671,1.315,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.017,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.305,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.949,0.057,0.033,0.065,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,5.828,0.74,0.428,0.838,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.327,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.949,0.057,0.033,0.065,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,6.035,0.571,0.33,0.646,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,9,1.289,0.038,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,9,2.53,0.208,0.069,0.136,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,9,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,9,11.79,2.032,0.677,1.328,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,9,1.009,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.284,0.066,0.038,0.075,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.239,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.697,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.076,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.403,0.041,0.024,0.047,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,3.394,0.34,0.196,0.385,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.339,0.021,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.384,0.096,0.055,0.109,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.394,0.525,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.027,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.316,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.309,0.062,0.036,0.071,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.889,0.962,0.556,1.089,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.137,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.767,0.074,0.043,0.083,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.235,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.119,0.139,0.08,0.157,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.975,2.219,1.281,2.512,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.155,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,1.819,0.052,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,8.97,0.689,0.308,0.604,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.006,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.132,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.767,0.079,0.046,0.09,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.625,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.217,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,2.078,0.092,0.041,0.081,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,9.677,0.893,0.399,0.783,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.006,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.285,0.074,0.043,0.084,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.093,0.175,0.101,0.198,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.067,1.012,0.584,1.145,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.327,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.299,0.176,0.102,0.199,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,11.667,1.443,0.833,1.633,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.214,0.044,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.052,0.128,0.041,0.079,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,11.136,2.45,0.775,1.519,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.006,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.255,0.032,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,2.168,0.17,0.06,0.117,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,8.799,1.735,0.614,1.203,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1.009,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.352,0.039,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.387,0.177,0.056,0.11,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,15.718,4.417,1.397,2.738,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.011,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.19,0.042,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.053,0.139,0.044,0.086,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,11.636,3.116,0.985,1.932,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.003,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.241,0.077,0.045,0.088,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.723,0.596,0.344,0.674,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.035,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.092,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,1.33,0.074,0.033,0.065,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,2.476,0.288,0.129,0.253,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.002,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.389,0.019,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.464,0.145,0.084,0.164,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.53,1.765,1.019,1.998,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.06,0.024,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.403,0.038,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.533,0.194,0.061,0.12,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,11.781,2.617,0.828,1.622,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.007,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.289,0.047,0.027,0.053,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.318,0.186,0.108,0.211,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,11.204,1.253,0.723,1.417,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.017,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.281,0.038,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,2.235,0.185,0.066,0.128,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,7.996,1.222,0.432,0.847,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1.011,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.401,0.061,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.553,0.203,0.064,0.126,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,15.147,4.164,1.317,2.581,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.02,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.26,0.114,0.066,0.129,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.017,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.242,0.102,0.059,0.116,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.016,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.256,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.586,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.021,0.018,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.263,0.03,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,2.329,0.105,0.047,0.092,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,10.5,1.118,0.5,0.98,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.002,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.2,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.976,0.093,0.053,0.105,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.625,0.807,0.466,0.914,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.068,0.007,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.411,0.073,0.042,0.083,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,3.622,0.204,0.118,0.231,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.364,0.039,0.023,0.044,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.146,0.094,0.054,0.106,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.009,0.592,0.342,0.67,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.049,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.365,0.058,0.033,0.065,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.384,0.096,0.055,0.109,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.087,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.695,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,10.278,2.097,1.211,2.373,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.288,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.362,0.032,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.017,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.265,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.225,0.137,0.043,0.085,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,13.024,2.366,0.748,1.467,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.014,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.067,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,1.644,0.039,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,9.59,0.857,0.303,0.594,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.064,0.011,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.588,0.05,0.029,0.057,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.586,1.723,0.995,1.95,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.167,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.807,0.019,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.141,0.834,0.482,0.944,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.328,0.096,0.055,0.109,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.838,0.437,0.253,0.495,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,5,1.248,0.048,0.021,0.042,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,5,2.065,0.137,0.061,0.12,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,5,9.333,0.913,0.408,0.8,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,5,1.002,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.091,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.46,0.159,0.092,0.18,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,4.013,0.29,0.167,0.328,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.339,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.174,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.12,0.37,0.214,0.419,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.042,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.301,0.062,0.036,0.07,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.206,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,7.692,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.042,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.588,0.043,0.025,0.049,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.091,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,10,1.26,0.055,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,10,2.352,0.196,0.062,0.121,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,10,13.968,4.478,1.416,2.776,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,10,1.009,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.06,0.024,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,1.439,0.025,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,7.438,0.514,0.182,0.356,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,8,1.048,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,8,1.394,0.034,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,8,6.182,0.443,0.156,0.307,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.182,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.98,0.139,0.08,0.157,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,9.764,1.166,0.673,1.32,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,4,1.242,0.015,0.008,0.015,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,4,2.129,0.065,0.033,0.064,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,4,10.051,0.827,0.413,0.81,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,4,1.026,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.167,0.008,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.757,0.083,0.048,0.094,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,6.564,0.636,0.367,0.72,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.031,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.084,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.39,0.058,0.033,0.066,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,2.81,0.16,0.092,0.181,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.288,0.034,0.02,0.039,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.03,0.093,0.053,0.105,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,6.825,0.275,0.159,0.311,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.064,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.282,0.016,0.009,0.019,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.002,0.068,0.039,0.077,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,5.773,0.189,0.109,0.214,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.02,0.01,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.22,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.808,0.05,0.029,0.056,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,6.144,0.453,0.261,0.512,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.322,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,2.292,0.081,0.047,0.092,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,8.372,0.7,0.404,0.792,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.028,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.141,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.579,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,4.353,0.189,0.109,0.214,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.035,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.128,0.02,0.011,0.022,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.571,0.028,0.016,0.032,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,4.486,0.239,0.138,0.271,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.035,0.006,0.004,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,nnh_cumulative,3,1.141,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,nnh_cumulative,3,1.571,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,nnh_cumulative,3,4.227,0.105,0.06,0.118,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,nnh_cumulative,3,1.035,0.012,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.289,0.044,0.018,0.035,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.098,0.234,0.096,0.187,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,8.757,0.808,0.33,0.646,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.008,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.359,0.055,0.023,0.044,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.174,0.142,0.058,0.113,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,9.531,1.338,0.546,1.071,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.036,0.011,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,8,1.265,0.038,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,8,2.194,0.147,0.052,0.102,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,8,11.955,2.796,0.988,1.937,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,8,1.004,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.332,0.048,0.019,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.376,0.107,0.044,0.086,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,11.323,2.715,1.109,2.173,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.005,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,8,1.369,0.055,0.019,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,8,2.461,0.238,0.084,0.165,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,8,14.335,2.186,0.773,1.515,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,8,1.014,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,4,1.389,0.028,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,4,2.341,0.22,0.11,0.216,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,4,10.933,2.51,1.255,2.46,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,4,1.036,0.014,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.334,0.039,0.016,0.031,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.388,0.14,0.057,0.112,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,11.667,4.303,1.757,3.443,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.008,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.357,0.058,0.026,0.051,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.349,0.063,0.028,0.056,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,12.024,1.4,0.626,1.227,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.029,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.347,0.05,0.02,0.04,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.03,0.087,0.035,0.069,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,8.028,1.168,0.477,0.935,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.047,0.012,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,4,1.299,0.019,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,4,2.326,0.044,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,4,11.12,2.883,1.441,2.825,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,4,1.008,0.01,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,9,1.309,0.042,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,9,2.441,0.073,0.024,0.048,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,9,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,9,13.051,1.763,0.588,1.152,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,9,1.011,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,3,1.31,0.026,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,3,2.415,0.23,0.133,0.26,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,3,12.037,0.802,0.463,0.907,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,3,1.014,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.189,0.049,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.956,0.082,0.037,0.072,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,12.231,3.162,1.414,2.771,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.002,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.241,0.051,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.125,0.174,0.071,0.139,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,9.061,1.961,0.801,1.569,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.01,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.26,0.039,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.154,0.153,0.068,0.134,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,11.246,2.284,1.021,2.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.012,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.398,0.04,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.34,0.098,0.044,0.086,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,13.222,3.931,1.758,3.446,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.031,0.008,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.253,0.034,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.177,0.168,0.068,0.134,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,10.543,1.536,0.627,1.229,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.01,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,3,1.3,0.057,0.033,0.065,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,3,2.213,0.153,0.088,0.173,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,3,12.753,3.794,2.191,4.294,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,3,1.014,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.378,0.035,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.445,0.132,0.059,0.116,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,11.857,1.845,0.825,1.618,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.046,0.01,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.249,0.044,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.202,0.176,0.072,0.14,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,11.25,2.97,1.212,2.376,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.01,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.388,0.048,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.53,0.22,0.07,0.136,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,10.789,2.601,0.822,1.612,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.026,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.406,0.054,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.583,0.186,0.059,0.115,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,12.794,3.519,1.113,2.181,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.012,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.361,0.026,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.398,0.144,0.059,0.115,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,9.893,1.532,0.626,1.226,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.04,0.018,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,6,1.274,0.017,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,6,2.222,0.139,0.057,0.111,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,6,8.524,0.89,0.363,0.712,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,6,1.014,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,3,1.385,0.079,0.045,0.089,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,3,2.547,0.138,0.08,0.156,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,3,11.364,1.968,1.136,2.227,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,3,1.031,0.011,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.073,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.646,0.048,0.022,0.042,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,9.889,0.994,0.444,0.871,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.265,0.057,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.384,0.189,0.06,0.117,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,15.246,3.666,1.159,2.272,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.009,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.382,0.071,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.56,0.275,0.087,0.171,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,16.27,4.963,1.57,3.076,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.029,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.276,0.057,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.417,0.179,0.057,0.111,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,15.429,4.064,1.285,2.519,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.009,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,10,1.342,0.054,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,10,2.617,0.233,0.074,0.144,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,10,23.25,6.899,2.182,4.276,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,10,1.013,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,4,1.268,0.064,0.032,0.063,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,4,2.338,0.283,0.141,0.277,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,4,16.349,4.41,2.205,4.322,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,4,1.01,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.379,0.048,0.021,0.042,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.282,0.165,0.074,0.144,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,8.603,1.161,0.519,1.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.053,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.286,0.032,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.359,0.262,0.117,0.23,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,13,4.108,1.837,3.601,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.004,0.006,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.051,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.433,0.017,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,7.505,0.631,0.282,0.553,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.047,0.026,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.437,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,7.505,0.631,0.282,0.553,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.217,0.036,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.94,0.149,0.067,0.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,9.707,1.05,0.47,0.92,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.349,0.048,0.022,0.042,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,2.263,0.18,0.08,0.157,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,8.538,1.461,0.653,1.28,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1.033,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,3,1.266,0.027,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,3,2.391,0.31,0.179,0.351,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,3,12.262,2.153,1.243,2.436,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,3,1.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,5,1.035,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,5,1.381,0.017,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,5,6.522,0.477,0.213,0.418,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,nnh_cumulative,4,1.031,0.023,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,nnh_cumulative,4,1.384,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,nnh_cumulative,4,6.485,0.543,0.271,0.532,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,nnh_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,nnh_cumulative,10,1.249,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,nnh_cumulative,10,2.103,0.075,0.024,0.046,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,nnh_cumulative,10,8.368,0.572,0.181,0.354,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.002,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,nnh_cumulative,20,1.161,0.036,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.838,0.097,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,nnh_cumulative,20,7.442,0.869,0.194,0.381,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,nnh_cumulative,20,1.291,0.046,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.183,0.126,0.028,0.055,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,nnh_cumulative,20,9.958,2.025,0.453,0.888,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.008,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,nnh_cumulative,20,1.3,0.05,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.081,0.154,0.034,0.067,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,nnh_cumulative,20,10.566,2.042,0.456,0.895,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.008,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,nnh_cumulative,15,1.207,0.04,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,nnh_cumulative,15,2.126,0.175,0.045,0.089,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,nnh_cumulative,15,9.644,1.62,0.418,0.82,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.001,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,nnh_cumulative,20,1.219,0.034,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.01,0.092,0.021,0.04,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,nnh_cumulative,20,7.805,0.819,0.183,0.359,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.001,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,nnh_cumulative,20,1.103,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.575,0.052,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,nnh_cumulative,20,5.65,0.442,0.099,0.194,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.001,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,nnh_cumulative,20,1.113,0.037,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.64,0.151,0.034,0.066,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,nnh_cumulative,20,6.698,1.081,0.242,0.474,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.003,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,nnh_cumulative,10,1.042,0.015,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.418,0.054,0.017,0.033,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,nnh_cumulative,10,5.034,0.568,0.18,0.352,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.008,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,nnh_cumulative,20,1.372,0.04,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.179,0.13,0.029,0.057,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,nnh_cumulative,20,7.982,1.095,0.245,0.48,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.041,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,nnh_cumulative,20,1.352,0.045,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.141,0.18,0.04,0.079,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,nnh_cumulative,20,7.23,1.035,0.231,0.454,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.016,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,nnh_cumulative,10,1.119,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.645,0.078,0.025,0.048,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,nnh_cumulative,10,6.318,0.912,0.288,0.565,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.016,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,nnh_cumulative,10,1.145,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.803,0.054,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,nnh_cumulative,10,9.128,1.088,0.344,0.675,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.003,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,nnh_cumulative,20,1.179,0.044,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.002,0.143,0.032,0.063,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,nnh_cumulative,20,11.461,2.887,0.645,1.265,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.003,0.006,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,nnh_cumulative,20,1.319,0.044,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.375,0.189,0.042,0.083,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,nnh_cumulative,20,10.425,2.904,0.649,1.273,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.018,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,nnh_cumulative,20,1.25,0.059,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.957,0.122,0.027,0.053,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,nnh_cumulative,20,7.621,0.92,0.206,0.403,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.03,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,nnh_cumulative,13,1.349,0.049,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,nnh_cumulative,13,2.307,0.156,0.043,0.085,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,nnh_cumulative,13,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,nnh_cumulative,13,8.411,1.156,0.321,0.628,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,nnh_cumulative,13,1.043,0.014,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,nnh_cumulative,15,1.311,0.048,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,nnh_cumulative,15,2.302,0.217,0.056,0.11,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,nnh_cumulative,15,7.28,1.2,0.31,0.607,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.02,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,nnh_cumulative,15,1.28,0.035,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,nnh_cumulative,15,2.099,0.201,0.052,0.102,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,nnh_cumulative,15,9.236,1.916,0.495,0.969,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.016,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,nnh_cumulative,15,1.131,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,nnh_cumulative,15,1.746,0.106,0.027,0.054,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,nnh_cumulative,15,8.633,1.252,0.323,0.634,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.001,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,nnh_cumulative,20,1.303,0.043,0.01,0.019,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,nnh_cumulative,20,2.245,0.137,0.031,0.06,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,nnh_cumulative,20,10.726,1.876,0.419,0.822,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.008,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,nnh_cumulative,10,1.046,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.497,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,nnh_cumulative,10,8.27,0.81,0.256,0.502,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,nnh_cumulative,20,1.075,0.024,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.668,0.066,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,nnh_cumulative,20,10.337,1.48,0.331,0.649,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,nnh_cumulative,20,1.042,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.388,0.033,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,nnh_cumulative,20,6.372,0.398,0.089,0.175,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,nnh_cumulative,15,1.158,0.029,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,nnh_cumulative,15,1.789,0.073,0.019,0.037,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,nnh_cumulative,15,7.162,1.247,0.322,0.631,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.007,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,nnh_cumulative,15,1.156,0.021,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,nnh_cumulative,15,1.544,0.067,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,nnh_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,nnh_cumulative,15,6.202,0.935,0.241,0.473,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,nnh_cumulative,15,1.003,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,nnh_cumulative,5,1.124,0.033,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,nnh_cumulative,5,1.752,0.095,0.043,0.084,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,nnh_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,nnh_cumulative,5,6.216,0.593,0.265,0.52,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,nnh_cumulative,5,1.002,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,nnh_cumulative,13,1.06,0.017,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,nnh_cumulative,13,1.501,0.073,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,nnh_cumulative,13,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,nnh_cumulative,13,5.78,0.687,0.191,0.374,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,nnh_cumulative,13,1.001,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,nnh_cumulative,10,1.219,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.894,0.126,0.04,0.078,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,nnh_cumulative,10,5.608,0.577,0.183,0.358,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.019,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,nnh_cumulative,10,1.054,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.4,0.095,0.03,0.059,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,nnh_cumulative,10,6.131,0.873,0.276,0.541,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.006,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,nnh_cumulative,20,1.13,0.02,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,nnh_cumulative,20,1.605,0.067,0.015,0.03,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,nnh_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,nnh_cumulative,20,4.512,0.291,0.065,0.128,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,nnh_cumulative,20,1.039,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,nnh_cumulative,10,1.154,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,nnh_cumulative,10,1.712,0.074,0.023,0.046,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,nnh_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,nnh_cumulative,10,4.901,0.813,0.257,0.504,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,nnh_cumulative,10,1.037,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109.333,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,67.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,3,3612.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,419.667,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,111.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,71,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3675.667,9.238,5.333,10.453,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,13.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,353.667,7.638,4.41,8.643,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,257,25.632,14.799,29.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,229,19.519,11.269,22.088,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,2979,78.886,45.545,89.268,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,100.333,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,647,38,21.939,43.001,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,99.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,67.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3743.667,5.859,3.383,6.631,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,297,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,93.667,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3674,15.875,9.165,17.964,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,368.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,110,4,2.309,4.526,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,80,6.557,3.786,7.42,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3574.333,13.577,7.839,15.364,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,442,6.928,4,7.84,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,96,5.196,3,5.88,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,64.333,6.028,3.48,6.821,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3646.667,16.197,9.351,18.328,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,12.333,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,403.333,14.64,8.452,16.567,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,110.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,80.667,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,3,3593.333,17.214,9.939,19.48,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,422,12,6.928,13.579,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,115.333,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,82.667,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3574,11.269,6.506,12.753,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,434.333,10.066,5.812,11.391,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,101.667,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,80.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3625,16.462,9.504,18.629,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,401.667,13.317,7.688,15.069,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,116.8,23.382,10.457,20.495,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,72.4,16.38,7.325,14.358,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,5,3672.2,51.183,22.89,44.864,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,13.2,2.28,1.02,1.999,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,348.4,12.442,5.564,10.906,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,106,7.846,2.481,4.863,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,71.2,6.779,2.144,4.202,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3706.8,10.119,3.2,6.272,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,9.8,1.135,0.359,0.704,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,328.4,7.545,2.386,4.677,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,100.6,5.32,2.379,4.663,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,64.2,6.496,2.905,5.694,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,5,3627.4,6.95,3.108,6.092,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,13.8,1.095,0.49,0.96,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,417.2,7.791,3.484,6.829,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,114.375,4.565,1.614,3.163,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,72.125,6.49,2.295,4.498,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,8,3636.125,18.939,6.696,13.124,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,15.125,2.357,0.833,1.633,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,386.125,17.988,6.36,12.465,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,98.333,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,71.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3632,15.62,9.018,17.676,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,12.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,407.667,16.862,9.735,19.081,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,114.875,5.915,2.091,4.099,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,80.5,5.425,1.918,3.759,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,8,3516.625,14.51,5.13,10.055,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,12.875,1.885,0.666,1.306,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,497.625,12.546,4.436,8.694,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,109.4,5.441,1.72,3.372,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,66.7,3.713,1.174,2.302,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,10,3574.5,17.232,5.449,10.681,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,10.8,2.251,0.712,1.395,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,460.6,16.133,5.102,9.999,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,119.1,11.249,3.557,6.972,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,69.8,14.25,4.506,8.832,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,10,3600.6,35.669,11.279,22.108,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,19.4,7.604,2.405,4.713,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,411.7,16.132,5.101,9.999,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,98.3,4.322,1.367,2.679,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,67.4,6.867,2.172,4.256,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3635.3,11.48,3.63,7.115,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,10.7,1.418,0.448,0.879,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,410.9,11.249,3.557,6.972,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,114.2,7.115,2.25,4.41,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,78.8,5.653,1.788,3.504,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,10,3555.9,21.85,6.91,13.543,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,14.3,2.497,0.79,1.547,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,459.7,16.793,5.31,10.409,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,108.6,6.931,2.192,4.296,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,78.2,6.268,1.982,3.885,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,10,3589.6,13.377,4.23,8.291,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,13.9,3.107,0.983,1.926,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,432.6,12.886,4.075,7.987,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,99.333,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,79.667,5.859,3.383,6.631,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,3,3571.667,10.017,5.783,11.335,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,455,12.288,7.095,13.905,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,311.333,30.73,17.742,34.774,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,304.667,21.221,12.252,24.014,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,2712.333,90.456,52.225,102.361,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,118.333,14.844,8.57,16.797,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,769,22.605,13.051,25.58,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,102.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,69.667,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3725,12.53,7.234,14.179,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,311.333,7.506,4.333,8.493,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,103,6,3.464,6.79,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,68.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3618.667,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,416.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,127.333,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,94.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3435.333,9.074,5.239,10.268,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,23.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,543,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,129,11.79,6.807,13.341,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,71,4.583,2.646,5.186,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3568.333,18.583,10.729,21.029,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,440.667,8.083,4.667,9.147,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,132.667,13.868,8.007,15.694,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,99.667,8.021,4.631,9.076,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,3,3436.667,20.008,11.552,22.642,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,24.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,530.333,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,136,13.528,7.81,15.308,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,101.667,9.018,5.207,10.205,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3416,20.075,11.59,22.717,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,26.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,543.667,7.506,4.333,8.493,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,120,6.245,3.606,7.067,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,91.667,5.859,3.383,6.631,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3502.333,21.008,12.129,23.773,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,24,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,486.333,13.577,7.839,15.364,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,114.333,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,74.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3790,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,230.333,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,314,26.889,15.524,30.427,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,290.667,19.296,11.141,21.835,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,2881.333,80.432,46.438,91.018,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,108.667,9.238,5.333,10.453,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,615.667,36.226,20.915,40.994,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,109.5,6.568,2.322,4.552,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,67.75,3.694,1.306,2.56,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,8,3774.375,18.508,6.544,12.826,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,17.25,1.982,0.701,1.373,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,253.25,13.987,4.945,9.693,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,102.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3758,12.53,7.234,14.179,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,273.667,8.083,4.667,9.147,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,105.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,72.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3760.333,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,266.333,6.429,3.712,7.275,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,104.667,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,67,5.292,3.055,5.988,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3770.667,17.243,9.955,19.513,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,14.667,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,264.333,8.505,4.91,9.624,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,118.333,15.373,8.876,17.396,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,68.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3711.667,42.852,24.741,48.492,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,20.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,299.333,25.891,14.948,29.298,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,107.6,6.736,2.13,4.175,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,68.3,4.785,1.513,2.966,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3773.3,11.344,3.587,7.031,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,17.2,2.044,0.646,1.267,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,255.4,11.881,3.757,7.364,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,108.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,73.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,3,3791.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,229.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,139,6.928,4,7.84,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,91.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3707,14.799,8.544,16.746,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,21,4.359,2.517,4.933,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,263.667,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,75.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3791.667,5.033,2.906,5.696,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,226.333,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,343.8,61.694,27.591,54.078,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,304.8,63.614,28.449,55.76,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,5,2734.6,232.122,103.808,203.464,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,127,27.175,12.153,23.82,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,703,78.74,35.214,69.019,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,102.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,69.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3763,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18.333,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,269.667,10.116,5.84,11.447,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,105.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,72.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3719.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,308,6,3.464,6.79,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,105.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,78.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3715,14.107,8.145,15.963,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,18.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,306.333,20.232,11.681,22.895,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,113,6.928,4,7.84,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,89.333,17.954,10.366,20.316,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3680.333,15.011,8.667,16.987,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,321.333,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,116.667,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,81.333,7.572,4.372,8.568,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3706,14.933,8.622,16.898,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,22.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,298.333,15.535,8.969,17.579,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,80.333,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3730.333,20.502,11.837,23.2,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,22,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,284,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,9,106,5.362,1.787,3.503,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,9,74,6.727,2.242,4.395,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,9,3529.222,12.647,4.216,8.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,9,14,3.742,1.247,2.445,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,9,499.778,15.213,5.071,9.939,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,116.333,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,77,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3579.667,9.713,5.608,10.991,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,14,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,438.667,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,229.333,11.59,6.692,13.116,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,186.667,8.145,4.702,9.216,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3002.333,14.048,8.11,15.896,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,65.667,8.622,4.978,9.756,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,733.667,6.028,3.48,6.821,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,98.667,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,66,4.583,2.646,5.186,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3727,14.422,8.327,16.32,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,13.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,318,14,8.083,15.842,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,106.333,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3586,15.1,8.718,17.087,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,14,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,446.333,18.556,10.713,20.998,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,160.667,9.074,5.239,10.268,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,114,3,1.732,3.395,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3286.333,8.963,5.175,10.142,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,13.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,649,4,2.309,4.526,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,127.667,6.807,3.93,7.703,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,76,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3542,29.547,17.059,33.435,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,463,18.193,10.504,20.588,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,159.2,8.349,3.734,7.318,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,114,2.345,1.049,2.056,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,5,3294,12.942,5.788,11.344,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,14.6,1.14,0.51,0.999,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,642,6.892,3.082,6.041,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,170.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,122,9.539,5.508,10.795,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3256.333,11.372,6.566,12.869,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,659.667,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,129.8,4.324,1.934,3.79,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,96.4,5.177,2.315,4.538,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,5,3434.4,12.522,5.6,10.976,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,14.6,2.408,1.077,2.111,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,546.8,5.805,2.596,5.088,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,123,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70.667,10.116,5.84,11.447,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3602.333,12.858,7.424,14.55,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,12,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,414.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,115,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,65.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3565.667,12.423,7.172,14.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,10.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,464,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,132,3.887,1.229,2.409,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,65.9,5.043,1.595,3.126,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,10,3502,8.11,2.565,5.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,11,1.944,0.615,1.205,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,509.4,4.477,1.416,2.775,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,142.375,12.42,4.391,8.607,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,90.375,20.191,7.139,13.992,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,8,3503,52.238,18.469,36.199,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,20.625,7.836,2.771,5.43,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,460.75,27.742,9.808,19.225,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,106.5,4.275,1.352,2.65,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,62.9,3.035,0.96,1.881,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3593.6,8.834,2.794,5.476,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,8.7,2.003,0.633,1.241,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,449.6,7.336,2.32,4.547,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,139.9,3.872,1.224,2.4,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,70.4,5.739,1.815,3.557,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,10,3485.5,10.617,3.357,6.581,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,10.8,2.251,0.712,1.395,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,514.8,9.283,2.936,5.754,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,108.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,61.667,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3700.333,12.503,7.219,14.149,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,328,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,354,56.555,25.292,49.573,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,331.6,49.034,21.929,42.98,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,5,2616.4,196.841,88.03,172.539,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,118.4,24.234,10.838,21.242,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,790.4,73.449,32.848,64.381,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,87.333,6.506,3.756,7.363,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,58.333,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3750,6,3.464,6.79,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,299.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,93.7,5.012,1.585,3.107,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,54.9,3.247,1.027,2.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,10,3679,10.863,3.435,6.733,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,12.5,3.44,1.088,2.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,377.1,11.328,3.582,7.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,102.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,61.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3652.333,10.017,5.783,11.335,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,13,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,386.333,6.506,3.756,7.363,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,135.875,21.669,7.661,15.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,74.875,12.977,4.588,8.993,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,8,3595.625,54.848,19.392,38.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,26.125,7.809,2.761,5.411,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,382.875,19.052,6.736,13.202,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,95.5,7.878,2.491,4.883,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,57.9,5.782,1.828,3.584,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3701,9.978,3.155,6.184,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,11.4,2.951,0.933,1.829,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,353.9,8.346,2.639,5.173,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,106.667,5.033,2.906,5.696,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,62.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,3,3665.333,15.011,8.667,16.987,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,366.667,10.214,5.897,11.559,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109,8.888,5.132,10.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,65,7.81,4.509,8.838,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3658,25.981,15,29.4,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,369,11.358,6.557,12.853,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109.667,8.021,4.631,9.076,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,63.667,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3664,18.52,10.693,20.958,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,363.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,111.4,5.32,2.379,4.663,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,69.2,3.421,1.53,2.998,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,5,3581,16.688,7.463,14.628,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,13.8,1.643,0.735,1.44,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,446.2,15.205,6.8,13.328,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,121,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,85.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3642.333,10.599,6.119,11.994,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,16.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,360,10.536,6.083,11.922,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,206.333,22.723,13.119,25.713,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,182,16.523,9.539,18.697,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3121,94.319,54.455,106.732,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,62,14.107,8.145,15.963,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,643,46.872,27.062,53.041,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,99.333,7.234,4.177,8.186,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,66.667,5.686,3.283,6.435,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3756,13.115,7.572,14.841,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,280.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,109.333,6.351,3.667,7.187,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,65.667,8.963,5.175,10.142,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3649,11.136,6.429,12.601,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,379.667,19.399,11.2,21.952,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,159.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,122,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3358.667,9.866,5.696,11.164,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,563.667,12.342,7.126,13.967,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,123,8.544,4.933,9.668,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,74,6.557,3.786,7.42,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3524.333,7.095,4.096,8.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,27.667,9.815,5.667,11.107,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,464.667,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,115.6,3.373,1.067,2.091,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,70.6,3.718,1.176,2.304,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized,10,3587.2,8.854,2.8,5.488,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,12,2.16,0.683,1.339,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,435.4,8.972,2.837,5.561,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,173.5,4.629,1.637,3.208,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,118.75,4.683,1.656,3.245,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,8,3407.375,9.841,3.479,6.819,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,18.875,1.642,0.581,1.138,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,502.875,8.725,3.085,6.046,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,177.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,128.333,11.015,6.36,12.465,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3372,12.124,7,13.72,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,20.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,523,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,141.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,100,10.536,6.083,11.922,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3543,18.52,10.693,20.958,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,421.667,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,120.667,17.954,10.366,20.316,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70,5.568,3.215,6.301,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,3,3537,35.679,20.599,40.375,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,15.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,477,15.875,9.165,17.964,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,5,126.6,4.159,1.86,3.646,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,5,83.4,4.775,2.135,4.185,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,5,3607.6,9.45,4.226,8.283,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,5,16.8,1.304,0.583,1.143,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,5,387.8,9.365,4.188,8.209,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,199,12.53,7.234,14.179,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,165.333,11.59,6.692,13.116,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3130,33.867,19.553,38.325,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,58,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,669,12.288,7.095,13.905,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,100.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,64.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3761,10.149,5.859,11.485,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,280.333,7.234,4.177,8.186,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,111,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,77.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3614,11.269,6.506,12.753,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,400,9.644,5.568,10.913,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,198.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,137,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3229,10.817,6.245,12.24,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,20.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,635.667,8.505,4.91,9.624,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,10,128.8,15.469,4.892,9.588,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,10,66.1,13.102,4.143,8.121,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,10,3536.1,42.186,13.34,26.147,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,10,12.7,8.015,2.534,4.967,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,10,472.9,19.198,6.071,11.899,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,240.5,4.751,1.68,3.292,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,156.875,3.796,1.342,2.631,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,8,3193.75,12.937,4.574,8.965,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,28.25,2.121,0.75,1.47,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,602.625,9.782,3.459,6.779,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,8,249.25,5.12,1.81,3.548,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,8,166.75,7.344,2.596,5.089,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,8,3164.75,20.443,7.228,14.166,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,8,33.125,2.748,0.972,1.905,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,8,606.5,11.976,4.234,8.299,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,167.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,109.333,6.429,3.712,7.275,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3466,19.079,11.015,21.59,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,22.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,458,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized,4,124.75,7.136,3.568,6.993,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized,4,91.5,5.066,2.533,4.965,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized,4,3702.75,12.79,6.395,12.534,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized,4,16,0.816,0.408,0.8,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,4,290,3.367,1.683,3.299,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,150,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,93.333,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3760.333,22.546,13.017,25.513,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,21.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,201.333,14.64,8.452,16.567,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,282.667,43.016,24.835,48.677,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,248,35.93,20.744,40.659,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized,3,3079.333,97.079,56.049,109.855,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,102.667,12.858,7.424,14.55,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,509,29.715,17.156,33.626,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,121.667,6.506,3.756,7.363,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,74.667,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized,3,3797,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,19.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,212,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,120.333,5.686,3.283,6.435,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,78,4,2.309,4.526,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3747,8.544,4.933,9.668,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,22.333,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,257.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,150.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,96,4.359,2.517,4.933,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized,3,3713,15.716,9.074,17.785,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,26.667,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,242,9.849,5.686,11.145,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,118,5.292,3.055,5.988,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,70,12.49,7.211,14.134,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized,3,3694.667,30.665,17.704,34.701,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,17.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,324.333,12.662,7.311,14.329,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,175,5,2.887,5.658,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,127.667,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized,3,3750,15.395,8.888,17.421,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,36.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,142,7.55,4.359,8.543,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,186,11.136,6.429,12.601,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,131.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized,3,3715,22.605,13.051,25.58,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,36,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,161.667,12.858,7.424,14.55,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized,3,182.333,8.083,4.667,9.147,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized,3,138.333,12.342,7.126,13.967,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized,3,3725.333,33.471,19.325,37.876,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized,3,38.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized,3,147.333,13.614,7.86,15.405,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,117.5,7.583,3.096,6.068,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,72,8,3.266,6.401,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3634.667,12.291,5.018,9.835,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,15.5,1.517,0.619,1.214,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,384.167,9.642,3.936,7.715,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,100.167,5.269,2.151,4.216,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,66.833,7.627,3.114,6.103,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3720.333,10.967,4.477,8.775,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,13.667,1.033,0.422,0.826,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,321.333,6.501,2.654,5.202,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,8,106.5,6.118,2.163,4.239,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,8,73.75,5.97,2.111,4.137,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,8,3544.5,22.829,8.071,15.819,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,8,13,1.852,0.655,1.283,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,8,484.875,15.45,5.462,10.706,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,106.5,7.583,3.096,6.068,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,61,2.449,1,1.96,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3635.5,14.025,5.726,11.222,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,11.833,3.764,1.537,3.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,408.333,7.367,3.007,5.895,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,8,97.5,4.472,1.581,3.099,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,8,70.125,3.682,1.302,2.551,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,8,3619.25,13.977,4.942,9.686,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,8,11.125,1.126,0.398,0.78,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,8,425.25,13.328,4.712,9.236,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,4,92.5,3.873,1.936,3.796,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,4,58.75,5.123,2.562,5.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,4,3716.25,7.365,3.683,7.218,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,4,11,2.449,1.225,2.4,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,4,343.5,6.137,3.069,6.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,107.667,8.017,3.273,6.415,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,61.5,3.728,1.522,2.983,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3607.333,16.836,6.873,13.472,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,11.833,3.312,1.352,2.65,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,431.833,14.784,6.036,11.83,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,95.2,6.301,2.818,5.523,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,56.2,3.834,1.715,3.361,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3721.8,11.212,5.014,9.827,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,11.4,1.342,0.6,1.176,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,336,14.697,6.573,12.882,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,109.667,3.882,1.585,3.106,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,67.667,4.131,1.687,3.306,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3783.333,11.742,4.794,9.395,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,17.667,2.16,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,243.833,8.28,3.38,6.626,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,4,104,10.863,5.431,10.646,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,4,76.5,11.818,5.909,11.582,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,4,3567.5,38.414,19.207,37.646,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,4,14.75,3.775,1.887,3.699,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,4,461.5,18.448,9.224,18.079,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,9,98.222,4.684,1.561,3.061,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,9,73.111,2.369,0.79,1.548,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,9,3588.111,12.394,4.131,8.097,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,9,11,1,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,9,452.444,12.391,4.13,8.095,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,3,95.667,6.028,3.48,6.821,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,3,58.667,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,3,3698.333,14.295,8.253,16.176,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,3,10.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,3,361,20.518,11.846,23.219,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,132.2,4.658,2.083,4.083,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,69,5.148,2.302,4.512,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3498.8,10.733,4.8,9.408,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,10.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,509,5.788,2.588,5.073,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,123.833,8.931,3.646,7.146,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,62.833,4.875,1.99,3.901,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3586.167,13.527,5.522,10.823,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,13.333,3.141,1.282,2.513,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,434.667,6.683,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,123.2,7.19,3.216,6.303,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,67,10.025,4.483,8.787,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3595.4,17.111,7.652,14.999,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,11.4,1.949,0.872,1.709,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,419.4,13.594,6.079,11.916,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,92.4,4.669,2.088,4.093,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,56.4,6.229,2.786,5.46,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3689,14.765,6.603,12.942,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,10.8,1.304,0.583,1.143,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,368.6,9.209,4.118,8.072,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,121,7.483,3.055,5.988,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,65.333,8.042,3.283,6.435,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3602.5,26.365,10.763,21.096,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,11.5,1.871,0.764,1.497,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,417.333,12.011,4.904,9.611,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,3,120.667,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,3,65,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,3,3592,8.185,4.726,9.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,3,10.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,3,433.333,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,88.4,4.93,2.205,4.321,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,57,3.317,1.483,2.907,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3707.8,5.675,2.538,4.974,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,9.8,1.095,0.49,0.96,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,359,10.124,4.528,8.874,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,120.667,7.146,2.917,5.718,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,62,8.786,3.587,7.031,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3591,27.423,11.195,21.943,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,11.833,1.941,0.792,1.553,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,433.167,12.09,4.936,9.674,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,93.5,4.696,1.485,2.911,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,55,3.972,1.256,2.462,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3721.6,12.589,3.981,7.803,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,15.3,4.322,1.367,2.679,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,332.8,10.861,3.434,6.732,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,94.3,4.322,1.367,2.679,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,55.3,3.974,1.257,2.463,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3695.6,9.371,2.963,5.808,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,12.3,4.001,1.265,2.48,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,360,10.863,3.435,6.733,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,96.167,9.786,3.995,7.83,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,58,4.775,1.949,3.821,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3732,9.252,3.777,7.403,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,16.667,2.875,1.174,2.301,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,313.167,3.656,1.493,2.925,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,6,118.167,12.384,5.056,9.909,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,6,67.833,12.734,5.199,10.19,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,6,3653,30.073,12.277,24.064,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,6,21,1.789,0.73,1.431,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,6,356.5,11.221,4.581,8.978,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,3,88.333,7.506,4.333,8.493,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,3,49.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,3,3717.333,21.362,12.333,24.173,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,3,12.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,3,351,19.698,11.372,22.29,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,171.4,2.793,1.249,2.448,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,118.6,5.459,2.441,4.785,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3410.6,11.014,4.925,9.654,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,18.2,1.643,0.735,1.44,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,502.6,8.173,3.655,7.164,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,118.6,13.108,4.145,8.124,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,57,6.831,2.16,4.234,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3564.9,36.507,11.545,22.627,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,9,4.922,1.556,3.05,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,466,18.732,5.924,11.61,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,88.1,4.909,1.552,3.043,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,51.3,7.499,2.371,4.648,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3707.3,16.932,5.354,10.494,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,9,3.367,1.065,2.087,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,360.4,16.621,5.256,10.302,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,113.1,8.711,2.755,5.399,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,53,6.412,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3576.3,20.865,6.598,12.932,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,10.9,7.445,2.354,4.615,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,462.2,12.865,4.068,7.974,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,10,98.1,5.259,1.663,3.259,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,10,52,7.087,2.241,4.392,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,10,3629.7,19.12,6.046,11.851,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,10,6.5,2.991,0.946,1.854,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,10,428.9,13.119,4.148,8.131,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,4,116.75,16.52,8.26,16.19,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,4,56.5,6.952,3.476,6.813,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,4,3554,27.653,13.826,27.1,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,4,7,1.826,0.913,1.789,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,4,476.75,8.808,4.404,8.632,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,97.4,4.159,1.86,3.646,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,65,4,1.789,3.506,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3735.2,13.065,5.843,11.452,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,15.6,1.949,0.872,1.709,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,309.2,10.06,4.499,8.818,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,113.2,6.686,2.99,5.86,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,58.8,3.962,1.772,3.473,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3607.4,21.102,9.437,18.497,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,10.6,3.05,1.364,2.673,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,430.2,19.728,8.823,17.292,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,240.6,6.107,2.731,5.353,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,157.6,3.578,1.6,3.136,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3188.6,13.686,6.12,11.996,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,27.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,607.6,7.503,3.356,6.577,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,241.2,6.834,3.056,5.99,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,157.6,2.966,1.327,2.6,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3188.6,12.798,5.724,11.218,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,27.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,607,6.633,2.966,5.814,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,144,5.385,2.408,4.72,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,89.4,5.128,2.293,4.495,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3559.8,13.719,6.135,12.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,16.8,1.924,0.86,1.686,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,412.8,11.67,5.219,10.23,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,97,2.449,1.095,2.147,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,64.4,7.301,3.265,6.399,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3737,11.811,5.282,10.353,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,15.6,2.966,1.327,2.6,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,307.8,7.95,3.555,6.968,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,3,120.333,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,3,58,4.583,2.646,5.186,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,3,3580,12.166,7.024,13.767,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,3,11.333,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,3,449,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,5,247.2,4.55,2.035,3.988,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,5,165.2,4.147,1.855,3.635,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,5,3168.6,17.053,7.626,14.947,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,5,31.4,2.881,1.288,2.525,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,5,607,9.354,4.183,8.199,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized,4,247.75,6.185,3.092,6.061,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized,4,169.25,7.089,3.544,6.947,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized,4,3159.75,25.902,12.951,25.384,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized,4,31,3.162,1.581,3.099,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized,4,611,13.292,6.646,13.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,normalized,10,110.5,3.567,1.128,2.211,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,normalized,10,79.8,2.821,0.892,1.748,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,normalized,10,3601.1,13.908,4.398,8.62,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,normalized,10,16.8,1.398,0.442,0.867,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,normalized,10,415.7,10.023,3.169,6.212,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,normalized,20,145.25,7.348,1.643,3.22,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,normalized,20,88.85,4.682,1.047,2.052,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,normalized,20,3479.15,44.023,9.844,19.294,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,normalized,20,20.2,2.167,0.484,0.95,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,normalized,20,490.7,43.979,9.834,19.274,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,normalized,20,115.45,4.639,1.037,2.033,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,normalized,20,71.5,4.478,1.001,1.963,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,normalized,20,3695.3,16.837,3.765,7.379,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,normalized,20,13.1,2.673,0.598,1.172,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,normalized,20,327.85,16.554,3.702,7.255,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,normalized,20,113.35,7.45,1.666,3.265,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,normalized,20,75.55,6.565,1.468,2.877,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,normalized,20,3559.95,30.2,6.753,13.236,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,normalized,20,14.3,2.364,0.529,1.036,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,normalized,20,459.75,23.83,5.329,10.444,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,normalized,15,132.467,3.583,0.925,1.813,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,normalized,15,87,6.188,1.598,3.131,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,normalized,15,3417.333,19.219,4.962,9.726,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,normalized,15,17.2,2.908,0.751,1.472,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,normalized,15,567.2,17.781,4.591,8.999,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,normalized,20,129,6.545,1.464,2.869,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,normalized,20,94.2,4.98,1.114,2.183,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,normalized,20,3447.9,15.008,3.356,6.578,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,normalized,20,24.35,1.872,0.418,0.82,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,normalized,20,528.9,14.589,3.262,6.394,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,normalized,20,164.85,6.564,1.468,2.877,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,normalized,20,123.7,6.045,1.352,2.649,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,normalized,20,3431.65,16.943,3.789,7.426,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,normalized,20,34.95,3.203,0.716,1.404,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,normalized,20,470.35,11.296,2.526,4.951,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,normalized,20,167.6,8.953,2.002,3.924,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,normalized,20,111.15,12.91,2.887,5.658,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,normalized,20,3401.2,50.409,11.272,22.093,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,normalized,20,25.3,4.68,1.047,2.051,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,normalized,20,515.65,59.138,13.224,25.918,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,normalized,10,231,5.963,1.886,3.696,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,normalized,10,136.3,4.473,1.415,2.773,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,normalized,10,3530.4,14.284,4.517,8.854,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,normalized,10,40.1,4.725,1.494,2.928,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,normalized,10,286.5,11.237,3.554,6.965,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,normalized,20,104.7,6.14,1.373,2.691,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,normalized,20,66.1,4.745,1.061,2.08,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,normalized,20,3758.5,30.4,6.798,13.323,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,normalized,20,17.35,2.961,0.662,1.298,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,normalized,20,276.1,39.426,8.816,17.279,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,normalized,20,108.85,7.775,1.739,3.408,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,normalized,20,74.25,7.9,1.766,3.462,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,normalized,20,3710.9,21.248,4.751,9.312,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,normalized,20,20.05,2.982,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,normalized,20,310.85,30.797,6.886,13.497,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,normalized,10,160.6,10.08,3.187,6.247,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,normalized,10,110.8,10.717,3.389,6.642,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,normalized,10,3554.5,14.902,4.712,9.236,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,normalized,10,24.5,3.951,1.249,2.449,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,normalized,10,371.5,16.861,5.332,10.45,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,normalized,10,163.9,8.075,2.554,5.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,normalized,10,117.1,3.446,1.09,2.136,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,normalized,10,3280,16.405,5.188,10.168,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,normalized,10,14.6,1.647,0.521,1.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,normalized,10,647.7,12.166,3.847,7.541,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,normalized,20,151.8,13.87,3.101,6.079,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,normalized,20,80.9,12.502,2.796,5.479,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,normalized,20,3424.7,67.143,15.014,29.427,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,normalized,20,11.75,2.826,0.632,1.239,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,normalized,20,551.35,41.107,9.192,18.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,normalized,20,104.2,8.16,1.825,3.576,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,normalized,20,57.1,4.941,1.105,2.165,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,normalized,20,3625.5,53.381,11.936,23.395,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,normalized,20,13.75,4.435,0.992,1.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,normalized,20,418.9,62.969,14.08,27.598,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,normalized,20,123,16.955,3.791,7.431,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,normalized,20,81.7,10.219,2.285,4.479,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,normalized,20,3753.9,37.035,8.281,16.231,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,normalized,20,19.75,3.932,0.879,1.723,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,normalized,20,239,65.033,14.542,28.502,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,normalized,13,104.231,12.644,3.507,6.873,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,normalized,13,60.462,8.521,2.363,4.632,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,normalized,13,3738.846,28.719,7.965,15.612,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,normalized,13,15.846,2.154,0.597,1.171,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,normalized,13,302.308,47.596,13.201,25.874,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,normalized,15,109.6,9.847,2.543,4.983,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,normalized,15,67.8,7.966,2.057,4.031,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,normalized,15,3597.4,46.849,12.096,23.709,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,normalized,15,19.267,3.15,0.813,1.594,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,normalized,15,429.2,59.052,15.247,29.885,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,normalized,15,124.667,7.979,2.06,4.038,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,normalized,15,81.933,7.86,2.029,3.978,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,normalized,15,3571.533,49.842,12.869,25.224,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,normalized,15,20,4.44,1.146,2.247,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,normalized,15,425.067,62.174,16.053,31.465,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,normalized,15,152.2,7.775,2.008,3.935,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,normalized,15,106.4,10.336,2.669,5.231,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,normalized,15,3358.933,45.399,11.722,22.975,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,normalized,15,17.533,3.044,0.786,1.541,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,normalized,15,586.733,40.161,10.37,20.324,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,normalized,20,105.7,5.332,1.192,2.337,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,normalized,20,73.05,5.042,1.127,2.21,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,normalized,20,3584.15,21.556,4.82,9.447,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,normalized,20,12.9,2.553,0.571,1.119,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,normalized,20,447.3,15.407,3.445,6.752,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,normalized,10,215.1,8.359,2.643,5.181,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,normalized,10,157.8,7.177,2.27,4.448,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,normalized,10,3212.4,31.124,9.842,19.291,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,normalized,10,21,2.789,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,normalized,10,616.3,25.355,8.018,15.715,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,normalized,20,175,6.026,1.348,2.641,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,normalized,20,119.8,5.197,1.162,2.278,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,normalized,20,3400.5,18.922,4.231,8.293,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,normalized,20,18.5,1.792,0.401,0.785,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,normalized,20,508.1,18.281,4.088,8.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,normalized,20,253.7,5.302,1.186,2.324,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,normalized,20,175.7,8.405,1.879,3.684,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,normalized,20,3147.4,16.95,3.79,7.429,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,normalized,20,32.35,2.207,0.494,0.967,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,normalized,20,613.5,8.108,1.813,3.553,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,normalized,15,145.2,6.293,1.625,3.185,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,normalized,15,97.733,6.352,1.64,3.215,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,normalized,15,3544.667,61.729,15.938,31.239,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,normalized,15,22.2,4.617,1.192,2.336,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,normalized,15,415.2,70.153,18.113,35.502,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,normalized,15,147.933,2.314,0.597,1.171,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,normalized,15,127.267,7.343,1.896,3.716,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,normalized,15,3455.6,37.55,9.695,19.003,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,normalized,15,29.133,2.9,0.749,1.468,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,normalized,15,464.867,43.824,11.315,22.178,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,normalized,5,188.4,8.989,4.02,7.879,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,normalized,5,131,10.7,4.785,9.379,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,normalized,5,3364.8,15.975,7.144,14.003,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,normalized,5,36.6,6.804,3.043,5.964,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,normalized,5,501,16.355,7.314,14.336,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,normalized,13,202.154,11.568,3.208,6.288,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,normalized,13,142,8.114,2.25,4.411,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,normalized,13,3298.923,34.75,9.638,18.89,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,normalized,13,30.077,3.068,0.851,1.668,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,normalized,13,549.385,23.009,6.382,12.508,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,normalized,10,120,4.807,1.52,2.98,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,normalized,10,83.9,4.771,1.509,2.957,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,normalized,10,3685.9,13.254,4.191,8.215,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,normalized,10,23.6,2.171,0.686,1.345,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,normalized,10,311.8,9.784,3.094,6.064,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,normalized,10,244.2,12.541,3.966,7.773,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,normalized,10,181,11.215,3.547,6.951,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,normalized,10,3292.1,29.637,9.372,18.369,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,normalized,10,28.6,4.789,1.514,2.968,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,normalized,10,476.6,12.563,3.973,7.786,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,normalized,20,178.85,7.741,1.731,3.393,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,normalized,20,129.25,5.169,1.156,2.266,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,normalized,20,3743.95,15.115,3.38,6.625,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,normalized,20,35.3,3.213,0.719,1.408,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,normalized,20,144.35,15.094,3.375,6.615,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,normalized,10,162.9,5.043,1.595,3.126,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,normalized,10,118.4,6.24,1.973,3.867,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,normalized,10,3711.7,8.381,2.65,5.194,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,normalized,10,39.9,3.985,1.26,2.47,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,normalized,10,193.8,8.094,2.56,5.017,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,192,5.196,3,5.88,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,82.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,611.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,196,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,84.333,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,13.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,549.667,8.145,4.702,9.216,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,586.333,41.501,23.961,46.963,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,329.333,16.258,9.387,18.398,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4212.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,100.333,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1233.333,78.945,45.579,89.335,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,182.333,8.505,4.91,9.624,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,82.667,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,479.333,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,179.667,11.15,6.438,12.618,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,86,8.185,4.726,9.263,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,548.333,14.224,8.212,16.096,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,207,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,97,7.55,4.359,8.543,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,649,13,7.506,14.711,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,172.667,8.505,4.91,9.624,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,76.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,12.333,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,576,16.643,9.609,18.834,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,209,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,98.333,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,631,16.643,9.609,18.834,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,216,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,100.667,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,650.333,10.693,6.173,12.1,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,198.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,96.667,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,600,16.523,9.539,18.697,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,202.4,40.172,17.966,35.212,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,85.6,17.416,7.788,15.265,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4223,1,0.447,0.877,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,13.2,2.28,1.02,1.999,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,550.8,51.154,22.877,44.838,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,187,12.138,3.838,7.523,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,81,7.196,2.275,4.46,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4222.2,1.398,0.442,0.867,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,9.8,1.135,0.359,0.704,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,515.4,10.824,3.423,6.709,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,178.6,6.066,2.713,5.317,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,78,6.745,3.017,5.913,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4223.2,1.643,0.735,1.44,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,13.8,1.095,0.49,0.96,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,595.8,7.259,3.247,6.363,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,201.625,5.069,1.792,3.513,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,87.25,7.046,2.491,4.882,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4223.875,2.1,0.743,1.455,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,15.125,2.357,0.833,1.633,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,587.75,17.91,6.332,12.411,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,182.333,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,84,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,12.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,590,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,208.25,8.598,3.04,5.958,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,93.375,5.78,2.044,4.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4222.5,1.773,0.627,1.228,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,12.875,1.885,0.666,1.306,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,705.875,14.817,5.239,10.268,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,186.9,4.795,1.516,2.972,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,77.5,4.72,1.493,2.925,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4222,1.155,0.365,0.716,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,10.8,2.251,0.712,1.395,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,647.5,16.628,5.258,10.306,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,208.3,24.833,7.853,15.392,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,89.2,16.376,5.179,10.15,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4220.6,4.575,1.447,2.836,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,19.4,7.604,2.405,4.713,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,620,34.127,10.792,21.152,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,176.4,4.971,1.572,3.081,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,78.1,7.156,2.263,4.435,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4222.6,1.43,0.452,0.886,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,10.7,1.418,0.448,0.879,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,587.3,12.202,3.859,7.563,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,207.3,7.846,2.481,4.863,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,93.1,6.28,1.986,3.892,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4222.9,1.663,0.526,1.031,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,14.3,2.497,0.79,1.547,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,667,21.166,6.693,13.119,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,200.7,7.379,2.334,4.574,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,92.1,7.724,2.442,4.787,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4222.9,0.994,0.314,0.616,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,13.9,3.107,0.983,1.926,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,633.3,13.499,4.269,8.367,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,197.333,5.774,3.333,6.533,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,98,5,2.887,5.658,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,652.333,11.59,6.692,13.116,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,734.333,65.118,37.596,73.688,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,423,34.395,19.858,38.921,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4215.667,4.619,2.667,5.227,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,118.333,14.844,8.57,16.797,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1503.333,86.985,50.221,98.432,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,187.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,85,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,498.667,11.504,6.642,13.018,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,188,6.245,3.606,7.067,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,85,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,604.667,6.028,3.48,6.821,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,245.333,10.116,5.84,11.447,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,118,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,23.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,788.333,10.116,5.84,11.447,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,215.333,17.898,10.333,20.253,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,86.333,6.11,3.528,6.914,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,656,18.028,10.408,20.4,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,257,21.932,12.662,24.818,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,124.333,8.083,4.667,9.147,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,24.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,787.333,20.502,11.837,23.2,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,264,23,13.279,26.027,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,128,9.539,5.508,10.795,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,26.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,807.667,20.033,11.566,22.67,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,235.667,12.097,6.984,13.689,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,115.667,5.859,3.383,6.631,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,24,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,722,20.518,11.846,23.219,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,204.333,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,90,4.583,2.646,5.186,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.667,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,434.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,713.333,45.369,26.194,51.34,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,399.333,18.877,10.899,21.361,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4210.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,108.667,9.238,5.333,10.453,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1329,80.988,46.758,91.646,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,194.5,9.381,3.317,6.501,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,85,3.546,1.254,2.457,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4222.125,0.991,0.35,0.687,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,17.25,1.982,0.701,1.373,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,447.75,19.226,6.797,13.323,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,189.667,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,87,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,463.333,12.503,7.219,14.149,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,196,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,90.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,462.333,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,186.333,10.97,6.333,12.413,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,81.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,14.667,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,450.667,17.156,9.905,19.414,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,207.333,17.214,9.939,19.48,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,89,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4218.333,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,20.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,506.667,38.07,21.98,43.08,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,193.1,10.268,3.247,6.364,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,85.5,6.042,1.91,3.745,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4221.8,0.789,0.249,0.489,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,17.2,2.044,0.646,1.267,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,448.5,11.462,3.625,7.105,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,201,6.557,3.786,7.42,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,92.333,6.028,3.48,6.821,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,430.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,251.333,13.577,7.839,15.364,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,112.333,8.963,5.175,10.142,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,21,4.359,2.517,4.933,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,515,15.62,9.018,17.676,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,204.667,7.506,4.333,8.493,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,95.333,6.506,3.756,7.363,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,431,5.292,3.055,5.988,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,775.6,151.14,67.592,132.48,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,431.8,90.409,40.432,79.247,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4213.2,5.541,2.478,4.857,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,127,27.175,12.153,23.82,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,1478.6,228.509,102.192,200.297,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,190,5.568,3.215,6.301,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,87.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18.333,2.309,1.333,2.613,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,459.667,15.373,8.876,17.396,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,197,5.568,3.215,6.301,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,91.333,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,505,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,202.667,8.737,5.044,9.887,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,97,6,3.464,6.79,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,18.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,509,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,219,10.44,6.028,11.814,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,106,17.349,10.017,19.633,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4220.667,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,540.333,11.93,6.888,13.5,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,220.667,5.686,3.283,6.435,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,104,8,4.619,9.053,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,22.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,519,16.523,9.539,18.697,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,211.667,10.693,6.173,12.1,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,102.333,7.767,4.485,8.79,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4226,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,22,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,495.667,22.03,12.719,24.93,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,9,194,7.583,2.528,4.954,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,9,88,9.644,3.215,6.301,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,9,4223,0.707,0.236,0.462,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,9,14,3.742,1.247,2.445,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,9,693.778,12.347,4.116,8.067,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,207.333,6.11,3.528,6.914,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,91,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225.667,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,14,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,646,8.888,5.132,10.058,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,481.667,10.97,6.333,12.413,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,252.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4217.667,6.11,3.528,6.914,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,65.667,8.622,4.978,9.756,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1215.333,8.327,4.807,9.423,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,178,3.606,2.082,4.08,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,79.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,13.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,496,15.716,9.074,17.785,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,190.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,84,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,14,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,636.667,16.563,9.563,18.743,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,288.333,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,127.667,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,13.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,937.333,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,218.667,14.295,8.253,16.176,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,91,15.716,9.074,17.785,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,681.667,30.238,17.458,34.217,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,287.8,6.76,3.023,5.926,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,128.6,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4223.8,0.447,0.2,0.392,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,14.6,1.14,0.51,0.999,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,929.8,12.558,5.616,11.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,307.667,11.06,6.386,12.516,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,137.333,10.599,6.119,11.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,967.333,11.93,6.888,13.5,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,240.8,8.167,3.652,7.159,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,111,6,2.683,5.259,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4222,4.528,2.025,3.969,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,14.6,2.408,1.077,2.111,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,787.6,13.39,5.988,11.737,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,205.667,13.796,7.965,15.612,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,82.667,12.342,7.126,13.967,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,12,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,620.333,14.364,8.293,16.255,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,190.667,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,75.667,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4220.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,10.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,654.667,10.97,6.333,12.413,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,208.9,6.488,2.052,4.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,76.9,4.818,1.524,2.986,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4220.3,3.683,1.165,2.283,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,11,1.944,0.615,1.205,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,718.3,6.273,1.984,3.888,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,253.375,36.924,13.055,25.587,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,111,25.4,8.98,17.601,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4217.125,6.221,2.199,4.311,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,20.625,7.836,2.771,5.43,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,714.125,56.93,20.128,39.45,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,178.1,4.886,1.545,3.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,71.6,3.239,1.024,2.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4221.3,0.675,0.213,0.418,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,8.7,2.003,0.633,1.241,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,627.7,8.706,2.753,5.396,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,221.1,7.666,2.424,4.751,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,81.2,5.653,1.788,3.504,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4221.4,1.174,0.371,0.728,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,10.8,2.251,0.712,1.395,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,735.9,10.734,3.394,6.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,189.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,80.667,5.033,2.906,5.696,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4217.667,6.429,3.712,7.275,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,517.333,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,804,123.313,55.147,108.088,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,450,69.728,31.183,61.119,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4210.8,1.304,0.583,1.143,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,118.4,24.234,10.838,21.242,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,1594.4,196.425,87.844,172.174,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,161.667,8.386,4.842,9.49,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,74.333,4.163,2.404,4.711,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4211,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,461,6,3.464,6.79,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,161.1,8.812,2.787,5.462,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,67.4,5.42,1.714,3.359,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4217.2,6.052,1.914,3.751,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,12.5,3.44,1.088,2.132,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,538.2,10.108,3.197,6.265,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,176.667,7.767,4.485,8.79,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,74.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4215.333,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,13,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,563,5.292,3.055,5.988,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,236.875,38.776,13.709,26.87,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,101,18.853,6.665,13.064,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4215.375,4.596,1.625,3.185,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,26.125,7.809,2.761,5.411,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,619.75,55.301,19.552,38.322,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,164.8,12.586,3.98,7.801,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,69.3,6.913,2.186,4.285,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4219.7,4.692,1.484,2.908,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,11.4,2.951,0.933,1.829,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,518.7,9.799,3.099,6.073,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,184,8,4.619,9.053,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,77.333,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4216,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,550.667,12.014,6.936,13.595,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,189,15.133,8.737,17.124,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,80,6.245,3.606,7.067,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4216,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,558,22.338,12.897,25.278,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,189.333,13.317,7.688,15.069,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,79.667,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4217,8.185,4.726,9.263,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,553,14.731,8.505,16.67,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,194.4,6.877,3.076,6.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,83,4.243,1.897,3.719,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4221.6,1.517,0.678,1.329,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,13.8,1.643,0.735,1.44,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,640.6,15.598,6.976,13.672,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,223,3,1.732,3.395,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,102,4,2.309,4.526,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,16.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,583,12.124,7,13.72,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,450.333,53.314,30.781,60.33,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,244,30.61,17.673,34.639,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4214.333,7.506,4.333,8.493,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,62,14.107,8.145,15.963,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1093.333,99.962,57.713,113.117,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,185.333,11.24,6.489,12.719,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,86,5.292,3.055,5.988,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,465.667,13.65,7.881,15.447,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,192.667,14.189,8.192,16.057,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,83.333,10.116,5.84,11.447,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,572.333,11.24,6.489,12.719,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,298.333,2.887,1.667,3.267,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,139,3.464,2,3.92,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4220.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,862,10.44,6.028,11.814,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,224.667,8.145,4.702,9.216,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,101.667,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4213.667,6.351,3.667,7.187,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,27.667,9.815,5.667,11.107,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,689.333,12.662,7.311,14.329,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,198.2,4.392,1.389,2.722,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,82.6,4.477,1.416,2.775,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4220.8,1.229,0.389,0.762,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,12,2.16,0.683,1.339,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,633.6,9.606,3.038,5.954,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,311.125,7.039,2.489,4.878,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,137.625,5.605,1.981,3.884,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4221.375,0.518,0.183,0.359,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,18.875,1.642,0.581,1.138,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,814,9.842,3.48,6.82,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,326.333,14.364,8.293,16.255,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,148.667,15.044,8.686,17.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,20.333,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,849.333,12.22,7.055,13.828,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,259,17.692,10.214,20.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,117.667,14.503,8.373,16.412,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,680.667,19.858,11.465,22.471,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,206.333,27.062,15.624,30.623,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,85.667,9.504,5.487,10.755,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4220.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,15.667,4.041,2.333,4.573,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,683.333,36.254,20.931,41.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,5,226.8,6.301,2.818,5.523,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,5,100.2,5.07,2.267,4.444,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,5,4222.2,1.095,0.49,0.96,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,5,16.8,1.304,0.583,1.143,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,5,614.6,9.044,4.045,7.928,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,422.333,22.368,12.914,25.312,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,223.333,13.577,7.839,15.364,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.333,2.082,1.202,2.356,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,58,2,1.155,2.263,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1091.333,34.356,19.835,38.877,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,182,6.083,3.512,6.883,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,81.333,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,462.333,11.015,6.36,12.465,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,208,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,97,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4222,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,608,10.392,6,11.76,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,356,3,1.732,3.395,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,157.667,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4220.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,20.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,991.667,10.263,5.925,11.614,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,10,207.6,32.077,10.144,19.882,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,10,78.8,19.205,6.073,11.904,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,10,4216.6,5.522,1.746,3.422,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,10,12.7,8.015,2.534,4.967,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,10,680.5,43.513,13.76,26.97,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,425.625,8.088,2.859,5.604,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,185.125,3.98,1.407,2.758,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4222,0.756,0.267,0.524,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,28.25,2.121,0.75,1.47,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,1028.25,12.87,4.55,8.919,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,8,449.125,11.18,3.953,7.747,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,8,199.875,7.298,2.58,5.058,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,8,4220.375,3.815,1.349,2.644,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,8,33.125,2.748,0.972,1.905,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,8,1055.625,18.102,6.4,12.544,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,299.667,13.051,7.535,14.769,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,132,9.165,5.292,10.371,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4223.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,22.667,3.055,1.764,3.457,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,757.667,18.556,10.713,20.998,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,4,232.25,10.689,5.344,10.475,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,4,107.5,5.508,2.754,5.397,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,4,4225,1.414,0.707,1.386,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,4,16,0.816,0.408,0.8,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,4,522.25,13.376,6.688,13.108,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,264.667,9.074,5.239,10.268,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,114.667,4.509,2.603,5.103,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4226.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,21.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,466,23.516,13.577,26.611,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,633.333,65.072,37.569,73.636,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,350.667,35.501,20.497,40.173,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4221.667,5.774,3.333,6.533,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,102.667,12.858,7.424,14.55,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,1142.333,94.786,54.725,107.26,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,216,6.245,3.606,7.067,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,94.333,5.033,2.906,5.696,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,19.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,428,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,220.667,5.033,2.906,5.696,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,100.333,4.726,2.728,5.348,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4225,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,22.333,3.215,1.856,3.638,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,478,9.539,5.508,10.795,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,273.333,7.767,4.485,8.79,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,122.667,5.508,3.18,6.232,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4228.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,26.667,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,515.333,17.01,9.821,19.248,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,205.333,20.502,11.837,23.2,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,87.333,15.373,8.876,17.396,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4224.333,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,17.333,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,529.667,30.238,17.458,34.217,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,339.333,9.292,5.364,10.514,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,164.333,5.132,2.963,5.807,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4231.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,36.667,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,481.333,15.695,9.062,17.761,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,353.333,11.015,6.36,12.465,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,167.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4230,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,36,2.646,1.528,2.994,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,515,23.58,13.614,26.683,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,normalized_cumulative,3,359.333,22.03,12.719,24.93,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,normalized_cumulative,3,177,14,8.083,15.842,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,normalized_cumulative,3,4232,1.732,1,1.96,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,normalized_cumulative,3,38.667,3.512,2.028,3.974,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,normalized_cumulative,3,506.667,33.005,19.055,37.349,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,205,5.02,2.049,4.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,87.5,7.556,3.085,6.046,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4223.833,1.722,0.703,1.378,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,15.5,1.517,0.619,1.214,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,589.167,12.189,4.976,9.753,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,180.667,6.947,2.836,5.559,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,80.5,6.686,2.729,5.35,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4222.333,1.966,0.803,1.573,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,13.667,1.033,0.422,0.826,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,502,11.559,4.719,9.249,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,8,193.25,10.899,3.853,7.553,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,8,86.75,6.585,2.328,4.563,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,8,4222.625,1.847,0.653,1.28,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,8,13,1.852,0.655,1.283,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,8,678.125,23.461,8.295,16.258,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,179.333,9.791,3.997,7.835,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,72.833,4.309,1.759,3.448,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4223.167,1.169,0.477,0.935,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,11.833,3.764,1.537,3.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,587.667,13.837,5.649,11.072,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,8,178.75,3.536,1.25,2.45,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,8,81.25,4.334,1.532,3.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,8,4223.25,1.669,0.59,1.157,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,8,11.125,1.126,0.398,0.78,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,8,604,14.677,5.189,10.171,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,4,162.25,8.221,4.11,8.056,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,4,69.75,4.856,2.428,4.759,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,4,4222,0.816,0.408,0.8,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,4,11,2.449,1.225,2.4,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,4,505.75,6.898,3.449,6.76,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,181,10.315,4.211,8.254,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,73.333,4.502,1.838,3.602,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4220.167,6.014,2.455,4.812,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,11.833,3.312,1.352,2.65,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,612.833,12.465,5.089,9.974,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,162.8,6.496,2.905,5.694,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,67.6,3.209,1.435,2.813,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4220.6,4.336,1.939,3.801,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,11.4,1.342,0.6,1.176,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,498.8,14.789,6.614,12.963,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,195,5.55,2.266,4.441,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,85.333,3.445,1.406,2.756,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4222.167,1.169,0.477,0.935,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,17.667,2.16,0.882,1.729,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,438.833,12.608,5.147,10.089,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,4,195.25,20.435,10.217,20.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,4,91.25,15.327,7.663,15.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,4,4224.25,0.5,0.25,0.49,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,4,14.75,3.775,1.887,3.699,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,4,656.75,38.561,19.28,37.789,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,9,182.333,5.454,1.818,3.564,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,9,84.111,3.14,1.047,2.052,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,9,4222.889,0.928,0.309,0.606,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,9,11,1,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,9,634.778,11.904,3.968,7.777,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,3,165,7,4.041,7.921,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,3,69.333,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,3,4224.333,1.155,0.667,1.307,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,3,10.667,0.577,0.333,0.653,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,3,526,15.395,8.888,17.421,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,211.6,5.857,2.619,5.134,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,79.4,3.847,1.72,3.372,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4219.4,5.367,2.4,4.704,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,10.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,720.6,6.58,2.943,5.768,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,200,13.446,5.489,10.759,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,76.167,4.956,2.023,3.966,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4220.833,5.456,2.227,4.366,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,13.333,3.141,1.282,2.513,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,634.667,12.58,5.136,10.066,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,201.6,13.594,6.079,11.916,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,78.4,9.659,4.32,8.467,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4216.4,7.436,3.326,6.518,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,11.4,1.949,0.872,1.709,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,621,12.981,5.805,11.378,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,159.6,5.413,2.421,4.745,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,67.2,5.805,2.596,5.088,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4217.2,7.294,3.262,6.393,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,10.8,1.304,0.583,1.143,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,528.2,10.354,4.63,9.075,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,197.833,14.386,5.873,11.511,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,76.833,7.679,3.135,6.144,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4217.667,7.339,2.996,5.873,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,11.5,1.871,0.764,1.497,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,615.167,20.243,8.264,16.198,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,3,196,4.583,2.646,5.186,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,3,75.333,7.024,4.055,7.948,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,3,4221.333,1.528,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,3,10.333,2.517,1.453,2.848,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,3,629.333,8.505,4.91,9.624,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,155.2,6.301,2.818,5.523,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,66.8,4.382,1.96,3.841,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4222,1.225,0.548,1.074,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,9.8,1.095,0.49,0.96,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,514.2,5.762,2.577,5.051,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,194.5,14.707,6.004,11.768,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,73.833,9.326,3.807,7.462,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4218.667,4.926,2.011,3.942,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,11.833,1.941,0.792,1.553,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,627.667,25.874,10.563,20.704,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,163.8,11.003,3.479,6.82,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,70.3,7.973,2.521,4.942,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4218.2,5.865,1.855,3.635,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,15.3,4.322,1.367,2.679,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,496.6,12.011,3.798,7.445,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,161.9,10.3,3.257,6.384,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,67.6,6.979,2.207,4.326,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4217.5,5.893,1.863,3.652,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,12.3,4.001,1.265,2.48,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,521.9,9.398,2.972,5.825,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,170.833,15.6,6.369,12.483,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,74.667,6.439,2.629,5.153,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4216,5.621,2.295,4.498,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,16.667,2.875,1.174,2.301,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,484,13.74,5.61,10.995,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,6,207,25.369,10.357,20.3,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,6,88.833,13.934,5.689,11.15,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,6,4216.5,4.848,1.979,3.879,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,6,21,1.789,0.73,1.431,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,6,563.5,31.552,12.881,25.246,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,3,150,1,0.577,1.132,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,3,61.667,7.024,4.055,7.948,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,3,4218.333,6.351,3.667,7.187,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,3,12.333,3.786,2.186,4.284,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,3,501,19,10.97,21.501,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,308.2,5.805,2.596,5.088,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,136.8,6.87,3.072,6.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4221.4,0.548,0.245,0.48,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,18.2,1.643,0.735,1.44,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,810.8,10.803,4.831,9.469,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,184.6,20.684,6.541,12.82,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,66,10.52,3.327,6.52,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4215.5,5.642,1.784,3.497,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,9,4.922,1.556,3.05,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,650.6,34.549,10.925,21.413,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,148.4,10.596,3.351,6.567,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,60.3,8.757,2.769,5.427,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4216.1,6.488,2.052,4.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,9,3.367,1.065,2.087,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,508.8,14.935,4.723,9.257,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,177,16.733,5.292,10.371,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,63.9,11.618,3.674,7.201,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4215.5,5.855,1.851,3.629,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,10.9,7.445,2.354,4.615,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,639.2,21.285,6.731,13.193,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,10,156.6,11.286,3.569,6.995,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,10,58.5,8.721,2.758,5.405,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,10,4215.2,5.884,1.861,3.647,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,10,6.5,2.991,0.946,1.854,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,10,585.5,18.686,5.909,11.582,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,4,180.25,24.157,12.079,23.674,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,4,63.5,8.699,4.349,8.525,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,4,4211,0.816,0.408,0.8,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,4,7,1.826,0.913,1.789,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,4,657,26.845,13.423,26.308,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,178,7.874,3.521,6.902,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,80.6,5.177,2.315,4.538,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4222.4,1.517,0.678,1.329,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,15.6,1.949,0.872,1.709,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,487.2,13.971,6.248,12.246,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,182.6,9.762,4.366,8.557,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,69.4,3.782,1.691,3.315,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4220.2,5.718,2.557,5.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,10.6,3.05,1.364,2.673,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,612.8,16.799,7.513,14.725,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,425.6,10.015,4.479,8.779,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,185,4.528,2.025,3.969,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4221.8,0.837,0.374,0.733,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,27.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,1033.2,13.953,6.24,12.231,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,426.2,10.134,4.532,8.883,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,185,3.536,1.581,3.099,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4221.8,0.837,0.374,0.733,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,27.4,2.302,1.03,2.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,1033.2,13.084,5.851,11.469,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,250.2,6.723,3.007,5.893,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,106.2,5.586,2.498,4.896,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4222.8,0.837,0.374,0.733,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,16.8,1.924,0.86,1.686,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,663,14.142,6.325,12.396,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,177,9.721,4.347,8.521,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,80,9.354,4.183,8.199,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4221.8,0.837,0.374,0.733,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,15.6,2.966,1.327,2.6,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,484.8,12.194,5.453,10.689,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,3,189.667,14.572,8.413,16.489,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,3,69.333,9.452,5.457,10.696,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,3,4218.667,6.658,3.844,7.535,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,3,11.333,4.933,2.848,5.582,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,3,638.667,11.59,6.692,13.116,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,5,443.8,8.701,3.891,7.626,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,5,196.6,5.03,2.249,4.409,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,5,4219.4,4.722,2.112,4.139,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,5,31.4,2.881,1.288,2.525,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,5,1050.8,14.237,6.367,12.48,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,normalized_cumulative,4,448,14.697,7.348,14.403,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,normalized_cumulative,4,200.25,9.287,4.644,9.101,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,normalized_cumulative,4,4218.75,5.188,2.594,5.084,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,normalized_cumulative,4,31,3.162,1.581,3.099,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,normalized_cumulative,4,1059,22.993,11.496,22.533,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,normalized_cumulative,10,207.1,6.118,1.935,3.792,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,normalized_cumulative,10,96.6,3.471,1.097,2.151,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,normalized_cumulative,10,4223.9,0.568,0.18,0.352,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,normalized_cumulative,10,16.8,1.398,0.442,0.867,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,normalized_cumulative,10,622.8,13.782,4.358,8.542,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,normalized_cumulative,20,254.3,9.027,2.018,3.956,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,normalized_cumulative,20,109.05,5.708,1.276,2.501,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,normalized_cumulative,20,4224.15,1.137,0.254,0.498,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,normalized_cumulative,20,20.2,2.167,0.484,0.95,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,normalized_cumulative,20,745,43.753,9.783,19.176,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,normalized_cumulative,20,200.05,7.28,1.628,3.191,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,normalized_cumulative,20,84.6,5.67,1.268,2.485,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,normalized_cumulative,20,4223.2,1.281,0.287,0.562,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,normalized_cumulative,20,13.1,2.673,0.598,1.172,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,normalized_cumulative,20,527.9,16.692,3.732,7.316,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,normalized_cumulative,20,203.2,10.461,2.339,4.585,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,normalized_cumulative,20,89.85,7.125,1.593,3.123,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,normalized_cumulative,20,4222.9,1.41,0.315,0.618,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,normalized_cumulative,20,14.3,2.364,0.529,1.036,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,normalized_cumulative,20,662.95,29.719,6.645,13.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,normalized_cumulative,15,236.667,9.737,2.514,4.928,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,normalized_cumulative,15,104.2,7.282,1.88,3.685,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,normalized_cumulative,15,4221.2,0.862,0.223,0.436,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,normalized_cumulative,15,17.2,2.908,0.751,1.472,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,normalized_cumulative,15,803.867,19.127,4.939,9.68,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,normalized_cumulative,20,247.55,10.164,2.273,4.455,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,normalized_cumulative,20,118.55,4.904,1.097,2.149,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,normalized_cumulative,20,4224.35,1.182,0.264,0.518,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,normalized_cumulative,20,24.35,1.872,0.418,0.82,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,normalized_cumulative,20,776.45,15.381,3.439,6.741,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,normalized_cumulative,20,323.5,11.091,2.48,4.861,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,normalized_cumulative,20,158.65,6.627,1.482,2.905,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,normalized_cumulative,20,4225.5,1.701,0.38,0.746,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,normalized_cumulative,20,34.95,3.203,0.716,1.404,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,normalized_cumulative,20,793.85,16.294,3.644,7.141,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,normalized_cumulative,20,304.05,21.647,4.84,9.487,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,normalized_cumulative,20,136.45,16.44,3.676,7.205,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,normalized_cumulative,20,4220.9,1.294,0.289,0.567,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,normalized_cumulative,20,25.3,4.68,1.047,2.051,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,normalized_cumulative,20,819.7,49.752,11.125,21.805,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,normalized_cumulative,10,407.4,11.559,3.655,7.164,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,normalized_cumulative,10,176.4,7.245,2.291,4.49,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,normalized_cumulative,10,4224.3,4.715,1.491,2.923,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,normalized_cumulative,10,40.1,4.725,1.494,2.928,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,normalized_cumulative,10,693.9,13.683,4.327,8.481,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,normalized_cumulative,20,188.15,10.84,2.424,4.751,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,normalized_cumulative,20,83.45,6.378,1.426,2.795,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,normalized_cumulative,20,4222.75,1.251,0.28,0.548,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,normalized_cumulative,20,17.35,2.961,0.662,1.298,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,normalized_cumulative,20,464.25,31.058,6.945,13.612,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,normalized_cumulative,20,203.15,15.329,3.428,6.718,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,normalized_cumulative,20,94.3,9.647,2.157,4.228,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,normalized_cumulative,20,4224.9,1.119,0.25,0.491,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,normalized_cumulative,20,20.05,2.982,0.667,1.307,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,normalized_cumulative,20,514,21.183,4.737,9.284,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,normalized_cumulative,10,295.9,15.416,4.875,9.555,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,normalized_cumulative,10,135.3,11.519,3.642,7.139,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,normalized_cumulative,10,4221.9,2.644,0.836,1.639,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,normalized_cumulative,10,24.5,3.951,1.249,2.449,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,normalized_cumulative,10,667.4,17.187,5.435,10.652,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,normalized_cumulative,10,295.6,7.486,2.367,4.64,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,normalized_cumulative,10,131.7,4.165,1.317,2.581,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,normalized_cumulative,10,4223.3,0.823,0.26,0.51,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,normalized_cumulative,10,14.6,1.647,0.521,1.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,normalized_cumulative,10,943.3,15.98,5.053,9.904,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,normalized_cumulative,20,244.45,26.293,5.879,11.523,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,normalized_cumulative,20,92.65,13.453,3.008,5.896,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,normalized_cumulative,20,4220.5,1.357,0.303,0.595,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,normalized_cumulative,20,11.75,2.826,0.632,1.239,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,normalized_cumulative,20,795.8,66.272,14.819,29.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,normalized_cumulative,20,175.05,13.705,3.065,6.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,normalized_cumulative,20,70.85,8.235,1.841,3.609,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,normalized_cumulative,20,4219.45,5.296,1.184,2.321,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,normalized_cumulative,20,13.75,4.435,0.992,1.944,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,normalized_cumulative,20,593.95,53.521,11.968,23.457,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,normalized_cumulative,20,224.45,29.269,6.545,12.828,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,normalized_cumulative,20,101.45,12.947,2.895,5.674,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,normalized_cumulative,20,4217.35,4.209,0.941,1.845,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,normalized_cumulative,20,19.75,3.932,0.879,1.723,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,normalized_cumulative,20,463.45,37.559,8.398,16.461,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,normalized_cumulative,13,180.538,21.384,5.931,11.624,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,normalized_cumulative,13,76.308,9.63,2.671,5.235,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,normalized_cumulative,13,4221.692,0.48,0.133,0.261,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,normalized_cumulative,13,15.846,2.154,0.597,1.171,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,normalized_cumulative,13,482.846,28.682,7.955,15.592,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,normalized_cumulative,15,196.667,16.723,4.318,8.463,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,normalized_cumulative,15,87.067,9.874,2.549,4.997,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,normalized_cumulative,15,4223.267,1.438,0.371,0.728,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,normalized_cumulative,15,19.267,3.15,0.813,1.594,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,normalized_cumulative,15,625.867,47.572,12.283,24.075,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,normalized_cumulative,15,226.6,16.505,4.261,8.352,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,normalized_cumulative,15,101.933,11.215,2.896,5.676,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,normalized_cumulative,15,4223.2,1.656,0.428,0.838,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,normalized_cumulative,15,20,4.44,1.146,2.247,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,normalized_cumulative,15,651.667,50.481,13.034,25.547,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,normalized_cumulative,15,276.133,14.397,3.717,7.286,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,normalized_cumulative,15,123.933,10.074,2.601,5.098,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,normalized_cumulative,15,4221.8,1.014,0.262,0.513,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,normalized_cumulative,15,17.533,3.044,0.786,1.541,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,normalized_cumulative,15,862.867,45.663,11.79,23.109,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,normalized_cumulative,20,191.65,10.796,2.414,4.732,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,normalized_cumulative,20,85.95,6.763,1.512,2.964,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,normalized_cumulative,20,4223.1,1.334,0.298,0.585,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,normalized_cumulative,20,12.9,2.553,0.571,1.119,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,normalized_cumulative,20,638.95,22.298,4.986,9.773,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,normalized_cumulative,10,393.9,11.666,3.689,7.231,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,normalized_cumulative,10,178.8,6.613,2.091,4.099,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,normalized_cumulative,10,4222.6,0.843,0.267,0.523,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,normalized_cumulative,10,21,2.789,0.882,1.729,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,normalized_cumulative,10,1010.2,31.272,9.889,19.383,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,normalized_cumulative,20,313.3,9.206,2.058,4.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,normalized_cumulative,20,138.3,5.639,1.261,2.471,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,normalized_cumulative,20,4221.9,0.968,0.216,0.424,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,normalized_cumulative,20,18.5,1.792,0.401,0.785,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,normalized_cumulative,20,821.4,18.76,4.195,8.222,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,normalized_cumulative,20,461.75,12.251,2.739,5.369,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,normalized_cumulative,20,208.05,7.756,1.734,3.399,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,normalized_cumulative,20,4222.65,1.182,0.264,0.518,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,normalized_cumulative,20,32.35,2.207,0.494,0.967,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,normalized_cumulative,20,1075.25,17.702,3.958,7.758,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,normalized_cumulative,15,265.133,13.516,3.49,6.84,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,normalized_cumulative,15,119.933,10.559,2.726,5.344,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,normalized_cumulative,15,4225,1.558,0.402,0.789,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,normalized_cumulative,15,22.2,4.617,1.192,2.336,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,normalized_cumulative,15,680.333,60.887,15.721,30.813,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,normalized_cumulative,15,304.333,9.522,2.459,4.819,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,normalized_cumulative,15,156.4,9.07,2.342,4.59,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,normalized_cumulative,15,4224.8,1.082,0.279,0.548,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,normalized_cumulative,15,29.133,2.9,0.749,1.468,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,normalized_cumulative,15,769.2,37.06,9.569,18.755,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,normalized_cumulative,5,356,12.39,5.541,10.86,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,normalized_cumulative,5,167.6,11.908,5.325,10.438,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,normalized_cumulative,5,4221.8,0.447,0.2,0.392,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,normalized_cumulative,5,36.6,6.804,3.043,5.964,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,normalized_cumulative,5,857,15.796,7.064,13.845,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,normalized_cumulative,13,374.231,18.926,5.249,10.288,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,normalized_cumulative,13,172.077,8.626,2.392,4.689,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,normalized_cumulative,13,4222.538,4.557,1.264,2.477,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,normalized_cumulative,13,30.077,3.068,0.851,1.668,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,normalized_cumulative,13,923.615,34.05,9.444,18.51,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,normalized_cumulative,10,227.5,5.93,1.875,3.676,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,normalized_cumulative,10,107.5,3.689,1.167,2.287,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,normalized_cumulative,10,4225.2,1.229,0.389,0.762,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,normalized_cumulative,10,23.6,2.171,0.686,1.345,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,normalized_cumulative,10,539.3,12.499,3.953,7.747,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,normalized_cumulative,10,453.8,23.021,7.28,14.268,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,normalized_cumulative,10,209.6,12.167,3.848,7.541,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,normalized_cumulative,10,4222.5,1.581,0.5,0.98,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,normalized_cumulative,10,28.6,4.789,1.514,2.968,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,normalized_cumulative,10,930.4,29.841,9.437,18.496,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,normalized_cumulative,20,343.4,12.137,2.714,5.319,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,normalized_cumulative,20,164.55,7.265,1.625,3.184,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,normalized_cumulative,20,4231.7,1.49,0.333,0.653,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,normalized_cumulative,20,35.3,3.213,0.719,1.408,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,normalized_cumulative,20,487.75,15.627,3.494,6.849,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,normalized_cumulative,10,321.2,10.358,3.275,6.42,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,normalized_cumulative,10,158.3,8.341,2.638,5.17,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,normalized_cumulative,10,4226.7,1.252,0.396,0.776,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,normalized_cumulative,10,39.9,3.985,1.26,2.47,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,normalized_cumulative,10,515,8.097,2.56,5.018,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.35,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.333,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.207,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.333,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.367,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.01,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.103,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.187,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.19,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.373,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.36,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.077,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.29,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.36,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.047,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.113,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.19,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.313,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.32,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.243,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.057,0.033,0.064,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.35,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.197,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.317,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.38,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.093,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.197,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.353,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.123,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.187,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.363,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.127,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.173,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.32,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.36,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.193,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.314,0.039,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.354,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,5,0.022,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.106,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.204,0.036,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.311,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.357,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.014,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.075,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.243,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.334,0.043,0.019,0.037,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.304,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,5,0.008,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.1,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.254,0.034,0.015,0.03,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.296,0.038,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.36,0.055,0.02,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,8,0.011,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.111,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.221,0.024,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.27,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.35,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.02,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.087,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.273,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.32,0.04,0.014,0.028,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.408,0.038,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,8,0.005,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.085,0.017,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.182,0.032,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.306,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.347,0.03,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct,10,0.008,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.089,0.017,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.25,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.318,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.331,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,10,0.01,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.099,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.242,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.318,0.041,0.013,0.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.341,0.051,0.016,0.032,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.013,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.07,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.258,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.293,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.399,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,10,0.007,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.096,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.205,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.29,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.39,0.044,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,10,0.011,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.096,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.213,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.327,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.33,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.237,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.16,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.39,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.38,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.07,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.29,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.343,0.042,0.024,0.047,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.023,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.107,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.237,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.323,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.33,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.213,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.34,0.046,0.026,0.052,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.373,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.17,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.35,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.34,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.197,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.33,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.367,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.177,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.34,0.035,0.02,0.039,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.377,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.127,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.153,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.35,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.343,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.18,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.273,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.383,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.04,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.183,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.147,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.383,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.01,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.383,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.077,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.249,0.025,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.359,0.03,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,8,0.048,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.125,0.014,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.22,0.015,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.247,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.353,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.03,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.25,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.293,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.323,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.03,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.127,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.227,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.277,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.343,0.042,0.024,0.047,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.037,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.107,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.237,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.29,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.347,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.027,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.217,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.261,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.33,0.025,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.039,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.124,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.246,0.027,0.008,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.257,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.353,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,3,0.037,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.13,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.223,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.283,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.35,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.027,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.153,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.187,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.247,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.35,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.04,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.14,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.223,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.152,0.049,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.352,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,5,0.004,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.402,0.048,0.021,0.042,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.09,0.012,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.25,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.327,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.03,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.137,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.257,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.277,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.337,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.137,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.243,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.273,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.333,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.137,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.243,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.283,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.337,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.017,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.113,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.25,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.253,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.34,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.173,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.22,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.24,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.347,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.167,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.233,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,9,0.379,0.042,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,9,0.311,0.034,0.011,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,9,0.009,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,9,0.087,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,9,0.214,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.333,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.343,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.103,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.217,0.045,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.217,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.417,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.297,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.07,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.327,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.313,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.027,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.107,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.227,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.327,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.32,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.113,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.233,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.313,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.457,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.117,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.05,0.029,0.057,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.37,0.052,0.03,0.059,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.103,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.183,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.316,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.438,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,5,0.006,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.112,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.128,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.317,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.45,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.113,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.34,0.02,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.378,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,5,0.006,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.104,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.172,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.3,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.38,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.207,0.04,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.317,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.35,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.087,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.24,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.336,0.025,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.396,0.041,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct,10,0.006,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.093,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.169,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.334,0.029,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.346,0.039,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,8,0.009,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.117,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.194,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.319,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.353,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.011,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.068,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.249,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.352,0.033,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.398,0.043,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,10,0.003,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.091,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.156,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.333,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.317,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.033,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.13,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.187,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.162,0.04,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.346,0.03,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,5,0.002,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.408,0.044,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.082,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.313,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.31,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.057,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.097,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.223,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.316,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.308,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,10,0.007,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.089,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.28,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.343,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.343,0.045,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.017,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.09,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.207,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.331,0.025,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.322,0.028,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,8,0.011,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.128,0.018,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.208,0.027,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.321,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.324,0.03,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.02,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.07,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.265,0.03,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.333,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,3,0.017,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.203,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.333,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.337,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.207,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.337,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.327,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.02,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.2,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.362,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.334,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,5,0.002,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.096,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.206,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.327,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.39,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.117,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.163,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.227,0.04,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.433,0.045,0.026,0.051,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.277,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.063,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.267,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.323,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.047,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.143,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.22,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.313,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.297,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.253,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.33,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.49,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.1,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.08,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.353,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.303,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.017,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.207,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.34,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.372,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct,10,0.014,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.079,0.014,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.195,0.022,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.329,0.017,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.504,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.105,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.062,0.01,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.31,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.51,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.06,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.303,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.443,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.14,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.35,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.317,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.113,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.217,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,5,0.316,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,5,0.378,0.029,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,5,0.002,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,5,0.108,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,5,0.196,0.034,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.227,0.047,0.027,0.053,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.44,0.056,0.032,0.063,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.25,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.08,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.287,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.337,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.04,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.213,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.317,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.323,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.003,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.13,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.227,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.33,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.52,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.11,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.04,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,10,0.367,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,10,0.35,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,10,0.009,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,10,0.078,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,10,0.196,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.249,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.56,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.135,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.056,0.021,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,8,0.238,0.018,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,8,0.555,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,8,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,8,0.162,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,8,0.045,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.34,0.053,0.031,0.06,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.403,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.103,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.153,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct,4,0.335,0.017,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct,4,0.37,0.018,0.009,0.018,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct,4,0.025,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct,4,0.1,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct,4,0.17,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.287,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.417,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.03,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.153,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.113,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.203,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.363,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.357,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.063,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.283,0.04,0.023,0.046,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.347,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct,3,0.06,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.147,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.163,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.28,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.327,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.02,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.173,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.2,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.267,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.39,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.163,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.167,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.32,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.317,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct,3,0.027,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.217,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.243,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.403,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct,3,0.033,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.23,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.09,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.25,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.413,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct,3,0.033,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.223,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.08,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct,3,0.24,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct,3,0.4,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct,3,0.033,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct,3,0.237,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct,3,0.09,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.295,0.04,0.016,0.032,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.367,0.058,0.024,0.046,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.008,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.115,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.215,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.275,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.355,0.042,0.017,0.034,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.035,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.107,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.228,0.034,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,8,0.334,0.039,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,8,0.37,0.028,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,8,0.004,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,8,0.088,0.019,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,8,0.205,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.33,0.02,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.328,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.005,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.093,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.243,0.028,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,8,0.321,0.045,0.016,0.031,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,8,0.339,0.038,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,8,0.014,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,8,0.071,0.011,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,8,0.255,0.033,0.012,0.023,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,4,0.29,0.032,0.016,0.031,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,4,0.335,0.051,0.025,0.05,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,4,0.035,0.013,0.006,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,4,0.095,0.021,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,4,0.245,0.017,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.33,0.028,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.327,0.032,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.008,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.093,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.242,0.021,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.312,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.342,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.028,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.084,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.234,0.03,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.25,0.034,0.014,0.027,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.367,0.033,0.014,0.027,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.045,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.127,0.018,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.212,0.033,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,4,0.34,0.014,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,4,0.335,0.033,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,4,0.007,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,4,0.095,0.026,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,4,0.222,0.005,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,9,0.354,0.02,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,9,0.332,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,9,0.011,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,9,0.078,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,9,0.224,0.028,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,3,0.347,0.055,0.032,0.062,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,3,0.333,0.042,0.024,0.047,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,3,0.013,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,3,0.083,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,3,0.223,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.33,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.426,0.034,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.002,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.086,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.156,0.036,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.333,0.03,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.358,0.048,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.01,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.115,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.183,0.032,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.328,0.026,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.374,0.032,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.012,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.092,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.194,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.288,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.348,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.03,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.08,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.254,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.337,0.031,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.365,0.031,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.01,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.097,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.192,0.022,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,3,0.317,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,3,0.37,0.044,0.025,0.049,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,3,0.013,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,3,0.083,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,3,0.217,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.316,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.324,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.044,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.086,0.013,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.23,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.345,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.363,0.048,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.01,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.093,0.021,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.188,0.026,0.01,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.323,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.301,0.027,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.025,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.097,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.254,0.03,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.323,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.306,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.012,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.083,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.276,0.033,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.317,0.028,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.315,0.021,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.038,0.017,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.103,0.018,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.227,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,6,0.333,0.023,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,6,0.333,0.021,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,6,0.013,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,6,0.118,0.012,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,6,0.202,0.013,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,3,0.33,0.044,0.025,0.049,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,3,0.303,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,3,0.03,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,3,0.09,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,3,0.247,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.324,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.506,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.102,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.068,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.37,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.353,0.037,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.009,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.069,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.199,0.041,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.33,0.027,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.329,0.047,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.028,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.066,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.247,0.041,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.369,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.347,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.009,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.069,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.206,0.039,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,10,0.361,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,10,0.338,0.038,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,10,0.013,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,10,0.047,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,10,0.241,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,4,0.358,0.038,0.019,0.037,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,4,0.368,0.043,0.022,0.043,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,4,0.01,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,4,0.065,0.019,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,4,0.2,0.047,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.286,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.322,0.041,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.05,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.118,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.224,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.35,0.028,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.346,0.054,0.024,0.047,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.004,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.082,0.02,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.218,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.254,0.023,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.564,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.134,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.048,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.26,0.028,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.562,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.134,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.044,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.304,0.04,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.414,0.036,0.016,0.032,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.104,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.178,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.298,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.324,0.027,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0.032,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.12,0.021,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.226,0.035,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,3,0.367,0.071,0.041,0.08,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,3,0.34,0.046,0.026,0.052,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,3,0.007,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,3,0.083,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,3,0.203,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,5,0.242,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,5,0.57,0.012,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,5,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,5,0.154,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,5,0.034,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct,4,0.248,0.017,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct,4,0.568,0.017,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,pct,4,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct,4,0.155,0.013,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct,4,0.03,0.022,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,pct,10,0.325,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,pct,10,0.356,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,pct,10,0.002,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,pct,10,0.12,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,pct,10,0.197,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,pct,20,0.316,0.034,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,pct,20,0.41,0.022,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,pct,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,pct,20,0.136,0.015,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,pct,20,0.138,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,pct,20,0.316,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,pct,20,0.355,0.029,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,pct,20,0.008,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,pct,20,0.104,0.022,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,pct,20,0.216,0.029,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,pct,20,0.288,0.032,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,pct,20,0.385,0.04,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,pct,20,0.007,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,pct,20,0.098,0.019,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,pct,20,0.222,0.032,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,pct,15,0.356,0.029,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,pct,15,0.367,0.037,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,pct,15,0.001,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,pct,15,0.106,0.015,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,pct,15,0.17,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,pct,20,0.322,0.024,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,pct,20,0.369,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,pct,20,0,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,pct,20,0.13,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,pct,20,0.178,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,pct,20,0.271,0.016,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,pct,20,0.458,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,pct,20,0,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,pct,20,0.178,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,pct,20,0.093,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,pct,20,0.285,0.044,0.01,0.019,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,pct,20,0.462,0.045,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,pct,20,0.003,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,pct,20,0.153,0.025,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,pct,20,0.098,0.029,0.006,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,pct,10,0.254,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,pct,10,0.505,0.034,0.011,0.021,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,pct,10,0.008,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,pct,10,0.201,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,pct,10,0.032,0.015,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,pct,20,0.269,0.034,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,pct,20,0.333,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,pct,20,0.04,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,pct,20,0.128,0.017,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,pct,20,0.231,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,pct,20,0.27,0.034,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,pct,20,0.329,0.029,0.006,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,pct,20,0.016,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,pct,20,0.141,0.02,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,pct,20,0.244,0.024,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,pct,10,0.285,0.04,0.013,0.025,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,pct,10,0.448,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,pct,10,0.016,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,pct,10,0.161,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,pct,10,0.09,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,pct,10,0.319,0.027,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,pct,10,0.444,0.022,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,pct,10,0.003,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,pct,10,0.111,0.014,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,pct,10,0.123,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,pct,20,0.348,0.031,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,pct,20,0.41,0.042,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,pct,20,0.002,0.006,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,pct,20,0.092,0.02,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,pct,20,0.148,0.031,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,pct,20,0.336,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,pct,20,0.322,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,pct,20,0.018,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,pct,20,0.102,0.022,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,pct,20,0.224,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,pct,20,0.289,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,pct,20,0.38,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,pct,20,0.029,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,pct,20,0.133,0.016,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,pct,20,0.169,0.039,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,pct,13,0.307,0.024,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,pct,13,0.315,0.029,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,pct,13,0.041,0.013,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,pct,13,0.121,0.015,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,pct,13,0.217,0.033,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,pct,15,0.326,0.027,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,pct,15,0.297,0.039,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,pct,15,0.02,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,pct,15,0.141,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,pct,15,0.216,0.027,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,pct,15,0.301,0.043,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,pct,15,0.368,0.036,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,pct,15,0.015,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,pct,15,0.113,0.023,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,pct,15,0.203,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,pct,15,0.31,0.037,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,pct,15,0.457,0.038,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,pct,15,0.001,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,pct,15,0.118,0.016,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,pct,15,0.115,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,pct,20,0.321,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,pct,20,0.351,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,pct,20,0.008,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,pct,20,0.096,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,pct,20,0.224,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,pct,10,0.288,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,pct,10,0.546,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,pct,10,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,pct,10,0.122,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,pct,10,0.044,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,pct,20,0.33,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,pct,20,0.502,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,pct,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,pct,20,0.098,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,pct,20,0.069,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,pct,20,0.238,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,pct,20,0.564,0.016,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,pct,20,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,pct,20,0.158,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,pct,20,0.04,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,pct,15,0.304,0.025,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,pct,15,0.417,0.018,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,pct,15,0.007,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,pct,15,0.143,0.019,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,pct,15,0.129,0.02,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,pct,15,0.217,0.024,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,pct,15,0.485,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,pct,15,0.003,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,pct,15,0.164,0.02,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,pct,15,0.132,0.015,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,pct,5,0.318,0.02,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,pct,5,0.41,0.028,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,pct,5,0.002,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,pct,5,0.162,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,pct,5,0.108,0.028,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,pct,13,0.276,0.03,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,pct,13,0.492,0.031,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,pct,13,0.001,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,pct,13,0.175,0.022,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,pct,13,0.055,0.015,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,pct,10,0.291,0.027,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,pct,10,0.35,0.028,0.009,0.018,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,pct,10,0.018,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,pct,10,0.18,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,pct,10,0.161,0.014,0.005,0.009,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,pct,10,0.232,0.041,0.013,0.025,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,pct,10,0.551,0.038,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,pct,10,0.006,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,pct,10,0.166,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,pct,10,0.045,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,pct,20,0.261,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,pct,20,0.402,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,pct,20,0.037,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,pct,20,0.222,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,pct,20,0.078,0.018,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,pct,10,0.282,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,pct,10,0.376,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,pct,10,0.036,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,pct,10,0.209,0.033,0.011,0.021,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,pct,10,0.097,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.793,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.443,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Claude 3.7 Sonnet,NA,Anthropic + Anthropic,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.803,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.47,0.044,0.025,0.049,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.103,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.99,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,DeepSeek R1,NA,Anthropic + DeepSeek,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.923,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.733,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.36,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-4.1,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.763,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.473,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.113,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.953,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.75,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.437,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,GPT-5 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.803,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.467,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.0 Flash,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.79,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.473,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.093,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.813,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.477,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.123,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Maverick,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.827,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.49,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.127,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,Llama 4 Scout,NA,Anthropic + Meta,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.8,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.48,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,o3 mini,NA,Anthropic + OpenAI,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.774,0.042,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.46,0.012,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.106,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.978,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,Gemini 2.5 Pro,NA,Anthropic + Google,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.743,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.432,0.024,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.075,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.986,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,LiSA 1.0,NA,Anthropic + AMBOSS,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.738,0.033,0.015,0.029,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.404,0.027,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.1,0.014,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.992,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Claude 3.7 Sonnet,NA,DeepSeek + Anthropic,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.768,0.027,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.471,0.05,0.018,0.035,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.111,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,0.989,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,NA,DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.707,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.437,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.087,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.98,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,GPT-5 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.812,0.031,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.492,0.039,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.085,0.017,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,0.995,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.742,0.02,0.006,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.436,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.089,0.017,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.992,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.748,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.43,0.024,0.007,0.015,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.099,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.99,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,NA,DeepSeek + Google,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.729,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.411,0.046,0.015,0.029,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.07,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.987,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,LiSA 1.0,NA,DeepSeek + AMBOSS,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.788,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.495,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.096,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.993,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Llama 4 Maverick,NA,DeepSeek + Meta,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.776,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.486,0.042,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.096,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.989,0.007,0.002,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,o3 mini,NA,DeepSeek + OpenAI,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.757,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.43,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.93,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.77,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.38,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.74,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.45,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.107,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.977,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.773,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.45,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.83,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.49,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.79,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.44,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.82,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.49,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.843,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.503,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.127,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.813,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.463,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-4.1,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.777,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.503,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.96,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.913,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.767,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.383,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.99,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.732,0.021,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.484,0.022,0.008,0.015,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.125,0.014,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,0.952,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.72,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.473,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.97,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.743,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.45,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.127,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.97,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.727,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.45,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.107,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.963,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Flash,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.757,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.467,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.973,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.715,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.454,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.124,0.013,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.961,0.012,0.004,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,LiSA 1.0,NA,OpenAI + AMBOSS,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.74,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.483,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.13,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.963,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.787,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.503,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.153,0.038,0.022,0.043,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.973,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.737,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.49,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.14,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.96,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.906,0.013,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.754,0.047,0.021,0.041,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.402,0.048,0.021,0.042,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.996,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.713,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.463,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.137,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.97,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.75,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.473,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.137,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.743,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.47,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.137,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.733,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.45,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.113,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.983,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.767,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.513,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.173,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.753,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.513,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.167,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,9,0.777,0.023,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,9,0.398,0.034,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,9,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,9,0.087,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,9,0.991,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,NA,Google + Anthropic,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.447,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.103,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.93,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.713,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.297,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.747,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.42,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.107,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.973,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.76,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.433,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.113,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.88,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.567,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.81,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.473,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.103,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.866,0.017,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.55,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.112,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.994,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.883,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.567,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.822,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.482,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.104,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.994,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.046,0.026,0.052,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.48,0.04,0.023,0.045,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.1,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.753,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.437,0.032,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.087,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.825,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.489,0.03,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.093,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.994,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.797,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.464,0.036,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.117,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,0.991,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.74,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.421,0.03,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.068,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.989,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.841,0.029,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.489,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.091,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.997,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.447,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.13,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.967,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,DeepSeek R1,NA,Google + DeepSeek,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.916,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.754,0.042,0.019,0.036,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.408,0.044,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.998,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-4.1,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.72,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.407,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.097,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.943,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.713,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.397,0.032,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.089,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.993,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.777,0.029,0.017,0.033,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.433,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.09,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.983,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.0 Flash,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.781,0.023,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.45,0.037,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.128,0.018,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,0.989,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,NA,Google + Google,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.715,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.394,0.031,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.07,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.98,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,LiSA 1.0,NA,Google + AMBOSS,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.443,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.983,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Maverick,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.447,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Llama 4 Scout,NA,Google + Meta,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.443,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.98,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,o3 mini,NA,Google + OpenAI,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.792,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.43,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.096,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.998,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.833,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.507,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.117,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.937,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.71,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.277,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.733,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.467,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.143,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.953,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.733,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.42,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.92,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.59,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.1,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.777,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.423,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.983,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.791,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.451,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.079,0.014,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.986,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,LiSA 1.0,NA,Meta + AMBOSS,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.938,0.01,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.609,0.015,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.105,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.94,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.63,0.02,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.857,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.553,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.43,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.113,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Claude 3.7 Sonnet,NA,Meta + Anthropic,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,5,0.802,0.031,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,5,0.486,0.034,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,5,0.108,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,5,0.998,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,DeepSeek R1,NA,Meta + DeepSeek,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.917,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.69,0.072,0.042,0.082,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.25,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-4.1,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.747,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.46,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.123,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.96,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.77,0.036,0.021,0.041,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.453,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.13,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.997,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,GPT-5 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.96,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.63,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.11,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.0 Flash,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,10,0.795,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,10,0.428,0.038,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,10,0.078,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,10,0.991,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,NA,Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.944,0.021,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.695,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.135,0.009,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,8,0.955,0.02,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,8,0.718,0.017,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,8,0.162,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,NA,Meta + Meta,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.847,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.507,0.035,0.02,0.04,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.103,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,o3 mini,NA,Meta + OpenAI,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,4,0.805,0.01,0.005,0.01,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,4,0.47,0.014,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,4,0.1,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,4,0.975,0.006,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,Claude 3.7 Sonnet,NA,OpenAI + Anthropic,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.857,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.57,0.026,0.015,0.03,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.153,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.97,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,DeepSeek R1,NA,OpenAI + DeepSeek,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.923,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.72,0.03,0.017,0.034,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.357,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-4.1,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.777,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.493,0.023,0.013,0.026,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.147,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.94,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.78,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.5,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.173,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.98,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,GPT-5 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.82,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.553,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.163,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.0 Flash,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.757,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.437,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.12,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.973,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,o3 mini,Gemini 2.5 Pro,NA,OpenAI + Google,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.877,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.633,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.23,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.967,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Maverick,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.887,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.637,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.223,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.967,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,Llama 4 Scout,NA,OpenAI + Meta,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Mild,pct_cumulative,3,0.877,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Moderate,pct_cumulative,3,0.637,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Severe,pct_cumulative,3,0.237,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,Uncertain,pct_cumulative,3,0.967,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,o3 mini,o3 mini,NA,OpenAI + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.777,0.027,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.482,0.053,0.022,0.042,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.115,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.992,0.008,0.003,0.006,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,DeepSeek R1,DeepSeek + DeepSeek + DeepSeek,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.737,0.029,0.012,0.024,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.462,0.032,0.013,0.026,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.107,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.965,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,DeepSeek R1,GPT-5,DeepSeek + DeepSeek + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,8,0.791,0.024,0.009,0.017,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,8,0.458,0.03,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,8,0.088,0.019,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,8,0.996,0.007,0.003,0.005,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Claude 3.7 Sonnet,DeepSeek + Google + Anthropic,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.752,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.422,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.093,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.995,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,8,0.731,0.029,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,8,0.41,0.043,0.015,0.03,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,8,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,8,0.071,0.011,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,8,0.986,0.013,0.005,0.009,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.0 Flash,LiSA 1.0,DeepSeek + Google + AMBOSS,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,4,0.72,0.014,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,4,0.43,0.039,0.02,0.038,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,4,0.095,0.021,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,4,0.965,0.013,0.006,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.75,0.022,0.009,0.018,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.42,0.025,0.01,0.02,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.093,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.992,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek + Google + Google,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.738,0.031,0.014,0.027,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.426,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.084,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.972,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,Gemini 2.5 Pro,GPT-5,DeepSeek + Google + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.743,0.028,0.011,0.022,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.493,0.021,0.008,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.127,0.018,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.955,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,GPT-5,GPT-5,OpenAI + OpenAI + OpenAI,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,4,0.77,0.012,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,4,0.43,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,4,0.095,0.026,0.013,0.026,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,4,0.992,0.01,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,Gemini 2.5 Pro,Google + Anthropic + Google,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,9,0.764,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,9,0.41,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,9,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,9,0.078,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,9,0.989,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Claude 3.7 Sonnet,LiSA 1.0,Google + Anthropic + AMBOSS,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,3,0.763,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,3,0.417,0.04,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,3,0.083,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,3,0.987,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.842,0.034,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.512,0.022,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.086,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.998,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Flash,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.807,0.034,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.473,0.039,0.016,0.031,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.115,0.026,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.99,0.009,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Flash,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.794,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.466,0.032,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.092,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.988,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,DeepSeek R1,Google + Google + DeepSeek,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.716,0.021,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.428,0.018,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.08,0.019,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.97,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.798,0.021,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.462,0.035,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.097,0.015,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.99,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,3,0.77,0.035,0.02,0.039,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,3,0.453,0.031,0.018,0.035,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,3,0.083,0.025,0.015,0.028,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,3,0.987,0.012,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,DeepSeek R1,Google + Meta + DeepSeek,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.726,0.018,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.41,0.023,0.01,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.086,0.013,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.956,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,GPT-5,Google + Meta + OpenAI,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.802,0.028,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.457,0.037,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.093,0.021,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.99,0.006,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,Llama 4 Maverick,Gemini 2.5 Pro,Google + Meta + Google,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.721,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.398,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.097,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.975,0.01,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,GPT-5,Google + OpenAI + OpenAI,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.712,0.027,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.389,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.083,0.02,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.988,0.008,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,GPT-5 mini,LiSA 1.0,Google + OpenAI + AMBOSS,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.735,0.014,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.418,0.026,0.01,0.021,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.103,0.018,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.962,0.017,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,GPT-5,Google + Google + OpenAI,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,6,0.785,0.01,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,6,0.452,0.029,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,6,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,6,0.118,0.012,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,6,0.987,0.005,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,Gemini 2.5 Pro,Gemini 2.5 Pro,Google + Google + Google,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,3,0.723,0.04,0.023,0.046,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,3,0.393,0.021,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,3,0.09,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,3,0.97,0.01,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.932,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.608,0.018,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.102,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.792,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.422,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.069,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.991,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,DeepSeek R1,Meta + Google + DeepSeek,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.725,0.036,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.395,0.045,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.066,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.972,0.011,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,GPT-5,Meta + Google + OpenAI,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.785,0.035,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.416,0.033,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.069,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.991,0.009,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Gemini 2.5 Pro,Meta + Google + Google,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,10,0.746,0.03,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,10,0.385,0.037,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,10,0.047,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,10,0.987,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,LiSA 1.0,Meta + Google + AMBOSS,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,4,0.79,0.039,0.02,0.038,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,4,0.432,0.054,0.027,0.053,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,4,0.065,0.019,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,4,0.99,0.008,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Gemini 2.5 Pro,Llama 4 Maverick,Meta + Google + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.726,0.026,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.44,0.032,0.014,0.028,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.118,0.016,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.95,0.007,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.778,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.428,0.046,0.021,0.04,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.082,0.02,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.996,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.952,0.019,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.698,0.008,0.004,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.134,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.956,0.024,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.696,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.134,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Maverick,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.822,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.518,0.041,0.018,0.036,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.104,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,DeepSeek R1,Meta + Meta + DeepSeek,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.742,0.026,0.012,0.023,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.444,0.033,0.015,0.029,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.12,0.021,0.009,0.019,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,0.968,0.013,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,GPT-5,Meta + Meta + OpenAI,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,3,0.79,0.017,0.01,0.02,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,3,0.423,0.059,0.034,0.066,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,3,0.083,0.015,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,3,0.993,0.006,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Gemini 2.5 Pro,Meta + Meta + Google,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,5,0.966,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,5,0.724,0.009,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,5,0.154,0.011,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Maverick,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Mild,pct_cumulative,4,0.97,0.022,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Moderate,pct_cumulative,4,0.723,0.005,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,None,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Severe,pct_cumulative,4,0.155,0.013,0.006,0.013,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Uncertain,pct_cumulative,4,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,Llama 4 Scout,Llama 4 Scout,Meta + Meta + Meta,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Mild,pct_cumulative,10,0.801,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Moderate,pct_cumulative,10,0.476,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Severe,pct_cumulative,10,0.12,0.008,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude 3.7 Sonnet,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.998,0.004,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Claude 3.7 Sonnet,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Mild,pct_cumulative,20,0.862,0.026,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Moderate,pct_cumulative,20,0.545,0.028,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Severe,pct_cumulative,20,0.136,0.015,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Haiku 4.5,Solo Models,Advisor,Uncertain,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Haiku 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Mild,pct_cumulative,20,0.775,0.028,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Moderate,pct_cumulative,20,0.46,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Severe,pct_cumulative,20,0.104,0.022,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -Claude Sonnet 4.5,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.992,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Claude Sonnet 4.5,NA,NA,Anthropic,NA,NA -DeepSeek R1,Solo Models,Advisor,Mild,pct_cumulative,20,0.77,0.03,0.007,0.013,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Moderate,pct_cumulative,20,0.483,0.036,0.008,0.016,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Severe,pct_cumulative,20,0.098,0.019,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek R1,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.993,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,DeepSeek R1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Mild,pct_cumulative,15,0.829,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Moderate,pct_cumulative,15,0.473,0.038,0.01,0.019,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Severe,pct_cumulative,15,0.106,0.015,0.004,0.008,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -DeepSeek V3.1,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.999,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,DeepSeek V3.1,NA,NA,DeepSeek,NA,NA -GPT-4.1,Solo Models,Advisor,Mild,pct_cumulative,20,0.821,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Moderate,pct_cumulative,20,0.498,0.022,0.005,0.01,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Severe,pct_cumulative,20,0.13,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1,Solo Models,Advisor,Uncertain,pct_cumulative,20,1,0.002,0,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Mild,pct_cumulative,20,0.906,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Moderate,pct_cumulative,20,0.635,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Severe,pct_cumulative,20,0.178,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4.1 mini,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.999,0.002,0.001,0.001,NO,AllCases,Unanimous,AllHarm,GPT-4.1 mini,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Mild,pct_cumulative,20,0.9,0.029,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Moderate,pct_cumulative,20,0.615,0.056,0.013,0.025,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Severe,pct_cumulative,20,0.153,0.025,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.997,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,GPT-4o,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Mild,pct_cumulative,10,0.96,0.014,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Moderate,pct_cumulative,10,0.706,0.026,0.008,0.016,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Severe,pct_cumulative,10,0.201,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-4o mini,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.992,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-4o mini,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Mild,pct_cumulative,20,0.73,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Moderate,pct_cumulative,20,0.46,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Severe,pct_cumulative,20,0.128,0.017,0.004,0.008,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.961,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,GPT-5,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Mild,pct_cumulative,20,0.74,0.024,0.005,0.011,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Moderate,pct_cumulative,20,0.47,0.038,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Severe,pct_cumulative,20,0.141,0.02,0.004,0.009,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 mini,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.984,0.009,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 mini,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Mild,pct_cumulative,10,0.894,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Moderate,pct_cumulative,10,0.609,0.028,0.009,0.017,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Severe,pct_cumulative,10,0.161,0.021,0.007,0.013,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -GPT-5 nano,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.984,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,GPT-5 nano,NA,NA,OpenAI,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Mild,pct_cumulative,10,0.874,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Moderate,pct_cumulative,10,0.555,0.016,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Severe,pct_cumulative,10,0.111,0.014,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.0 Flash,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.997,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 2.0 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Mild,pct_cumulative,20,0.85,0.032,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Moderate,pct_cumulative,20,0.502,0.035,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Severe,pct_cumulative,20,0.092,0.02,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Flash,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.998,0.006,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Flash,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Mild,pct_cumulative,20,0.759,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Moderate,pct_cumulative,20,0.424,0.033,0.007,0.015,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Severe,pct_cumulative,20,0.102,0.022,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 2.5 Pro,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.982,0.012,0.003,0.005,NO,AllCases,Unanimous,AllHarm,Gemini 2.5 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Mild,pct_cumulative,20,0.802,0.038,0.008,0.017,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Moderate,pct_cumulative,20,0.513,0.032,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Severe,pct_cumulative,20,0.133,0.016,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Gemini 3 Pro,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.971,0.006,0.001,0.003,NO,AllCases,Unanimous,AllHarm,Gemini 3 Pro,NA,NA,Google,NA,NA -Glass Health 4.0,Solo Models,Advisor,Mild,pct_cumulative,13,0.742,0.027,0.008,0.015,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Moderate,pct_cumulative,13,0.435,0.031,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,None,pct_cumulative,13,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Severe,pct_cumulative,13,0.121,0.015,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Glass Health 4.0,Solo Models,Advisor,Uncertain,pct_cumulative,13,0.959,0.013,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Glass Health 4.0,NA,NA,Glass Health,NA,NA -Grok 4,Solo Models,Advisor,Mild,pct_cumulative,15,0.764,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Moderate,pct_cumulative,15,0.438,0.042,0.011,0.021,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Severe,pct_cumulative,15,0.141,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.98,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Mild,pct_cumulative,15,0.782,0.021,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Moderate,pct_cumulative,15,0.481,0.047,0.012,0.024,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Severe,pct_cumulative,15,0.113,0.023,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Grok 4 Fast,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.985,0.007,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Grok 4 Fast,NA,NA,xAI,NA,NA -Kimi K2,Solo Models,Advisor,Mild,pct_cumulative,15,0.885,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Moderate,pct_cumulative,15,0.575,0.035,0.009,0.018,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Severe,pct_cumulative,15,0.118,0.016,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -Kimi K2,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.999,0.003,0.001,0.001,NO,AllCases,Unanimous,AllHarm,Kimi K2,NA,NA,Moonshot AI,NA,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Mild,pct_cumulative,20,0.768,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Moderate,pct_cumulative,20,0.447,0.027,0.006,0.012,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Severe,pct_cumulative,20,0.096,0.017,0.004,0.007,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -AMBOSS LiSA 1.0,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.992,0.007,0.002,0.003,NO,AllCases,Unanimous,AllHarm,LiSA 1.0,NA,NA,AMBOSS,AMBOSS LiSA 1.0,NA -Llama 3.3 70b,Solo Models,Advisor,Mild,pct_cumulative,10,0.956,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Moderate,pct_cumulative,10,0.668,0.006,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Severe,pct_cumulative,10,0.122,0.012,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 3.3 70b,Solo Models,Advisor,Uncertain,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 3.3 70b,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Mild,pct_cumulative,20,0.931,0.021,0.005,0.009,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Moderate,pct_cumulative,20,0.601,0.023,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Severe,pct_cumulative,20,0.098,0.013,0.003,0.006,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Maverick,Solo Models,Advisor,Uncertain,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Maverick,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Mild,pct_cumulative,20,0.96,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Moderate,pct_cumulative,20,0.721,0.017,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Severe,pct_cumulative,20,0.158,0.01,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Llama 4 Scout,Solo Models,Advisor,Uncertain,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Llama 4 Scout,NA,NA,Meta,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Mild,pct_cumulative,15,0.864,0.022,0.006,0.011,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Moderate,pct_cumulative,15,0.56,0.024,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Severe,pct_cumulative,15,0.143,0.019,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Large 2.1,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.993,0.008,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Mistral Large 2.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Mild,pct_cumulative,15,0.865,0.016,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Moderate,pct_cumulative,15,0.649,0.028,0.007,0.014,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,None,pct_cumulative,15,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Severe,pct_cumulative,15,0.164,0.02,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Mistral Medium 3.1,Solo Models,Advisor,Uncertain,pct_cumulative,15,0.997,0.005,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Mistral Medium 3.1,NA,NA,Mistral AI,NA,NA -Qwen3 235B,Solo Models,Advisor,Mild,pct_cumulative,5,0.89,0.025,0.011,0.022,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Moderate,pct_cumulative,5,0.572,0.03,0.014,0.027,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,None,pct_cumulative,5,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Severe,pct_cumulative,5,0.162,0.015,0.007,0.013,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 235B,Solo Models,Advisor,Uncertain,pct_cumulative,5,0.998,0.004,0.002,0.004,NO,AllCases,Unanimous,AllHarm,Qwen3 235B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Mild,pct_cumulative,13,0.944,0.016,0.004,0.008,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Moderate,pct_cumulative,13,0.668,0.032,0.009,0.017,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,None,pct_cumulative,13,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Severe,pct_cumulative,13,0.175,0.022,0.006,0.012,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -Qwen3 32B,Solo Models,Advisor,Uncertain,pct_cumulative,13,0.999,0.003,0.001,0.002,NO,AllCases,Unanimous,AllHarm,Qwen3 32B,NA,NA,Alibaba,NA,NA -o1,Solo Models,Advisor,Mild,pct_cumulative,10,0.821,0.017,0.005,0.011,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Moderate,pct_cumulative,10,0.53,0.033,0.011,0.021,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Severe,pct_cumulative,10,0.18,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.982,0.019,0.006,0.012,NO,AllCases,Unanimous,AllHarm,o1,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Mild,pct_cumulative,10,0.949,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Moderate,pct_cumulative,10,0.717,0.046,0.014,0.028,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Severe,pct_cumulative,10,0.166,0.023,0.007,0.014,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o1 mini,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.994,0.005,0.002,0.003,NO,AllCases,Unanimous,AllHarm,o1 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Mild,pct_cumulative,20,0.885,0.016,0.004,0.007,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Moderate,pct_cumulative,20,0.624,0.025,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,None,pct_cumulative,20,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Severe,pct_cumulative,20,0.222,0.014,0.003,0.006,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o3 mini,Solo Models,Advisor,Uncertain,pct_cumulative,20,0.963,0.011,0.002,0.005,NO,AllCases,Unanimous,AllHarm,o3 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Mild,pct_cumulative,10,0.867,0.018,0.006,0.011,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Moderate,pct_cumulative,10,0.585,0.025,0.008,0.016,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,None,pct_cumulative,10,1,0,0,0,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Severe,pct_cumulative,10,0.209,0.033,0.011,0.021,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -o4 mini,Solo Models,Advisor,Uncertain,pct_cumulative,10,0.964,0.011,0.003,0.007,NO,AllCases,Unanimous,AllHarm,o4 mini,NA,NA,OpenAI,NA,NA -Human Generalist Physicians,Solo Models,Human,All,Accuracy,3,0.871,0.009,0.005,0.01,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Completeness,3,0.333,0.022,0.013,0.025,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Emergencies,3,0.167,0.289,0.167,0.327,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Escalation,3,0.545,0.12,0.069,0.136,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,F1,3,0.551,0.027,0.015,0.03,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,OverallScore,3,0.46,0.004,0.002,0.005,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Precision,3,0.492,0.052,0.03,0.059,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Recall,3,0.629,0.017,0.01,0.019,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Restraint,3,0.559,0.054,0.031,0.062,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,All,Safety,3,0.586,0.014,0.008,0.016,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,nnh,3,3.131,0.35,0.202,0.396,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,nnh,3,2.109,0.218,0.126,0.247,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,nnh,3,100,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,nnh,3,6.167,1.258,0.726,1.424,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,nnh,3,30,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,nnh_cumulative,3,1.034,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,nnh_cumulative,3,1.556,0.096,0.056,0.109,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,nnh_cumulative,3,6.167,1.258,0.726,1.424,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,nnh_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,normalized,3,192.222,5.092,2.94,5.762,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,normalized,3,115.556,26.943,15.556,30.489,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,normalized,3,3557.778,85.266,49.229,96.488,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,normalized,3,33.333,6.667,3.849,7.544,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,normalized,3,325.556,72.444,41.826,81.978,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,normalized_cumulative,3,341.111,23.649,13.654,26.761,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,normalized_cumulative,3,148.889,27.756,16.025,31.408,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,normalized_cumulative,3,4224.444,1.925,1.111,2.178,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,normalized_cumulative,3,33.333,6.667,3.849,7.544,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,normalized_cumulative,3,666.667,83.533,48.228,94.527,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,pct,3,0.322,0.038,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,pct,3,0.478,0.051,0.029,0.058,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,pct,3,0,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,pct,3,0.167,0.033,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,pct,3,0.033,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Mild,pct_cumulative,3,0.967,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Moderate,pct_cumulative,3,0.644,0.038,0.022,0.044,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,None,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Severe,pct_cumulative,3,0.167,0.033,0.019,0.038,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA -Human Generalist Physicians,Solo Models,Human,Uncertain,pct_cumulative,3,1,0,0,0,NO,AllCases,Unanimous,AllHarm,Human,NA,NA,Human,Human Generalist Physicians,NA +Model,Team,Condition,Provider,Metric,mean,ci +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Completeness,0.569,0.016 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.569,0.016 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Completeness,0.492,0.048 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.048 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.618,0.008 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.008 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.488,0.024 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.488,0.024 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.553,0.044 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.044 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.589,0.035 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.52,0.016 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.016 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Completeness,0.549,0.024 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.024 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Completeness,0.549,0.024 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.024 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.516,0.021 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.021 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.517,0.016 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.517,0.016 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Completeness,0.555,0.018 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.555,0.018 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Completeness,0.595,0.03 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.595,0.03 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Completeness,0.509,0.025 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.509,0.025 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Completeness,0.598,0.05 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.598,0.05 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.604,0.019 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.604,0.019 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.606,0.024 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.606,0.024 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.559,0.016 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.559,0.016 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Completeness,0.646,0.023 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.646,0.023 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Completeness,0.565,0.02 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.565,0.02 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Completeness,0.546,0.024 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.546,0.024 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Completeness,0.593,0.048 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.593,0.048 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.699,0.042 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.699,0.042 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.512,0.05 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.512,0.05 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.585,0.014 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.585,0.014 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.638,0.016 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.638,0.016 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.589,0.035 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.622,0 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.622,0.014 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.014 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.602,0.008 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.602,0.008 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Completeness,0.419,0.016 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.419,0.016 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.622,0.037 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.037 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.468,0.024 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.468,0.024 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.5,0.014 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.5,0.014 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.52,0.029 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.029 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.516,0.016 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.016 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.516,0.016 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.016 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Completeness,0.51,0.018 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.51,0.018 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.467,0.042 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.467,0.042 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.496,0.032 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.496,0.032 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.455,0.042 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.455,0.042 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.659,0.024 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.024 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.492,0.008 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.008 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.516,0.035 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.035 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.537,0.05 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.05 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.557,0.021 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.557,0.021 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.472,0.029 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.472,0.029 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.459,0.029 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.459,0.029 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Completeness,0.634,0.017 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.634,0.017 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.553,0.048 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.048 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.703,0.044 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.703,0.044 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.545,0.016 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.545,0.016 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.602,0.029 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.602,0.029 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.659,0.024 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.024 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.533,0.029 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.533,0.029 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.663,0.01 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.663,0.01 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.663,0.016 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.663,0.016 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.627,0.026 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.627,0.026 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.537,0.041 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.041 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.634,0.024 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.634,0.024 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.615,0.012 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.615,0.012 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.553,0.021 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.021 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Completeness,0.673,0.021 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.673,0.021 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.622,0.02 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.02 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.496,0.008 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.496,0.008 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.729,0.04 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.729,0.04 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.549,0.028 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.028 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.591,0.024 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.591,0.024 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.589,0.035 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.541,0.027 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.541,0.027 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Completeness,0.618,0.028 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.028 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.537,0.014 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.014 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.537,0.014 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.014 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.52,0.016 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.016 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Completeness,0.593,0.022 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.593,0.022 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Completeness,0.463,0.055 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.463,0.055 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.646,0.028 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.646,0.028 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.492,0.029 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.029 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.581,0.021 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.581,0.021 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.549,0.028 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.028 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.606,0.029 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.606,0.029 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Completeness,0.643,0.017 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.643,0.017 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.497,0.016 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.497,0.016 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.504,0.021 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.504,0.021 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.476,0.014 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.476,0.014 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Completeness,0.61,0.014 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.61,0.014 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Completeness,0.537,0.021 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.021 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.654,0.032 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.654,0.032 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.512,0.014 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.512,0.014 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.581,0.021 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.581,0.021 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.659,0.028 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.028 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.618,0.023 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.023 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.587,0.012 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.587,0.012 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.591,0.011 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.591,0.011 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.577,0.029 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.577,0.029 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Completeness,0.534,0.025 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.534,0.025 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Completeness,0.358,0.044 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.358,0.044 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.537,0.05 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.05 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.435,0.032 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.435,0.032 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.447,0.016 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.447,0.016 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.411,0.029 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.411,0.029 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.52,0.021 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.021 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.264,0.021 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.264,0.021 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.28,0.014 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.28,0.014 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.268,0.028 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.268,0.028 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Completeness,0.5,0.034 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.5,0.034 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Completeness,0.498,0.024 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.498,0.024 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Completeness,0.614,0.019 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.614,0.019 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Completeness,0.583,0.013 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.583,0.013 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Completeness,0.633,0.026 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.633,0.026 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Completeness,0.558,0.033 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.558,0.033 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Completeness,0.587,0.016 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.587,0.016 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Completeness,0.544,0.014 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.544,0.014 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Completeness,0.451,0.023 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.451,0.023 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Completeness,0.573,0.01 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.573,0.01 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Completeness,0.636,0.013 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.636,0.013 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.561,0.037 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.561,0.037 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.598,0.036 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.598,0.036 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.543,0.026 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.543,0.026 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Completeness,0.539,0.024 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.539,0.024 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.568,0.022 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.568,0.022 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.535,0.02 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.535,0.02 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Completeness,0.573,0.024 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.573,0.024 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Completeness,0.59,0.022 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.59,0.022 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Completeness,0.565,0.02 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.565,0.02 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Completeness,0.566,0.029 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.566,0.029 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Completeness,0.6,0.026 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.6,0.026 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.553,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.553,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.522,0.028 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.522,0.028 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Completeness,0.593,0.016 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.016 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.495,0.01 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.495,0.01 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Completeness,0.618,0.022 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.618,0.022 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Completeness,0.599,0.026 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.599,0.026 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Completeness,0.618,0.023 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.618,0.023 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Completeness,0.66,0.023 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.66,0.023 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Completeness,0.628,0.05 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.628,0.05 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Completeness,0.534,0.027 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.534,0.027 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Completeness,0.615,0.031 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.615,0.031 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.593,0.014 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.014 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.598,0.017 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.598,0.017 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Completeness,0.51,0.04 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.51,0.04 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Completeness,0.537,0.031 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.537,0.031 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Completeness,0.626,0.065 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.626,0.065 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.593,0.012 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.012 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.595,0.02 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.595,0.02 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Completeness,0.543,0.011 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Completeness,0.543,0.011 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Completeness,0.512,0.019 +Claude Haiku 4.5,Solo Models,Advisor,NA,Completeness,0.512,0.019 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Completeness,0.51,0.016 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Completeness,0.51,0.016 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Completeness,0.573,0.015 +DeepSeek R1,Solo Models,Advisor,NA,Completeness,0.573,0.015 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Completeness,0.628,0.023 +DeepSeek V3.1,Solo Models,Advisor,NA,Completeness,0.628,0.023 +Expert AI,Solo Models,Advisor,NA,Completeness,0.656,0.009 +GPT-4.1,Solo Models,Advisor,OpenAI,Completeness,0.618,0.01 +GPT-4.1,Solo Models,Advisor,NA,Completeness,0.618,0.01 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Completeness,0.458,0.013 +GPT-4.1 mini,Solo Models,Advisor,NA,Completeness,0.458,0.013 +GPT-4o,Solo Models,Advisor,OpenAI,Completeness,0.565,0.041 +GPT-4o,Solo Models,Advisor,NA,Completeness,0.565,0.041 +GPT-4o mini,Solo Models,Advisor,OpenAI,Completeness,0.332,0.016 +GPT-4o mini,Solo Models,Advisor,NA,Completeness,0.332,0.016 +GPT-5,Solo Models,Advisor,OpenAI,Completeness,0.502,0.017 +GPT-5,Solo Models,Advisor,NA,Completeness,0.502,0.017 +GPT-5 mini,Solo Models,Advisor,OpenAI,Completeness,0.505,0.016 +GPT-5 mini,Solo Models,Advisor,NA,Completeness,0.505,0.016 +GPT-5 nano,Solo Models,Advisor,OpenAI,Completeness,0.438,0.019 +GPT-5 nano,Solo Models,Advisor,NA,Completeness,0.438,0.019 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Completeness,0.654,0.01 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Completeness,0.654,0.01 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Completeness,0.632,0.012 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Completeness,0.632,0.012 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Completeness,0.583,0.027 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Completeness,0.583,0.027 +Gemini 3 Pro,Solo Models,Advisor,Google,Completeness,0.435,0.026 +Gemini 3 Pro,Solo Models,Advisor,NA,Completeness,0.435,0.026 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Completeness,0.52,0.027 +Glass Health 4.0,Solo Models,Advisor,NA,Completeness,0.52,0.027 +Grok 4,Solo Models,Advisor,xAI,Completeness,0.573,0.032 +Grok 4,Solo Models,Advisor,NA,Completeness,0.573,0.032 +Grok 4 Fast,Solo Models,Advisor,xAI,Completeness,0.554,0.033 +Grok 4 Fast,Solo Models,Advisor,NA,Completeness,0.554,0.033 +Kimi K2,Solo Models,Advisor,Moonshot AI,Completeness,0.637,0.025 +Kimi K2,Solo Models,Advisor,NA,Completeness,0.637,0.025 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Completeness,0.671,0.012 +LiSA 1.0,Solo Models,Advisor,NA,Completeness,0.671,0.012 +Llama 3.3 70b,Solo Models,Advisor,Meta,Completeness,0.56,0.02 +Llama 3.3 70b,Solo Models,Advisor,NA,Completeness,0.56,0.02 +Llama 4 Maverick,Solo Models,Advisor,Meta,Completeness,0.516,0.011 +Llama 4 Maverick,Solo Models,Advisor,NA,Completeness,0.516,0.011 +Llama 4 Scout,Solo Models,Advisor,Meta,Completeness,0.579,0.007 +Llama 4 Scout,Solo Models,Advisor,NA,Completeness,0.579,0.007 +MedGemma 27B,Solo Models,Advisor,Google,Completeness,0.515,0.021 +MedGemma 27B,Solo Models,Advisor,NA,Completeness,0.515,0.021 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Completeness,0.498,0.036 +Mistral Large 2.1,Solo Models,Advisor,NA,Completeness,0.498,0.036 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Completeness,0.474,0.029 +Mistral Medium 3.1,Solo Models,Advisor,NA,Completeness,0.474,0.029 +Qwen3 235B,Solo Models,Advisor,Alibaba,Completeness,0.534,0.041 +Qwen3 235B,Solo Models,Advisor,NA,Completeness,0.534,0.041 +Qwen3 32B,Solo Models,Advisor,Alibaba,Completeness,0.483,0.018 +Qwen3 32B,Solo Models,Advisor,NA,Completeness,0.483,0.018 +o1,Solo Models,Advisor,OpenAI,Completeness,0.441,0.023 +o1,Solo Models,Advisor,NA,Completeness,0.441,0.023 +o1 mini,Solo Models,Advisor,OpenAI,Completeness,0.487,0.028 +o1 mini,Solo Models,Advisor,NA,Completeness,0.487,0.028 +o3 mini,Solo Models,Advisor,OpenAI,Completeness,0.282,0.013 +o3 mini,Solo Models,Advisor,NA,Completeness,0.282,0.013 +o4 mini,Solo Models,Advisor,OpenAI,Completeness,0.355,0.018 +o4 mini,Solo Models,Advisor,NA,Completeness,0.355,0.018 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Escalation,0.766,0.01 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.766,0.01 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Escalation,0.723,0.04 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.723,0.04 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.756,0.01 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.756,0.01 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.692,0.043 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.043 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.627,0.017 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.627,0.017 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.662,0.026 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.662,0.026 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.754,0.037 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.754,0.037 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Escalation,0.647,0.01 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.01 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Escalation,0.652,0.02 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.652,0.02 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.647,0.01 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.01 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.688,0.031 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.688,0.031 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Escalation,0.684,0.018 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.684,0.018 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Escalation,0.782,0.022 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.022 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Escalation,0.743,0.017 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.743,0.017 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Escalation,0.657,0.017 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.017 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.752,0.018 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.752,0.018 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.783,0.012 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.783,0.012 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.774,0.018 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.774,0.018 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Escalation,0.782,0.018 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.018 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Escalation,0.75,0.015 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.75,0.015 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Escalation,0.734,0.012 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.734,0.012 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Escalation,0.746,0.017 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.746,0.017 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.821,0.029 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.029 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.667,0.035 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.667,0.035 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.652,0.01 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.652,0.01 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.736,0.02 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.736,0.02 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.776,0.045 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.045 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.716,0.017 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.716,0.017 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.716,0.017 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.716,0.017 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.701,0.017 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.701,0.017 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Escalation,0.657,0.034 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.034 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.771,0.054 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.771,0.054 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.655,0.019 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.655,0.019 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.597,0.034 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.597,0.034 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.637,0.039 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.637,0.039 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.692,0.01 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.01 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.667,0.054 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.667,0.054 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Escalation,0.669,0.017 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.669,0.017 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.632,0.02 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.632,0.02 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.647,0.035 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.035 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.622,0.026 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.622,0.026 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.749,0.019 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.749,0.019 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.607,0.01 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.607,0.01 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.542,0.026 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.542,0.026 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.597,0.074 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.597,0.074 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.658,0.02 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.658,0.02 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.527,0.026 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.527,0.026 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.512,0.026 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.512,0.026 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Escalation,0.718,0.018 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.718,0.018 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.713,0.033 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.713,0.033 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.781,0.059 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.781,0.059 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.682,0.035 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.682,0.035 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.657,0.051 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.051 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.677,0.035 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.677,0.035 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.739,0.055 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.739,0.055 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.701,0.028 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.701,0.028 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.692,0.043 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.043 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.654,0.036 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.654,0.036 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.821,0.045 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.045 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.801,0.01 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.801,0.01 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.839,0.02 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.839,0.02 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.775,0.03 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.775,0.03 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Escalation,0.851,0.014 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.851,0.014 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.829,0.019 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.829,0.019 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.724,0.007 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.724,0.007 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.866,0.033 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.866,0.033 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.751,0.01 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.751,0.01 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.712,0.024 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.712,0.024 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.782,0.025 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.025 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.729,0.023 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.729,0.023 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Escalation,0.776,0.022 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.022 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.738,0.025 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.738,0.025 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.738,0.025 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.738,0.025 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.708,0.024 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.708,0.024 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Escalation,0.768,0.008 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.768,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Escalation,0.74,0.025 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.74,0.025 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.811,0.039 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.811,0.039 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.697,0.039 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.697,0.039 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.662,0.054 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.662,0.054 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.766,0.026 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.766,0.026 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.764,0.067 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.764,0.067 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Escalation,0.787,0.014 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.787,0.014 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.769,0.017 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.769,0.017 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.776,0.017 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.017 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.743,0.033 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.743,0.033 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Escalation,0.796,0.026 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.796,0.026 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Escalation,0.732,0.025 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.732,0.025 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.851,0.045 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.851,0.045 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.711,0.043 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.711,0.043 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.607,0.01 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.607,0.01 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.776,0.017 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.017 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.821,0.011 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.011 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.778,0.015 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.778,0.015 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.754,0.02 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.754,0.02 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.721,0.026 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.721,0.026 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Escalation,0.649,0.019 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.649,0.019 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Escalation,0.554,0.067 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.554,0.067 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.584,0.106 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.584,0.106 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.552,0.017 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.552,0.017 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.495,0.038 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.495,0.038 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.55,0.034 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.55,0.034 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.676,0.033 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.676,0.033 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.34,0.02 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.34,0.02 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.374,0.025 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.374,0.025 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.34,0.027 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.34,0.027 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Escalation,0.727,0.014 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.014 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Escalation,0.764,0.027 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.764,0.027 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Escalation,0.749,0.014 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.749,0.014 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Escalation,0.754,0.019 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.754,0.019 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Escalation,0.777,0.017 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.777,0.017 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Escalation,0.754,0.019 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.754,0.019 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Escalation,0.765,0.038 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.765,0.038 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Escalation,0.759,0.038 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.759,0.038 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Escalation,0.669,0.021 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.669,0.021 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Escalation,0.722,0.038 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.722,0.038 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Escalation,0.73,0.019 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.73,0.019 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.708,0.038 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.708,0.038 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.839,0.026 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.839,0.026 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.8,0.044 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.8,0.044 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Escalation,0.774,0.029 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.774,0.029 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.771,0.033 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.771,0.033 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.767,0.033 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.767,0.033 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Escalation,0.811,0.035 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.811,0.035 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Escalation,0.785,0.007 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.785,0.007 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Escalation,0.783,0.024 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.783,0.024 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Escalation,0.713,0.02 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.713,0.02 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Escalation,0.727,0.026 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.026 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.727,0.033 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.033 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.725,0.036 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.725,0.036 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Escalation,0.721,0.043 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.721,0.043 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.77,0.022 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.77,0.022 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Escalation,0.824,0.008 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.824,0.008 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Escalation,0.753,0.019 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.753,0.019 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Escalation,0.823,0.014 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.823,0.014 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Escalation,0.83,0.013 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.83,0.013 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Escalation,0.833,0.015 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.833,0.015 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Escalation,0.725,0.033 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.725,0.033 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Escalation,0.82,0.024 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.82,0.024 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.779,0.023 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.779,0.023 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.773,0.025 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.773,0.025 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Escalation,0.794,0.025 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.794,0.025 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Escalation,0.749,0.03 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.749,0.03 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Escalation,0.838,0.044 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.838,0.044 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.758,0.027 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.758,0.027 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.765,0.03 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.765,0.03 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Escalation,0.642,0.012 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Escalation,0.642,0.012 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Escalation,0.687,0.014 +Claude Haiku 4.5,Solo Models,Advisor,NA,Escalation,0.687,0.014 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Escalation,0.64,0.012 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Escalation,0.64,0.012 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Escalation,0.756,0.011 +DeepSeek R1,Solo Models,Advisor,NA,Escalation,0.756,0.011 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Escalation,0.765,0.02 +DeepSeek V3.1,Solo Models,Advisor,NA,Escalation,0.765,0.02 +Expert AI,Solo Models,Advisor,NA,Escalation,0.748,0.009 +GPT-4.1,Solo Models,Advisor,OpenAI,Escalation,0.724,0.013 +GPT-4.1,Solo Models,Advisor,NA,Escalation,0.724,0.013 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Escalation,0.527,0.011 +GPT-4.1 mini,Solo Models,Advisor,NA,Escalation,0.527,0.011 +GPT-4o,Solo Models,Advisor,OpenAI,Escalation,0.81,0.018 +GPT-4o,Solo Models,Advisor,NA,Escalation,0.81,0.018 +GPT-4o mini,Solo Models,Advisor,OpenAI,Escalation,0.722,0.015 +GPT-4o mini,Solo Models,Advisor,NA,Escalation,0.722,0.015 +GPT-5,Solo Models,Advisor,OpenAI,Escalation,0.678,0.016 +GPT-5,Solo Models,Advisor,NA,Escalation,0.678,0.016 +GPT-5 mini,Solo Models,Advisor,OpenAI,Escalation,0.54,0.022 +GPT-5 mini,Solo Models,Advisor,NA,Escalation,0.54,0.022 +GPT-5 nano,Solo Models,Advisor,OpenAI,Escalation,0.568,0.023 +GPT-5 nano,Solo Models,Advisor,NA,Escalation,0.568,0.023 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Escalation,0.693,0.02 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Escalation,0.693,0.02 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Escalation,0.823,0.013 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Escalation,0.823,0.013 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Escalation,0.733,0.013 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Escalation,0.733,0.013 +Gemini 3 Pro,Solo Models,Advisor,Google,Escalation,0.6,0.024 +Gemini 3 Pro,Solo Models,Advisor,NA,Escalation,0.6,0.024 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Escalation,0.711,0.025 +Glass Health 4.0,Solo Models,Advisor,NA,Escalation,0.711,0.025 +Grok 4,Solo Models,Advisor,xAI,Escalation,0.755,0.014 +Grok 4,Solo Models,Advisor,NA,Escalation,0.755,0.014 +Grok 4 Fast,Solo Models,Advisor,xAI,Escalation,0.751,0.019 +Grok 4 Fast,Solo Models,Advisor,NA,Escalation,0.751,0.019 +Kimi K2,Solo Models,Advisor,Moonshot AI,Escalation,0.82,0.019 +Kimi K2,Solo Models,Advisor,NA,Escalation,0.82,0.019 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Escalation,0.731,0.016 +LiSA 1.0,Solo Models,Advisor,NA,Escalation,0.731,0.016 +Llama 3.3 70b,Solo Models,Advisor,Meta,Escalation,0.76,0.028 +Llama 3.3 70b,Solo Models,Advisor,NA,Escalation,0.76,0.028 +Llama 4 Maverick,Solo Models,Advisor,Meta,Escalation,0.758,0.013 +Llama 4 Maverick,Solo Models,Advisor,NA,Escalation,0.758,0.013 +Llama 4 Scout,Solo Models,Advisor,Meta,Escalation,0.76,0.008 +Llama 4 Scout,Solo Models,Advisor,NA,Escalation,0.76,0.008 +MedGemma 27B,Solo Models,Advisor,Google,Escalation,0.713,0.017 +MedGemma 27B,Solo Models,Advisor,NA,Escalation,0.713,0.017 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Escalation,0.689,0.034 +Mistral Large 2.1,Solo Models,Advisor,NA,Escalation,0.689,0.034 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Escalation,0.617,0.029 +Mistral Medium 3.1,Solo Models,Advisor,NA,Escalation,0.617,0.029 +Qwen3 235B,Solo Models,Advisor,Alibaba,Escalation,0.772,0.044 +Qwen3 235B,Solo Models,Advisor,NA,Escalation,0.772,0.044 +Qwen3 32B,Solo Models,Advisor,Alibaba,Escalation,0.714,0.023 +Qwen3 32B,Solo Models,Advisor,NA,Escalation,0.714,0.023 +o1,Solo Models,Advisor,OpenAI,Escalation,0.6,0.014 +o1,Solo Models,Advisor,NA,Escalation,0.6,0.014 +o1 mini,Solo Models,Advisor,OpenAI,Escalation,0.76,0.013 +o1 mini,Solo Models,Advisor,NA,Escalation,0.76,0.013 +o3 mini,Solo Models,Advisor,OpenAI,Escalation,0.384,0.018 +o3 mini,Solo Models,Advisor,NA,Escalation,0.384,0.018 +o4 mini,Solo Models,Advisor,OpenAI,Escalation,0.35,0.022 +o4 mini,Solo Models,Advisor,NA,Escalation,0.35,0.022 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,F1,0.619,0.002 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.619,0.002 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,F1,0.633,0.005 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.633,0.005 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.456,0.016 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.456,0.016 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.664,0.007 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.664,0.007 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.645,0.009 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.645,0.009 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.608,0.006 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.608,0.006 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.636,0.004 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.636,0.004 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,F1,0.611,0.008 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.611,0.008 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,F1,0.604,0.005 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.005 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.622,0.009 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.622,0.009 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.639,0.02 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.02 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,F1,0.656,0.004 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.656,0.004 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,F1,0.626,0.004 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.626,0.004 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,F1,0.617,0.005 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.617,0.005 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,F1,0.628,0.004 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.628,0.004 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.592,0.005 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.592,0.005 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.612,0.004 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.612,0.004 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.612,0.008 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.612,0.008 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,F1,0.637,0.003 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.637,0.003 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,F1,0.599,0.006 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.599,0.006 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,F1,0.608,0.004 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.608,0.004 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,F1,0.604,0.007 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.007 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.407,0.018 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.407,0.018 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.654,0.006 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.654,0.006 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.625,0.004 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.625,0.004 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.565,0.005 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.565,0.005 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.607,0.011 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.607,0.011 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.56,0.009 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.56,0.009 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.555,0.009 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.555,0.009 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.577,0.01 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.577,0.01 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,F1,0.673,0.004 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.673,0.004 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.429,0.012 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.429,0.012 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.671,0.007 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.671,0.007 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.669,0.008 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.008 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.669,0.003 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.003 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.675,0.01 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.675,0.01 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.653,0.018 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.653,0.018 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,F1,0.675,0.004 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.675,0.004 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.676,0.006 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.676,0.006 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.638,0.01 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.638,0.01 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.674,0.007 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.674,0.007 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.415,0.034 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.415,0.034 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.669,0.01 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.01 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.652,0.003 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.652,0.003 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.652,0.006 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.652,0.006 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.641,0.002 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.641,0.002 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.639,0.007 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.007 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.648,0.012 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.648,0.012 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,F1,0.598,0.002 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.598,0.002 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.603,0.007 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.603,0.007 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.471,0.007 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.471,0.007 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.662,0.009 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.662,0.009 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.617,0.007 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.617,0.007 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.533,0.001 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.533,0.001 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.596,0.01 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.596,0.01 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.536,0.003 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.536,0.003 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.525,0.007 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.525,0.007 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.568,0.006 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.568,0.006 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.61,0.013 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.013 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.61,0.006 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.006 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.589,0.002 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.589,0.002 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.581,0.013 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.581,0.013 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,F1,0.625,0.003 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.625,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.582,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.582,0.003 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.643,0.005 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.643,0.005 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.402,0.02 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.402,0.02 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.674,0.005 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.674,0.005 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.65,0.004 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.65,0.004 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.639,0.006 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.006 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.603,0.015 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.603,0.015 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,F1,0.659,0.004 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.659,0.004 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.637,0.007 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.637,0.007 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.634,0.011 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.634,0.011 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.635,0.01 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.635,0.01 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,F1,0.61,0.006 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.006 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,F1,0.611,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.611,0.008 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.49,0.026 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.49,0.026 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.668,0.011 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.668,0.011 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.634,0.005 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.634,0.005 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.543,0.002 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.543,0.002 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.597,0.008 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.597,0.008 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,F1,0.62,0.002 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.62,0.002 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.544,0.003 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.544,0.003 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.536,0.007 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.536,0.007 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.582,0.009 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.582,0.009 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,F1,0.6,0.016 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.6,0.016 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,F1,0.604,0.004 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.004 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.492,0.006 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.492,0.006 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.672,0.009 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.672,0.009 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.62,0.004 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.62,0.004 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.516,0.001 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.516,0.001 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.601,0.011 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.601,0.011 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.491,0.003 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.491,0.003 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.482,0.004 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.482,0.004 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.56,0.007 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.56,0.007 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,F1,0.635,0.01 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.635,0.01 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,F1,0.632,0.017 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.632,0.017 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.444,0.03 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.444,0.03 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.67,0.004 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.67,0.004 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.651,0.006 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.651,0.006 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.614,0.014 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.614,0.014 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.645,0.016 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.645,0.016 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.573,0.019 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.573,0.019 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.558,0.013 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.558,0.013 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.558,0.026 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.558,0.026 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,F1,0.615,0.005 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.615,0.005 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,F1,0.653,0.006 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.653,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,F1,0.602,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.602,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,F1,0.628,0.005 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.628,0.005 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,F1,0.629,0.004 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.629,0.004 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,F1,0.662,0.005 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.005 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,F1,0.621,0.004 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.621,0.004 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,F1,0.661,0.006 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.661,0.006 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,F1,0.675,0.007 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.675,0.007 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,F1,0.606,0.013 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.606,0.013 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,F1,0.622,0.003 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.622,0.003 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.654,0.006 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.654,0.006 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.588,0.003 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.588,0.003 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.607,0.007 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.607,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,F1,0.612,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.652,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.652,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.613,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.613,0.009 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,F1,0.612,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,F1,0.662,0.003 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.003 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,F1,0.612,0.009 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.009 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,F1,0.661,0.006 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.661,0.006 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,F1,0.656,0.004 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.656,0.004 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.662,0.009 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.009 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.624,0.012 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.624,0.012 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,F1,0.666,0.009 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.666,0.009 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.546,0.004 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.546,0.004 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,F1,0.612,0.009 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.009 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,F1,0.664,0.005 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.664,0.005 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,F1,0.616,0.007 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.616,0.007 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,F1,0.64,0.006 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.64,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,F1,0.612,0.012 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.012 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,F1,0.663,0.007 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.663,0.007 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,F1,0.623,0.005 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.623,0.005 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.49,0.005 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.49,0.005 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.49,0.005 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.49,0.005 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,F1,0.584,0.006 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.584,0.006 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,F1,0.665,0.008 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.665,0.008 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,F1,0.614,0.009 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.614,0.009 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.484,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.484,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.482,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.482,0.008 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,F1,0.614,0.004 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,F1,0.614,0.004 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,F1,0.562,0.004 +Claude Haiku 4.5,Solo Models,Advisor,NA,F1,0.562,0.004 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,F1,0.642,0.003 +Claude Sonnet 4.5,Solo Models,Advisor,NA,F1,0.642,0.003 +DeepSeek R1,Solo Models,Advisor,DeepSeek,F1,0.601,0.005 +DeepSeek R1,Solo Models,Advisor,NA,F1,0.601,0.005 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,F1,0.564,0.004 +DeepSeek V3.1,Solo Models,Advisor,NA,F1,0.564,0.004 +Expert AI,Solo Models,Advisor,NA,F1,0.571,0.004 +GPT-4.1,Solo Models,Advisor,OpenAI,F1,0.564,0.003 +GPT-4.1,Solo Models,Advisor,NA,F1,0.564,0.003 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,F1,0.54,0.003 +GPT-4.1 mini,Solo Models,Advisor,NA,F1,0.54,0.003 +GPT-4o,Solo Models,Advisor,OpenAI,F1,0.544,0.004 +GPT-4o,Solo Models,Advisor,NA,F1,0.544,0.004 +GPT-4o mini,Solo Models,Advisor,OpenAI,F1,0.506,0.007 +GPT-4o mini,Solo Models,Advisor,NA,F1,0.506,0.007 +GPT-5,Solo Models,Advisor,OpenAI,F1,0.669,0.004 +GPT-5,Solo Models,Advisor,NA,F1,0.669,0.004 +GPT-5 mini,Solo Models,Advisor,OpenAI,F1,0.645,0.004 +GPT-5 mini,Solo Models,Advisor,NA,F1,0.645,0.004 +GPT-5 nano,Solo Models,Advisor,OpenAI,F1,0.56,0.007 +GPT-5 nano,Solo Models,Advisor,NA,F1,0.56,0.007 +Gemini 2.0 Flash,Solo Models,Advisor,Google,F1,0.531,0.003 +Gemini 2.0 Flash,Solo Models,Advisor,NA,F1,0.531,0.003 +Gemini 2.5 Flash,Solo Models,Advisor,Google,F1,0.566,0.008 +Gemini 2.5 Flash,Solo Models,Advisor,NA,F1,0.566,0.008 +Gemini 2.5 Pro,Solo Models,Advisor,Google,F1,0.627,0.005 +Gemini 2.5 Pro,Solo Models,Advisor,NA,F1,0.627,0.005 +Gemini 3 Pro,Solo Models,Advisor,Google,F1,0.649,0.003 +Gemini 3 Pro,Solo Models,Advisor,NA,F1,0.649,0.003 +Glass Health 4.0,Solo Models,Advisor,Glass Health,F1,0.662,0.004 +Glass Health 4.0,Solo Models,Advisor,NA,F1,0.662,0.004 +Grok 4,Solo Models,Advisor,xAI,F1,0.612,0.005 +Grok 4,Solo Models,Advisor,NA,F1,0.612,0.005 +Grok 4 Fast,Solo Models,Advisor,xAI,F1,0.596,0.005 +Grok 4 Fast,Solo Models,Advisor,NA,F1,0.596,0.005 +Kimi K2,Solo Models,Advisor,Moonshot AI,F1,0.545,0.006 +Kimi K2,Solo Models,Advisor,NA,F1,0.545,0.006 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,F1,0.623,0.004 +LiSA 1.0,Solo Models,Advisor,NA,F1,0.623,0.004 +Llama 3.3 70b,Solo Models,Advisor,Meta,F1,0.5,0.005 +Llama 3.3 70b,Solo Models,Advisor,NA,F1,0.5,0.005 +Llama 4 Maverick,Solo Models,Advisor,Meta,F1,0.544,0.003 +Llama 4 Maverick,Solo Models,Advisor,NA,F1,0.544,0.003 +Llama 4 Scout,Solo Models,Advisor,Meta,F1,0.479,0.002 +Llama 4 Scout,Solo Models,Advisor,NA,F1,0.479,0.002 +MedGemma 27B,Solo Models,Advisor,Google,F1,0.541,0.005 +MedGemma 27B,Solo Models,Advisor,NA,F1,0.541,0.005 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,F1,0.578,0.005 +Mistral Large 2.1,Solo Models,Advisor,NA,F1,0.578,0.005 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,F1,0.551,0.003 +Mistral Medium 3.1,Solo Models,Advisor,NA,F1,0.551,0.003 +Qwen3 235B,Solo Models,Advisor,Alibaba,F1,0.53,0.006 +Qwen3 235B,Solo Models,Advisor,NA,F1,0.53,0.006 +Qwen3 32B,Solo Models,Advisor,Alibaba,F1,0.503,0.006 +Qwen3 32B,Solo Models,Advisor,NA,F1,0.503,0.006 +o1,Solo Models,Advisor,OpenAI,F1,0.625,0.004 +o1,Solo Models,Advisor,NA,F1,0.625,0.004 +o1 mini,Solo Models,Advisor,OpenAI,F1,0.485,0.006 +o1 mini,Solo Models,Advisor,NA,F1,0.485,0.006 +o3 mini,Solo Models,Advisor,OpenAI,F1,0.571,0.006 +o3 mini,Solo Models,Advisor,NA,F1,0.571,0.006 +o4 mini,Solo Models,Advisor,OpenAI,F1,0.584,0.005 +o4 mini,Solo Models,Advisor,NA,F1,0.584,0.005 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,OverallScore,0.59,0.004 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.004 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,OverallScore,0.571,0.025 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.571,0.025 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.458,0.016 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.458,0.016 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.579,0.011 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.011 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.591,0.018 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.018 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.588,0.021 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.588,0.021 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.582,0.004 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.582,0.004 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,OverallScore,0.576,0.015 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.015 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,OverallScore,0.573,0.013 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.573,0.013 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.568,0.016 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.568,0.016 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.586,0.007 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.586,0.007 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,OverallScore,0.61,0.009 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.61,0.009 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,OverallScore,0.603,0.014 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.603,0.014 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,OverallScore,0.567,0.013 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.567,0.013 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,OverallScore,0.602,0.014 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.602,0.014 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.591,0.01 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.01 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.597,0.008 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.008 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.589,0.008 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.589,0.008 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,OverallScore,0.628,0.009 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.628,0.009 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,OverallScore,0.579,0.01 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.01 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,OverallScore,0.577,0.011 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.577,0.011 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,OverallScore,0.586,0.021 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.586,0.021 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.43,0.012 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.43,0.012 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.584,0.022 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.584,0.022 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.594,0.004 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.594,0.004 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.576,0.005 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.005 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.599,0.013 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.599,0.013 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.567,0.007 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.567,0.007 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.565,0.002 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.565,0.002 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.572,0.006 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.572,0.006 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,OverallScore,0.551,0.013 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.551,0.013 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.445,0.012 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.445,0.012 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.57,0.012 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.57,0.012 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.581,0.003 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.581,0.003 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.593,0.012 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.012 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.596,0.012 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.596,0.012 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.592,0.007 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.007 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,OverallScore,0.589,0.008 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.589,0.008 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.574,0.028 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.028 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.575,0.024 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.024 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.563,0.026 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.563,0.026 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.431,0.044 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.431,0.044 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.574,0.008 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.008 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.578,0.025 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.578,0.025 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.583,0.026 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.583,0.026 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.604,0.015 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.604,0.015 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.545,0.018 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.545,0.018 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.543,0.021 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.543,0.021 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,OverallScore,0.602,0.007 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.602,0.007 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.014 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.014 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.496,0.018 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.496,0.018 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.603,0.012 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.603,0.012 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.597,0.019 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.019 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.56,0.009 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.56,0.009 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.569,0.016 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.569,0.016 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.562,0.004 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.562,0.004 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.556,0.006 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.556,0.006 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.576,0.012 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.012 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.024 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.024 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.606,0.016 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.606,0.016 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.592,0.005 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.005 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.57,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.57,0.009 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,OverallScore,0.63,0.009 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.63,0.009 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.592,0.008 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.008 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.003 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.003 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.439,0.024 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.439,0.024 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.614,0.015 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.614,0.015 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.616,0.012 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.616,0.012 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.614,0.014 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.614,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.579,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.014 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,OverallScore,0.634,0.012 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.634,0.012 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.591,0.006 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.006 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.59,0.003 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.003 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.583,0.008 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.583,0.008 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,OverallScore,0.597,0.01 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.01 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,OverallScore,0.547,0.032 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.547,0.032 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.497,0.018 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.497,0.018 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.575,0.019 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.019 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.597,0.015 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.015 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.54,0.013 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.54,0.013 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.597,0.011 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.011 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,OverallScore,0.621,0.008 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.621,0.008 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.529,0.006 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.529,0.006 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.522,0.017 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.522,0.017 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.538,0.014 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.538,0.014 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,OverallScore,0.593,0.002 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.002 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,OverallScore,0.574,0.011 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.011 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.507,0.013 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.507,0.013 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.593,0.007 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.007 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.59,0.003 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.003 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.555,0.009 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.555,0.009 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.608,0.012 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.608,0.012 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.513,0.004 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.513,0.004 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.503,0.004 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.503,0.004 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.568,0.014 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.568,0.014 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,OverallScore,0.593,0.015 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.015 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,OverallScore,0.504,0.038 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.504,0.038 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.447,0.033 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.447,0.033 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.555,0.024 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.555,0.024 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.543,0.008 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.543,0.008 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.518,0.023 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.518,0.023 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.59,0.014 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.014 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.411,0.016 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.411,0.016 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.421,0.011 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.421,0.011 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.411,0.019 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.411,0.019 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,OverallScore,0.562,0.016 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.562,0.016 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,OverallScore,0.578,0.011 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.578,0.011 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,OverallScore,0.599,0.011 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.599,0.011 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,OverallScore,0.603,0.009 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.603,0.009 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,OverallScore,0.619,0.012 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.619,0.012 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,OverallScore,0.608,0.016 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.608,0.016 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,OverallScore,0.598,0.011 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.598,0.011 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,OverallScore,0.608,0.004 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.608,0.004 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,OverallScore,0.564,0.013 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.564,0.013 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,OverallScore,0.586,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.586,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,OverallScore,0.616,0.006 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.616,0.006 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.611,0.024 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.611,0.024 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.586,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.586,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.575,0.015 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.575,0.015 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,OverallScore,0.581,0.013 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.581,0.013 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.609,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.609,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.579,0.013 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.579,0.013 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,OverallScore,0.591,0.018 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.591,0.018 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,OverallScore,0.622,0.012 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.622,0.012 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,OverallScore,0.591,0.008 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.591,0.008 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,OverallScore,0.615,0.016 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.016 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,OverallScore,0.623,0.013 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.623,0.013 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.611,0.012 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.611,0.012 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.581,0.015 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.581,0.015 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,OverallScore,0.627,0.021 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.627,0.021 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.529,0.006 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.529,0.006 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,OverallScore,0.612,0.01 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.612,0.01 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,OverallScore,0.632,0.011 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.632,0.011 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,OverallScore,0.615,0.011 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.011 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,OverallScore,0.643,0.011 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.643,0.011 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,OverallScore,0.613,0.022 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.613,0.022 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,OverallScore,0.594,0.013 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.594,0.013 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,OverallScore,0.615,0.011 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.011 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.515,0.007 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.515,0.007 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.517,0.006 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.517,0.006 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,OverallScore,0.554,0.02 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.554,0.02 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,OverallScore,0.597,0.019 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.597,0.019 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,OverallScore,0.615,0.03 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.03 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.507,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.507,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.506,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.506,0.008 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,OverallScore,0.576,0.007 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,OverallScore,0.576,0.007 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,OverallScore,0.537,0.007 +Claude Haiku 4.5,Solo Models,Advisor,NA,OverallScore,0.537,0.007 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,OverallScore,0.582,0.008 +Claude Sonnet 4.5,Solo Models,Advisor,NA,OverallScore,0.582,0.008 +DeepSeek R1,Solo Models,Advisor,DeepSeek,OverallScore,0.581,0.006 +DeepSeek R1,Solo Models,Advisor,NA,OverallScore,0.581,0.006 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,OverallScore,0.577,0.009 +DeepSeek V3.1,Solo Models,Advisor,NA,OverallScore,0.577,0.009 +Expert AI,Solo Models,Advisor,NA,OverallScore,0.591,0.006 +GPT-4.1,Solo Models,Advisor,OpenAI,OverallScore,0.564,0.004 +GPT-4.1,Solo Models,Advisor,NA,OverallScore,0.564,0.004 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,OverallScore,0.497,0.006 +GPT-4.1 mini,Solo Models,Advisor,NA,OverallScore,0.497,0.006 +GPT-4o,Solo Models,Advisor,OpenAI,OverallScore,0.536,0.015 +GPT-4o,Solo Models,Advisor,NA,OverallScore,0.536,0.015 +GPT-4o mini,Solo Models,Advisor,OpenAI,OverallScore,0.437,0.012 +GPT-4o mini,Solo Models,Advisor,NA,OverallScore,0.437,0.012 +GPT-5,Solo Models,Advisor,OpenAI,OverallScore,0.583,0.009 +GPT-5,Solo Models,Advisor,NA,OverallScore,0.583,0.009 +GPT-5 mini,Solo Models,Advisor,OpenAI,OverallScore,0.57,0.01 +GPT-5 mini,Solo Models,Advisor,NA,OverallScore,0.57,0.01 +GPT-5 nano,Solo Models,Advisor,OpenAI,OverallScore,0.511,0.012 +GPT-5 nano,Solo Models,Advisor,NA,OverallScore,0.511,0.012 +Gemini 2.0 Flash,Solo Models,Advisor,Google,OverallScore,0.556,0.005 +Gemini 2.0 Flash,Solo Models,Advisor,NA,OverallScore,0.556,0.005 +Gemini 2.5 Flash,Solo Models,Advisor,Google,OverallScore,0.582,0.007 +Gemini 2.5 Flash,Solo Models,Advisor,NA,OverallScore,0.582,0.007 +Gemini 2.5 Pro,Solo Models,Advisor,Google,OverallScore,0.599,0.007 +Gemini 2.5 Pro,Solo Models,Advisor,NA,OverallScore,0.599,0.007 +Gemini 3 Pro,Solo Models,Advisor,Google,OverallScore,0.548,0.013 +Gemini 3 Pro,Solo Models,Advisor,NA,OverallScore,0.548,0.013 +Glass Health 4.0,Solo Models,Advisor,Glass Health,OverallScore,0.59,0.012 +Glass Health 4.0,Solo Models,Advisor,NA,OverallScore,0.59,0.012 +Grok 4,Solo Models,Advisor,xAI,OverallScore,0.58,0.012 +Grok 4,Solo Models,Advisor,NA,OverallScore,0.58,0.012 +Grok 4 Fast,Solo Models,Advisor,xAI,OverallScore,0.572,0.013 +Grok 4 Fast,Solo Models,Advisor,NA,OverallScore,0.572,0.013 +Kimi K2,Solo Models,Advisor,Moonshot AI,OverallScore,0.561,0.009 +Kimi K2,Solo Models,Advisor,NA,OverallScore,0.561,0.009 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,OverallScore,0.623,0.005 +LiSA 1.0,Solo Models,Advisor,NA,OverallScore,0.623,0.005 +Llama 3.3 70b,Solo Models,Advisor,Meta,OverallScore,0.511,0.007 +Llama 3.3 70b,Solo Models,Advisor,NA,OverallScore,0.511,0.007 +Llama 4 Maverick,Solo Models,Advisor,Meta,OverallScore,0.535,0.004 +Llama 4 Maverick,Solo Models,Advisor,NA,OverallScore,0.535,0.004 +Llama 4 Scout,Solo Models,Advisor,Meta,OverallScore,0.496,0.003 +Llama 4 Scout,Solo Models,Advisor,NA,OverallScore,0.496,0.003 +MedGemma 27B,Solo Models,Advisor,Google,OverallScore,0.523,0.011 +MedGemma 27B,Solo Models,Advisor,NA,OverallScore,0.523,0.011 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,OverallScore,0.537,0.013 +Mistral Large 2.1,Solo Models,Advisor,NA,OverallScore,0.537,0.013 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,OverallScore,0.502,0.012 +Mistral Medium 3.1,Solo Models,Advisor,NA,OverallScore,0.502,0.012 +Qwen3 235B,Solo Models,Advisor,Alibaba,OverallScore,0.527,0.018 +Qwen3 235B,Solo Models,Advisor,NA,OverallScore,0.527,0.018 +Qwen3 32B,Solo Models,Advisor,Alibaba,OverallScore,0.488,0.01 +Qwen3 32B,Solo Models,Advisor,NA,OverallScore,0.488,0.01 +o1,Solo Models,Advisor,OpenAI,OverallScore,0.532,0.013 +o1,Solo Models,Advisor,NA,OverallScore,0.532,0.013 +o1 mini,Solo Models,Advisor,OpenAI,OverallScore,0.475,0.014 +o1 mini,Solo Models,Advisor,NA,OverallScore,0.475,0.014 +o3 mini,Solo Models,Advisor,OpenAI,OverallScore,0.427,0.01 +o3 mini,Solo Models,Advisor,NA,OverallScore,0.427,0.01 +o4 mini,Solo Models,Advisor,OpenAI,OverallScore,0.479,0.013 +o4 mini,Solo Models,Advisor,NA,OverallScore,0.479,0.013 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Precision,0.505,0.002 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.505,0.002 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Precision,0.542,0.006 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.542,0.006 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.313,0.016 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.313,0.016 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.59,0.004 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.59,0.004 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.54,0.009 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.54,0.009 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.486,0.007 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.007 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.523,0.01 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.523,0.01 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Precision,0.495,0.01 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.01 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Precision,0.486,0.006 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.006 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.511,0.01 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.511,0.01 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.54,0.026 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.54,0.026 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Precision,0.561,0.004 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.561,0.004 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Precision,0.514,0.004 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.514,0.004 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Precision,0.519,0.007 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.519,0.007 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Precision,0.517,0.009 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.517,0.009 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.462,0.004 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.462,0.004 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.488,0.005 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.488,0.005 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.5,0.011 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.5,0.011 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Precision,0.518,0.004 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.518,0.004 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Precision,0.478,0.006 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.478,0.006 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Precision,0.495,0.004 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.004 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Precision,0.486,0.006 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.006 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.27,0.015 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.27,0.015 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.576,0.009 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.576,0.009 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.509,0.003 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.509,0.003 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.43,0.004 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.43,0.004 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.484,0.01 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.484,0.01 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.43,0.009 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.43,0.009 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.423,0.009 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.423,0.009 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.455,0.01 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.455,0.01 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Precision,0.634,0.002 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.634,0.002 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.291,0.013 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.291,0.013 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.618,0.011 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.618,0.011 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.601,0.01 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.601,0.01 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.604,0.005 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.604,0.005 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.612,0.016 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.612,0.016 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.567,0.031 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.031 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Precision,0.615,0.006 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.615,0.006 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.635,0.002 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.635,0.002 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.567,0.011 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.011 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.636,0.002 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.636,0.002 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.276,0.03 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.276,0.03 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.606,0.013 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.606,0.013 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.57,0.002 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.57,0.002 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.567,0.013 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.013 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.545,0.012 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.545,0.012 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.564,0.014 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.564,0.014 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.581,0.019 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.581,0.019 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Precision,0.468,0.003 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.468,0.003 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.489,0.005 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.489,0.005 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.322,0.004 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.322,0.004 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.575,0.012 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.575,0.012 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.493,0.009 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.493,0.009 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.387,0.002 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.387,0.002 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.472,0.015 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.472,0.015 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.389,0.003 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.389,0.003 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.378,0.005 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.378,0.005 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.431,0.005 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.431,0.005 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.501,0.008 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.501,0.008 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.484,0.006 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.484,0.006 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.457,0.002 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.457,0.002 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.456,0.015 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.456,0.015 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Precision,0.497,0.003 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.497,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.45,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.45,0.003 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.561,0.008 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.561,0.008 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.262,0.019 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.262,0.019 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.596,0.005 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.596,0.005 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.543,0.004 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.543,0.004 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.528,0.005 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.528,0.005 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.5,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.5,0.02 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Precision,0.556,0.003 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.556,0.003 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.537,0.009 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.537,0.009 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.533,0.016 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.533,0.016 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.536,0.011 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.536,0.011 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Precision,0.491,0.007 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.491,0.007 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Precision,0.523,0.007 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.523,0.007 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.345,0.025 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.345,0.025 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.6,0.009 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.6,0.009 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.526,0.007 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.526,0.007 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.405,0.003 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.405,0.003 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.466,0.004 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.466,0.004 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Precision,0.495,0.003 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.003 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.417,0.002 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.417,0.002 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.405,0.005 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.405,0.005 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.471,0.01 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.471,0.01 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Precision,0.472,0.017 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.472,0.017 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Precision,0.503,0.004 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.503,0.004 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.346,0.007 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.346,0.007 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.601,0.008 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.601,0.008 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.507,0.007 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.507,0.007 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.371,0.002 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.371,0.002 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.471,0.011 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.471,0.011 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.355,0.003 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.355,0.003 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.347,0.004 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.347,0.004 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.439,0.008 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.439,0.008 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Precision,0.562,0.008 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.562,0.008 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Precision,0.624,0.025 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.624,0.025 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.319,0.028 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.319,0.028 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.646,0.008 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.646,0.008 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.597,0.009 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.597,0.009 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.577,0.014 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.577,0.014 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.554,0.023 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.554,0.023 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.644,0.016 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.644,0.016 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.603,0.03 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.603,0.03 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.615,0.042 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.615,0.042 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Precision,0.518,0.005 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.518,0.005 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Precision,0.573,0.006 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.573,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Precision,0.474,0.007 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.474,0.007 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Precision,0.518,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.518,0.006 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Precision,0.509,0.005 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.509,0.005 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Precision,0.567,0.004 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.567,0.004 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Precision,0.504,0.006 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.504,0.006 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Precision,0.571,0.007 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.571,0.007 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Precision,0.626,0.007 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.626,0.007 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Precision,0.484,0.018 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.484,0.018 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Precision,0.495,0.004 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.495,0.004 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.554,0.012 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.554,0.012 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.455,0.004 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.455,0.004 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.493,0.005 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.493,0.005 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Precision,0.498,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.498,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.55,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.55,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.502,0.01 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.502,0.01 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Precision,0.496,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.496,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Precision,0.56,0.003 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.56,0.003 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Precision,0.496,0.011 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.496,0.011 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Precision,0.572,0.005 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.572,0.005 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Precision,0.553,0.003 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.553,0.003 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.581,0.005 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.581,0.005 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.531,0.015 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.531,0.015 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Precision,0.567,0.015 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.567,0.015 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.418,0.004 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.418,0.004 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Precision,0.484,0.01 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.484,0.01 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Precision,0.56,0.006 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.56,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Precision,0.489,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.489,0.006 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Precision,0.515,0.006 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.515,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Precision,0.479,0.012 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.479,0.012 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Precision,0.582,0.008 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.582,0.008 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Precision,0.504,0.009 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.504,0.009 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.354,0.004 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.354,0.004 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.354,0.004 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.354,0.004 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Precision,0.479,0.006 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.479,0.006 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Precision,0.584,0.006 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.584,0.006 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Precision,0.49,0.007 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.49,0.007 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.348,0.004 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.348,0.004 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.346,0.007 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.346,0.007 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Precision,0.499,0.004 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Precision,0.499,0.004 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Precision,0.444,0.007 +Claude Haiku 4.5,Solo Models,Advisor,NA,Precision,0.444,0.007 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Precision,0.555,0.005 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Precision,0.555,0.005 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Precision,0.481,0.006 +DeepSeek R1,Solo Models,Advisor,NA,Precision,0.481,0.006 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Precision,0.425,0.004 +DeepSeek V3.1,Solo Models,Advisor,NA,Precision,0.425,0.004 +Expert AI,Solo Models,Advisor,NA,Precision,0.437,0.007 +GPT-4.1,Solo Models,Advisor,OpenAI,Precision,0.434,0.003 +GPT-4.1,Solo Models,Advisor,NA,Precision,0.434,0.003 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Precision,0.423,0.003 +GPT-4.1 mini,Solo Models,Advisor,NA,Precision,0.423,0.003 +GPT-4o,Solo Models,Advisor,OpenAI,Precision,0.416,0.006 +GPT-4o,Solo Models,Advisor,NA,Precision,0.416,0.006 +GPT-4o mini,Solo Models,Advisor,OpenAI,Precision,0.455,0.006 +GPT-4o mini,Solo Models,Advisor,NA,Precision,0.455,0.006 +GPT-5,Solo Models,Advisor,OpenAI,Precision,0.604,0.012 +GPT-5,Solo Models,Advisor,NA,Precision,0.604,0.012 +GPT-5 mini,Solo Models,Advisor,OpenAI,Precision,0.567,0.007 +GPT-5 mini,Solo Models,Advisor,NA,Precision,0.567,0.007 +GPT-5 nano,Solo Models,Advisor,OpenAI,Precision,0.475,0.006 +GPT-5 nano,Solo Models,Advisor,NA,Precision,0.475,0.006 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Precision,0.385,0.003 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Precision,0.385,0.003 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Precision,0.429,0.01 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Precision,0.429,0.01 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Precision,0.515,0.012 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Precision,0.515,0.012 +Gemini 3 Pro,Solo Models,Advisor,Google,Precision,0.615,0.019 +Gemini 3 Pro,Solo Models,Advisor,NA,Precision,0.615,0.019 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Precision,0.589,0.016 +Glass Health 4.0,Solo Models,Advisor,NA,Precision,0.589,0.016 +Grok 4,Solo Models,Advisor,xAI,Precision,0.5,0.013 +Grok 4,Solo Models,Advisor,NA,Precision,0.5,0.013 +Grok 4 Fast,Solo Models,Advisor,xAI,Precision,0.487,0.013 +Grok 4 Fast,Solo Models,Advisor,NA,Precision,0.487,0.013 +Kimi K2,Solo Models,Advisor,Moonshot AI,Precision,0.406,0.007 +Kimi K2,Solo Models,Advisor,NA,Precision,0.406,0.007 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Precision,0.493,0.004 +LiSA 1.0,Solo Models,Advisor,NA,Precision,0.493,0.004 +Llama 3.3 70b,Solo Models,Advisor,Meta,Precision,0.361,0.005 +Llama 3.3 70b,Solo Models,Advisor,NA,Precision,0.361,0.005 +Llama 4 Maverick,Solo Models,Advisor,Meta,Precision,0.415,0.003 +Llama 4 Maverick,Solo Models,Advisor,NA,Precision,0.415,0.003 +Llama 4 Scout,Solo Models,Advisor,Meta,Precision,0.343,0.002 +Llama 4 Scout,Solo Models,Advisor,NA,Precision,0.343,0.002 +MedGemma 27B,Solo Models,Advisor,Google,Precision,0.427,0.006 +MedGemma 27B,Solo Models,Advisor,NA,Precision,0.427,0.006 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Precision,0.474,0.015 +Mistral Large 2.1,Solo Models,Advisor,NA,Precision,0.474,0.015 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Precision,0.434,0.007 +Mistral Medium 3.1,Solo Models,Advisor,NA,Precision,0.434,0.007 +Qwen3 235B,Solo Models,Advisor,Alibaba,Precision,0.407,0.004 +Qwen3 235B,Solo Models,Advisor,NA,Precision,0.407,0.004 +Qwen3 32B,Solo Models,Advisor,Alibaba,Precision,0.379,0.006 +Qwen3 32B,Solo Models,Advisor,NA,Precision,0.379,0.006 +o1,Solo Models,Advisor,OpenAI,Precision,0.551,0.005 +o1,Solo Models,Advisor,NA,Precision,0.551,0.005 +o1 mini,Solo Models,Advisor,OpenAI,Precision,0.37,0.006 +o1 mini,Solo Models,Advisor,NA,Precision,0.37,0.006 +o3 mini,Solo Models,Advisor,OpenAI,Precision,0.635,0.008 +o3 mini,Solo Models,Advisor,NA,Precision,0.635,0.008 +o4 mini,Solo Models,Advisor,OpenAI,Precision,0.588,0.005 +o4 mini,Solo Models,Advisor,NA,Precision,0.588,0.005 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Recall,0.799,0.011 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.011 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Recall,0.761,0.008 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.761,0.008 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.839,0.008 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.839,0.008 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.759,0.014 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.759,0.014 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.801,0.007 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.801,0.007 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.812,0.003 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.812,0.003 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.812,0.012 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.812,0.012 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Recall,0.797,0.003 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.797,0.003 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Recall,0.798,0.002 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.798,0.002 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.794,0.006 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.794,0.006 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.782,0.005 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.782,0.005 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Recall,0.789,0.007 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.789,0.007 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Recall,0.8,0.007 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.007 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Recall,0.762,0.006 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.762,0.006 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Recall,0.802,0.011 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.802,0.011 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.824,0.008 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.008 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.823,0.003 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.823,0.003 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.79,0.005 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.79,0.005 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Recall,0.826,0.003 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.826,0.003 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Recall,0.8,0.007 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.007 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Recall,0.79,0.007 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.79,0.007 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Recall,0.8,0.015 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.015 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.834,0.003 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.834,0.003 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.756,0.002 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.756,0.002 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.808,0.008 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.008 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.82,0.008 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.82,0.008 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.816,0.011 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.816,0.011 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.804,0.005 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.804,0.005 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.807,0.007 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.007 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.789,0.006 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.789,0.006 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Recall,0.717,0.011 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.011 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.81,0.016 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.81,0.016 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.735,0.008 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.735,0.008 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.754,0.008 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.754,0.008 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.75,0.007 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.75,0.007 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.754,0.01 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.754,0.01 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.771,0.01 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.771,0.01 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Recall,0.748,0.009 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.748,0.009 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.723,0.012 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.723,0.012 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.729,0.009 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.729,0.009 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.717,0.013 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.013 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.843,0.007 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.843,0.007 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.747,0.009 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.747,0.009 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.762,0.011 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.762,0.011 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.768,0.023 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.768,0.023 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.78,0.023 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.78,0.023 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.736,0.015 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.736,0.015 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.733,0.015 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.733,0.015 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Recall,0.83,0.007 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.83,0.007 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.786,0.011 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.786,0.011 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.873,0.022 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.873,0.022 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.778,0.008 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.778,0.008 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.825,0.007 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.825,0.007 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.858,0.012 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.858,0.012 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.807,0.01 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.01 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.861,0.008 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.861,0.008 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.858,0.012 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.858,0.012 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.833,0.008 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.833,0.008 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.78,0.022 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.78,0.022 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.824,0.006 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.006 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.831,0.004 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.004 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.799,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.009 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Recall,0.843,0.006 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.843,0.006 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.825,0.006 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.825,0.006 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.752,0.003 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.752,0.003 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.866,0.029 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.866,0.029 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.777,0.01 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.777,0.01 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.81,0.008 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.81,0.008 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.808,0.009 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.009 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.763,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.014 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Recall,0.808,0.009 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.009 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.783,0.009 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.009 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.783,0.008 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.008 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.779,0.006 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.779,0.006 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Recall,0.807,0.011 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.011 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Recall,0.735,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.735,0.008 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.85,0.008 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.85,0.008 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.753,0.014 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.753,0.014 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.798,0.007 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.798,0.007 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.824,0.006 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.006 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.83,0.017 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.83,0.017 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Recall,0.831,0.004 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.004 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.783,0.006 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.006 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.792,0.009 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.792,0.009 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.763,0.007 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.007 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Recall,0.824,0.012 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.012 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Recall,0.755,0.006 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.755,0.006 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.849,0.012 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.849,0.012 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.763,0.011 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.011 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.799,0.003 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.003 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.852,0.009 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.852,0.009 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.831,0.012 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.012 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.795,0.006 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.795,0.006 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.791,0.005 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.791,0.005 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.773,0.002 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.773,0.002 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Recall,0.73,0.015 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.73,0.015 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Recall,0.64,0.013 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.64,0.013 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.732,0.02 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.732,0.02 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.696,0.007 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.696,0.007 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.717,0.011 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.011 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.656,0.014 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.656,0.014 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.772,0.003 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.772,0.003 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.516,0.02 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.516,0.02 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.519,0.016 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.519,0.016 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.512,0.019 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.512,0.019 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Recall,0.757,0.007 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.757,0.007 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Recall,0.759,0.008 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.759,0.008 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Recall,0.826,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.826,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Recall,0.797,0.005 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.005 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Recall,0.824,0.002 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.824,0.002 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Recall,0.795,0.008 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.795,0.008 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Recall,0.809,0.009 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.809,0.009 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Recall,0.784,0.009 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.784,0.009 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Recall,0.731,0.008 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.731,0.008 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Recall,0.811,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.811,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Recall,0.84,0.006 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.84,0.006 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.796,0.013 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.796,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.829,0.007 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.829,0.007 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.789,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.789,0.013 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Recall,0.794,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.794,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.799,0.008 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.799,0.008 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.788,0.01 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.788,0.01 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Recall,0.797,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Recall,0.809,0.009 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.809,0.009 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Recall,0.8,0.006 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.8,0.006 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Recall,0.784,0.01 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.784,0.01 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Recall,0.806,0.009 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.806,0.009 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.768,0.015 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.768,0.015 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.757,0.012 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.757,0.012 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Recall,0.808,0.005 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.808,0.005 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.785,0.007 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.785,0.007 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Recall,0.831,0.011 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.831,0.011 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Recall,0.817,0.009 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.817,0.009 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Recall,0.833,0.012 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.833,0.012 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Recall,0.844,0.01 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.844,0.01 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Recall,0.845,0.014 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.845,0.014 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Recall,0.771,0.008 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.771,0.008 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Recall,0.816,0.012 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.816,0.012 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.797,0.009 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.009 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.797,0.009 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.009 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Recall,0.749,0.01 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.749,0.01 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Recall,0.772,0.014 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.772,0.014 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Recall,0.819,0.019 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.819,0.019 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.792,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.792,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.795,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.795,0.008 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Recall,0.796,0.005 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Recall,0.796,0.005 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Recall,0.768,0.01 +Claude Haiku 4.5,Solo Models,Advisor,NA,Recall,0.768,0.01 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Recall,0.762,0.005 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Recall,0.762,0.005 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Recall,0.802,0.005 +DeepSeek R1,Solo Models,Advisor,NA,Recall,0.802,0.005 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Recall,0.835,0.006 +DeepSeek V3.1,Solo Models,Advisor,NA,Recall,0.835,0.006 +Expert AI,Solo Models,Advisor,NA,Recall,0.825,0.009 +GPT-4.1,Solo Models,Advisor,OpenAI,Recall,0.806,0.004 +GPT-4.1,Solo Models,Advisor,NA,Recall,0.806,0.004 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Recall,0.747,0.004 +GPT-4.1 mini,Solo Models,Advisor,NA,Recall,0.747,0.004 +GPT-4o,Solo Models,Advisor,OpenAI,Recall,0.787,0.02 +GPT-4o,Solo Models,Advisor,NA,Recall,0.787,0.02 +GPT-4o mini,Solo Models,Advisor,OpenAI,Recall,0.571,0.01 +GPT-4o mini,Solo Models,Advisor,NA,Recall,0.571,0.01 +GPT-5,Solo Models,Advisor,OpenAI,Recall,0.752,0.01 +GPT-5,Solo Models,Advisor,NA,Recall,0.752,0.01 +GPT-5 mini,Solo Models,Advisor,OpenAI,Recall,0.749,0.011 +GPT-5 mini,Solo Models,Advisor,NA,Recall,0.749,0.011 +GPT-5 nano,Solo Models,Advisor,OpenAI,Recall,0.682,0.013 +GPT-5 nano,Solo Models,Advisor,NA,Recall,0.682,0.013 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Recall,0.857,0.005 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Recall,0.857,0.005 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Recall,0.833,0.006 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Recall,0.833,0.006 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Recall,0.805,0.016 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Recall,0.805,0.016 +Gemini 3 Pro,Solo Models,Advisor,Google,Recall,0.694,0.025 +Gemini 3 Pro,Solo Models,Advisor,NA,Recall,0.694,0.025 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Recall,0.761,0.022 +Glass Health 4.0,Solo Models,Advisor,NA,Recall,0.761,0.022 +Grok 4,Solo Models,Advisor,xAI,Recall,0.793,0.02 +Grok 4,Solo Models,Advisor,NA,Recall,0.793,0.02 +Grok 4 Fast,Solo Models,Advisor,xAI,Recall,0.77,0.021 +Grok 4 Fast,Solo Models,Advisor,NA,Recall,0.77,0.021 +Kimi K2,Solo Models,Advisor,Moonshot AI,Recall,0.831,0.01 +Kimi K2,Solo Models,Advisor,NA,Recall,0.831,0.01 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Recall,0.846,0.004 +LiSA 1.0,Solo Models,Advisor,NA,Recall,0.846,0.004 +Llama 3.3 70b,Solo Models,Advisor,Meta,Recall,0.814,0.009 +Llama 3.3 70b,Solo Models,Advisor,NA,Recall,0.814,0.009 +Llama 4 Maverick,Solo Models,Advisor,Meta,Recall,0.79,0.005 +Llama 4 Maverick,Solo Models,Advisor,NA,Recall,0.79,0.005 +Llama 4 Scout,Solo Models,Advisor,Meta,Recall,0.793,0.003 +Llama 4 Scout,Solo Models,Advisor,NA,Recall,0.793,0.003 +MedGemma 27B,Solo Models,Advisor,Google,Recall,0.738,0.007 +MedGemma 27B,Solo Models,Advisor,NA,Recall,0.738,0.007 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Recall,0.747,0.024 +Mistral Large 2.1,Solo Models,Advisor,NA,Recall,0.747,0.024 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Recall,0.757,0.013 +Mistral Medium 3.1,Solo Models,Advisor,NA,Recall,0.757,0.013 +Qwen3 235B,Solo Models,Advisor,Alibaba,Recall,0.758,0.014 +Qwen3 235B,Solo Models,Advisor,NA,Recall,0.758,0.014 +Qwen3 32B,Solo Models,Advisor,Alibaba,Recall,0.751,0.01 +Qwen3 32B,Solo Models,Advisor,NA,Recall,0.751,0.01 +o1,Solo Models,Advisor,OpenAI,Recall,0.721,0.005 +o1,Solo Models,Advisor,NA,Recall,0.721,0.005 +o1 mini,Solo Models,Advisor,OpenAI,Recall,0.704,0.007 +o1 mini,Solo Models,Advisor,NA,Recall,0.704,0.007 +o3 mini,Solo Models,Advisor,OpenAI,Recall,0.519,0.009 +o3 mini,Solo Models,Advisor,NA,Recall,0.519,0.009 +o4 mini,Solo Models,Advisor,OpenAI,Recall,0.58,0.009 +o4 mini,Solo Models,Advisor,NA,Recall,0.58,0.009 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Restraint,0.542,0.002 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.542,0.002 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Restraint,0.573,0.007 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.573,0.007 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.444,0.015 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.444,0.015 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.614,0.002 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.614,0.002 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.574,0.005 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.574,0.005 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.533,0.005 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.533,0.005 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.556,0.006 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.556,0.006 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Restraint,0.54,0.009 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.54,0.009 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Restraint,0.533,0.007 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.533,0.007 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.551,0.011 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.011 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.583,0.009 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.583,0.009 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Restraint,0.599,0.003 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.599,0.003 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Restraint,0.544,0.005 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.544,0.005 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Restraint,0.551,0.008 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.008 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Restraint,0.55,0.008 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.55,0.008 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.507,0.005 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.507,0.005 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.526,0.005 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.526,0.005 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.543,0.005 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.543,0.005 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Restraint,0.556,0.004 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.556,0.004 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Restraint,0.52,0.006 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.006 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Restraint,0.532,0.005 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.532,0.005 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Restraint,0.523,0.008 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.523,0.008 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.402,0.01 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.402,0.01 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.602,0.007 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.602,0.007 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.547,0.001 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.547,0.001 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.485,0.002 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.485,0.002 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.535,0.006 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.535,0.006 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.486,0.003 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.486,0.003 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.481,0.003 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.481,0.003 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.503,0.009 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.503,0.009 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Restraint,0.66,0 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.66,0 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.447,0.011 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.447,0.011 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.644,0.008 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.644,0.008 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.631,0.009 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.631,0.009 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.637,0.005 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.637,0.005 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.639,0.007 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.639,0.007 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.615,0.021 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.615,0.021 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Restraint,0.646,0.005 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.646,0.005 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.662,0.002 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.662,0.002 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.632,0.005 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.632,0.005 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.664,0.001 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.664,0.001 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.426,0.024 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.426,0.024 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.633,0.011 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.633,0.011 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.606,0.003 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.606,0.003 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.609,0.012 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.609,0.012 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.6,0.003 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.6,0.003 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.606,0.011 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.606,0.011 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.617,0.012 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.617,0.012 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Restraint,0.508,0.003 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.508,0.003 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.528,0.005 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.528,0.005 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.424,0.008 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.424,0.008 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.604,0.013 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.604,0.013 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.535,0.011 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.535,0.011 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.452,0.002 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.452,0.002 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.52,0.008 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.008 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.455,0.003 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.455,0.003 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.448,0.004 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.448,0.004 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.487,0.004 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.487,0.004 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.539,0.007 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.539,0.007 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.525,0.006 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.525,0.006 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.503,0.002 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.503,0.002 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.517,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.517,0.009 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Restraint,0.538,0.003 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.538,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.499,0.003 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.499,0.003 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.586,0.006 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.586,0.006 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.403,0.013 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.403,0.013 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.615,0.001 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.615,0.001 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.57,0.004 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.57,0.004 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.563,0.002 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.563,0.002 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.551,0.008 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.008 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Restraint,0.586,0.003 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.586,0.003 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.569,0.005 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.569,0.005 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.567,0.007 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.567,0.007 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.569,0.004 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.569,0.004 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Restraint,0.529,0.008 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.529,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Restraint,0.56,0.01 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.56,0.01 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.45,0.021 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.45,0.021 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.626,0.006 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.626,0.006 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.566,0.012 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.566,0.012 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.476,0.004 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.476,0.004 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.523,0.007 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.523,0.007 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Restraint,0.543,0.003 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.543,0.003 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.491,0.002 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.491,0.002 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.484,0.005 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.484,0.005 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.529,0.005 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.529,0.005 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Restraint,0.518,0.009 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.518,0.009 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Restraint,0.547,0.005 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.547,0.005 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.441,0.002 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.441,0.002 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.629,0.009 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.629,0.009 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.554,0.006 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.554,0.006 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.455,0.003 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.455,0.003 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.52,0.007 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.007 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.451,0.003 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.451,0.003 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.448,0.004 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.448,0.004 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.513,0.004 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.513,0.004 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Restraint,0.61,0.007 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.61,0.007 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Restraint,0.665,0.022 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.665,0.022 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.472,0.023 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.472,0.023 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.672,0.002 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.672,0.002 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.634,0.007 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.634,0.007 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.629,0.015 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.629,0.015 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.597,0.012 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.597,0.012 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.694,0.02 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.694,0.02 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.668,0.02 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.668,0.02 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.685,0.027 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.685,0.027 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Restraint,0.551,0.006 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.551,0.006 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Restraint,0.595,0.005 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.595,0.005 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Restraint,0.515,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.515,0.006 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Restraint,0.548,0.003 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.548,0.003 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Restraint,0.547,0.005 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.547,0.005 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Restraint,0.59,0.004 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.59,0.004 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Restraint,0.538,0.005 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.538,0.005 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Restraint,0.591,0.008 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.591,0.008 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Restraint,0.651,0.008 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.651,0.008 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Restraint,0.523,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.523,0.009 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Restraint,0.536,0.003 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.536,0.003 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.579,0.013 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.579,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.502,0.003 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.502,0.003 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.53,0.003 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.003 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Restraint,0.539,0.008 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.539,0.008 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.573,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.573,0.006 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.539,0.007 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.539,0.007 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Restraint,0.533,0.006 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.533,0.006 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Restraint,0.584,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.584,0.005 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Restraint,0.534,0.006 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.534,0.006 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Restraint,0.593,0.005 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.593,0.005 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Restraint,0.58,0.004 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.58,0.004 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.602,0.003 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.602,0.003 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.567,0.007 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.567,0.007 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Restraint,0.588,0.015 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.588,0.015 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.492,0.004 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.492,0.004 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Restraint,0.524,0.007 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.524,0.007 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Restraint,0.583,0.007 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.583,0.007 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Restraint,0.526,0.005 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.526,0.005 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Restraint,0.548,0.006 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.548,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Restraint,0.52,0.006 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.52,0.006 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Restraint,0.608,0.008 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.608,0.008 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Restraint,0.54,0.008 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.54,0.008 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.45,0.004 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.45,0.004 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.45,0.003 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.45,0.003 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Restraint,0.53,0.005 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.005 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Restraint,0.609,0.006 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.609,0.006 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Restraint,0.53,0.002 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.002 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.448,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.448,0.005 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.447,0.007 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.447,0.007 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Restraint,0.543,0.004 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Restraint,0.543,0.004 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Restraint,0.494,0.007 +Claude Haiku 4.5,Solo Models,Advisor,NA,Restraint,0.494,0.007 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Restraint,0.591,0.005 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Restraint,0.591,0.005 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Restraint,0.521,0.005 +DeepSeek R1,Solo Models,Advisor,NA,Restraint,0.521,0.005 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Restraint,0.478,0.004 +DeepSeek V3.1,Solo Models,Advisor,NA,Restraint,0.478,0.004 +Expert AI,Solo Models,Advisor,NA,Restraint,0.5,0.005 +GPT-4.1,Solo Models,Advisor,OpenAI,Restraint,0.487,0.003 +GPT-4.1,Solo Models,Advisor,NA,Restraint,0.487,0.003 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Restraint,0.498,0.003 +GPT-4.1 mini,Solo Models,Advisor,NA,Restraint,0.498,0.003 +GPT-4o,Solo Models,Advisor,OpenAI,Restraint,0.487,0.007 +GPT-4o,Solo Models,Advisor,NA,Restraint,0.487,0.007 +GPT-4o mini,Solo Models,Advisor,OpenAI,Restraint,0.554,0.007 +GPT-4o mini,Solo Models,Advisor,NA,Restraint,0.554,0.007 +GPT-5,Solo Models,Advisor,OpenAI,Restraint,0.631,0.012 +GPT-5,Solo Models,Advisor,NA,Restraint,0.631,0.012 +GPT-5 mini,Solo Models,Advisor,OpenAI,Restraint,0.601,0.008 +GPT-5 mini,Solo Models,Advisor,NA,Restraint,0.601,0.008 +GPT-5 nano,Solo Models,Advisor,OpenAI,Restraint,0.534,0.006 +GPT-5 nano,Solo Models,Advisor,NA,Restraint,0.534,0.006 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Restraint,0.452,0.003 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Restraint,0.452,0.003 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Restraint,0.485,0.007 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Restraint,0.485,0.007 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Restraint,0.545,0.011 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Restraint,0.545,0.011 +Gemini 3 Pro,Solo Models,Advisor,Google,Restraint,0.648,0.02 +Gemini 3 Pro,Solo Models,Advisor,NA,Restraint,0.648,0.02 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Restraint,0.613,0.016 +Glass Health 4.0,Solo Models,Advisor,NA,Restraint,0.613,0.016 +Grok 4,Solo Models,Advisor,xAI,Restraint,0.537,0.012 +Grok 4,Solo Models,Advisor,NA,Restraint,0.537,0.012 +Grok 4 Fast,Solo Models,Advisor,xAI,Restraint,0.532,0.013 +Grok 4 Fast,Solo Models,Advisor,NA,Restraint,0.532,0.013 +Kimi K2,Solo Models,Advisor,Moonshot AI,Restraint,0.469,0.007 +Kimi K2,Solo Models,Advisor,NA,Restraint,0.469,0.007 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Restraint,0.541,0.004 +LiSA 1.0,Solo Models,Advisor,NA,Restraint,0.541,0.004 +Llama 3.3 70b,Solo Models,Advisor,Meta,Restraint,0.451,0.005 +Llama 3.3 70b,Solo Models,Advisor,NA,Restraint,0.451,0.005 +Llama 4 Maverick,Solo Models,Advisor,Meta,Restraint,0.491,0.003 +Llama 4 Maverick,Solo Models,Advisor,NA,Restraint,0.491,0.003 +Llama 4 Scout,Solo Models,Advisor,Meta,Restraint,0.446,0.001 +Llama 4 Scout,Solo Models,Advisor,NA,Restraint,0.446,0.001 +MedGemma 27B,Solo Models,Advisor,Google,Restraint,0.505,0.005 +MedGemma 27B,Solo Models,Advisor,NA,Restraint,0.505,0.005 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Restraint,0.532,0.015 +Mistral Large 2.1,Solo Models,Advisor,NA,Restraint,0.532,0.015 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Restraint,0.505,0.008 +Mistral Medium 3.1,Solo Models,Advisor,NA,Restraint,0.505,0.008 +Qwen3 235B,Solo Models,Advisor,Alibaba,Restraint,0.49,0.006 +Qwen3 235B,Solo Models,Advisor,NA,Restraint,0.49,0.006 +Qwen3 32B,Solo Models,Advisor,Alibaba,Restraint,0.46,0.005 +Qwen3 32B,Solo Models,Advisor,NA,Restraint,0.46,0.005 +o1,Solo Models,Advisor,OpenAI,Restraint,0.59,0.005 +o1,Solo Models,Advisor,NA,Restraint,0.59,0.005 +o1 mini,Solo Models,Advisor,OpenAI,Restraint,0.479,0.004 +o1 mini,Solo Models,Advisor,NA,Restraint,0.479,0.004 +o3 mini,Solo Models,Advisor,OpenAI,Restraint,0.693,0.008 +o3 mini,Solo Models,Advisor,NA,Restraint,0.693,0.008 +o4 mini,Solo Models,Advisor,OpenAI,Restraint,0.651,0.004 +o4 mini,Solo Models,Advisor,NA,Restraint,0.651,0.004 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Safety,0.673,0.011 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.673,0.011 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Safety,0.681,0.008 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.681,0.008 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.374,0.022 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.374,0.022 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.665,0.017 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.665,0.017 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.657,0.035 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.657,0.035 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.656,0.029 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.656,0.029 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.697,0.011 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.697,0.011 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Safety,0.654,0.026 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.654,0.026 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Safety,0.65,0.02 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.02 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.652,0.03 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.652,0.03 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.68,0.015 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.68,0.015 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Safety,0.691,0.008 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.691,0.008 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Safety,0.687,0.025 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.687,0.025 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Safety,0.663,0.015 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.663,0.015 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Safety,0.671,0.015 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.671,0.015 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.689,0.014 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.689,0.014 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.681,0.011 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.681,0.011 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.686,0.01 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.686,0.01 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Safety,0.701,0.012 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.701,0.012 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Safety,0.675,0.012 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.675,0.012 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Safety,0.672,0.014 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.672,0.014 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Safety,0.657,0.013 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.657,0.013 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.328,0.02 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.328,0.02 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.659,0.006 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.659,0.006 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.66,0.005 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.66,0.005 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.633,0.022 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.633,0.022 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.693,0.011 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.693,0.011 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.615,0.02 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.615,0.02 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.617,0.013 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.617,0.013 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.627,0.018 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.627,0.018 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Safety,0.65,0.021 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.021 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.347,0.018 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.347,0.018 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.64,0.013 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.64,0.013 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.634,0.009 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.634,0.009 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.64,0.014 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.64,0.014 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.654,0.018 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.654,0.018 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.667,0.022 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.667,0.022 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Safety,0.633,0.013 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.633,0.013 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.635,0.029 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.635,0.029 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.62,0.037 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.62,0.037 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.616,0.023 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.616,0.023 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.327,0.059 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.327,0.059 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.621,0.017 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.621,0.017 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.626,0.037 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.626,0.037 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.613,0.03 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.613,0.03 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.664,0.024 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.664,0.024 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.577,0.029 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.577,0.029 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.58,0.025 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.58,0.025 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Safety,0.694,0.015 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.694,0.015 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.661,0.02 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.02 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.443,0.045 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.443,0.045 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.674,0.021 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.674,0.021 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.67,0.023 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.67,0.023 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.614,0.007 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.614,0.007 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.679,0.02 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.679,0.02 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.61,0.011 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.61,0.011 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.605,0.002 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.605,0.002 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.643,0.014 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.643,0.014 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.666,0.022 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.666,0.022 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.682,0.023 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.682,0.023 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.69,0.009 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.69,0.009 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.66,0.015 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.66,0.015 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Safety,0.707,0.011 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.707,0.011 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.688,0.01 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.688,0.01 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.67,0.01 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.67,0.01 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.339,0.042 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.339,0.042 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.698,0.022 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.698,0.022 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.705,0.016 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.705,0.016 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.706,0.011 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.706,0.011 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.661,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.014 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Safety,0.715,0.015 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.715,0.015 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.687,0.004 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.687,0.004 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.686,0.003 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.686,0.003 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.682,0.009 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.682,0.009 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Safety,0.69,0.008 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.69,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Safety,0.652,0.024 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.652,0.024 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.442,0.017 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.442,0.017 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.631,0.014 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.631,0.014 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.65,0.025 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.025 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.614,0.023 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.614,0.023 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.684,0.017 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.684,0.017 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Safety,0.7,0.009 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.7,0.009 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.616,0.009 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.616,0.009 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.59,0.036 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.59,0.036 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.631,0.031 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.631,0.031 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Safety,0.674,0.007 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.674,0.007 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Safety,0.65,0.016 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.016 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.471,0.022 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.471,0.022 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.661,0.006 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.006 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.643,0.015 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.643,0.015 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.592,0.009 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.592,0.009 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.717,0.018 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.717,0.018 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.52,0.008 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.52,0.008 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.491,0.008 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.491,0.008 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.625,0.025 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.625,0.025 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Safety,0.647,0.014 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.647,0.014 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Safety,0.607,0.022 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.607,0.022 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.367,0.037 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.367,0.037 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.621,0.027 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.621,0.027 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.584,0.01 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.584,0.01 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.567,0.023 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.567,0.023 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.673,0.017 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.673,0.017 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.483,0.006 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.483,0.006 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.486,0.006 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.486,0.006 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.477,0.008 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.477,0.008 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Safety,0.658,0.005 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.658,0.005 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Safety,0.669,0.008 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.669,0.008 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Safety,0.695,0.016 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.695,0.016 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Safety,0.698,0.016 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.016 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Safety,0.698,0.014 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.014 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Safety,0.694,0.022 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.694,0.022 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Safety,0.691,0.027 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.691,0.027 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Safety,0.712,0.01 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.712,0.01 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Safety,0.641,0.018 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.641,0.018 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Safety,0.686,0.032 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.686,0.032 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Safety,0.698,0.011 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.011 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.714,0.02 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.714,0.02 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.69,0.017 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.69,0.017 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.675,0.021 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.675,0.021 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Safety,0.688,0.014 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.688,0.014 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.707,0.02 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.707,0.02 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.69,0.014 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.69,0.014 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Safety,0.687,0.037 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.687,0.037 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Safety,0.708,0.016 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.708,0.016 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Safety,0.7,0.007 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.7,0.007 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Safety,0.705,0.017 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.705,0.017 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Safety,0.706,0.019 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.706,0.019 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.697,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.697,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.675,0.02 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.675,0.02 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Safety,0.715,0.044 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.715,0.044 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.619,0.014 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.619,0.014 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Safety,0.73,0.016 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.73,0.016 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Safety,0.736,0.016 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.736,0.016 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Safety,0.736,0.012 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.736,0.012 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Safety,0.756,0.015 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.756,0.015 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Safety,0.729,0.029 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.729,0.029 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Safety,0.655,0.013 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.655,0.013 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Safety,0.715,0.021 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.715,0.021 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.522,0.011 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.522,0.011 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.524,0.011 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.524,0.011 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Safety,0.64,0.017 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.64,0.017 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Safety,0.661,0.025 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.661,0.025 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Safety,0.718,0.033 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.718,0.033 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.5,0.007 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.5,0.007 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.499,0.008 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.499,0.008 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Safety,0.654,0.009 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,Safety,0.654,0.009 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Safety,0.628,0.009 +Claude Haiku 4.5,Solo Models,Advisor,NA,Safety,0.628,0.009 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Safety,0.668,0.009 +Claude Sonnet 4.5,Solo Models,Advisor,NA,Safety,0.668,0.009 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Safety,0.672,0.01 +DeepSeek R1,Solo Models,Advisor,NA,Safety,0.672,0.01 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Safety,0.662,0.011 +DeepSeek V3.1,Solo Models,Advisor,NA,Safety,0.662,0.011 +Expert AI,Solo Models,Advisor,NA,Safety,0.644,0.011 +GPT-4.1,Solo Models,Advisor,OpenAI,Safety,0.609,0.008 +GPT-4.1,Solo Models,Advisor,NA,Safety,0.609,0.008 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Safety,0.543,0.005 +GPT-4.1 mini,Solo Models,Advisor,NA,Safety,0.543,0.005 +GPT-4o,Solo Models,Advisor,OpenAI,Safety,0.575,0.016 +GPT-4o,Solo Models,Advisor,NA,Safety,0.575,0.016 +GPT-4o mini,Solo Models,Advisor,OpenAI,Safety,0.49,0.012 +GPT-4o mini,Solo Models,Advisor,NA,Safety,0.49,0.012 +GPT-5,Solo Models,Advisor,OpenAI,Safety,0.642,0.01 +GPT-5,Solo Models,Advisor,NA,Safety,0.642,0.01 +GPT-5 mini,Solo Models,Advisor,OpenAI,Safety,0.621,0.014 +GPT-5 mini,Solo Models,Advisor,NA,Safety,0.621,0.014 +GPT-5 nano,Solo Models,Advisor,OpenAI,Safety,0.586,0.014 +GPT-5 nano,Solo Models,Advisor,NA,Safety,0.586,0.014 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Safety,0.605,0.013 +Gemini 2.0 Flash,Solo Models,Advisor,NA,Safety,0.605,0.013 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Safety,0.664,0.013 +Gemini 2.5 Flash,Solo Models,Advisor,NA,Safety,0.664,0.013 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Safety,0.695,0.01 +Gemini 2.5 Pro,Solo Models,Advisor,NA,Safety,0.695,0.01 +Gemini 3 Pro,Solo Models,Advisor,Google,Safety,0.628,0.013 +Gemini 3 Pro,Solo Models,Advisor,NA,Safety,0.628,0.013 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Safety,0.663,0.013 +Glass Health 4.0,Solo Models,Advisor,NA,Safety,0.663,0.013 +Grok 4,Solo Models,Advisor,xAI,Safety,0.646,0.014 +Grok 4,Solo Models,Advisor,NA,Safety,0.646,0.014 +Grok 4 Fast,Solo Models,Advisor,xAI,Safety,0.649,0.021 +Grok 4 Fast,Solo Models,Advisor,NA,Safety,0.649,0.021 +Kimi K2,Solo Models,Advisor,Moonshot AI,Safety,0.613,0.012 +Kimi K2,Solo Models,Advisor,NA,Safety,0.613,0.012 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Safety,0.679,0.008 +LiSA 1.0,Solo Models,Advisor,NA,Safety,0.679,0.008 +Llama 3.3 70b,Solo Models,Advisor,Meta,Safety,0.538,0.009 +Llama 3.3 70b,Solo Models,Advisor,NA,Safety,0.538,0.009 +Llama 4 Maverick,Solo Models,Advisor,Meta,Safety,0.612,0.006 +Llama 4 Maverick,Solo Models,Advisor,NA,Safety,0.612,0.006 +Llama 4 Scout,Solo Models,Advisor,Meta,Safety,0.482,0.006 +Llama 4 Scout,Solo Models,Advisor,NA,Safety,0.482,0.006 +MedGemma 27B,Solo Models,Advisor,Google,Safety,0.554,0.015 +MedGemma 27B,Solo Models,Advisor,NA,Safety,0.554,0.015 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Safety,0.604,0.011 +Mistral Large 2.1,Solo Models,Advisor,NA,Safety,0.604,0.011 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Safety,0.537,0.014 +Mistral Medium 3.1,Solo Models,Advisor,NA,Safety,0.537,0.014 +Qwen3 235B,Solo Models,Advisor,Alibaba,Safety,0.565,0.015 +Qwen3 235B,Solo Models,Advisor,NA,Safety,0.565,0.015 +Qwen3 32B,Solo Models,Advisor,Alibaba,Safety,0.527,0.014 +Qwen3 32B,Solo Models,Advisor,NA,Safety,0.527,0.014 +o1,Solo Models,Advisor,OpenAI,Safety,0.599,0.009 +o1,Solo Models,Advisor,NA,Safety,0.599,0.009 +o1 mini,Solo Models,Advisor,OpenAI,Safety,0.461,0.015 +o1 mini,Solo Models,Advisor,NA,Safety,0.461,0.015 +o3 mini,Solo Models,Advisor,OpenAI,Safety,0.494,0.006 +o3 mini,Solo Models,Advisor,NA,Safety,0.494,0.006 +o4 mini,Solo Models,Advisor,OpenAI,Safety,0.525,0.014 +o4 mini,Solo Models,Advisor,NA,Safety,0.525,0.014 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,nnh_cumulative,9.141,0.944 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,nnh_cumulative,9.697,0.594 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.697,0.594 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,2.784,0.175 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.784,0.175 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.889,1.089 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.889,1.089 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.862,2.304 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.862,2.304 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,8.586,0.495 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,11.126,3.139 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.126,3.139 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,nnh_cumulative,8.159,0.914 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.159,0.914 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,nnh_cumulative,7.906,0.419 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.906,0.419 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.372,0.792 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,9.485,0.661 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.485,0.661 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,nnh_cumulative,13.492,0.974 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.492,0.974 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,nnh_cumulative,10.152,1.175 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.152,1.175 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,nnh_cumulative,9.36,1.549 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.36,1.549 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,nnh_cumulative,11.574,0.907 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.574,0.907 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,12.208,1.79 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,12.208,1.79 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,11.636,1.5 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.636,1.5 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,10.373,1.084 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.373,1.084 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,nnh_cumulative,14.599,1.47 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,14.599,1.47 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,nnh_cumulative,10.842,1.451 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.842,1.451 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,nnh_cumulative,10.891,1.669 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.891,1.669 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,nnh_cumulative,10.067,1.145 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.633,0.078 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.633,0.078 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,9.394,0.594 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.394,0.594 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.333,0 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.333,0 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.586,0.495 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,10.067,1.145 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,8.12,0.419 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,7.906,0.419 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.906,0.419 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.333,0 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.333,0 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,nnh_cumulative,8.462,1.508 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.462,1.508 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.615,0.172 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.615,0.172 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.098,0.692 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.098,0.692 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.372,0.792 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.095,1.867 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.095,1.867 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,10.104,4.114 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.104,4.114 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.442,1.273 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.442,1.273 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,nnh_cumulative,8.155,0.573 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.155,0.573 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,7.937,2.04 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.937,2.04 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,6.843,2.211 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.843,2.211 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.222,1.089 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.222,1.089 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.518,0.284 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.518,0.284 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.425,1.207 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.425,1.207 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.639,2.326 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.639,2.326 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,7.555,1.82 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.555,1.82 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.928,1.315 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.928,1.315 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,5.828,0.838 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,5.828,0.838 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,6.035,0.646 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.035,0.646 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,nnh_cumulative,11.79,1.328 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.79,1.328 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,9.697,0.594 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.697,0.594 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,3.394,0.385 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,3.394,0.385 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,9.394,0.594 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.394,0.594 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,8.889,1.089 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.889,1.089 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,9.141,0.944 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,9.975,2.512 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.975,2.512 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,8.97,0.604 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.97,0.604 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,8.625,0.914 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.625,0.914 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,9.677,0.783 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.677,0.783 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,10.067,1.145 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,11.667,1.633 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.667,1.633 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,11.136,1.519 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.136,1.519 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,8.799,1.203 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.799,1.203 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,nnh_cumulative,15.718,2.738 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,15.718,2.738 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,11.636,1.932 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.636,1.932 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,7.723,0.674 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.723,0.674 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,2.476,0.253 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.476,0.253 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,10.53,1.998 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.53,1.998 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,11.781,1.622 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.781,1.622 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,11.204,1.417 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.204,1.417 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,7.996,0.847 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.996,0.847 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,nnh_cumulative,15.147,2.581 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,15.147,2.581 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,9.141,0.944 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,9.141,0.944 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,8.586,0.495 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,nnh_cumulative,10.5,0.98 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.5,0.98 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,nnh_cumulative,8.625,0.914 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.625,0.914 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,3.622,0.231 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,3.622,0.231 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,7.009,0.67 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.009,0.67 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,8.12,0.419 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,10.278,2.373 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.278,2.373 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,8.372,0.792 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,nnh_cumulative,13.024,1.467 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.024,1.467 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,9.59,0.594 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.59,0.594 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,8.586,1.95 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,1.95 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,9.141,0.944 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,nnh_cumulative,8.838,0.495 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.838,0.495 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,nnh_cumulative,9.333,0.8 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.333,0.8 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,4.013,0.328 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.013,0.328 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,8.12,0.419 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,7.692,0 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.692,0 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,9.091,0 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.091,0 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,13.968,2.776 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.968,2.776 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,7.438,0.356 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.438,0.356 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,6.182,0.307 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.182,0.307 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,9.764,1.32 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.764,1.32 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,nnh_cumulative,10.051,0.81 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.051,0.81 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,nnh_cumulative,6.564,0.72 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.564,0.72 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.81,0.181 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.81,0.181 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,6.825,0.311 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.825,0.311 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,5.773,0.214 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,5.773,0.214 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,6.144,0.512 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.144,0.512 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.372,0.792 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,4.353,0.214 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.353,0.214 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,4.486,0.271 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.486,0.271 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,4.227,0.118 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.227,0.118 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,nnh_cumulative,8.757,0.646 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.757,0.646 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,nnh_cumulative,9.531,1.071 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.531,1.071 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,nnh_cumulative,11.955,1.937 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.955,1.937 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,nnh_cumulative,11.323,2.173 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.323,2.173 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,nnh_cumulative,14.335,1.515 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,14.335,1.515 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,nnh_cumulative,10.933,2.46 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.933,2.46 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,nnh_cumulative,11.667,3.443 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.667,3.443 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,nnh_cumulative,12.024,1.227 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.024,1.227 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,nnh_cumulative,8.028,0.935 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.028,0.935 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,nnh_cumulative,11.12,2.825 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.12,2.825 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,nnh_cumulative,13.051,1.152 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13.051,1.152 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,12.037,0.907 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.037,0.907 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,12.231,2.771 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.231,2.771 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,9.061,1.569 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.061,1.569 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,nnh_cumulative,11.246,2.002 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.246,2.002 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,13.222,3.446 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13.222,3.446 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,10.543,1.229 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.543,1.229 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,nnh_cumulative,12.753,4.294 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.753,4.294 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,nnh_cumulative,11.857,1.618 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.857,1.618 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,nnh_cumulative,11.25,2.376 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.25,2.376 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,nnh_cumulative,10.789,1.612 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.789,1.612 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,nnh_cumulative,12.794,2.181 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.794,2.181 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,9.893,1.226 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.893,1.226 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,8.524,0.712 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.524,0.712 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,nnh_cumulative,11.364,2.227 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.364,2.227 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,9.889,0.871 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.889,0.871 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,nnh_cumulative,15.246,2.272 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,15.246,2.272 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,nnh_cumulative,16.27,3.076 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,16.27,3.076 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,nnh_cumulative,15.429,2.519 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,15.429,2.519 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,nnh_cumulative,23.25,4.276 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,23.25,4.276 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,nnh_cumulative,16.349,4.322 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,16.349,4.322 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,nnh_cumulative,8.603,1.017 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.603,1.017 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,nnh_cumulative,13,3.601 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13,3.601 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,7.505,0.553 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,7.505,0.553 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,7.505,0.553 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,7.505,0.553 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,nnh_cumulative,9.707,0.92 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.707,0.92 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,nnh_cumulative,8.538,1.28 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.538,1.28 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,nnh_cumulative,12.262,2.436 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.262,2.436 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,6.522,0.418 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,6.522,0.418 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,6.485,0.532 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,6.485,0.532 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,nnh_cumulative,8.368,0.354 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,nnh_cumulative,8.368,0.354 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,nnh_cumulative,7.442,0.381 +Claude Haiku 4.5,Solo Models,Advisor,NA,nnh_cumulative,7.442,0.381 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,nnh_cumulative,9.958,0.888 +Claude Sonnet 4.5,Solo Models,Advisor,NA,nnh_cumulative,9.958,0.888 +DeepSeek R1,Solo Models,Advisor,DeepSeek,nnh_cumulative,10.566,0.895 +DeepSeek R1,Solo Models,Advisor,NA,nnh_cumulative,10.566,0.895 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,nnh_cumulative,9.644,0.82 +DeepSeek V3.1,Solo Models,Advisor,NA,nnh_cumulative,9.644,0.82 +Expert AI,Solo Models,Advisor,NA,nnh_cumulative,9.218,0.721 +GPT-4.1,Solo Models,Advisor,OpenAI,nnh_cumulative,7.805,0.359 +GPT-4.1,Solo Models,Advisor,NA,nnh_cumulative,7.805,0.359 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,5.65,0.194 +GPT-4.1 mini,Solo Models,Advisor,NA,nnh_cumulative,5.65,0.194 +GPT-4o,Solo Models,Advisor,OpenAI,nnh_cumulative,6.698,0.474 +GPT-4o,Solo Models,Advisor,NA,nnh_cumulative,6.698,0.474 +GPT-4o mini,Solo Models,Advisor,OpenAI,nnh_cumulative,5.034,0.352 +GPT-4o mini,Solo Models,Advisor,NA,nnh_cumulative,5.034,0.352 +GPT-5,Solo Models,Advisor,OpenAI,nnh_cumulative,7.982,0.48 +GPT-5,Solo Models,Advisor,NA,nnh_cumulative,7.982,0.48 +GPT-5 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,7.23,0.454 +GPT-5 mini,Solo Models,Advisor,NA,nnh_cumulative,7.23,0.454 +GPT-5 nano,Solo Models,Advisor,OpenAI,nnh_cumulative,6.318,0.565 +GPT-5 nano,Solo Models,Advisor,NA,nnh_cumulative,6.318,0.565 +Gemini 2.0 Flash,Solo Models,Advisor,Google,nnh_cumulative,9.128,0.675 +Gemini 2.0 Flash,Solo Models,Advisor,NA,nnh_cumulative,9.128,0.675 +Gemini 2.5 Flash,Solo Models,Advisor,Google,nnh_cumulative,11.461,1.265 +Gemini 2.5 Flash,Solo Models,Advisor,NA,nnh_cumulative,11.461,1.265 +Gemini 2.5 Pro,Solo Models,Advisor,Google,nnh_cumulative,10.425,1.273 +Gemini 2.5 Pro,Solo Models,Advisor,NA,nnh_cumulative,10.425,1.273 +Gemini 3 Pro,Solo Models,Advisor,Google,nnh_cumulative,7.621,0.403 +Gemini 3 Pro,Solo Models,Advisor,NA,nnh_cumulative,7.621,0.403 +Glass Health 4.0,Solo Models,Advisor,Glass Health,nnh_cumulative,8.411,0.628 +Glass Health 4.0,Solo Models,Advisor,NA,nnh_cumulative,8.411,0.628 +Grok 4,Solo Models,Advisor,xAI,nnh_cumulative,7.28,0.607 +Grok 4,Solo Models,Advisor,NA,nnh_cumulative,7.28,0.607 +Grok 4 Fast,Solo Models,Advisor,xAI,nnh_cumulative,9.236,0.969 +Grok 4 Fast,Solo Models,Advisor,NA,nnh_cumulative,9.236,0.969 +Kimi K2,Solo Models,Advisor,Moonshot AI,nnh_cumulative,8.633,0.634 +Kimi K2,Solo Models,Advisor,NA,nnh_cumulative,8.633,0.634 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,nnh_cumulative,10.726,0.822 +LiSA 1.0,Solo Models,Advisor,NA,nnh_cumulative,10.726,0.822 +Llama 3.3 70b,Solo Models,Advisor,Meta,nnh_cumulative,8.27,0.502 +Llama 3.3 70b,Solo Models,Advisor,NA,nnh_cumulative,8.27,0.502 +Llama 4 Maverick,Solo Models,Advisor,Meta,nnh_cumulative,10.337,0.649 +Llama 4 Maverick,Solo Models,Advisor,NA,nnh_cumulative,10.337,0.649 +Llama 4 Scout,Solo Models,Advisor,Meta,nnh_cumulative,6.372,0.175 +Llama 4 Scout,Solo Models,Advisor,NA,nnh_cumulative,6.372,0.175 +MedGemma 27B,Solo Models,Advisor,Google,nnh_cumulative,7.225,0.538 +MedGemma 27B,Solo Models,Advisor,NA,nnh_cumulative,7.225,0.538 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,7.162,0.631 +Mistral Large 2.1,Solo Models,Advisor,NA,nnh_cumulative,7.162,0.631 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,6.202,0.473 +Mistral Medium 3.1,Solo Models,Advisor,NA,nnh_cumulative,6.202,0.473 +Qwen3 235B,Solo Models,Advisor,Alibaba,nnh_cumulative,6.485,0.532 +Qwen3 235B,Solo Models,Advisor,NA,nnh_cumulative,6.485,0.532 +Qwen3 32B,Solo Models,Advisor,Alibaba,nnh_cumulative,5.78,0.374 +Qwen3 32B,Solo Models,Advisor,NA,nnh_cumulative,5.78,0.374 +o1,Solo Models,Advisor,OpenAI,nnh_cumulative,5.608,0.358 +o1,Solo Models,Advisor,NA,nnh_cumulative,5.608,0.358 +o1 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,6.131,0.541 +o1 mini,Solo Models,Advisor,NA,nnh_cumulative,6.131,0.541 +o3 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,4.512,0.128 +o3 mini,Solo Models,Advisor,NA,nnh_cumulative,4.512,0.128 +o4 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,4.901,0.504 +o4 mini,Solo Models,Advisor,NA,nnh_cumulative,4.901,0.504 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,normalized,15,1.132 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,15,1.132 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,normalized,13.333,0.653 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,13.333,0.653 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,100.333,5.807 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,100.333,5.807 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,15,2.263 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.263 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,15.667,3.974 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,15.667,3.974 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,17,1.132 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,17,1.132 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,12.333,5.227 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,12.333,5.227 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,normalized,17.667,1.307 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,1.307 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,normalized,18,1.132 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,18,1.132 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,16.333,1.729 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.333,1.729 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,13.2,1.999 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,13.2,1.999 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,normalized,9.8,0.704 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,9.8,0.704 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,normalized,13.8,0.96 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,13.8,0.96 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,normalized,15.125,1.633 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,15.125,1.633 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,normalized,12.667,1.729 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,12.667,1.729 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,12.875,1.306 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,12.875,1.306 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,10.8,1.395 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,10.8,1.395 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,19.4,4.713 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,19.4,4.713 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,normalized,10.7,0.879 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,10.7,0.879 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,normalized,14.3,1.547 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,14.3,1.547 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,normalized,13.9,1.926 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,13.9,1.926 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,normalized,18.333,2.356 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,18.333,2.356 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,118.333,16.797 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,118.333,16.797 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,15.333,0.653 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,0.653 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,16.667,0.653 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,0.653 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,23.667,0.653 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,23.667,0.653 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,15.333,1.729 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.729 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,24.667,0.653 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,24.667,0.653 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,26.333,0.653 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,26.333,0.653 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,24,0 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,24,0 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,normalized,15.333,1.729 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.729 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,108.667,10.453 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,108.667,10.453 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,17.25,1.373 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,17.25,1.373 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,16.333,0.653 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.333,0.653 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,18,4.08 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,18,4.08 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,14.667,2.613 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,14.667,2.613 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,20.333,2.356 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,20.333,2.356 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,normalized,17.2,1.267 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,17.2,1.267 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,18.667,3.974 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,18.667,3.974 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,21,4.933 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,21,4.933 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19.667,2.848 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19.667,2.848 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,127,23.82 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,127,23.82 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,18.333,2.613 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,18.333,2.613 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19,4.08 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19,4.08 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,18.667,4.573 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,18.667,4.573 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,16.667,2.848 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,2.848 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,22.667,3.457 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,22.667,3.457 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,22,2.994 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22,2.994 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,normalized,14,2.445 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,14,2.445 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,14,2.994 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,14,2.994 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,65.667,9.756 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,65.667,9.756 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,13.333,0.653 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,13.333,0.653 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,14,1.96 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,14,1.96 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,13.667,1.729 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,13.667,1.729 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,15,3.92 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,15,3.92 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,14.6,0.999 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,14.6,0.999 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15.333,1.307 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.307 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,14.6,2.111 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,14.6,2.111 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,12,3.92 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,12,3.92 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,10.333,1.729 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,10.333,1.729 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,11,1.205 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,11,1.205 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,20.625,5.43 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,20.625,5.43 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,normalized,8.7,1.241 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,8.7,1.241 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,10.8,1.395 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,10.8,1.395 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,19,1.132 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,19,1.132 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,118.4,21.242 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,118.4,21.242 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,16,2.263 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,16,2.263 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,12.5,2.132 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,12.5,2.132 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,13,2.994 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,13,2.994 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,26.125,5.411 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,26.125,5.411 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,normalized,11.4,1.829 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,11.4,1.829 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15,2.994 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.994 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15,2.994 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.994 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,16,2.994 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16,2.994 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,normalized,13.8,1.44 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,13.8,1.44 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,normalized,16.667,0.653 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,0.653 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,62,15.963 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,62,15.963 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,19.333,2.848 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,19.333,2.848 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17.667,1.729 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,1.729 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,17,2.994 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,17,2.994 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,27.667,11.107 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,27.667,11.107 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,normalized,12,1.339 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,12,1.339 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,18.875,1.138 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,18.875,1.138 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,20.333,4.573 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,20.333,4.573 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17.667,4.573 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,4.573 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,normalized,15.667,4.573 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,15.667,4.573 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,normalized,16.8,1.143 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,16.8,1.143 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,58,2.263 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,58,2.263 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17,1.132 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,17,1.132 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,19.333,0.653 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19.333,0.653 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,20.667,0.653 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,20.667,0.653 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,12.7,4.967 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,12.7,4.967 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,28.25,1.47 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,28.25,1.47 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,33.125,1.905 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,33.125,1.905 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,22.667,3.457 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22.667,3.457 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,normalized,16,0.8 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,16,0.8 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,normalized,21.333,3.974 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,21.333,3.974 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,102.667,14.55 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,102.667,14.55 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19.667,0.653 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,19.667,0.653 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,22.333,3.638 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22.333,3.638 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,26.667,5.582 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,26.667,5.582 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,17.333,3.974 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,17.333,3.974 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,36.667,2.848 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,36.667,2.848 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,36,2.994 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,36,2.994 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,38.667,3.974 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,38.667,3.974 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,normalized,15.5,1.214 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.5,1.214 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,normalized,13.667,0.826 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13.667,0.826 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,normalized,13,1.283 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13,1.283 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,normalized,11.833,3.012 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,3.012 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,normalized,11.125,0.78 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.125,0.78 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,normalized,11,2.4 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11,2.4 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,normalized,11.833,2.65 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,2.65 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,normalized,11.4,1.176 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.4,1.176 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,normalized,17.667,1.729 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,17.667,1.729 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,normalized,14.75,3.699 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,14.75,3.699 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,normalized,11,0.653 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11,0.653 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,10.667,0.653 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.667,0.653 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,10.4,2.018 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.4,2.018 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,13.333,2.513 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13.333,2.513 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,normalized,11.4,1.709 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.4,1.709 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,10.8,1.143 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.8,1.143 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,11.5,1.497 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.5,1.497 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,normalized,10.333,2.848 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.333,2.848 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,normalized,9.8,0.96 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9.8,0.96 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,normalized,11.833,1.553 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,1.553 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,normalized,15.3,2.679 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.3,2.679 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,normalized,12.3,2.48 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,12.3,2.48 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,16.667,2.301 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,16.667,2.301 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,21,1.431 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,21,1.431 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,normalized,12.333,4.284 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,12.333,4.284 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,18.2,1.44 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,18.2,1.44 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,normalized,9,3.05 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9,3.05 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,normalized,9,2.087 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9,2.087 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,normalized,10.9,4.615 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.9,4.615 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,normalized,6.5,1.854 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,6.5,1.854 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,normalized,7,1.789 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,7,1.789 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,normalized,15.6,1.709 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.6,1.709 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,normalized,10.6,2.673 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.6,2.673 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,27.4,2.018 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,27.4,2.018 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,27.4,2.018 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,27.4,2.018 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,normalized,16.8,1.686 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,16.8,1.686 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,normalized,15.6,2.6 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.6,2.6 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,normalized,11.333,5.582 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.333,5.582 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,31.4,2.525 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,31.4,2.525 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,31,3.099 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,31,3.099 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,normalized,16.8,0.867 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,normalized,16.8,0.867 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,normalized,20.2,0.95 +Claude Haiku 4.5,Solo Models,Advisor,NA,normalized,20.2,0.95 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,normalized,13.1,1.172 +Claude Sonnet 4.5,Solo Models,Advisor,NA,normalized,13.1,1.172 +DeepSeek R1,Solo Models,Advisor,DeepSeek,normalized,14.3,1.036 +DeepSeek R1,Solo Models,Advisor,NA,normalized,14.3,1.036 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,normalized,17.2,1.472 +DeepSeek V3.1,Solo Models,Advisor,NA,normalized,17.2,1.472 +Expert AI,Solo Models,Advisor,NA,normalized,22.4,1.377 +GPT-4.1,Solo Models,Advisor,OpenAI,normalized,24.35,0.82 +GPT-4.1,Solo Models,Advisor,NA,normalized,24.35,0.82 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,normalized,34.95,1.404 +GPT-4.1 mini,Solo Models,Advisor,NA,normalized,34.95,1.404 +GPT-4o,Solo Models,Advisor,OpenAI,normalized,25.3,2.051 +GPT-4o,Solo Models,Advisor,NA,normalized,25.3,2.051 +GPT-4o mini,Solo Models,Advisor,OpenAI,normalized,40.1,2.928 +GPT-4o mini,Solo Models,Advisor,NA,normalized,40.1,2.928 +GPT-5,Solo Models,Advisor,OpenAI,normalized,17.35,1.298 +GPT-5,Solo Models,Advisor,NA,normalized,17.35,1.298 +GPT-5 mini,Solo Models,Advisor,OpenAI,normalized,20.05,1.307 +GPT-5 mini,Solo Models,Advisor,NA,normalized,20.05,1.307 +GPT-5 nano,Solo Models,Advisor,OpenAI,normalized,24.5,2.449 +GPT-5 nano,Solo Models,Advisor,NA,normalized,24.5,2.449 +Gemini 2.0 Flash,Solo Models,Advisor,Google,normalized,14.6,1.021 +Gemini 2.0 Flash,Solo Models,Advisor,NA,normalized,14.6,1.021 +Gemini 2.5 Flash,Solo Models,Advisor,Google,normalized,11.75,1.239 +Gemini 2.5 Flash,Solo Models,Advisor,NA,normalized,11.75,1.239 +Gemini 2.5 Pro,Solo Models,Advisor,Google,normalized,13.75,1.944 +Gemini 2.5 Pro,Solo Models,Advisor,NA,normalized,13.75,1.944 +Gemini 3 Pro,Solo Models,Advisor,Google,normalized,19.75,1.723 +Gemini 3 Pro,Solo Models,Advisor,NA,normalized,19.75,1.723 +Glass Health 4.0,Solo Models,Advisor,Glass Health,normalized,15.846,1.171 +Glass Health 4.0,Solo Models,Advisor,NA,normalized,15.846,1.171 +Grok 4,Solo Models,Advisor,xAI,normalized,19.267,1.594 +Grok 4,Solo Models,Advisor,NA,normalized,19.267,1.594 +Grok 4 Fast,Solo Models,Advisor,xAI,normalized,20,2.247 +Grok 4 Fast,Solo Models,Advisor,NA,normalized,20,2.247 +Kimi K2,Solo Models,Advisor,Moonshot AI,normalized,17.533,1.541 +Kimi K2,Solo Models,Advisor,NA,normalized,17.533,1.541 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,normalized,12.9,1.119 +LiSA 1.0,Solo Models,Advisor,NA,normalized,12.9,1.119 +Llama 3.3 70b,Solo Models,Advisor,Meta,normalized,21,1.729 +Llama 3.3 70b,Solo Models,Advisor,NA,normalized,21,1.729 +Llama 4 Maverick,Solo Models,Advisor,Meta,normalized,18.5,0.785 +Llama 4 Maverick,Solo Models,Advisor,NA,normalized,18.5,0.785 +Llama 4 Scout,Solo Models,Advisor,Meta,normalized,32.35,0.967 +Llama 4 Scout,Solo Models,Advisor,NA,normalized,32.35,0.967 +MedGemma 27B,Solo Models,Advisor,Google,normalized,28,1.987 +MedGemma 27B,Solo Models,Advisor,NA,normalized,28,1.987 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,normalized,22.2,2.336 +Mistral Large 2.1,Solo Models,Advisor,NA,normalized,22.2,2.336 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,normalized,29.133,1.468 +Mistral Medium 3.1,Solo Models,Advisor,NA,normalized,29.133,1.468 +Qwen3 235B,Solo Models,Advisor,Alibaba,normalized,31.25,3.869 +Qwen3 235B,Solo Models,Advisor,NA,normalized,31.25,3.869 +Qwen3 32B,Solo Models,Advisor,Alibaba,normalized,30.077,1.668 +Qwen3 32B,Solo Models,Advisor,NA,normalized,30.077,1.668 +o1,Solo Models,Advisor,OpenAI,normalized,23.6,1.345 +o1,Solo Models,Advisor,NA,normalized,23.6,1.345 +o1 mini,Solo Models,Advisor,OpenAI,normalized,28.6,2.968 +o1 mini,Solo Models,Advisor,NA,normalized,28.6,2.968 +o3 mini,Solo Models,Advisor,OpenAI,normalized,35.3,1.408 +o3 mini,Solo Models,Advisor,NA,normalized,35.3,1.408 +o4 mini,Solo Models,Advisor,OpenAI,normalized,39.9,2.47 +o4 mini,Solo Models,Advisor,NA,normalized,39.9,2.47 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,pct_cumulative,0.11,0.011 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,pct_cumulative,0.103,0.007 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.007 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.36,0.023 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.36,0.023 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.113,0.013 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.013 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.117,0.028 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.028 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.117,0.007 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.093,0.024 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.093,0.024 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,pct_cumulative,0.123,0.013 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.013 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,pct_cumulative,0.127,0.007 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.007 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.12,0.011 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.106,0.008 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.106,0.008 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,pct_cumulative,0.075,0.005 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.075,0.005 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,pct_cumulative,0.1,0.012 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.012 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,pct_cumulative,0.111,0.015 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.111,0.015 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,pct_cumulative,0.087,0.007 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.007 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.085,0.012 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.085,0.012 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.089,0.01 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.089,0.01 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.099,0.011 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.099,0.011 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,pct_cumulative,0.07,0.007 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.07,0.007 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,pct_cumulative,0.096,0.012 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.012 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,pct_cumulative,0.096,0.012 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.012 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,pct_cumulative,0.1,0.011 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.38,0.011 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.38,0.011 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.107,0.007 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.007 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.117,0.007 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.1,0.011 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.123,0.007 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.127,0.007 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.007 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,pct_cumulative,0.12,0.02 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.02 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.383,0.026 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.383,0.026 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.125,0.01 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.125,0.01 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0.011 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.127,0.026 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.026 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.107,0.036 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.036 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.12,0.02 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.02 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,pct_cumulative,0.124,0.008 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.124,0.008 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.13,0.03 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0.03 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.153,0.043 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.153,0.043 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.14,0.02 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.14,0.02 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.402,0.042 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.402,0.042 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.137,0.024 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.024 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.137,0.036 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.036 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.137,0.035 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.035 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.113,0.017 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.017 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.173,0.024 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.173,0.024 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.167,0.017 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.167,0.017 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,pct_cumulative,0.087,0.008 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.008 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.103,0.007 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.007 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.297,0.035 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.297,0.035 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.107,0.007 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.007 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.113,0.013 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.013 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.11,0.011 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.103,0.024 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.024 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.112,0.007 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.112,0.007 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.117,0.013 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.013 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.104,0.008 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.104,0.008 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.1,0.011 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.087,0.013 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.013 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.093,0.01 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.093,0.01 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.117,0.016 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.016 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,pct_cumulative,0.068,0.011 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.068,0.011 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.091,0.013 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.091,0.013 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.13,0.011 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0.011 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.408,0.039 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.408,0.039 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.097,0.017 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.097,0.017 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.089,0.013 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.089,0.013 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.09,0.011 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.09,0.011 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.128,0.013 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.128,0.013 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,pct_cumulative,0.07,0.011 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.07,0.011 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.11,0.011 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.11,0.011 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.117,0.007 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,pct_cumulative,0.096,0.008 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.008 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,pct_cumulative,0.117,0.013 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.013 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.277,0.017 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.277,0.017 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.143,0.013 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.143,0.013 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.123,0.007 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.1,0.023 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.023 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.12,0.011 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,pct_cumulative,0.079,0.008 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.079,0.008 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.105,0.006 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.105,0.006 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.12,0.03 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.03 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.11,0.011 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,pct_cumulative,0.113,0.007 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.007 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,pct_cumulative,0.108,0.01 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.108,0.01 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.25,0.02 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.25,0.02 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.123,0.007 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.13,0 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.11,0 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.078,0.014 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.078,0.014 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.135,0.006 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.135,0.006 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.162,0.008 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.162,0.008 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.103,0.013 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.013 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,pct_cumulative,0.1,0.008 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.008 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,pct_cumulative,0.153,0.017 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.153,0.017 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.357,0.024 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.357,0.024 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.147,0.007 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.147,0.007 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.173,0.007 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.173,0.007 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.163,0.013 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.163,0.013 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.12,0.011 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.23,0.011 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.23,0.011 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.223,0.013 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.223,0.013 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.237,0.007 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.237,0.007 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,pct_cumulative,0.115,0.008 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.115,0.008 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,pct_cumulative,0.107,0.012 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.107,0.012 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,pct_cumulative,0.088,0.013 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.088,0.013 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,pct_cumulative,0.093,0.021 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.021 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,pct_cumulative,0.071,0.008 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.071,0.008 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,pct_cumulative,0.095,0.02 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.095,0.02 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,pct_cumulative,0.093,0.021 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.021 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,pct_cumulative,0.084,0.008 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.084,0.008 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,pct_cumulative,0.127,0.014 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.127,0.014 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,pct_cumulative,0.095,0.026 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.095,0.026 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,pct_cumulative,0.078,0.006 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.078,0.006 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.083,0.007 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.007 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.086,0.018 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.086,0.018 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.115,0.021 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.115,0.021 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,pct_cumulative,0.092,0.017 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.092,0.017 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.08,0.016 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.08,0.016 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.097,0.012 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.097,0.012 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,pct_cumulative,0.083,0.028 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.028 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,pct_cumulative,0.086,0.012 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.086,0.012 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,pct_cumulative,0.093,0.017 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.017 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,pct_cumulative,0.097,0.013 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.097,0.013 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,pct_cumulative,0.083,0.012 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.012 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.103,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.103,0.014 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.118,0.009 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.118,0.009 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,pct_cumulative,0.09,0.02 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.09,0.02 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.102,0.01 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.102,0.01 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,pct_cumulative,0.069,0.01 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.069,0.01 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,pct_cumulative,0.066,0.011 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.066,0.011 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,pct_cumulative,0.069,0.011 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.069,0.011 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,pct_cumulative,0.047,0.01 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.047,0.01 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,pct_cumulative,0.065,0.019 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.065,0.019 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,pct_cumulative,0.118,0.014 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.118,0.014 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,pct_cumulative,0.082,0.018 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.082,0.018 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.134,0.01 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.134,0.01 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.134,0.01 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.134,0.01 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,pct_cumulative,0.104,0.01 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.104,0.01 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,pct_cumulative,0.12,0.019 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.12,0.019 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,pct_cumulative,0.083,0.017 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.017 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.154,0.01 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.154,0.01 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.155,0.013 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.155,0.013 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,pct_cumulative,0.12,0.005 +Claude 3.7 Sonnet,Solo Models,Advisor,NA,pct_cumulative,0.12,0.005 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,pct_cumulative,0.136,0.007 +Claude Haiku 4.5,Solo Models,Advisor,NA,pct_cumulative,0.136,0.007 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,pct_cumulative,0.104,0.009 +Claude Sonnet 4.5,Solo Models,Advisor,NA,pct_cumulative,0.104,0.009 +DeepSeek R1,Solo Models,Advisor,DeepSeek,pct_cumulative,0.098,0.008 +DeepSeek R1,Solo Models,Advisor,NA,pct_cumulative,0.098,0.008 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,pct_cumulative,0.106,0.008 +DeepSeek V3.1,Solo Models,Advisor,NA,pct_cumulative,0.106,0.008 +Expert AI,Solo Models,Advisor,NA,pct_cumulative,0.111,0.01 +GPT-4.1,Solo Models,Advisor,OpenAI,pct_cumulative,0.13,0.006 +GPT-4.1,Solo Models,Advisor,NA,pct_cumulative,0.13,0.006 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.178,0.006 +GPT-4.1 mini,Solo Models,Advisor,NA,pct_cumulative,0.178,0.006 +GPT-4o,Solo Models,Advisor,OpenAI,pct_cumulative,0.153,0.011 +GPT-4o,Solo Models,Advisor,NA,pct_cumulative,0.153,0.011 +GPT-4o mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.201,0.014 +GPT-4o mini,Solo Models,Advisor,NA,pct_cumulative,0.201,0.014 +GPT-5,Solo Models,Advisor,OpenAI,pct_cumulative,0.128,0.008 +GPT-5,Solo Models,Advisor,NA,pct_cumulative,0.128,0.008 +GPT-5 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.141,0.009 +GPT-5 mini,Solo Models,Advisor,NA,pct_cumulative,0.141,0.009 +GPT-5 nano,Solo Models,Advisor,OpenAI,pct_cumulative,0.161,0.013 +GPT-5 nano,Solo Models,Advisor,NA,pct_cumulative,0.161,0.013 +Gemini 2.0 Flash,Solo Models,Advisor,Google,pct_cumulative,0.111,0.008 +Gemini 2.0 Flash,Solo Models,Advisor,NA,pct_cumulative,0.111,0.008 +Gemini 2.5 Flash,Solo Models,Advisor,Google,pct_cumulative,0.092,0.009 +Gemini 2.5 Flash,Solo Models,Advisor,NA,pct_cumulative,0.092,0.009 +Gemini 2.5 Pro,Solo Models,Advisor,Google,pct_cumulative,0.102,0.009 +Gemini 2.5 Pro,Solo Models,Advisor,NA,pct_cumulative,0.102,0.009 +Gemini 3 Pro,Solo Models,Advisor,Google,pct_cumulative,0.133,0.007 +Gemini 3 Pro,Solo Models,Advisor,NA,pct_cumulative,0.133,0.007 +Glass Health 4.0,Solo Models,Advisor,Glass Health,pct_cumulative,0.121,0.008 +Glass Health 4.0,Solo Models,Advisor,NA,pct_cumulative,0.121,0.008 +Grok 4,Solo Models,Advisor,xAI,pct_cumulative,0.141,0.011 +Grok 4,Solo Models,Advisor,NA,pct_cumulative,0.141,0.011 +Grok 4 Fast,Solo Models,Advisor,xAI,pct_cumulative,0.113,0.012 +Grok 4 Fast,Solo Models,Advisor,NA,pct_cumulative,0.113,0.012 +Kimi K2,Solo Models,Advisor,Moonshot AI,pct_cumulative,0.118,0.008 +Kimi K2,Solo Models,Advisor,NA,pct_cumulative,0.118,0.008 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,pct_cumulative,0.096,0.007 +LiSA 1.0,Solo Models,Advisor,NA,pct_cumulative,0.096,0.007 +Llama 3.3 70b,Solo Models,Advisor,Meta,pct_cumulative,0.122,0.008 +Llama 3.3 70b,Solo Models,Advisor,NA,pct_cumulative,0.122,0.008 +Llama 4 Maverick,Solo Models,Advisor,Meta,pct_cumulative,0.098,0.006 +Llama 4 Maverick,Solo Models,Advisor,NA,pct_cumulative,0.098,0.006 +Llama 4 Scout,Solo Models,Advisor,Meta,pct_cumulative,0.158,0.004 +Llama 4 Scout,Solo Models,Advisor,NA,pct_cumulative,0.158,0.004 +MedGemma 27B,Solo Models,Advisor,Google,pct_cumulative,0.14,0.01 +MedGemma 27B,Solo Models,Advisor,NA,pct_cumulative,0.14,0.01 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.143,0.01 +Mistral Large 2.1,Solo Models,Advisor,NA,pct_cumulative,0.143,0.01 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.164,0.01 +Mistral Medium 3.1,Solo Models,Advisor,NA,pct_cumulative,0.164,0.01 +Qwen3 235B,Solo Models,Advisor,Alibaba,pct_cumulative,0.155,0.013 +Qwen3 235B,Solo Models,Advisor,NA,pct_cumulative,0.155,0.013 +Qwen3 32B,Solo Models,Advisor,Alibaba,pct_cumulative,0.175,0.012 +Qwen3 32B,Solo Models,Advisor,NA,pct_cumulative,0.175,0.012 +o1,Solo Models,Advisor,OpenAI,pct_cumulative,0.18,0.011 +o1,Solo Models,Advisor,NA,pct_cumulative,0.18,0.011 +o1 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.166,0.014 +o1 mini,Solo Models,Advisor,NA,pct_cumulative,0.166,0.014 +o3 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.222,0.006 +o3 mini,Solo Models,Advisor,NA,pct_cumulative,0.222,0.006 +o4 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.209,0.021 +o4 mini,Solo Models,Advisor,NA,pct_cumulative,0.209,0.021 +Human Generalist Physicians,Solo Models,Human,Human,Completeness,0.333,0.025 +Human,Solo Models,Human,NA,Completeness,0.333,0.025 +Human Generalist Physicians,Solo Models,Human,Human,Escalation,0.545,0.136 +Human,Solo Models,Human,NA,Escalation,0.545,0.136 +Human Generalist Physicians,Solo Models,Human,Human,F1,0.551,0.03 +Human,Solo Models,Human,NA,F1,0.551,0.03 +Human Generalist Physicians,Solo Models,Human,Human,OverallScore,0.46,0.005 +Human,Solo Models,Human,NA,OverallScore,0.46,0.005 +Human Generalist Physicians,Solo Models,Human,Human,Precision,0.492,0.059 +Human,Solo Models,Human,NA,Precision,0.492,0.059 +Human Generalist Physicians,Solo Models,Human,Human,Recall,0.629,0.019 +Human,Solo Models,Human,NA,Recall,0.629,0.019 +Human Generalist Physicians,Solo Models,Human,Human,Restraint,0.559,0.062 +Human,Solo Models,Human,NA,Restraint,0.559,0.062 +Human Generalist Physicians,Solo Models,Human,Human,Safety,0.586,0.016 +Human,Solo Models,Human,NA,Safety,0.586,0.016 +Human Generalist Physicians,Solo Models,Human,Human,nnh_cumulative,6.167,1.424 +Human,Solo Models,Human,NA,nnh_cumulative,6.167,1.424 +Human Generalist Physicians,Solo Models,Human,Human,normalized,33.333,7.544 +Human,Solo Models,Human,NA,normalized,33.333,7.544 +Human Generalist Physicians,Solo Models,Human,Human,pct_cumulative,0.167,0.038 +Human,Solo Models,Human,NA,pct_cumulative,0.167,0.038 +Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Runtime,62.516,1.993 +Claude 3.7 Sonnet,Solo Models,AdvisorAvoid,Anthropic,Runtime,62.516,1.993 +Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Runtime,115.97,4.194 +Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Runtime,150.11,5.59 +Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,78.058,2.583 +Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,161.371,5.044 +Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,122.695,3.428 +Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,73.355,2.621 +Claude 3.7 Sonnet + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Google + Anthropic,Runtime,128.244,4.598 +Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,142.958,2.915 +Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Anthropic + Google + OpenAI,Runtime,225.74,5.002 +Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Google + OpenAI,Runtime,212.682,4.58 +Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Runtime,77.181,2.666 +Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Runtime,84.859,2.964 +Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Anthropic + Meta + Google,Runtime,165.869,4.133 +Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Meta + Google,Runtime,161.69,3.931 +Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,93.705,2.773 +Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Runtime,28.477,0.45 +Claude Haiku 4.5,Solo Models,AdvisorAvoid,Anthropic,Runtime,30.124,0.86 +Claude Haiku 4.5,Solo Models,AdvisorMax,Anthropic,Runtime,26.829,0.228 +Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Runtime,52.02,1.421 +Claude Sonnet 4.5,Solo Models,AdvisorAvoid,Anthropic,Runtime,70.42,2.305 +Claude Sonnet 4.5,Solo Models,AdvisorMax,Anthropic,Runtime,33.621,0.404 +Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,170.678,3.968 +Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Runtime,116.461,2.503 +DeepSeek R1,Solo Models,Advisor,DeepSeek,Runtime,81.615,1.169 +DeepSeek R1,Solo Models,AdvisorAvoid,DeepSeek,Runtime,95.022,1.937 +DeepSeek R1,Solo Models,AdvisorMax,DeepSeek,Runtime,59.242,1.249 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Runtime,135.939,3.96 +DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,DeepSeek + Anthropic,Runtime,130.946,4.556 +DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Runtime,163.641,4.122 +DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Runtime,212.295,5.646 +DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Runtime,231.908,5.843 +DeepSeek R1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,109.808,5.074 +DeepSeek R1 + GPT-5,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,195.841,6.874 +DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,162.666,5.099 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,112.424,3.447 +DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Steward,DeepSeek + Google,Runtime,106.073,3.301 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Runtime,154.679,3.666 +DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Google + Anthropic,Runtime,155.343,3.687 +DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Runtime,183.728,4.488 +DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Runtime,162.221,3.627 +DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,145.062,3.407 +DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Runtime,211.683,5.023 +DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Runtime,221.04,4.675 +DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,182.48,3.443 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Runtime,245.384,5.358 +DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Google + OpenAI,Runtime,243.615,3.915 +DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Runtime,140.472,3.378 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Runtime,112.284,3.279 +DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Steward,DeepSeek + Meta,Runtime,111.754,3.301 +DeepSeek R1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Runtime,115.821,4.854 +DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Meta + Google,Runtime,195.417,6.177 +DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Meta + Google,Runtime,191.133,5.747 +DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,130.064,3.33 +DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Runtime,30.446,0.609 +DeepSeek V3.1,Solo Models,AdvisorAvoid,DeepSeek,Runtime,29.078,1.056 +DeepSeek V3.1,Solo Models,AdvisorMax,DeepSeek,Runtime,31.13,0.743 +Expert AI,Solo Models,Advisor,UpToDate,Runtime,44.729,0.594 +Expert AI,Solo Models,AdvisorAvoid,UpToDate,Runtime,44.21,0.805 +Expert AI,Solo Models,AdvisorFree,UpToDate,Runtime,66.261,0.937 +Expert AI,Solo Models,AdvisorMax,UpToDate,Runtime,45.768,0.756 +GPT-4.1,Solo Models,Advisor,OpenAI,Runtime,13.217,0.101 +GPT-4.1,Solo Models,AdvisorAvoid,OpenAI,Runtime,14.094,0.141 +GPT-4.1,Solo Models,AdvisorMax,OpenAI,Runtime,12.34,0.124 +GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,69.434,3.092 +GPT-4.1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,102.3,5.019 +GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,31.085,0.52 +GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,120.15,4.805 +GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,76.648,2.195 +GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,28.071,0.374 +GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,82.262,3.743 +GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,78.449,2.482 +GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,99.267,1.299 +GPT-4.1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,178.031,3.66 +GPT-4.1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,169.855,3.521 +GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,31.438,0.488 +GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,38.471,1.441 +GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,114.786,2.219 +GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,112.396,2.241 +GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,49.474,0.784 +GPT-4.1 mini,Solo Models,Advisor,OpenAI,Runtime,41.688,1.365 +GPT-4.1 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,19.654,0.353 +GPT-4.1 mini,Solo Models,AdvisorMax,OpenAI,Runtime,63.721,1.897 +GPT-4o,Solo Models,Advisor,OpenAI,Runtime,22.031,0.411 +GPT-4o,Solo Models,AdvisorAvoid,OpenAI,Runtime,17.066,0.259 +GPT-4o,Solo Models,AdvisorMax,OpenAI,Runtime,26.997,0.648 +GPT-4o mini,Solo Models,Advisor,OpenAI,Runtime,16.981,0.285 +GPT-4o mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,16.981,0.285 +GPT-5,Solo Models,Advisor,OpenAI,Runtime,75.236,1.445 +GPT-5,Solo Models,AdvisorAvoid,OpenAI,Runtime,91.382,2.401 +GPT-5,Solo Models,AdvisorFree,OpenAI,Runtime,67.492,1.418 +GPT-5,Solo Models,AdvisorMax,OpenAI,Runtime,59.089,0.768 +GPT-5 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,145.511,4.351 +GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,177.368,6.495 +GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,109.266,3.375 +GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,174.885,4.317 +GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Runtime,233.233,5.215 +GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,153.537,3.735 +GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,106.431,3.294 +GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,150.872,4.85 +GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,155.246,4.357 +GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,139.303,4.358 +GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,173.411,3.526 +GPT-5 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,241.583,5.454 +GPT-5 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,235.359,4.993 +GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Runtime,138.79,2.475 +GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,109.694,3.299 +GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,116.871,3.814 +GPT-5 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,183.745,4.66 +GPT-5 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,183.348,4.46 +GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,124.963,3.454 +GPT-5 mini,Solo Models,Advisor,OpenAI,Runtime,64.216,1.044 +GPT-5 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,54.46,1.132 +GPT-5 mini,Solo Models,AdvisorMax,OpenAI,Runtime,73.973,1.532 +GPT-5 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,127.462,4.789 +GPT-5 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,141.887,5.593 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,72.794,1.767 +GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Steward,OpenAI + OpenAI,Runtime,68.89,1.745 +GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,165.47,5.52 +GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,115.81,2.816 +GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,70.972,1.963 +GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,138.613,2.244 +GPT-5 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,212.173,4.295 +GPT-5 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,204.338,3.971 +GPT-5 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,75.718,2.014 +GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,83.961,2.603 +GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,155.754,3.408 +GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,153.856,3.402 +GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,87.828,2.089 +GPT-5 nano,Solo Models,Advisor,OpenAI,Runtime,43.758,0.894 +GPT-5 nano,Solo Models,AdvisorAvoid,OpenAI,Runtime,43.758,0.894 +Gemini 2.0 Flash,Solo Models,Advisor,Google,Runtime,15.999,0.18 +Gemini 2.0 Flash,Solo Models,AdvisorAvoid,Google,Runtime,15.999,0.18 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Runtime,84.673,2.751 +Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,Google + Anthropic,Runtime,72.956,1.884 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + Anthropic,Runtime,139.956,5.686 +Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + OpenAI,Runtime,175,6.422 +Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + OpenAI,Runtime,162.485,5.439 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Runtime,165.912,4.645 +Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + Google,Runtime,161.669,3.386 +Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Runtime,135.355,2.964 +Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,102.837,5.21 +Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,33.758,0.58 +Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,118.552,4.424 +Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,78.616,2.439 +Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,30.42,0.458 +Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,102.026,1.43 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Runtime,176.972,3.019 +Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Google + OpenAI,Runtime,170.951,2.992 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,34.228,0.478 +Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Steward,Google + Meta,Runtime,35.848,0.499 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,40.348,0.982 +Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Steward,Google + Meta,Runtime,38.207,0.824 +Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,123.429,2.223 +Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Meta + Google,Runtime,119.499,1.935 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,52.735,0.769 +Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,51.33,0.743 +Gemini 2.5 Flash,Solo Models,Advisor,Google,Runtime,48.326,0.642 +Gemini 2.5 Flash,Solo Models,AdvisorAvoid,Google,Runtime,61.444,0.552 +Gemini 2.5 Flash,Solo Models,AdvisorMax,Google,Runtime,17.845,0.274 +Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,119.39,3.171 +Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,134.413,2.561 +Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,125.624,1.476 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Runtime,176.966,4.796 +Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Runtime,193.598,2.341 +Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,139.192,1.467 +Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Runtime,180.858,2.545 +Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Runtime,198.144,2.32 +Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Runtime,196.918,2.084 +Gemini 2.5 Flash + Glass Health 4.0,2-Agent Teams,Advisor + Guardian,Google + Glass Health,Runtime,188.96,6.889 +Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Runtime,106.713,1.225 +Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,84.71,1.099 +Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Runtime,135.103,2.303 +Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Runtime,152.116,2.238 +Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,165.027,1.917 +Gemini 2.5 Pro,Solo Models,Advisor,Google,Runtime,89.581,0.43 +Gemini 2.5 Pro,Solo Models,AdvisorAvoid,Google,Runtime,92.728,0.644 +Gemini 2.5 Pro,Solo Models,AdvisorFree,Google,Runtime,83.898,0.907 +Gemini 2.5 Pro,Solo Models,AdvisorMax,Google,Runtime,83.006,0.825 +Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,171.56,5.385 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,104.65,1.338 +Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,101.098,1.408 +Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,189.569,4.876 +Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Anthropic,Runtime,233.592,5.612 +Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Anthropic,Runtime,236.49,5.691 +Gemini 2.5 Pro + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Runtime,248.59,6.044 +Gemini 2.5 Pro + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + OpenAI,Runtime,243.911,5.557 +Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Google,Runtime,249.206,5.548 +Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Google,Runtime,249.216,5.381 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,159.842,1.969 +Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,163.719,1.859 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Runtime,226.428,2.42 +Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + OpenAI,Runtime,232.66,2.625 +Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Google,Runtime,222.061,4.184 +Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Google,Runtime,223.92,4.039 +Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Runtime,211.436,2.217 +Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,100.153,1.544 +Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,163.924,1.494 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Runtime,231.227,2.856 +Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Google + OpenAI,Runtime,235.974,3.469 +Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Runtime,231.823,2.322 +Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Runtime,136.413,1.22 +Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,105.152,1.638 +Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,113.295,2.166 +Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,184.862,3.168 +Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Meta + Google,Runtime,182.81,3.071 +Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,117.865,1.663 +Gemini 3 Pro,Solo Models,Advisor,Google,Runtime,89.201,1.387 +Gemini 3 Pro,Solo Models,AdvisorAvoid,Google,Runtime,81.096,1.543 +Gemini 3 Pro,Solo Models,AdvisorMax,Google,Runtime,97.306,2.195 +Glass Health 4.0,Solo Models,Advisor,Glass Health,Runtime,121.469,1.661 +Glass Health 4.0,Solo Models,AdvisorAvoid,Glass Health,Runtime,144.791,3.679 +Glass Health 4.0,Solo Models,AdvisorFree,Glass Health,Runtime,142.283,2.191 +Glass Health 4.0,Solo Models,AdvisorMax,Glass Health,Runtime,111.44,1.363 +Grok 4,Solo Models,Advisor,xAI,Runtime,158.563,4.002 +Grok 4,Solo Models,AdvisorAvoid,xAI,Runtime,156.692,5.677 +Grok 4,Solo Models,AdvisorMax,xAI,Runtime,159.499,5.29 +Grok 4 Fast,Solo Models,Advisor,xAI,Runtime,20.805,0.393 +Grok 4 Fast,Solo Models,AdvisorAvoid,xAI,Runtime,17.048,0.488 +Grok 4 Fast,Solo Models,AdvisorMax,xAI,Runtime,22.683,0.498 +Human,Solo Models,Human,Human,Runtime,0,0 +Kimi K2,Solo Models,Advisor,Moonshot AI,Runtime,147.306,2.543 +Kimi K2,Solo Models,AdvisorAvoid,Moonshot AI,Runtime,128.512,3.487 +Kimi K2,Solo Models,AdvisorMax,Moonshot AI,Runtime,156.703,3.24 +LiSA 1.0,Solo Models,Advisor,AMBOSS,Runtime,58.128,0.394 +LiSA 1.0,Solo Models,AdvisorAvoid,AMBOSS,Runtime,55.039,0.377 +LiSA 1.0,Solo Models,AdvisorMax,AMBOSS,Runtime,63.897,1.312 +Llama 3.3 70b,Solo Models,Advisor,Meta,Runtime,31.94,0.644 +Llama 3.3 70b,Solo Models,AdvisorAvoid,Meta,Runtime,31.94,0.644 +Llama 4 Maverick,Solo Models,Advisor,Meta,Runtime,17.533,0.335 +Llama 4 Maverick,Solo Models,AdvisorAvoid,Meta,Runtime,14.913,0.314 +Llama 4 Maverick,Solo Models,AdvisorMax,Meta,Runtime,20.154,0.547 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Runtime,69.655,2.862 +Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,Meta + Anthropic,Runtime,61.298,1.132 +Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Runtime,97.769,4.388 +Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,31.525,0.513 +Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,120.446,4.567 +Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,76.238,2.17 +Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,29.483,0.511 +Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Anthropic,Runtime,85.018,3.499 +Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Anthropic,Runtime,81.778,2.83 +Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,101.502,1.323 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Runtime,176.624,3.47 +Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + OpenAI,Runtime,172.562,3.678 +Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Runtime,62.733,0.768 +Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,33.377,0.595 +Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,49.94,0.665 +Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,39,2.151 +Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Runtime,120.878,2.527 +Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Meta + Google,Runtime,119.109,3.049 +Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,53.995,0.908 +Llama 4 Scout,Solo Models,Advisor,Meta,Runtime,17.633,0.131 +Llama 4 Scout,Solo Models,AdvisorAvoid,Meta,Runtime,17.741,0.208 +Llama 4 Scout,Solo Models,AdvisorMax,Meta,Runtime,17.526,0.16 +Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Runtime,76.893,3.349 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Runtime,96.518,3.97 +Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Steward,Meta + DeepSeek,Runtime,90.653,4.006 +Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,35.173,0.543 +Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,122.553,4.887 +Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,81.408,2.147 +Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,33.294,0.497 +Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Anthropic,Runtime,90.476,3.792 +Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Anthropic,Runtime,84.193,2.379 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,108.216,1.012 +Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Steward,Meta + Google,Runtime,101.444,0.993 +Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Runtime,146.408,1.491 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Runtime,177.734,1.731 +Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + OpenAI,Runtime,181.132,1.89 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Runtime,175.689,1.455 +Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Google,Runtime,184.133,1.888 +Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Runtime,164.296,1.36 +Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Runtime,126.382,1.44 +Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,35.641,0.403 +Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Runtime,107.386,2.41 +Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Runtime,115.532,1.345 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,52.436,0.667 +Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,53.474,0.718 +Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,39.14,0.708 +Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Runtime,92.694,2.018 +Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Runtime,112.43,2.46 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Runtime,126.394,1.644 +Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Meta + Google,Runtime,122.73,1.845 +Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.148,1.092 +Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.255,1.1 +Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,59.535,0.943 +MedGemma 27B,Solo Models,Advisor,Google,Runtime,122.092,1.053 +MedGemma 27B,Solo Models,AdvisorAvoid,Google,Runtime,120.249,1.402 +MedGemma 27B,Solo Models,AdvisorMax,Google,Runtime,124.03,1.562 +Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Runtime,41.207,0.417 +Mistral Large 2.1,Solo Models,AdvisorAvoid,Mistral AI,Runtime,39.51,0.551 +Mistral Large 2.1,Solo Models,AdvisorMax,Mistral AI,Runtime,42.055,0.554 +Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Runtime,49.17,0.826 +Mistral Medium 3.1,Solo Models,AdvisorAvoid,Mistral AI,Runtime,41.982,1.367 +Mistral Medium 3.1,Solo Models,AdvisorMax,Mistral AI,Runtime,52.764,0.959 +No Intervention,Solo Models,Control,Control,Runtime,0,0 +Qwen3 235B,Solo Models,Advisor,Alibaba,Runtime,168.028,7.335 +Qwen3 235B,Solo Models,AdvisorMax,Alibaba,Runtime,168.028,7.335 +Qwen3 32B,Solo Models,Advisor,Alibaba,Runtime,42.555,1.491 +Qwen3 32B,Solo Models,AdvisorAvoid,Alibaba,Runtime,43.15,3.175 +Qwen3 32B,Solo Models,AdvisorMax,Alibaba,Runtime,42.376,1.689 +Random Intervention,Solo Models,Control,Control,Runtime,0,0 +o1,Solo Models,Advisor,OpenAI,Runtime,79.093,1.654 +o1,Solo Models,AdvisorAvoid,OpenAI,Runtime,79.093,1.654 +o1 mini,Solo Models,Advisor,OpenAI,Runtime,23.134,0.41 +o1 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,23.134,0.41 +o3 mini,Solo Models,Advisor,OpenAI,Runtime,45.99,0.457 +o3 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,43.856,0.561 +o3 mini,Solo Models,AdvisorFree,OpenAI,Runtime,45.808,0.849 +o3 mini,Solo Models,AdvisorMax,OpenAI,Runtime,48.124,0.696 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,108.617,3.605 +o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,OpenAI + Anthropic,Runtime,100.489,2.646 +o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,133.718,4.955 +o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,61.817,1.056 +o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,148.376,4.564 +o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,107.025,2.692 +o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,57.297,0.948 +o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,112.898,3.829 +o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,108.726,3.014 +o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,125.805,1.685 +o3 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,206.177,3.628 +o3 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,200.297,4.03 +o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,61.281,0.995 +o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,69.133,1.497 +o3 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,144.017,2.621 +o3 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,141.748,2.594 +o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,73.491,1.171 +o4 mini,Solo Models,Advisor,OpenAI,Runtime,37.114,0.654 +o4 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,37.114,0.654 diff --git a/frontend/public/data/combination-index.json b/frontend/public/data/combination-index.json index e388ccd..3a550e8 100644 --- a/frontend/public/data/combination-index.json +++ b/frontend/public/data/combination-index.json @@ -1,6 +1,6 @@ [ { - "combinationId": "AMBOSS LiSA 1.0::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "AMBOSS LiSA 1.0::Solo Models::Advisor::::::::", "displayLabel": "AMBOSS LiSA 1.0", "model": "AMBOSS LiSA 1.0", "team": "Solo Models", @@ -8,7 +8,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet::Solo Models::Advisor::::::::", "displayLabel": "Claude 3.7 Sonnet", "model": "Claude 3.7 Sonnet", "team": "Solo Models", @@ -16,7 +16,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + Claude 3.7 Sonnet", "model": "Claude 3.7 Sonnet + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -24,7 +24,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + DeepSeek R1", "model": "Claude 3.7 Sonnet + DeepSeek R1", "team": "2-Agent Teams", @@ -32,7 +32,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + Gemini 2.0 Flash", "model": "Claude 3.7 Sonnet + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -40,7 +40,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + Gemini 2.5 Pro", "model": "Claude 3.7 Sonnet + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -48,7 +48,15 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5", + "model": "Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Claude 3.7 Sonnet + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + GPT-4.1", "model": "Claude 3.7 Sonnet + GPT-4.1", "team": "2-Agent Teams", @@ -56,7 +64,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + GPT-5", "model": "Claude 3.7 Sonnet + GPT-5", "team": "2-Agent Teams", @@ -64,7 +72,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + GPT-5 mini", "model": "Claude 3.7 Sonnet + GPT-5 mini", "team": "2-Agent Teams", @@ -72,7 +80,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + Llama 4 Maverick", "model": "Claude 3.7 Sonnet + Llama 4 Maverick", "team": "2-Agent Teams", @@ -80,7 +88,7 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + Llama 4 Scout", "model": "Claude 3.7 Sonnet + Llama 4 Scout", "team": "2-Agent Teams", @@ -88,7 +96,15 @@ "harm": "" }, { - "combinationId": "Claude 3.7 Sonnet + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro", + "model": "Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Claude 3.7 Sonnet + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude 3.7 Sonnet + o3 mini", "model": "Claude 3.7 Sonnet + o3 mini", "team": "2-Agent Teams", @@ -96,7 +112,7 @@ "harm": "" }, { - "combinationId": "Claude Haiku 4.5::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude Haiku 4.5::Solo Models::Advisor::::::::", "displayLabel": "Claude Haiku 4.5", "model": "Claude Haiku 4.5", "team": "Solo Models", @@ -104,7 +120,7 @@ "harm": "" }, { - "combinationId": "Claude Sonnet 4.5::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude Sonnet 4.5::Solo Models::Advisor::::::::", "displayLabel": "Claude Sonnet 4.5", "model": "Claude Sonnet 4.5", "team": "Solo Models", @@ -112,7 +128,7 @@ "harm": "" }, { - "combinationId": "Claude Sonnet 4.5 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude Sonnet 4.5 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude Sonnet 4.5 + Gemini 2.5 Pro", "model": "Claude Sonnet 4.5 + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -120,7 +136,7 @@ "harm": "" }, { - "combinationId": "Claude Sonnet 4.5 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Claude Sonnet 4.5 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Claude Sonnet 4.5 + LiSA 1.0", "model": "Claude Sonnet 4.5 + LiSA 1.0", "team": "2-Agent Teams", @@ -128,7 +144,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1::Solo Models::Advisor::::::::", "displayLabel": "DeepSeek R1", "model": "DeepSeek R1", "team": "Solo Models", @@ -136,7 +152,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + Claude 3.7 Sonnet", "model": "DeepSeek R1 + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -144,7 +160,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + DeepSeek R1", "model": "DeepSeek R1 + DeepSeek R1", "team": "2-Agent Teams", @@ -152,7 +168,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + DeepSeek R1 + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + DeepSeek R1 + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + DeepSeek R1 + DeepSeek R1", "model": "DeepSeek R1 + DeepSeek R1 + DeepSeek R1", "team": "3-Agent Teams", @@ -160,7 +176,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + DeepSeek R1 + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + DeepSeek R1 + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + DeepSeek R1 + GPT-5", "model": "DeepSeek R1 + DeepSeek R1 + GPT-5", "team": "3-Agent Teams", @@ -168,7 +184,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.0 Flash", "model": "DeepSeek R1 + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -176,7 +192,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet", "model": "DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet", "team": "3-Agent Teams", @@ -184,7 +200,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro", "model": "DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -192,7 +208,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0", "model": "DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0", "team": "3-Agent Teams", @@ -200,7 +216,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.5 Flash", "model": "DeepSeek R1 + Gemini 2.5 Flash", "team": "2-Agent Teams", @@ -208,7 +224,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro", "model": "DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -216,7 +232,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.5 Flash + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.5 Flash + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.5 Flash + GPT-5", "model": "DeepSeek R1 + Gemini 2.5 Flash + GPT-5", "team": "3-Agent Teams", @@ -224,7 +240,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.5 Pro", "model": "DeepSeek R1 + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -232,7 +248,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "DeepSeek R1 + Gemini 2.5 Pro + GPT-5", "model": "DeepSeek R1 + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -240,7 +256,23 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "DeepSeek R1 + GPT-4.1", + "model": "DeepSeek R1 + GPT-4.1", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "DeepSeek R1 + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "DeepSeek R1 + GPT-5", + "model": "DeepSeek R1 + GPT-5", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "DeepSeek R1 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + GPT-5 mini", "model": "DeepSeek R1 + GPT-5 mini", "team": "2-Agent Teams", @@ -248,7 +280,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + LiSA 1.0", "model": "DeepSeek R1 + LiSA 1.0", "team": "2-Agent Teams", @@ -256,7 +288,7 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + Llama 4 Maverick", "model": "DeepSeek R1 + Llama 4 Maverick", "team": "2-Agent Teams", @@ -264,7 +296,23 @@ "harm": "" }, { - "combinationId": "DeepSeek R1 + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek R1 + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "DeepSeek R1 + Llama 4 Scout", + "model": "DeepSeek R1 + Llama 4 Scout", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro", + "model": "DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "DeepSeek R1 + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "DeepSeek R1 + o3 mini", "model": "DeepSeek R1 + o3 mini", "team": "2-Agent Teams", @@ -272,7 +320,7 @@ "harm": "" }, { - "combinationId": "DeepSeek V3.1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "DeepSeek V3.1::Solo Models::Advisor::::::::", "displayLabel": "DeepSeek V3.1", "model": "DeepSeek V3.1", "team": "Solo Models", @@ -280,7 +328,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Expert AI::Solo Models::Advisor::::::::", + "displayLabel": "Expert AI", + "model": "Expert AI", + "team": "Solo Models", + "condition": "Advisor", + "harm": "" + }, + { + "combinationId": "Gemini 2.0 Flash::Solo Models::Advisor::::::::", "displayLabel": "Gemini 2.0 Flash", "model": "Gemini 2.0 Flash", "team": "Solo Models", @@ -288,7 +344,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Claude 3.7 Sonnet", "model": "Gemini 2.0 Flash + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -296,7 +352,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro", "model": "Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -304,7 +360,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5", + "model": "Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0", "model": "Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0", "team": "3-Agent Teams", @@ -312,7 +376,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + DeepSeek R1", "model": "Gemini 2.0 Flash + DeepSeek R1", "team": "2-Agent Teams", @@ -320,7 +384,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Gemini 2.0 Flash", "model": "Gemini 2.0 Flash + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -328,7 +392,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Gemini 2.5 Pro", "model": "Gemini 2.0 Flash + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -336,7 +400,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5", "model": "Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -344,7 +408,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + GPT-4.1", "model": "Gemini 2.0 Flash + GPT-4.1", "team": "2-Agent Teams", @@ -352,7 +416,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + GPT-5", "model": "Gemini 2.0 Flash + GPT-5", "team": "2-Agent Teams", @@ -360,7 +424,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + GPT-5 mini", "model": "Gemini 2.0 Flash + GPT-5 mini", "team": "2-Agent Teams", @@ -368,7 +432,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Llama 4 Maverick", "model": "Gemini 2.0 Flash + Llama 4 Maverick", "team": "2-Agent Teams", @@ -376,7 +440,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + Llama 4 Scout", "model": "Gemini 2.0 Flash + Llama 4 Scout", "team": "2-Agent Teams", @@ -384,7 +448,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.0 Flash + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro", + "model": "Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.0 Flash + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.0 Flash + o3 mini", "model": "Gemini 2.0 Flash + o3 mini", "team": "2-Agent Teams", @@ -392,7 +464,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash::Solo Models::Advisor::::::::", "displayLabel": "Gemini 2.5 Flash", "model": "Gemini 2.5 Flash", "team": "Solo Models", @@ -400,7 +472,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + DeepSeek R1", "model": "Gemini 2.5 Flash + DeepSeek R1", "team": "2-Agent Teams", @@ -408,7 +480,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Flash", "model": "Gemini 2.5 Flash + Gemini 2.5 Flash", "team": "2-Agent Teams", @@ -416,7 +488,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash", "model": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash", "team": "3-Agent Teams", @@ -424,7 +496,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro", "model": "Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -432,7 +504,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Pro", "model": "Gemini 2.5 Flash + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -440,7 +512,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1", "model": "Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1", "team": "3-Agent Teams", @@ -448,7 +520,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro", "model": "Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -456,7 +528,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5", "model": "Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -464,7 +536,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Glass Health 4.0::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "Gemini 2.5 Flash + Glass Health 4.0", + "model": "Gemini 2.5 Flash + Glass Health 4.0", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Flash + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + GPT-5 mini", "model": "Gemini 2.5 Flash + GPT-5 mini", "team": "2-Agent Teams", @@ -472,7 +552,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + LiSA 1.0", "model": "Gemini 2.5 Flash + LiSA 1.0", "team": "2-Agent Teams", @@ -480,7 +560,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Llama 4 Maverick", "model": "Gemini 2.5 Flash + Llama 4 Maverick", "team": "2-Agent Teams", @@ -488,7 +568,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1", "model": "Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1", "team": "3-Agent Teams", @@ -496,7 +576,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro", "model": "Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -504,7 +584,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Flash + Llama 4 Maverick + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Flash + Llama 4 Maverick + GPT-5", "model": "Gemini 2.5 Flash + Llama 4 Maverick + GPT-5", "team": "3-Agent Teams", @@ -512,7 +592,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro::Solo Models::Advisor::::::::", "displayLabel": "Gemini 2.5 Pro", "model": "Gemini 2.5 Pro", "team": "Solo Models", @@ -520,7 +600,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + DeepSeek R1", "model": "Gemini 2.5 Pro + DeepSeek R1", "team": "2-Agent Teams", @@ -528,7 +608,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Gemini 2.0 Flash", "model": "Gemini 2.5 Pro + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -536,7 +616,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Gemini 2.5 Pro", "model": "Gemini 2.5 Pro + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -544,7 +624,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro", "model": "Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -552,7 +632,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5", "model": "Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -560,7 +640,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + GPT-4.1", "model": "Gemini 2.5 Pro + GPT-4.1", "team": "2-Agent Teams", @@ -568,7 +648,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + GPT-5", "model": "Gemini 2.5 Pro + GPT-5", "team": "2-Agent Teams", @@ -576,7 +656,31 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet", + "model": "Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro", + "model": "Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Pro + GPT-5 + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.5 Pro + GPT-5 + GPT-5", + "model": "Gemini 2.5 Pro + GPT-5 + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Pro + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + GPT-5 mini", "model": "Gemini 2.5 Pro + GPT-5 mini", "team": "2-Agent Teams", @@ -584,7 +688,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + GPT-5 mini + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro", + "model": "Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Pro + GPT-5 mini + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + GPT-5 mini + GPT-5", "model": "Gemini 2.5 Pro + GPT-5 mini + GPT-5", "team": "3-Agent Teams", @@ -592,7 +704,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0", "model": "Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0", "team": "3-Agent Teams", @@ -600,7 +712,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + LiSA 1.0", "model": "Gemini 2.5 Pro + LiSA 1.0", "team": "2-Agent Teams", @@ -608,7 +720,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Llama 4 Maverick", "model": "Gemini 2.5 Pro + Llama 4 Maverick", "team": "2-Agent Teams", @@ -616,7 +728,7 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + Llama 4 Scout", "model": "Gemini 2.5 Pro + Llama 4 Scout", "team": "2-Agent Teams", @@ -624,7 +736,15 @@ "harm": "" }, { - "combinationId": "Gemini 2.5 Pro + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro", + "model": "Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Gemini 2.5 Pro + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Gemini 2.5 Pro + o3 mini", "model": "Gemini 2.5 Pro + o3 mini", "team": "2-Agent Teams", @@ -632,7 +752,7 @@ "harm": "" }, { - "combinationId": "Gemini 3 Pro::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Gemini 3 Pro::Solo Models::Advisor::::::::", "displayLabel": "Gemini 3 Pro", "model": "Gemini 3 Pro", "team": "Solo Models", @@ -640,7 +760,7 @@ "harm": "" }, { - "combinationId": "Glass Health 4.0::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Glass Health 4.0::Solo Models::Advisor::::::::", "displayLabel": "Glass Health 4.0", "model": "Glass Health 4.0", "team": "Solo Models", @@ -648,7 +768,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1::Solo Models::Advisor::::::::", "displayLabel": "GPT-4.1", "model": "GPT-4.1", "team": "Solo Models", @@ -656,7 +776,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + Claude 3.7 Sonnet", "model": "GPT-4.1 + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -664,7 +784,15 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "GPT-4.1 + DeepSeek R1", + "model": "GPT-4.1 + DeepSeek R1", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-4.1 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + Gemini 2.0 Flash", "model": "GPT-4.1 + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -672,7 +800,15 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "model": "GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-4.1 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + Gemini 2.5 Pro", "model": "GPT-4.1 + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -680,7 +816,15 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-4.1 + Gemini 2.5 Pro + GPT-5", + "model": "GPT-4.1 + Gemini 2.5 Pro + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-4.1 + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + GPT-4.1", "model": "GPT-4.1 + GPT-4.1", "team": "2-Agent Teams", @@ -688,7 +832,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + GPT-5", "model": "GPT-4.1 + GPT-5", "team": "2-Agent Teams", @@ -696,7 +840,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + GPT-5 mini", "model": "GPT-4.1 + GPT-5 mini", "team": "2-Agent Teams", @@ -704,7 +848,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + Llama 4 Maverick", "model": "GPT-4.1 + Llama 4 Maverick", "team": "2-Agent Teams", @@ -712,7 +856,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + Llama 4 Scout", "model": "GPT-4.1 + Llama 4 Scout", "team": "2-Agent Teams", @@ -720,7 +864,15 @@ "harm": "" }, { - "combinationId": "GPT-4.1 + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro", + "model": "GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-4.1 + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-4.1 + o3 mini", "model": "GPT-4.1 + o3 mini", "team": "2-Agent Teams", @@ -728,7 +880,7 @@ "harm": "" }, { - "combinationId": "GPT-4.1 mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4.1 mini::Solo Models::Advisor::::::::", "displayLabel": "GPT-4.1 mini", "model": "GPT-4.1 mini", "team": "Solo Models", @@ -736,7 +888,7 @@ "harm": "" }, { - "combinationId": "GPT-4o::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4o::Solo Models::Advisor::::::::", "displayLabel": "GPT-4o", "model": "GPT-4o", "team": "Solo Models", @@ -744,7 +896,7 @@ "harm": "" }, { - "combinationId": "GPT-4o mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-4o mini::Solo Models::Advisor::::::::", "displayLabel": "GPT-4o mini", "model": "GPT-4o mini", "team": "Solo Models", @@ -752,7 +904,7 @@ "harm": "" }, { - "combinationId": "GPT-5::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5::Solo Models::Advisor::::::::", "displayLabel": "GPT-5", "model": "GPT-5", "team": "Solo Models", @@ -760,7 +912,15 @@ "harm": "" }, { - "combinationId": "GPT-5 + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "GPT-5 + Claude 3.7 Sonnet", + "model": "GPT-5 + Claude 3.7 Sonnet", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + DeepSeek R1", "model": "GPT-5 + DeepSeek R1", "team": "2-Agent Teams", @@ -768,7 +928,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + Gemini 2.0 Flash", "model": "GPT-5 + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -776,7 +936,15 @@ "harm": "" }, { - "combinationId": "GPT-5 + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "model": "GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 + Gemini 2.5 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + Gemini 2.5 Flash", "model": "GPT-5 + Gemini 2.5 Flash", "team": "2-Agent Teams", @@ -784,7 +952,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + Gemini 2.5 Pro", "model": "GPT-5 + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -792,7 +960,15 @@ "harm": "" }, { - "combinationId": "GPT-5 + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-5 + Gemini 2.5 Pro + GPT-5", + "model": "GPT-5 + Gemini 2.5 Pro + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + GPT-4.1", "model": "GPT-5 + GPT-4.1", "team": "2-Agent Teams", @@ -800,7 +976,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + GPT-5", "model": "GPT-5 + GPT-5", "team": "2-Agent Teams", @@ -808,7 +984,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + GPT-5 + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + GPT-5 + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "GPT-5 + GPT-5 + GPT-5", "model": "GPT-5 + GPT-5 + GPT-5", "team": "3-Agent Teams", @@ -816,7 +992,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + GPT-5 mini", "model": "GPT-5 + GPT-5 mini", "team": "2-Agent Teams", @@ -824,7 +1000,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + LiSA 1.0", "model": "GPT-5 + LiSA 1.0", "team": "2-Agent Teams", @@ -832,7 +1008,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + Llama 4 Maverick", "model": "GPT-5 + Llama 4 Maverick", "team": "2-Agent Teams", @@ -840,7 +1016,7 @@ "harm": "" }, { - "combinationId": "GPT-5 + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + Llama 4 Scout", "model": "GPT-5 + Llama 4 Scout", "team": "2-Agent Teams", @@ -848,7 +1024,15 @@ "harm": "" }, { - "combinationId": "GPT-5 + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-5 + Llama 4 Scout + Gemini 2.5 Pro", + "model": "GPT-5 + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 + o3 mini", "model": "GPT-5 + o3 mini", "team": "2-Agent Teams", @@ -856,7 +1040,7 @@ "harm": "" }, { - "combinationId": "GPT-5 mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini::Solo Models::Advisor::::::::", "displayLabel": "GPT-5 mini", "model": "GPT-5 mini", "team": "Solo Models", @@ -864,7 +1048,23 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "GPT-5 mini + Claude 3.7 Sonnet", + "model": "GPT-5 mini + Claude 3.7 Sonnet", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 mini + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "GPT-5 mini + DeepSeek R1", + "model": "GPT-5 mini + DeepSeek R1", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 mini + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + Gemini 2.0 Flash", "model": "GPT-5 mini + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -872,7 +1072,7 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + Gemini 2.5 Pro", "model": "GPT-5 mini + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -880,7 +1080,15 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-5 mini + Gemini 2.5 Pro + GPT-5", + "model": "GPT-5 mini + Gemini 2.5 Pro + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 mini + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + GPT-4.1", "model": "GPT-5 mini + GPT-4.1", "team": "2-Agent Teams", @@ -888,7 +1096,7 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + GPT-5", "model": "GPT-5 mini + GPT-5", "team": "2-Agent Teams", @@ -896,7 +1104,7 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + GPT-5 mini", "model": "GPT-5 mini + GPT-5 mini", "team": "2-Agent Teams", @@ -904,7 +1112,15 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", + "displayLabel": "GPT-5 mini + Llama 4 Maverick", + "model": "GPT-5 mini + Llama 4 Maverick", + "team": "2-Agent Teams", + "condition": "Advisor + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 mini + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + Llama 4 Scout", "model": "GPT-5 mini + Llama 4 Scout", "team": "2-Agent Teams", @@ -912,7 +1128,15 @@ "harm": "" }, { - "combinationId": "GPT-5 mini + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro", + "model": "GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "GPT-5 mini + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "GPT-5 mini + o3 mini", "model": "GPT-5 mini + o3 mini", "team": "2-Agent Teams", @@ -920,7 +1144,7 @@ "harm": "" }, { - "combinationId": "GPT-5 nano::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "GPT-5 nano::Solo Models::Advisor::::::::", "displayLabel": "GPT-5 nano", "model": "GPT-5 nano", "team": "Solo Models", @@ -928,7 +1152,7 @@ "harm": "" }, { - "combinationId": "Grok 4::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Grok 4::Solo Models::Advisor::::::::", "displayLabel": "Grok 4", "model": "Grok 4", "team": "Solo Models", @@ -936,7 +1160,7 @@ "harm": "" }, { - "combinationId": "Grok 4 Fast::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Grok 4 Fast::Solo Models::Advisor::::::::", "displayLabel": "Grok 4 Fast", "model": "Grok 4 Fast", "team": "Solo Models", @@ -944,7 +1168,15 @@ "harm": "" }, { - "combinationId": "Human Generalist Physicians::Solo Models::Human::::AllHarm::AllCases::Unanimous", + "combinationId": "Human::Solo Models::Human::::::::", + "displayLabel": "Human", + "model": "Human", + "team": "Solo Models", + "condition": "Human", + "harm": "" + }, + { + "combinationId": "Human Generalist Physicians::Solo Models::Human::::::::", "displayLabel": "Human Generalist Physicians", "model": "Human Generalist Physicians", "team": "Solo Models", @@ -952,7 +1184,7 @@ "harm": "" }, { - "combinationId": "Kimi K2::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Kimi K2::Solo Models::Advisor::::::::", "displayLabel": "Kimi K2", "model": "Kimi K2", "team": "Solo Models", @@ -960,7 +1192,15 @@ "harm": "" }, { - "combinationId": "Llama 3.3 70b::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "LiSA 1.0::Solo Models::Advisor::::::::", + "displayLabel": "LiSA 1.0", + "model": "LiSA 1.0", + "team": "Solo Models", + "condition": "Advisor", + "harm": "" + }, + { + "combinationId": "Llama 3.3 70b::Solo Models::Advisor::::::::", "displayLabel": "Llama 3.3 70b", "model": "Llama 3.3 70b", "team": "Solo Models", @@ -968,7 +1208,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick::Solo Models::Advisor::::::::", "displayLabel": "Llama 4 Maverick", "model": "Llama 4 Maverick", "team": "Solo Models", @@ -976,7 +1216,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Claude 3.7 Sonnet", "model": "Llama 4 Maverick + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -984,7 +1224,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + DeepSeek R1", "model": "Llama 4 Maverick + DeepSeek R1", "team": "2-Agent Teams", @@ -992,7 +1232,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Gemini 2.0 Flash", "model": "Llama 4 Maverick + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -1000,7 +1240,15 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "model": "Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Llama 4 Maverick + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Gemini 2.5 Pro", "model": "Llama 4 Maverick + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -1008,7 +1256,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Gemini 2.5 Pro + GPT-5", "model": "Llama 4 Maverick + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -1016,7 +1264,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + GPT-4.1", "model": "Llama 4 Maverick + GPT-4.1", "team": "2-Agent Teams", @@ -1024,7 +1272,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + GPT-5", "model": "Llama 4 Maverick + GPT-5", "team": "2-Agent Teams", @@ -1032,7 +1280,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + GPT-5 mini", "model": "Llama 4 Maverick + GPT-5 mini", "team": "2-Agent Teams", @@ -1040,7 +1288,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + LiSA 1.0::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + LiSA 1.0", "model": "Llama 4 Maverick + LiSA 1.0", "team": "2-Agent Teams", @@ -1048,7 +1296,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Llama 4 Maverick", "model": "Llama 4 Maverick + Llama 4 Maverick", "team": "2-Agent Teams", @@ -1056,7 +1304,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick", "model": "Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick", "team": "3-Agent Teams", @@ -1064,7 +1312,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + Llama 4 Scout", "model": "Llama 4 Maverick + Llama 4 Scout", "team": "2-Agent Teams", @@ -1072,7 +1320,15 @@ "harm": "" }, { - "combinationId": "Llama 4 Maverick + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro", + "model": "Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Llama 4 Maverick + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Maverick + o3 mini", "model": "Llama 4 Maverick + o3 mini", "team": "2-Agent Teams", @@ -1080,7 +1336,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout::Solo Models::Advisor::::::::", "displayLabel": "Llama 4 Scout", "model": "Llama 4 Scout", "team": "Solo Models", @@ -1088,7 +1344,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + Claude 3.7 Sonnet", "model": "Llama 4 Scout + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -1096,7 +1352,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + DeepSeek R1", "model": "Llama 4 Scout + DeepSeek R1", "team": "2-Agent Teams", @@ -1104,7 +1360,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.0 Flash", "model": "Llama 4 Scout + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -1112,7 +1368,15 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "model": "Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro", "model": "Llama 4 Scout + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -1120,7 +1384,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1", "model": "Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1", "team": "3-Agent Teams", @@ -1128,7 +1392,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro", "model": "Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -1136,7 +1400,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro + GPT-5", "model": "Llama 4 Scout + Gemini 2.5 Pro + GPT-5", "team": "3-Agent Teams", @@ -1144,7 +1408,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0", "model": "Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0", "team": "3-Agent Teams", @@ -1152,7 +1416,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick", "model": "Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick", "team": "3-Agent Teams", @@ -1160,7 +1424,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + GPT-4.1", "model": "Llama 4 Scout + GPT-4.1", "team": "2-Agent Teams", @@ -1168,7 +1432,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + GPT-5", "model": "Llama 4 Scout + GPT-5", "team": "2-Agent Teams", @@ -1176,7 +1440,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + GPT-5 mini", "model": "Llama 4 Scout + GPT-5 mini", "team": "2-Agent Teams", @@ -1184,7 +1448,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Maverick", "model": "Llama 4 Scout + Llama 4 Maverick", "team": "2-Agent Teams", @@ -1192,7 +1456,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro", "model": "Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -1200,7 +1464,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Maverick + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Maverick + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Maverick + GPT-5", "model": "Llama 4 Scout + Llama 4 Maverick + GPT-5", "team": "3-Agent Teams", @@ -1208,7 +1472,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick", "model": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick", "team": "3-Agent Teams", @@ -1216,7 +1480,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout", "model": "Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout", "team": "3-Agent Teams", @@ -1224,7 +1488,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout", "model": "Llama 4 Scout + Llama 4 Scout", "team": "2-Agent Teams", @@ -1232,7 +1496,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout + DeepSeek R1::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout + DeepSeek R1", "model": "Llama 4 Scout + Llama 4 Scout + DeepSeek R1", "team": "3-Agent Teams", @@ -1240,7 +1504,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro", "model": "Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro", "team": "3-Agent Teams", @@ -1248,7 +1512,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout + GPT-5", "model": "Llama 4 Scout + Llama 4 Scout + GPT-5", "team": "3-Agent Teams", @@ -1256,7 +1520,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick", "model": "Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick", "team": "3-Agent Teams", @@ -1264,7 +1528,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + Llama 4 Scout + Llama 4 Scout::3-Agent Teams::Advisor + Guardian + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + Llama 4 Scout + Llama 4 Scout::3-Agent Teams::Advisor + Guardian + Guardian::::::::", "displayLabel": "Llama 4 Scout + Llama 4 Scout + Llama 4 Scout", "model": "Llama 4 Scout + Llama 4 Scout + Llama 4 Scout", "team": "3-Agent Teams", @@ -1272,7 +1536,7 @@ "harm": "" }, { - "combinationId": "Llama 4 Scout + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "Llama 4 Scout + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "Llama 4 Scout + o3 mini", "model": "Llama 4 Scout + o3 mini", "team": "2-Agent Teams", @@ -1280,7 +1544,15 @@ "harm": "" }, { - "combinationId": "Mistral Large 2.1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "MedGemma 27B::Solo Models::Advisor::::::::", + "displayLabel": "MedGemma 27B", + "model": "MedGemma 27B", + "team": "Solo Models", + "condition": "Advisor", + "harm": "" + }, + { + "combinationId": "Mistral Large 2.1::Solo Models::Advisor::::::::", "displayLabel": "Mistral Large 2.1", "model": "Mistral Large 2.1", "team": "Solo Models", @@ -1288,7 +1560,7 @@ "harm": "" }, { - "combinationId": "Mistral Medium 3.1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Mistral Medium 3.1::Solo Models::Advisor::::::::", "displayLabel": "Mistral Medium 3.1", "model": "Mistral Medium 3.1", "team": "Solo Models", @@ -1296,7 +1568,7 @@ "harm": "" }, { - "combinationId": "o1::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "o1::Solo Models::Advisor::::::::", "displayLabel": "o1", "model": "o1", "team": "Solo Models", @@ -1304,7 +1576,7 @@ "harm": "" }, { - "combinationId": "o1 mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "o1 mini::Solo Models::Advisor::::::::", "displayLabel": "o1 mini", "model": "o1 mini", "team": "Solo Models", @@ -1312,7 +1584,7 @@ "harm": "" }, { - "combinationId": "o3 mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini::Solo Models::Advisor::::::::", "displayLabel": "o3 mini", "model": "o3 mini", "team": "Solo Models", @@ -1320,7 +1592,7 @@ "harm": "" }, { - "combinationId": "o3 mini + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Claude 3.7 Sonnet::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + Claude 3.7 Sonnet", "model": "o3 mini + Claude 3.7 Sonnet", "team": "2-Agent Teams", @@ -1328,7 +1600,7 @@ "harm": "" }, { - "combinationId": "o3 mini + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + DeepSeek R1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + DeepSeek R1", "model": "o3 mini + DeepSeek R1", "team": "2-Agent Teams", @@ -1336,7 +1608,7 @@ "harm": "" }, { - "combinationId": "o3 mini + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Gemini 2.0 Flash::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + Gemini 2.0 Flash", "model": "o3 mini + Gemini 2.0 Flash", "team": "2-Agent Teams", @@ -1344,7 +1616,15 @@ "harm": "" }, { - "combinationId": "o3 mini + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "model": "o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "o3 mini + Gemini 2.5 Pro::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + Gemini 2.5 Pro", "model": "o3 mini + Gemini 2.5 Pro", "team": "2-Agent Teams", @@ -1352,7 +1632,15 @@ "harm": "" }, { - "combinationId": "o3 mini + GPT-4.1::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Gemini 2.5 Pro + GPT-5::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "o3 mini + Gemini 2.5 Pro + GPT-5", + "model": "o3 mini + Gemini 2.5 Pro + GPT-5", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "o3 mini + GPT-4.1::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + GPT-4.1", "model": "o3 mini + GPT-4.1", "team": "2-Agent Teams", @@ -1360,7 +1648,7 @@ "harm": "" }, { - "combinationId": "o3 mini + GPT-5::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + GPT-5::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + GPT-5", "model": "o3 mini + GPT-5", "team": "2-Agent Teams", @@ -1368,7 +1656,7 @@ "harm": "" }, { - "combinationId": "o3 mini + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + GPT-5 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + GPT-5 mini", "model": "o3 mini + GPT-5 mini", "team": "2-Agent Teams", @@ -1376,7 +1664,7 @@ "harm": "" }, { - "combinationId": "o3 mini + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Llama 4 Maverick::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + Llama 4 Maverick", "model": "o3 mini + Llama 4 Maverick", "team": "2-Agent Teams", @@ -1384,7 +1672,7 @@ "harm": "" }, { - "combinationId": "o3 mini + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Llama 4 Scout::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + Llama 4 Scout", "model": "o3 mini + Llama 4 Scout", "team": "2-Agent Teams", @@ -1392,7 +1680,15 @@ "harm": "" }, { - "combinationId": "o3 mini + o3 mini::2-Agent Teams::Advisor + Guardian::::AllHarm::AllCases::Unanimous", + "combinationId": "o3 mini + Llama 4 Scout + Gemini 2.5 Pro::3-Agent Teams::Advisor + Guardian + Guardian::::::::", + "displayLabel": "o3 mini + Llama 4 Scout + Gemini 2.5 Pro", + "model": "o3 mini + Llama 4 Scout + Gemini 2.5 Pro", + "team": "3-Agent Teams", + "condition": "Advisor + Guardian + Guardian", + "harm": "" + }, + { + "combinationId": "o3 mini + o3 mini::2-Agent Teams::Advisor + Guardian::::::::", "displayLabel": "o3 mini + o3 mini", "model": "o3 mini + o3 mini", "team": "2-Agent Teams", @@ -1400,7 +1696,7 @@ "harm": "" }, { - "combinationId": "o4 mini::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "o4 mini::Solo Models::Advisor::::::::", "displayLabel": "o4 mini", "model": "o4 mini", "team": "Solo Models", @@ -1408,7 +1704,7 @@ "harm": "" }, { - "combinationId": "Qwen3 235B::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Qwen3 235B::Solo Models::Advisor::::::::", "displayLabel": "Qwen3 235B", "model": "Qwen3 235B", "team": "Solo Models", @@ -1416,7 +1712,7 @@ "harm": "" }, { - "combinationId": "Qwen3 32B::Solo Models::Advisor::::AllHarm::AllCases::Unanimous", + "combinationId": "Qwen3 32B::Solo Models::Advisor::::::::", "displayLabel": "Qwen3 32B", "model": "Qwen3 32B", "team": "Solo Models", diff --git a/frontend/src/components/FiltersPanel.tsx b/frontend/src/components/FiltersPanel.tsx index bd228ae..9139666 100644 --- a/frontend/src/components/FiltersPanel.tsx +++ b/frontend/src/components/FiltersPanel.tsx @@ -172,7 +172,7 @@ export function TeamFiltersBar({

- TEAM CONFIGURATION + MULTI-AGENT CONFIGURATION

{showTeamInfo ? (
- View performance of multi-agent teams, where one model reviews and edits the output of other models in a Guardian or Stewardship role + View performance of multi-agent teams, where models can review and update the output of other models, as if providing a 2nd opinion
) : null}
From 6fa549d74b30488ef0b61316b9735beb44f086de Mon Sep 17 00:00:00 2001 From: symbiologist Date: Sun, 14 Dec 2025 16:25:36 -0500 Subject: [PATCH 2/3] Fix runtime error --- data/metrics.csv | 2096 ---------------------------------------------- 1 file changed, 2096 deletions(-) diff --git a/data/metrics.csv b/data/metrics.csv index 869ed6b..082b0b7 100644 --- a/data/metrics.csv +++ b/data/metrics.csv @@ -1,3983 +1,1992 @@ Model,Team,Condition,Provider,Metric,mean,ci Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Completeness,0.569,0.016 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.569,0.016 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Completeness,0.492,0.048 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.048 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.618,0.008 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.008 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.488,0.024 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.488,0.024 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.553,0.044 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.044 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.589,0.035 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.52,0.016 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.016 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Completeness,0.549,0.024 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.024 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Completeness,0.549,0.024 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.024 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Completeness,0.516,0.021 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.021 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Completeness,0.517,0.016 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.517,0.016 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Completeness,0.555,0.018 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.555,0.018 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Completeness,0.595,0.03 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.595,0.03 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Completeness,0.509,0.025 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.509,0.025 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Completeness,0.598,0.05 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.598,0.05 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.604,0.019 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.604,0.019 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.606,0.024 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.606,0.024 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Completeness,0.559,0.016 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.559,0.016 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Completeness,0.646,0.023 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.646,0.023 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Completeness,0.565,0.02 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.565,0.02 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Completeness,0.546,0.024 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.546,0.024 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Completeness,0.593,0.048 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.593,0.048 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.699,0.042 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.699,0.042 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.512,0.05 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.512,0.05 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.585,0.014 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.585,0.014 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.638,0.016 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.638,0.016 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.589,0.035 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.622,0 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.622,0.014 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.014 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.602,0.008 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.602,0.008 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Completeness,0.419,0.016 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.419,0.016 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.622,0.037 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.037 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.468,0.024 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.468,0.024 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.5,0.014 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.5,0.014 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.52,0.029 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.029 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.516,0.016 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.016 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.516,0.016 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.016 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Completeness,0.51,0.018 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.51,0.018 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.467,0.042 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.467,0.042 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.496,0.032 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.496,0.032 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.455,0.042 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.455,0.042 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.659,0.024 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.024 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.492,0.008 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.008 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.516,0.035 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.516,0.035 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.537,0.05 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.05 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.557,0.021 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.557,0.021 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.472,0.029 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.472,0.029 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.459,0.029 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.459,0.029 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Completeness,0.634,0.017 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.634,0.017 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.553,0.048 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.048 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.703,0.044 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.703,0.044 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.545,0.016 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.545,0.016 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.602,0.029 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.602,0.029 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.659,0.024 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.024 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.533,0.029 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.533,0.029 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.663,0.01 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.663,0.01 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.663,0.016 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.663,0.016 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.627,0.026 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.627,0.026 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.537,0.041 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.041 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.634,0.024 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.634,0.024 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.615,0.012 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.615,0.012 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.553,0.021 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.553,0.021 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Completeness,0.673,0.021 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.673,0.021 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.622,0.02 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.622,0.02 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Completeness,0.496,0.008 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.496,0.008 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.729,0.04 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.729,0.04 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.549,0.028 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.028 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.591,0.024 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.591,0.024 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.589,0.035 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.589,0.035 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Completeness,0.541,0.027 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.541,0.027 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Completeness,0.618,0.028 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.028 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.537,0.014 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.014 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Completeness,0.537,0.014 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.014 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Completeness,0.52,0.016 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.016 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Completeness,0.593,0.022 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.593,0.022 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Completeness,0.463,0.055 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.463,0.055 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.646,0.028 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.646,0.028 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.492,0.029 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.492,0.029 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.581,0.021 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.581,0.021 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.549,0.028 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.549,0.028 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.606,0.029 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.606,0.029 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Completeness,0.643,0.017 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.643,0.017 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.497,0.016 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.497,0.016 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.504,0.021 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.504,0.021 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.476,0.014 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.476,0.014 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Completeness,0.61,0.014 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.61,0.014 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Completeness,0.537,0.021 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.021 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.654,0.032 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.654,0.032 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.512,0.014 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.512,0.014 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.581,0.021 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.581,0.021 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.659,0.028 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.659,0.028 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Completeness,0.618,0.023 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.618,0.023 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.587,0.012 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.587,0.012 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Completeness,0.591,0.011 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.591,0.011 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Completeness,0.577,0.029 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.577,0.029 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Completeness,0.534,0.025 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.534,0.025 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Completeness,0.358,0.044 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.358,0.044 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.537,0.05 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.537,0.05 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.435,0.032 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.435,0.032 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.447,0.016 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.447,0.016 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.411,0.029 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.411,0.029 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Completeness,0.52,0.021 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.52,0.021 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.264,0.021 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.264,0.021 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Completeness,0.28,0.014 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.28,0.014 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Completeness,0.268,0.028 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Completeness,0.268,0.028 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Completeness,0.5,0.034 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.5,0.034 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Completeness,0.498,0.024 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.498,0.024 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Completeness,0.614,0.019 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.614,0.019 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Completeness,0.583,0.013 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.583,0.013 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Completeness,0.633,0.026 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.633,0.026 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Completeness,0.558,0.033 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.558,0.033 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Completeness,0.587,0.016 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.587,0.016 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Completeness,0.544,0.014 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.544,0.014 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Completeness,0.451,0.023 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.451,0.023 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Completeness,0.573,0.01 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.573,0.01 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Completeness,0.636,0.013 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.636,0.013 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.561,0.037 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.561,0.037 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.598,0.036 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.598,0.036 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.543,0.026 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.543,0.026 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Completeness,0.539,0.024 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.539,0.024 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.568,0.022 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.568,0.022 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.535,0.02 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.535,0.02 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Completeness,0.573,0.024 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.573,0.024 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Completeness,0.59,0.022 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.59,0.022 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Completeness,0.565,0.02 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.565,0.02 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Completeness,0.566,0.029 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.566,0.029 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Completeness,0.6,0.026 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.6,0.026 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Completeness,0.553,0.02 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.553,0.02 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Completeness,0.522,0.028 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.522,0.028 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Completeness,0.593,0.016 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.016 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.495,0.01 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.495,0.01 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Completeness,0.618,0.022 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.618,0.022 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Completeness,0.599,0.026 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.599,0.026 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Completeness,0.618,0.023 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.618,0.023 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Completeness,0.66,0.023 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.66,0.023 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Completeness,0.628,0.05 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.628,0.05 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Completeness,0.534,0.027 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.534,0.027 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Completeness,0.615,0.031 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.615,0.031 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.593,0.014 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.014 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.598,0.017 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.598,0.017 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Completeness,0.51,0.04 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.51,0.04 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Completeness,0.537,0.031 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.537,0.031 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Completeness,0.626,0.065 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.626,0.065 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.593,0.012 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.593,0.012 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Completeness,0.595,0.02 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Completeness,0.595,0.02 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Completeness,0.543,0.011 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Completeness,0.543,0.011 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Completeness,0.512,0.019 -Claude Haiku 4.5,Solo Models,Advisor,NA,Completeness,0.512,0.019 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Completeness,0.51,0.016 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Completeness,0.51,0.016 DeepSeek R1,Solo Models,Advisor,DeepSeek,Completeness,0.573,0.015 -DeepSeek R1,Solo Models,Advisor,NA,Completeness,0.573,0.015 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Completeness,0.628,0.023 -DeepSeek V3.1,Solo Models,Advisor,NA,Completeness,0.628,0.023 -Expert AI,Solo Models,Advisor,NA,Completeness,0.656,0.009 GPT-4.1,Solo Models,Advisor,OpenAI,Completeness,0.618,0.01 -GPT-4.1,Solo Models,Advisor,NA,Completeness,0.618,0.01 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Completeness,0.458,0.013 -GPT-4.1 mini,Solo Models,Advisor,NA,Completeness,0.458,0.013 GPT-4o,Solo Models,Advisor,OpenAI,Completeness,0.565,0.041 -GPT-4o,Solo Models,Advisor,NA,Completeness,0.565,0.041 GPT-4o mini,Solo Models,Advisor,OpenAI,Completeness,0.332,0.016 -GPT-4o mini,Solo Models,Advisor,NA,Completeness,0.332,0.016 GPT-5,Solo Models,Advisor,OpenAI,Completeness,0.502,0.017 -GPT-5,Solo Models,Advisor,NA,Completeness,0.502,0.017 GPT-5 mini,Solo Models,Advisor,OpenAI,Completeness,0.505,0.016 -GPT-5 mini,Solo Models,Advisor,NA,Completeness,0.505,0.016 GPT-5 nano,Solo Models,Advisor,OpenAI,Completeness,0.438,0.019 -GPT-5 nano,Solo Models,Advisor,NA,Completeness,0.438,0.019 Gemini 2.0 Flash,Solo Models,Advisor,Google,Completeness,0.654,0.01 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Completeness,0.654,0.01 Gemini 2.5 Flash,Solo Models,Advisor,Google,Completeness,0.632,0.012 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Completeness,0.632,0.012 Gemini 2.5 Pro,Solo Models,Advisor,Google,Completeness,0.583,0.027 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Completeness,0.583,0.027 Gemini 3 Pro,Solo Models,Advisor,Google,Completeness,0.435,0.026 -Gemini 3 Pro,Solo Models,Advisor,NA,Completeness,0.435,0.026 Glass Health 4.0,Solo Models,Advisor,Glass Health,Completeness,0.52,0.027 -Glass Health 4.0,Solo Models,Advisor,NA,Completeness,0.52,0.027 Grok 4,Solo Models,Advisor,xAI,Completeness,0.573,0.032 -Grok 4,Solo Models,Advisor,NA,Completeness,0.573,0.032 Grok 4 Fast,Solo Models,Advisor,xAI,Completeness,0.554,0.033 -Grok 4 Fast,Solo Models,Advisor,NA,Completeness,0.554,0.033 Kimi K2,Solo Models,Advisor,Moonshot AI,Completeness,0.637,0.025 -Kimi K2,Solo Models,Advisor,NA,Completeness,0.637,0.025 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Completeness,0.671,0.012 -LiSA 1.0,Solo Models,Advisor,NA,Completeness,0.671,0.012 Llama 3.3 70b,Solo Models,Advisor,Meta,Completeness,0.56,0.02 -Llama 3.3 70b,Solo Models,Advisor,NA,Completeness,0.56,0.02 Llama 4 Maverick,Solo Models,Advisor,Meta,Completeness,0.516,0.011 -Llama 4 Maverick,Solo Models,Advisor,NA,Completeness,0.516,0.011 Llama 4 Scout,Solo Models,Advisor,Meta,Completeness,0.579,0.007 -Llama 4 Scout,Solo Models,Advisor,NA,Completeness,0.579,0.007 MedGemma 27B,Solo Models,Advisor,Google,Completeness,0.515,0.021 -MedGemma 27B,Solo Models,Advisor,NA,Completeness,0.515,0.021 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Completeness,0.498,0.036 -Mistral Large 2.1,Solo Models,Advisor,NA,Completeness,0.498,0.036 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Completeness,0.474,0.029 -Mistral Medium 3.1,Solo Models,Advisor,NA,Completeness,0.474,0.029 Qwen3 235B,Solo Models,Advisor,Alibaba,Completeness,0.534,0.041 -Qwen3 235B,Solo Models,Advisor,NA,Completeness,0.534,0.041 Qwen3 32B,Solo Models,Advisor,Alibaba,Completeness,0.483,0.018 -Qwen3 32B,Solo Models,Advisor,NA,Completeness,0.483,0.018 o1,Solo Models,Advisor,OpenAI,Completeness,0.441,0.023 -o1,Solo Models,Advisor,NA,Completeness,0.441,0.023 o1 mini,Solo Models,Advisor,OpenAI,Completeness,0.487,0.028 -o1 mini,Solo Models,Advisor,NA,Completeness,0.487,0.028 o3 mini,Solo Models,Advisor,OpenAI,Completeness,0.282,0.013 -o3 mini,Solo Models,Advisor,NA,Completeness,0.282,0.013 o4 mini,Solo Models,Advisor,OpenAI,Completeness,0.355,0.018 -o4 mini,Solo Models,Advisor,NA,Completeness,0.355,0.018 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Escalation,0.766,0.01 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.766,0.01 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Escalation,0.723,0.04 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.723,0.04 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.756,0.01 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.756,0.01 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.692,0.043 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.043 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.627,0.017 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.627,0.017 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.662,0.026 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.662,0.026 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.754,0.037 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.754,0.037 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Escalation,0.647,0.01 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.01 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Escalation,0.652,0.02 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.652,0.02 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Escalation,0.647,0.01 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.01 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Escalation,0.688,0.031 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.688,0.031 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Escalation,0.684,0.018 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.684,0.018 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Escalation,0.782,0.022 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.022 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Escalation,0.743,0.017 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.743,0.017 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Escalation,0.657,0.017 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.017 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.752,0.018 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.752,0.018 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.783,0.012 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.783,0.012 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Escalation,0.774,0.018 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.774,0.018 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Escalation,0.782,0.018 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.018 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Escalation,0.75,0.015 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.75,0.015 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Escalation,0.734,0.012 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.734,0.012 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Escalation,0.746,0.017 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.746,0.017 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.821,0.029 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.029 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.667,0.035 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.667,0.035 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.652,0.01 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.652,0.01 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.736,0.02 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.736,0.02 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.776,0.045 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.045 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.716,0.017 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.716,0.017 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.716,0.017 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.716,0.017 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.701,0.017 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.701,0.017 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Escalation,0.657,0.034 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.034 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.771,0.054 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.771,0.054 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.655,0.019 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.655,0.019 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.597,0.034 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.597,0.034 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.637,0.039 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.637,0.039 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.692,0.01 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.01 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.667,0.054 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.667,0.054 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Escalation,0.669,0.017 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.669,0.017 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.632,0.02 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.632,0.02 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.647,0.035 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.647,0.035 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.622,0.026 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.622,0.026 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.749,0.019 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.749,0.019 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.607,0.01 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.607,0.01 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.542,0.026 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.542,0.026 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.597,0.074 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.597,0.074 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.658,0.02 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.658,0.02 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.527,0.026 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.527,0.026 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.512,0.026 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.512,0.026 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Escalation,0.718,0.018 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.718,0.018 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.713,0.033 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.713,0.033 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.781,0.059 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.781,0.059 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.682,0.035 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.682,0.035 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.657,0.051 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.657,0.051 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.677,0.035 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.677,0.035 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.739,0.055 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.739,0.055 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.701,0.028 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.701,0.028 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.692,0.043 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.692,0.043 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.654,0.036 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.654,0.036 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.821,0.045 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.045 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.801,0.01 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.801,0.01 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.839,0.02 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.839,0.02 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.775,0.03 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.775,0.03 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Escalation,0.851,0.014 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.851,0.014 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.829,0.019 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.829,0.019 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Escalation,0.724,0.007 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.724,0.007 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.866,0.033 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.866,0.033 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.751,0.01 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.751,0.01 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.712,0.024 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.712,0.024 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.782,0.025 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.782,0.025 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Escalation,0.729,0.023 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.729,0.023 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Escalation,0.776,0.022 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.022 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.738,0.025 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.738,0.025 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Escalation,0.738,0.025 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.738,0.025 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Escalation,0.708,0.024 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.708,0.024 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Escalation,0.768,0.008 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.768,0.008 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Escalation,0.74,0.025 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.74,0.025 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.811,0.039 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.811,0.039 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.697,0.039 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.697,0.039 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.662,0.054 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.662,0.054 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.766,0.026 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.766,0.026 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.764,0.067 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.764,0.067 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Escalation,0.787,0.014 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.787,0.014 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.769,0.017 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.769,0.017 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.776,0.017 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.017 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.743,0.033 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.743,0.033 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Escalation,0.796,0.026 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.796,0.026 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Escalation,0.732,0.025 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.732,0.025 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.851,0.045 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.851,0.045 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.711,0.043 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.711,0.043 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.607,0.01 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.607,0.01 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.776,0.017 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.776,0.017 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Escalation,0.821,0.011 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.821,0.011 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.778,0.015 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.778,0.015 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Escalation,0.754,0.02 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.754,0.02 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Escalation,0.721,0.026 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.721,0.026 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Escalation,0.649,0.019 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.649,0.019 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Escalation,0.554,0.067 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.554,0.067 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.584,0.106 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.584,0.106 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.552,0.017 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.552,0.017 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.495,0.038 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.495,0.038 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.55,0.034 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.55,0.034 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Escalation,0.676,0.033 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.676,0.033 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.34,0.02 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.34,0.02 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Escalation,0.374,0.025 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.374,0.025 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Escalation,0.34,0.027 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Escalation,0.34,0.027 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Escalation,0.727,0.014 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.014 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Escalation,0.764,0.027 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.764,0.027 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Escalation,0.749,0.014 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.749,0.014 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Escalation,0.754,0.019 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.754,0.019 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Escalation,0.777,0.017 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.777,0.017 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Escalation,0.754,0.019 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.754,0.019 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Escalation,0.765,0.038 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.765,0.038 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Escalation,0.759,0.038 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.759,0.038 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Escalation,0.669,0.021 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.669,0.021 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Escalation,0.722,0.038 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.722,0.038 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Escalation,0.73,0.019 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.73,0.019 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.708,0.038 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.708,0.038 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.839,0.026 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.839,0.026 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.8,0.044 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.8,0.044 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Escalation,0.774,0.029 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.774,0.029 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.771,0.033 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.771,0.033 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.767,0.033 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.767,0.033 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Escalation,0.811,0.035 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.811,0.035 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Escalation,0.785,0.007 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.785,0.007 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Escalation,0.783,0.024 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.783,0.024 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Escalation,0.713,0.02 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.713,0.02 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Escalation,0.727,0.026 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.026 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Escalation,0.727,0.033 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.727,0.033 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Escalation,0.725,0.036 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.725,0.036 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Escalation,0.721,0.043 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.721,0.043 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.77,0.022 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.77,0.022 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Escalation,0.824,0.008 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.824,0.008 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Escalation,0.753,0.019 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.753,0.019 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Escalation,0.823,0.014 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.823,0.014 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Escalation,0.83,0.013 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.83,0.013 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Escalation,0.833,0.015 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.833,0.015 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Escalation,0.725,0.033 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.725,0.033 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Escalation,0.82,0.024 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.82,0.024 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.779,0.023 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.779,0.023 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.773,0.025 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.773,0.025 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Escalation,0.794,0.025 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.794,0.025 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Escalation,0.749,0.03 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.749,0.03 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Escalation,0.838,0.044 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.838,0.044 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.758,0.027 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.758,0.027 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Escalation,0.765,0.03 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Escalation,0.765,0.03 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Escalation,0.642,0.012 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Escalation,0.642,0.012 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Escalation,0.687,0.014 -Claude Haiku 4.5,Solo Models,Advisor,NA,Escalation,0.687,0.014 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Escalation,0.64,0.012 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Escalation,0.64,0.012 DeepSeek R1,Solo Models,Advisor,DeepSeek,Escalation,0.756,0.011 -DeepSeek R1,Solo Models,Advisor,NA,Escalation,0.756,0.011 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Escalation,0.765,0.02 -DeepSeek V3.1,Solo Models,Advisor,NA,Escalation,0.765,0.02 -Expert AI,Solo Models,Advisor,NA,Escalation,0.748,0.009 GPT-4.1,Solo Models,Advisor,OpenAI,Escalation,0.724,0.013 -GPT-4.1,Solo Models,Advisor,NA,Escalation,0.724,0.013 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Escalation,0.527,0.011 -GPT-4.1 mini,Solo Models,Advisor,NA,Escalation,0.527,0.011 GPT-4o,Solo Models,Advisor,OpenAI,Escalation,0.81,0.018 -GPT-4o,Solo Models,Advisor,NA,Escalation,0.81,0.018 GPT-4o mini,Solo Models,Advisor,OpenAI,Escalation,0.722,0.015 -GPT-4o mini,Solo Models,Advisor,NA,Escalation,0.722,0.015 GPT-5,Solo Models,Advisor,OpenAI,Escalation,0.678,0.016 -GPT-5,Solo Models,Advisor,NA,Escalation,0.678,0.016 GPT-5 mini,Solo Models,Advisor,OpenAI,Escalation,0.54,0.022 -GPT-5 mini,Solo Models,Advisor,NA,Escalation,0.54,0.022 GPT-5 nano,Solo Models,Advisor,OpenAI,Escalation,0.568,0.023 -GPT-5 nano,Solo Models,Advisor,NA,Escalation,0.568,0.023 Gemini 2.0 Flash,Solo Models,Advisor,Google,Escalation,0.693,0.02 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Escalation,0.693,0.02 Gemini 2.5 Flash,Solo Models,Advisor,Google,Escalation,0.823,0.013 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Escalation,0.823,0.013 Gemini 2.5 Pro,Solo Models,Advisor,Google,Escalation,0.733,0.013 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Escalation,0.733,0.013 Gemini 3 Pro,Solo Models,Advisor,Google,Escalation,0.6,0.024 -Gemini 3 Pro,Solo Models,Advisor,NA,Escalation,0.6,0.024 Glass Health 4.0,Solo Models,Advisor,Glass Health,Escalation,0.711,0.025 -Glass Health 4.0,Solo Models,Advisor,NA,Escalation,0.711,0.025 Grok 4,Solo Models,Advisor,xAI,Escalation,0.755,0.014 -Grok 4,Solo Models,Advisor,NA,Escalation,0.755,0.014 Grok 4 Fast,Solo Models,Advisor,xAI,Escalation,0.751,0.019 -Grok 4 Fast,Solo Models,Advisor,NA,Escalation,0.751,0.019 Kimi K2,Solo Models,Advisor,Moonshot AI,Escalation,0.82,0.019 -Kimi K2,Solo Models,Advisor,NA,Escalation,0.82,0.019 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Escalation,0.731,0.016 -LiSA 1.0,Solo Models,Advisor,NA,Escalation,0.731,0.016 Llama 3.3 70b,Solo Models,Advisor,Meta,Escalation,0.76,0.028 -Llama 3.3 70b,Solo Models,Advisor,NA,Escalation,0.76,0.028 Llama 4 Maverick,Solo Models,Advisor,Meta,Escalation,0.758,0.013 -Llama 4 Maverick,Solo Models,Advisor,NA,Escalation,0.758,0.013 Llama 4 Scout,Solo Models,Advisor,Meta,Escalation,0.76,0.008 -Llama 4 Scout,Solo Models,Advisor,NA,Escalation,0.76,0.008 MedGemma 27B,Solo Models,Advisor,Google,Escalation,0.713,0.017 -MedGemma 27B,Solo Models,Advisor,NA,Escalation,0.713,0.017 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Escalation,0.689,0.034 -Mistral Large 2.1,Solo Models,Advisor,NA,Escalation,0.689,0.034 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Escalation,0.617,0.029 -Mistral Medium 3.1,Solo Models,Advisor,NA,Escalation,0.617,0.029 Qwen3 235B,Solo Models,Advisor,Alibaba,Escalation,0.772,0.044 -Qwen3 235B,Solo Models,Advisor,NA,Escalation,0.772,0.044 Qwen3 32B,Solo Models,Advisor,Alibaba,Escalation,0.714,0.023 -Qwen3 32B,Solo Models,Advisor,NA,Escalation,0.714,0.023 o1,Solo Models,Advisor,OpenAI,Escalation,0.6,0.014 -o1,Solo Models,Advisor,NA,Escalation,0.6,0.014 o1 mini,Solo Models,Advisor,OpenAI,Escalation,0.76,0.013 -o1 mini,Solo Models,Advisor,NA,Escalation,0.76,0.013 o3 mini,Solo Models,Advisor,OpenAI,Escalation,0.384,0.018 -o3 mini,Solo Models,Advisor,NA,Escalation,0.384,0.018 o4 mini,Solo Models,Advisor,OpenAI,Escalation,0.35,0.022 -o4 mini,Solo Models,Advisor,NA,Escalation,0.35,0.022 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,F1,0.619,0.002 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.619,0.002 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,F1,0.633,0.005 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.633,0.005 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.456,0.016 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.456,0.016 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.664,0.007 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.664,0.007 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.645,0.009 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.645,0.009 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.608,0.006 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.608,0.006 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.636,0.004 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.636,0.004 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,F1,0.611,0.008 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.611,0.008 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,F1,0.604,0.005 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.005 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,F1,0.622,0.009 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.622,0.009 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,F1,0.639,0.02 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.02 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,F1,0.656,0.004 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.656,0.004 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,F1,0.626,0.004 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.626,0.004 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,F1,0.617,0.005 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.617,0.005 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,F1,0.628,0.004 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.628,0.004 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.592,0.005 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.592,0.005 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.612,0.004 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.612,0.004 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,F1,0.612,0.008 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.612,0.008 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,F1,0.637,0.003 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.637,0.003 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,F1,0.599,0.006 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.599,0.006 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,F1,0.608,0.004 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.608,0.004 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,F1,0.604,0.007 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.007 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.407,0.018 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.407,0.018 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.654,0.006 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.654,0.006 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.625,0.004 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.625,0.004 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.565,0.005 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.565,0.005 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.607,0.011 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.607,0.011 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.56,0.009 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.56,0.009 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.555,0.009 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.555,0.009 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.577,0.01 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.577,0.01 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,F1,0.673,0.004 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.673,0.004 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.429,0.012 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.429,0.012 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.671,0.007 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.671,0.007 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.669,0.008 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.008 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.669,0.003 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.003 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.675,0.01 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.675,0.01 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.653,0.018 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.653,0.018 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,F1,0.675,0.004 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.675,0.004 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.676,0.006 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.676,0.006 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.638,0.01 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.638,0.01 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.674,0.007 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.674,0.007 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.415,0.034 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.415,0.034 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.669,0.01 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.669,0.01 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.652,0.003 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.652,0.003 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.652,0.006 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.652,0.006 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.641,0.002 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.641,0.002 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.639,0.007 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.007 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.648,0.012 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.648,0.012 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,F1,0.598,0.002 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.598,0.002 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.603,0.007 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.603,0.007 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.471,0.007 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.471,0.007 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.662,0.009 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.662,0.009 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.617,0.007 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.617,0.007 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.533,0.001 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.533,0.001 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.596,0.01 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.596,0.01 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.536,0.003 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.536,0.003 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.525,0.007 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.525,0.007 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.568,0.006 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.568,0.006 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.61,0.013 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.013 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.61,0.006 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.006 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.589,0.002 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.589,0.002 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.581,0.013 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.581,0.013 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,F1,0.625,0.003 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.625,0.003 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.582,0.003 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.582,0.003 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,F1,0.643,0.005 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.643,0.005 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.402,0.02 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.402,0.02 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.674,0.005 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.674,0.005 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.65,0.004 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.65,0.004 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.639,0.006 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.639,0.006 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,F1,0.603,0.015 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.603,0.015 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,F1,0.659,0.004 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.659,0.004 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.637,0.007 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.637,0.007 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,F1,0.634,0.011 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.634,0.011 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,F1,0.635,0.01 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.635,0.01 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,F1,0.61,0.006 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.61,0.006 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,F1,0.611,0.008 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.611,0.008 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.49,0.026 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.49,0.026 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.668,0.011 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.668,0.011 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.634,0.005 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.634,0.005 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.543,0.002 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.543,0.002 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.597,0.008 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.597,0.008 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,F1,0.62,0.002 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,F1,0.62,0.002 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.544,0.003 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.544,0.003 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.536,0.007 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.536,0.007 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.582,0.009 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.582,0.009 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,F1,0.6,0.016 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.6,0.016 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,F1,0.604,0.004 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.604,0.004 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.492,0.006 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.492,0.006 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.672,0.009 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.672,0.009 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.62,0.004 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.62,0.004 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.516,0.001 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.516,0.001 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,F1,0.601,0.011 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.601,0.011 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.491,0.003 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.491,0.003 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,F1,0.482,0.004 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.482,0.004 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,F1,0.56,0.007 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.56,0.007 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,F1,0.635,0.01 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,F1,0.635,0.01 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,F1,0.632,0.017 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,F1,0.632,0.017 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.444,0.03 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,F1,0.444,0.03 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.67,0.004 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,F1,0.67,0.004 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.651,0.006 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.651,0.006 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.614,0.014 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,F1,0.614,0.014 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,F1,0.645,0.016 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,F1,0.645,0.016 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.573,0.019 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,F1,0.573,0.019 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,F1,0.558,0.013 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,F1,0.558,0.013 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,F1,0.558,0.026 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,F1,0.558,0.026 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,F1,0.615,0.005 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.615,0.005 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,F1,0.653,0.006 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.653,0.006 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,F1,0.602,0.006 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.602,0.006 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,F1,0.628,0.005 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.628,0.005 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,F1,0.629,0.004 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.629,0.004 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,F1,0.662,0.005 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.005 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,F1,0.621,0.004 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.621,0.004 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,F1,0.661,0.006 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.661,0.006 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,F1,0.675,0.007 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.675,0.007 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,F1,0.606,0.013 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.606,0.013 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,F1,0.622,0.003 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.622,0.003 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.654,0.006 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.654,0.006 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.588,0.003 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.588,0.003 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.607,0.007 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.607,0.007 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,F1,0.612,0.006 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.006 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.652,0.006 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.652,0.006 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.613,0.009 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.613,0.009 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,F1,0.612,0.005 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.005 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,F1,0.662,0.003 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.003 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,F1,0.612,0.009 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.009 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,F1,0.661,0.006 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.661,0.006 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,F1,0.656,0.004 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.656,0.004 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,F1,0.662,0.009 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.662,0.009 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,F1,0.624,0.012 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.624,0.012 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,F1,0.666,0.009 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.666,0.009 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.546,0.004 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.546,0.004 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,F1,0.612,0.009 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.009 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,F1,0.664,0.005 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.664,0.005 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,F1,0.616,0.007 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.616,0.007 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,F1,0.64,0.006 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.64,0.006 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,F1,0.612,0.012 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.612,0.012 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,F1,0.663,0.007 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.663,0.007 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,F1,0.623,0.005 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.623,0.005 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.49,0.005 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.49,0.005 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.49,0.005 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.49,0.005 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,F1,0.584,0.006 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.584,0.006 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,F1,0.665,0.008 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.665,0.008 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,F1,0.614,0.009 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.614,0.009 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.484,0.005 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.484,0.005 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,F1,0.482,0.008 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,F1,0.482,0.008 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,F1,0.614,0.004 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,F1,0.614,0.004 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,F1,0.562,0.004 -Claude Haiku 4.5,Solo Models,Advisor,NA,F1,0.562,0.004 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,F1,0.642,0.003 -Claude Sonnet 4.5,Solo Models,Advisor,NA,F1,0.642,0.003 DeepSeek R1,Solo Models,Advisor,DeepSeek,F1,0.601,0.005 -DeepSeek R1,Solo Models,Advisor,NA,F1,0.601,0.005 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,F1,0.564,0.004 -DeepSeek V3.1,Solo Models,Advisor,NA,F1,0.564,0.004 -Expert AI,Solo Models,Advisor,NA,F1,0.571,0.004 GPT-4.1,Solo Models,Advisor,OpenAI,F1,0.564,0.003 -GPT-4.1,Solo Models,Advisor,NA,F1,0.564,0.003 GPT-4.1 mini,Solo Models,Advisor,OpenAI,F1,0.54,0.003 -GPT-4.1 mini,Solo Models,Advisor,NA,F1,0.54,0.003 GPT-4o,Solo Models,Advisor,OpenAI,F1,0.544,0.004 -GPT-4o,Solo Models,Advisor,NA,F1,0.544,0.004 GPT-4o mini,Solo Models,Advisor,OpenAI,F1,0.506,0.007 -GPT-4o mini,Solo Models,Advisor,NA,F1,0.506,0.007 GPT-5,Solo Models,Advisor,OpenAI,F1,0.669,0.004 -GPT-5,Solo Models,Advisor,NA,F1,0.669,0.004 GPT-5 mini,Solo Models,Advisor,OpenAI,F1,0.645,0.004 -GPT-5 mini,Solo Models,Advisor,NA,F1,0.645,0.004 GPT-5 nano,Solo Models,Advisor,OpenAI,F1,0.56,0.007 -GPT-5 nano,Solo Models,Advisor,NA,F1,0.56,0.007 Gemini 2.0 Flash,Solo Models,Advisor,Google,F1,0.531,0.003 -Gemini 2.0 Flash,Solo Models,Advisor,NA,F1,0.531,0.003 Gemini 2.5 Flash,Solo Models,Advisor,Google,F1,0.566,0.008 -Gemini 2.5 Flash,Solo Models,Advisor,NA,F1,0.566,0.008 Gemini 2.5 Pro,Solo Models,Advisor,Google,F1,0.627,0.005 -Gemini 2.5 Pro,Solo Models,Advisor,NA,F1,0.627,0.005 Gemini 3 Pro,Solo Models,Advisor,Google,F1,0.649,0.003 -Gemini 3 Pro,Solo Models,Advisor,NA,F1,0.649,0.003 Glass Health 4.0,Solo Models,Advisor,Glass Health,F1,0.662,0.004 -Glass Health 4.0,Solo Models,Advisor,NA,F1,0.662,0.004 Grok 4,Solo Models,Advisor,xAI,F1,0.612,0.005 -Grok 4,Solo Models,Advisor,NA,F1,0.612,0.005 Grok 4 Fast,Solo Models,Advisor,xAI,F1,0.596,0.005 -Grok 4 Fast,Solo Models,Advisor,NA,F1,0.596,0.005 Kimi K2,Solo Models,Advisor,Moonshot AI,F1,0.545,0.006 -Kimi K2,Solo Models,Advisor,NA,F1,0.545,0.006 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,F1,0.623,0.004 -LiSA 1.0,Solo Models,Advisor,NA,F1,0.623,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,F1,0.5,0.005 -Llama 3.3 70b,Solo Models,Advisor,NA,F1,0.5,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,F1,0.544,0.003 -Llama 4 Maverick,Solo Models,Advisor,NA,F1,0.544,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,F1,0.479,0.002 -Llama 4 Scout,Solo Models,Advisor,NA,F1,0.479,0.002 MedGemma 27B,Solo Models,Advisor,Google,F1,0.541,0.005 -MedGemma 27B,Solo Models,Advisor,NA,F1,0.541,0.005 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,F1,0.578,0.005 -Mistral Large 2.1,Solo Models,Advisor,NA,F1,0.578,0.005 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,F1,0.551,0.003 -Mistral Medium 3.1,Solo Models,Advisor,NA,F1,0.551,0.003 Qwen3 235B,Solo Models,Advisor,Alibaba,F1,0.53,0.006 -Qwen3 235B,Solo Models,Advisor,NA,F1,0.53,0.006 Qwen3 32B,Solo Models,Advisor,Alibaba,F1,0.503,0.006 -Qwen3 32B,Solo Models,Advisor,NA,F1,0.503,0.006 o1,Solo Models,Advisor,OpenAI,F1,0.625,0.004 -o1,Solo Models,Advisor,NA,F1,0.625,0.004 o1 mini,Solo Models,Advisor,OpenAI,F1,0.485,0.006 -o1 mini,Solo Models,Advisor,NA,F1,0.485,0.006 o3 mini,Solo Models,Advisor,OpenAI,F1,0.571,0.006 -o3 mini,Solo Models,Advisor,NA,F1,0.571,0.006 o4 mini,Solo Models,Advisor,OpenAI,F1,0.584,0.005 -o4 mini,Solo Models,Advisor,NA,F1,0.584,0.005 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,OverallScore,0.59,0.004 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.004 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,OverallScore,0.571,0.025 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.571,0.025 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.458,0.016 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.458,0.016 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.579,0.011 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.011 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.591,0.018 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.018 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.588,0.021 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.588,0.021 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.582,0.004 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.582,0.004 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,OverallScore,0.576,0.015 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.015 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,OverallScore,0.573,0.013 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.573,0.013 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,OverallScore,0.568,0.016 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.568,0.016 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,OverallScore,0.586,0.007 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.586,0.007 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,OverallScore,0.61,0.009 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.61,0.009 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,OverallScore,0.603,0.014 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.603,0.014 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,OverallScore,0.567,0.013 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.567,0.013 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,OverallScore,0.602,0.014 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.602,0.014 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.591,0.01 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.01 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.597,0.008 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.008 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,OverallScore,0.589,0.008 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.589,0.008 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,OverallScore,0.628,0.009 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.628,0.009 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,OverallScore,0.579,0.01 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.01 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,OverallScore,0.577,0.011 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.577,0.011 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,OverallScore,0.586,0.021 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.586,0.021 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.43,0.012 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.43,0.012 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.584,0.022 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.584,0.022 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.594,0.004 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.594,0.004 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.576,0.005 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.005 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.599,0.013 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.599,0.013 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.567,0.007 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.567,0.007 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.565,0.002 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.565,0.002 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.572,0.006 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.572,0.006 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,OverallScore,0.551,0.013 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.551,0.013 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.445,0.012 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.445,0.012 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.57,0.012 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.57,0.012 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.581,0.003 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.581,0.003 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.593,0.012 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.012 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.596,0.012 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.596,0.012 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.592,0.007 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.007 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,OverallScore,0.589,0.008 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.589,0.008 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.574,0.028 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.028 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.575,0.024 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.024 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.563,0.026 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.563,0.026 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.431,0.044 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.431,0.044 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.574,0.008 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.008 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.578,0.025 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.578,0.025 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.583,0.026 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.583,0.026 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.604,0.015 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.604,0.015 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.545,0.018 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.545,0.018 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.543,0.021 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.543,0.021 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,OverallScore,0.602,0.007 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.602,0.007 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.014 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.014 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.496,0.018 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.496,0.018 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.603,0.012 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.603,0.012 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.597,0.019 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.019 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.56,0.009 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.56,0.009 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.569,0.016 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.569,0.016 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.562,0.004 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.562,0.004 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.556,0.006 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.556,0.006 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.576,0.012 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.576,0.012 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.024 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.024 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.606,0.016 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.606,0.016 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.592,0.005 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.005 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.57,0.009 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.57,0.009 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,OverallScore,0.63,0.009 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.63,0.009 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.592,0.008 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.592,0.008 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,OverallScore,0.575,0.003 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.003 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.439,0.024 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.439,0.024 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.614,0.015 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.614,0.015 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.616,0.012 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.616,0.012 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.614,0.014 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.614,0.014 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,OverallScore,0.579,0.014 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.579,0.014 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,OverallScore,0.634,0.012 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.634,0.012 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.591,0.006 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.591,0.006 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,OverallScore,0.59,0.003 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.003 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,OverallScore,0.583,0.008 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.583,0.008 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,OverallScore,0.597,0.01 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.01 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,OverallScore,0.547,0.032 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.547,0.032 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.497,0.018 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.497,0.018 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.575,0.019 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.575,0.019 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.597,0.015 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.015 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.54,0.013 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.54,0.013 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.597,0.011 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.597,0.011 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,OverallScore,0.621,0.008 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.621,0.008 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.529,0.006 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.529,0.006 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.522,0.017 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.522,0.017 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.538,0.014 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.538,0.014 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,OverallScore,0.593,0.002 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.002 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,OverallScore,0.574,0.011 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.574,0.011 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.507,0.013 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.507,0.013 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.593,0.007 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.007 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.59,0.003 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.003 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.555,0.009 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.555,0.009 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,OverallScore,0.608,0.012 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.608,0.012 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.513,0.004 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.513,0.004 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,OverallScore,0.503,0.004 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.503,0.004 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,OverallScore,0.568,0.014 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.568,0.014 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,OverallScore,0.593,0.015 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.593,0.015 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,OverallScore,0.504,0.038 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.504,0.038 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.447,0.033 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.447,0.033 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.555,0.024 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.555,0.024 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.543,0.008 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.543,0.008 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.518,0.023 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.518,0.023 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,OverallScore,0.59,0.014 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.59,0.014 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.411,0.016 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.411,0.016 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,OverallScore,0.421,0.011 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.421,0.011 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,OverallScore,0.411,0.019 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,OverallScore,0.411,0.019 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,OverallScore,0.562,0.016 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.562,0.016 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,OverallScore,0.578,0.011 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.578,0.011 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,OverallScore,0.599,0.011 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.599,0.011 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,OverallScore,0.603,0.009 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.603,0.009 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,OverallScore,0.619,0.012 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.619,0.012 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,OverallScore,0.608,0.016 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.608,0.016 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,OverallScore,0.598,0.011 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.598,0.011 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,OverallScore,0.608,0.004 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.608,0.004 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,OverallScore,0.564,0.013 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.564,0.013 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,OverallScore,0.586,0.009 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.586,0.009 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,OverallScore,0.616,0.006 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.616,0.006 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.611,0.024 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.611,0.024 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.586,0.013 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.586,0.013 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.575,0.015 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.575,0.015 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,OverallScore,0.581,0.013 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.581,0.013 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.609,0.009 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.609,0.009 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.579,0.013 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.579,0.013 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,OverallScore,0.591,0.018 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.591,0.018 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,OverallScore,0.622,0.012 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.622,0.012 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,OverallScore,0.591,0.008 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.591,0.008 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,OverallScore,0.615,0.016 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.016 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,OverallScore,0.623,0.013 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.623,0.013 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,OverallScore,0.611,0.012 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.611,0.012 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,OverallScore,0.581,0.015 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.581,0.015 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,OverallScore,0.627,0.021 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.627,0.021 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.529,0.006 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.529,0.006 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,OverallScore,0.612,0.01 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.612,0.01 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,OverallScore,0.632,0.011 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.632,0.011 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,OverallScore,0.615,0.011 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.011 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,OverallScore,0.643,0.011 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.643,0.011 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,OverallScore,0.613,0.022 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.613,0.022 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,OverallScore,0.594,0.013 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.594,0.013 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,OverallScore,0.615,0.011 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.011 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.515,0.007 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.515,0.007 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.517,0.006 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.517,0.006 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,OverallScore,0.554,0.02 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.554,0.02 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,OverallScore,0.597,0.019 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.597,0.019 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,OverallScore,0.615,0.03 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.615,0.03 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.507,0.005 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.507,0.005 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,OverallScore,0.506,0.008 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,OverallScore,0.506,0.008 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,OverallScore,0.576,0.007 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,OverallScore,0.576,0.007 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,OverallScore,0.537,0.007 -Claude Haiku 4.5,Solo Models,Advisor,NA,OverallScore,0.537,0.007 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,OverallScore,0.582,0.008 -Claude Sonnet 4.5,Solo Models,Advisor,NA,OverallScore,0.582,0.008 DeepSeek R1,Solo Models,Advisor,DeepSeek,OverallScore,0.581,0.006 -DeepSeek R1,Solo Models,Advisor,NA,OverallScore,0.581,0.006 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,OverallScore,0.577,0.009 -DeepSeek V3.1,Solo Models,Advisor,NA,OverallScore,0.577,0.009 -Expert AI,Solo Models,Advisor,NA,OverallScore,0.591,0.006 GPT-4.1,Solo Models,Advisor,OpenAI,OverallScore,0.564,0.004 -GPT-4.1,Solo Models,Advisor,NA,OverallScore,0.564,0.004 GPT-4.1 mini,Solo Models,Advisor,OpenAI,OverallScore,0.497,0.006 -GPT-4.1 mini,Solo Models,Advisor,NA,OverallScore,0.497,0.006 GPT-4o,Solo Models,Advisor,OpenAI,OverallScore,0.536,0.015 -GPT-4o,Solo Models,Advisor,NA,OverallScore,0.536,0.015 GPT-4o mini,Solo Models,Advisor,OpenAI,OverallScore,0.437,0.012 -GPT-4o mini,Solo Models,Advisor,NA,OverallScore,0.437,0.012 GPT-5,Solo Models,Advisor,OpenAI,OverallScore,0.583,0.009 -GPT-5,Solo Models,Advisor,NA,OverallScore,0.583,0.009 GPT-5 mini,Solo Models,Advisor,OpenAI,OverallScore,0.57,0.01 -GPT-5 mini,Solo Models,Advisor,NA,OverallScore,0.57,0.01 GPT-5 nano,Solo Models,Advisor,OpenAI,OverallScore,0.511,0.012 -GPT-5 nano,Solo Models,Advisor,NA,OverallScore,0.511,0.012 Gemini 2.0 Flash,Solo Models,Advisor,Google,OverallScore,0.556,0.005 -Gemini 2.0 Flash,Solo Models,Advisor,NA,OverallScore,0.556,0.005 Gemini 2.5 Flash,Solo Models,Advisor,Google,OverallScore,0.582,0.007 -Gemini 2.5 Flash,Solo Models,Advisor,NA,OverallScore,0.582,0.007 Gemini 2.5 Pro,Solo Models,Advisor,Google,OverallScore,0.599,0.007 -Gemini 2.5 Pro,Solo Models,Advisor,NA,OverallScore,0.599,0.007 Gemini 3 Pro,Solo Models,Advisor,Google,OverallScore,0.548,0.013 -Gemini 3 Pro,Solo Models,Advisor,NA,OverallScore,0.548,0.013 Glass Health 4.0,Solo Models,Advisor,Glass Health,OverallScore,0.59,0.012 -Glass Health 4.0,Solo Models,Advisor,NA,OverallScore,0.59,0.012 Grok 4,Solo Models,Advisor,xAI,OverallScore,0.58,0.012 -Grok 4,Solo Models,Advisor,NA,OverallScore,0.58,0.012 Grok 4 Fast,Solo Models,Advisor,xAI,OverallScore,0.572,0.013 -Grok 4 Fast,Solo Models,Advisor,NA,OverallScore,0.572,0.013 Kimi K2,Solo Models,Advisor,Moonshot AI,OverallScore,0.561,0.009 -Kimi K2,Solo Models,Advisor,NA,OverallScore,0.561,0.009 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,OverallScore,0.623,0.005 -LiSA 1.0,Solo Models,Advisor,NA,OverallScore,0.623,0.005 Llama 3.3 70b,Solo Models,Advisor,Meta,OverallScore,0.511,0.007 -Llama 3.3 70b,Solo Models,Advisor,NA,OverallScore,0.511,0.007 Llama 4 Maverick,Solo Models,Advisor,Meta,OverallScore,0.535,0.004 -Llama 4 Maverick,Solo Models,Advisor,NA,OverallScore,0.535,0.004 Llama 4 Scout,Solo Models,Advisor,Meta,OverallScore,0.496,0.003 -Llama 4 Scout,Solo Models,Advisor,NA,OverallScore,0.496,0.003 MedGemma 27B,Solo Models,Advisor,Google,OverallScore,0.523,0.011 -MedGemma 27B,Solo Models,Advisor,NA,OverallScore,0.523,0.011 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,OverallScore,0.537,0.013 -Mistral Large 2.1,Solo Models,Advisor,NA,OverallScore,0.537,0.013 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,OverallScore,0.502,0.012 -Mistral Medium 3.1,Solo Models,Advisor,NA,OverallScore,0.502,0.012 Qwen3 235B,Solo Models,Advisor,Alibaba,OverallScore,0.527,0.018 -Qwen3 235B,Solo Models,Advisor,NA,OverallScore,0.527,0.018 Qwen3 32B,Solo Models,Advisor,Alibaba,OverallScore,0.488,0.01 -Qwen3 32B,Solo Models,Advisor,NA,OverallScore,0.488,0.01 o1,Solo Models,Advisor,OpenAI,OverallScore,0.532,0.013 -o1,Solo Models,Advisor,NA,OverallScore,0.532,0.013 o1 mini,Solo Models,Advisor,OpenAI,OverallScore,0.475,0.014 -o1 mini,Solo Models,Advisor,NA,OverallScore,0.475,0.014 o3 mini,Solo Models,Advisor,OpenAI,OverallScore,0.427,0.01 -o3 mini,Solo Models,Advisor,NA,OverallScore,0.427,0.01 o4 mini,Solo Models,Advisor,OpenAI,OverallScore,0.479,0.013 -o4 mini,Solo Models,Advisor,NA,OverallScore,0.479,0.013 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Precision,0.505,0.002 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.505,0.002 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Precision,0.542,0.006 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.542,0.006 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.313,0.016 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.313,0.016 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.59,0.004 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.59,0.004 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.54,0.009 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.54,0.009 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.486,0.007 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.007 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.523,0.01 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.523,0.01 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Precision,0.495,0.01 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.01 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Precision,0.486,0.006 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.006 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Precision,0.511,0.01 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.511,0.01 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Precision,0.54,0.026 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.54,0.026 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Precision,0.561,0.004 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.561,0.004 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Precision,0.514,0.004 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.514,0.004 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Precision,0.519,0.007 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.519,0.007 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Precision,0.517,0.009 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.517,0.009 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.462,0.004 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.462,0.004 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.488,0.005 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.488,0.005 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Precision,0.5,0.011 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.5,0.011 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Precision,0.518,0.004 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.518,0.004 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Precision,0.478,0.006 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.478,0.006 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Precision,0.495,0.004 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.004 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Precision,0.486,0.006 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.486,0.006 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.27,0.015 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.27,0.015 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.576,0.009 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.576,0.009 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.509,0.003 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.509,0.003 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.43,0.004 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.43,0.004 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.484,0.01 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.484,0.01 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.43,0.009 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.43,0.009 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.423,0.009 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.423,0.009 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.455,0.01 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.455,0.01 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Precision,0.634,0.002 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.634,0.002 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.291,0.013 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.291,0.013 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.618,0.011 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.618,0.011 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.601,0.01 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.601,0.01 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.604,0.005 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.604,0.005 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.612,0.016 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.612,0.016 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.567,0.031 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.031 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Precision,0.615,0.006 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.615,0.006 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.635,0.002 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.635,0.002 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.567,0.011 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.011 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.636,0.002 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.636,0.002 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.276,0.03 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.276,0.03 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.606,0.013 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.606,0.013 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.57,0.002 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.57,0.002 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.567,0.013 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.567,0.013 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.545,0.012 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.545,0.012 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.564,0.014 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.564,0.014 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.581,0.019 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.581,0.019 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Precision,0.468,0.003 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.468,0.003 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.489,0.005 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.489,0.005 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.322,0.004 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.322,0.004 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.575,0.012 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.575,0.012 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.493,0.009 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.493,0.009 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.387,0.002 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.387,0.002 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.472,0.015 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.472,0.015 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.389,0.003 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.389,0.003 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.378,0.005 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.378,0.005 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.431,0.005 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.431,0.005 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.501,0.008 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.501,0.008 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.484,0.006 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.484,0.006 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.457,0.002 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.457,0.002 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.456,0.015 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.456,0.015 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Precision,0.497,0.003 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.497,0.003 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.45,0.003 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.45,0.003 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Precision,0.561,0.008 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.561,0.008 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.262,0.019 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.262,0.019 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.596,0.005 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.596,0.005 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.543,0.004 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.543,0.004 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.528,0.005 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.528,0.005 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Precision,0.5,0.02 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.5,0.02 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Precision,0.556,0.003 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.556,0.003 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.537,0.009 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.537,0.009 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Precision,0.533,0.016 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.533,0.016 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Precision,0.536,0.011 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.536,0.011 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Precision,0.491,0.007 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.491,0.007 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Precision,0.523,0.007 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.523,0.007 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.345,0.025 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.345,0.025 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.6,0.009 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.6,0.009 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.526,0.007 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.526,0.007 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.405,0.003 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.405,0.003 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.466,0.004 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.466,0.004 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Precision,0.495,0.003 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Precision,0.495,0.003 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.417,0.002 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.417,0.002 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.405,0.005 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.405,0.005 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.471,0.01 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.471,0.01 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Precision,0.472,0.017 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.472,0.017 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Precision,0.503,0.004 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.503,0.004 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.346,0.007 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.346,0.007 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.601,0.008 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.601,0.008 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.507,0.007 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.507,0.007 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.371,0.002 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.371,0.002 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Precision,0.471,0.011 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.471,0.011 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.355,0.003 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.355,0.003 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Precision,0.347,0.004 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.347,0.004 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Precision,0.439,0.008 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.439,0.008 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Precision,0.562,0.008 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Precision,0.562,0.008 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Precision,0.624,0.025 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.624,0.025 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.319,0.028 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Precision,0.319,0.028 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.646,0.008 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Precision,0.646,0.008 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.597,0.009 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.597,0.009 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.577,0.014 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Precision,0.577,0.014 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Precision,0.554,0.023 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Precision,0.554,0.023 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.644,0.016 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Precision,0.644,0.016 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Precision,0.603,0.03 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Precision,0.603,0.03 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Precision,0.615,0.042 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Precision,0.615,0.042 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Precision,0.518,0.005 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.518,0.005 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Precision,0.573,0.006 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.573,0.006 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Precision,0.474,0.007 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.474,0.007 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Precision,0.518,0.006 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.518,0.006 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Precision,0.509,0.005 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.509,0.005 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Precision,0.567,0.004 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.567,0.004 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Precision,0.504,0.006 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.504,0.006 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Precision,0.571,0.007 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.571,0.007 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Precision,0.626,0.007 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.626,0.007 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Precision,0.484,0.018 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.484,0.018 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Precision,0.495,0.004 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.495,0.004 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.554,0.012 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.554,0.012 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.455,0.004 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.455,0.004 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.493,0.005 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.493,0.005 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Precision,0.498,0.007 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.498,0.007 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.55,0.006 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.55,0.006 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.502,0.01 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.502,0.01 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Precision,0.496,0.005 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.496,0.005 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Precision,0.56,0.003 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.56,0.003 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Precision,0.496,0.011 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.496,0.011 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Precision,0.572,0.005 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.572,0.005 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Precision,0.553,0.003 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.553,0.003 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Precision,0.581,0.005 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.581,0.005 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Precision,0.531,0.015 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.531,0.015 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Precision,0.567,0.015 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.567,0.015 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.418,0.004 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.418,0.004 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Precision,0.484,0.01 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.484,0.01 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Precision,0.56,0.006 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.56,0.006 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Precision,0.489,0.006 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.489,0.006 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Precision,0.515,0.006 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.515,0.006 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Precision,0.479,0.012 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.479,0.012 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Precision,0.582,0.008 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.582,0.008 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Precision,0.504,0.009 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.504,0.009 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.354,0.004 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.354,0.004 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.354,0.004 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.354,0.004 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Precision,0.479,0.006 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.479,0.006 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Precision,0.584,0.006 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.584,0.006 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Precision,0.49,0.007 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.49,0.007 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.348,0.004 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.348,0.004 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Precision,0.346,0.007 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Precision,0.346,0.007 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Precision,0.499,0.004 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Precision,0.499,0.004 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Precision,0.444,0.007 -Claude Haiku 4.5,Solo Models,Advisor,NA,Precision,0.444,0.007 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Precision,0.555,0.005 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Precision,0.555,0.005 DeepSeek R1,Solo Models,Advisor,DeepSeek,Precision,0.481,0.006 -DeepSeek R1,Solo Models,Advisor,NA,Precision,0.481,0.006 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Precision,0.425,0.004 -DeepSeek V3.1,Solo Models,Advisor,NA,Precision,0.425,0.004 -Expert AI,Solo Models,Advisor,NA,Precision,0.437,0.007 GPT-4.1,Solo Models,Advisor,OpenAI,Precision,0.434,0.003 -GPT-4.1,Solo Models,Advisor,NA,Precision,0.434,0.003 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Precision,0.423,0.003 -GPT-4.1 mini,Solo Models,Advisor,NA,Precision,0.423,0.003 GPT-4o,Solo Models,Advisor,OpenAI,Precision,0.416,0.006 -GPT-4o,Solo Models,Advisor,NA,Precision,0.416,0.006 GPT-4o mini,Solo Models,Advisor,OpenAI,Precision,0.455,0.006 -GPT-4o mini,Solo Models,Advisor,NA,Precision,0.455,0.006 GPT-5,Solo Models,Advisor,OpenAI,Precision,0.604,0.012 -GPT-5,Solo Models,Advisor,NA,Precision,0.604,0.012 GPT-5 mini,Solo Models,Advisor,OpenAI,Precision,0.567,0.007 -GPT-5 mini,Solo Models,Advisor,NA,Precision,0.567,0.007 GPT-5 nano,Solo Models,Advisor,OpenAI,Precision,0.475,0.006 -GPT-5 nano,Solo Models,Advisor,NA,Precision,0.475,0.006 Gemini 2.0 Flash,Solo Models,Advisor,Google,Precision,0.385,0.003 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Precision,0.385,0.003 Gemini 2.5 Flash,Solo Models,Advisor,Google,Precision,0.429,0.01 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Precision,0.429,0.01 Gemini 2.5 Pro,Solo Models,Advisor,Google,Precision,0.515,0.012 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Precision,0.515,0.012 Gemini 3 Pro,Solo Models,Advisor,Google,Precision,0.615,0.019 -Gemini 3 Pro,Solo Models,Advisor,NA,Precision,0.615,0.019 Glass Health 4.0,Solo Models,Advisor,Glass Health,Precision,0.589,0.016 -Glass Health 4.0,Solo Models,Advisor,NA,Precision,0.589,0.016 Grok 4,Solo Models,Advisor,xAI,Precision,0.5,0.013 -Grok 4,Solo Models,Advisor,NA,Precision,0.5,0.013 Grok 4 Fast,Solo Models,Advisor,xAI,Precision,0.487,0.013 -Grok 4 Fast,Solo Models,Advisor,NA,Precision,0.487,0.013 Kimi K2,Solo Models,Advisor,Moonshot AI,Precision,0.406,0.007 -Kimi K2,Solo Models,Advisor,NA,Precision,0.406,0.007 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Precision,0.493,0.004 -LiSA 1.0,Solo Models,Advisor,NA,Precision,0.493,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Precision,0.361,0.005 -Llama 3.3 70b,Solo Models,Advisor,NA,Precision,0.361,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,Precision,0.415,0.003 -Llama 4 Maverick,Solo Models,Advisor,NA,Precision,0.415,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,Precision,0.343,0.002 -Llama 4 Scout,Solo Models,Advisor,NA,Precision,0.343,0.002 MedGemma 27B,Solo Models,Advisor,Google,Precision,0.427,0.006 -MedGemma 27B,Solo Models,Advisor,NA,Precision,0.427,0.006 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Precision,0.474,0.015 -Mistral Large 2.1,Solo Models,Advisor,NA,Precision,0.474,0.015 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Precision,0.434,0.007 -Mistral Medium 3.1,Solo Models,Advisor,NA,Precision,0.434,0.007 Qwen3 235B,Solo Models,Advisor,Alibaba,Precision,0.407,0.004 -Qwen3 235B,Solo Models,Advisor,NA,Precision,0.407,0.004 Qwen3 32B,Solo Models,Advisor,Alibaba,Precision,0.379,0.006 -Qwen3 32B,Solo Models,Advisor,NA,Precision,0.379,0.006 o1,Solo Models,Advisor,OpenAI,Precision,0.551,0.005 -o1,Solo Models,Advisor,NA,Precision,0.551,0.005 o1 mini,Solo Models,Advisor,OpenAI,Precision,0.37,0.006 -o1 mini,Solo Models,Advisor,NA,Precision,0.37,0.006 o3 mini,Solo Models,Advisor,OpenAI,Precision,0.635,0.008 -o3 mini,Solo Models,Advisor,NA,Precision,0.635,0.008 o4 mini,Solo Models,Advisor,OpenAI,Precision,0.588,0.005 -o4 mini,Solo Models,Advisor,NA,Precision,0.588,0.005 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Recall,0.799,0.011 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.011 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Recall,0.761,0.008 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.761,0.008 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.839,0.008 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.839,0.008 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.759,0.014 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.759,0.014 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.801,0.007 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.801,0.007 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.812,0.003 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.812,0.003 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.812,0.012 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.812,0.012 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Recall,0.797,0.003 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.797,0.003 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Recall,0.798,0.002 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.798,0.002 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Recall,0.794,0.006 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.794,0.006 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Recall,0.782,0.005 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.782,0.005 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Recall,0.789,0.007 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.789,0.007 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Recall,0.8,0.007 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.007 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Recall,0.762,0.006 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.762,0.006 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Recall,0.802,0.011 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.802,0.011 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.824,0.008 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.008 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.823,0.003 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.823,0.003 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Recall,0.79,0.005 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.79,0.005 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Recall,0.826,0.003 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.826,0.003 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Recall,0.8,0.007 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.007 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Recall,0.79,0.007 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.79,0.007 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Recall,0.8,0.015 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.8,0.015 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.834,0.003 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.834,0.003 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.756,0.002 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.756,0.002 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.808,0.008 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.008 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.82,0.008 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.82,0.008 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.816,0.011 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.816,0.011 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.804,0.005 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.804,0.005 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.807,0.007 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.007 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.789,0.006 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.789,0.006 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Recall,0.717,0.011 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.011 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.81,0.016 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.81,0.016 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.735,0.008 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.735,0.008 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.754,0.008 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.754,0.008 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.75,0.007 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.75,0.007 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.754,0.01 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.754,0.01 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.771,0.01 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.771,0.01 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Recall,0.748,0.009 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.748,0.009 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.723,0.012 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.723,0.012 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.729,0.009 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.729,0.009 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.717,0.013 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.013 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.843,0.007 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.843,0.007 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.747,0.009 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.747,0.009 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.762,0.011 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.762,0.011 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.768,0.023 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.768,0.023 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.78,0.023 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.78,0.023 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.736,0.015 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.736,0.015 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.733,0.015 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.733,0.015 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Recall,0.83,0.007 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.83,0.007 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.786,0.011 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.786,0.011 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.873,0.022 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.873,0.022 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.778,0.008 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.778,0.008 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.825,0.007 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.825,0.007 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.858,0.012 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.858,0.012 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.807,0.01 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.01 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.861,0.008 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.861,0.008 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.858,0.012 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.858,0.012 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.833,0.008 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.833,0.008 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.78,0.022 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.78,0.022 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.824,0.006 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.006 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.831,0.004 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.004 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.799,0.009 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.009 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Recall,0.843,0.006 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.843,0.006 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.825,0.006 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.825,0.006 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Recall,0.752,0.003 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.752,0.003 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.866,0.029 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.866,0.029 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.777,0.01 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.777,0.01 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.81,0.008 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.81,0.008 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.808,0.009 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.009 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Recall,0.763,0.014 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.014 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Recall,0.808,0.009 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.808,0.009 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.783,0.009 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.009 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Recall,0.783,0.008 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.008 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Recall,0.779,0.006 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.779,0.006 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Recall,0.807,0.011 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.807,0.011 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Recall,0.735,0.008 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.735,0.008 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.85,0.008 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.85,0.008 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.753,0.014 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.753,0.014 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.798,0.007 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.798,0.007 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.824,0.006 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.006 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.83,0.017 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.83,0.017 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Recall,0.831,0.004 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.004 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.783,0.006 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.783,0.006 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.792,0.009 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.792,0.009 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.763,0.007 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.007 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Recall,0.824,0.012 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.824,0.012 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Recall,0.755,0.006 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.755,0.006 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.849,0.012 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.849,0.012 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.763,0.011 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.763,0.011 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.799,0.003 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.799,0.003 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.852,0.009 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.852,0.009 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Recall,0.831,0.012 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.831,0.012 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.795,0.006 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.795,0.006 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Recall,0.791,0.005 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.791,0.005 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Recall,0.773,0.002 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.773,0.002 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Recall,0.73,0.015 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Recall,0.73,0.015 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Recall,0.64,0.013 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.64,0.013 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.732,0.02 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Recall,0.732,0.02 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.696,0.007 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Recall,0.696,0.007 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.717,0.011 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.717,0.011 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.656,0.014 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Recall,0.656,0.014 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Recall,0.772,0.003 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Recall,0.772,0.003 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.516,0.02 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Recall,0.516,0.02 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Recall,0.519,0.016 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Recall,0.519,0.016 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Recall,0.512,0.019 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Recall,0.512,0.019 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Recall,0.757,0.007 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.757,0.007 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Recall,0.759,0.008 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.759,0.008 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Recall,0.826,0.006 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.826,0.006 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Recall,0.797,0.005 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.005 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Recall,0.824,0.002 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.824,0.002 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Recall,0.795,0.008 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.795,0.008 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Recall,0.809,0.009 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.809,0.009 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Recall,0.784,0.009 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.784,0.009 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Recall,0.731,0.008 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.731,0.008 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Recall,0.811,0.009 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.811,0.009 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Recall,0.84,0.006 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.84,0.006 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.796,0.013 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.796,0.013 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.829,0.007 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.829,0.007 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.789,0.013 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.789,0.013 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Recall,0.794,0.007 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.794,0.007 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.799,0.008 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.799,0.008 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.788,0.01 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.788,0.01 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Recall,0.797,0.005 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.005 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Recall,0.809,0.009 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.809,0.009 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Recall,0.8,0.006 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.8,0.006 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Recall,0.784,0.01 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.784,0.01 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Recall,0.806,0.009 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.806,0.009 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Recall,0.768,0.015 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.768,0.015 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Recall,0.757,0.012 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.757,0.012 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Recall,0.808,0.005 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.808,0.005 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.785,0.007 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.785,0.007 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Recall,0.831,0.011 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.831,0.011 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Recall,0.817,0.009 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.817,0.009 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Recall,0.833,0.012 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.833,0.012 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Recall,0.844,0.01 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.844,0.01 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Recall,0.845,0.014 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.845,0.014 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Recall,0.771,0.008 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.771,0.008 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Recall,0.816,0.012 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.816,0.012 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.797,0.009 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.009 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.797,0.009 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.797,0.009 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Recall,0.749,0.01 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.749,0.01 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Recall,0.772,0.014 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.772,0.014 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Recall,0.819,0.019 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.819,0.019 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.792,0.008 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.792,0.008 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Recall,0.795,0.008 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Recall,0.795,0.008 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Recall,0.796,0.005 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Recall,0.796,0.005 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Recall,0.768,0.01 -Claude Haiku 4.5,Solo Models,Advisor,NA,Recall,0.768,0.01 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Recall,0.762,0.005 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Recall,0.762,0.005 DeepSeek R1,Solo Models,Advisor,DeepSeek,Recall,0.802,0.005 -DeepSeek R1,Solo Models,Advisor,NA,Recall,0.802,0.005 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Recall,0.835,0.006 -DeepSeek V3.1,Solo Models,Advisor,NA,Recall,0.835,0.006 -Expert AI,Solo Models,Advisor,NA,Recall,0.825,0.009 GPT-4.1,Solo Models,Advisor,OpenAI,Recall,0.806,0.004 -GPT-4.1,Solo Models,Advisor,NA,Recall,0.806,0.004 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Recall,0.747,0.004 -GPT-4.1 mini,Solo Models,Advisor,NA,Recall,0.747,0.004 GPT-4o,Solo Models,Advisor,OpenAI,Recall,0.787,0.02 -GPT-4o,Solo Models,Advisor,NA,Recall,0.787,0.02 GPT-4o mini,Solo Models,Advisor,OpenAI,Recall,0.571,0.01 -GPT-4o mini,Solo Models,Advisor,NA,Recall,0.571,0.01 GPT-5,Solo Models,Advisor,OpenAI,Recall,0.752,0.01 -GPT-5,Solo Models,Advisor,NA,Recall,0.752,0.01 GPT-5 mini,Solo Models,Advisor,OpenAI,Recall,0.749,0.011 -GPT-5 mini,Solo Models,Advisor,NA,Recall,0.749,0.011 GPT-5 nano,Solo Models,Advisor,OpenAI,Recall,0.682,0.013 -GPT-5 nano,Solo Models,Advisor,NA,Recall,0.682,0.013 Gemini 2.0 Flash,Solo Models,Advisor,Google,Recall,0.857,0.005 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Recall,0.857,0.005 Gemini 2.5 Flash,Solo Models,Advisor,Google,Recall,0.833,0.006 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Recall,0.833,0.006 Gemini 2.5 Pro,Solo Models,Advisor,Google,Recall,0.805,0.016 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Recall,0.805,0.016 Gemini 3 Pro,Solo Models,Advisor,Google,Recall,0.694,0.025 -Gemini 3 Pro,Solo Models,Advisor,NA,Recall,0.694,0.025 Glass Health 4.0,Solo Models,Advisor,Glass Health,Recall,0.761,0.022 -Glass Health 4.0,Solo Models,Advisor,NA,Recall,0.761,0.022 Grok 4,Solo Models,Advisor,xAI,Recall,0.793,0.02 -Grok 4,Solo Models,Advisor,NA,Recall,0.793,0.02 Grok 4 Fast,Solo Models,Advisor,xAI,Recall,0.77,0.021 -Grok 4 Fast,Solo Models,Advisor,NA,Recall,0.77,0.021 Kimi K2,Solo Models,Advisor,Moonshot AI,Recall,0.831,0.01 -Kimi K2,Solo Models,Advisor,NA,Recall,0.831,0.01 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Recall,0.846,0.004 -LiSA 1.0,Solo Models,Advisor,NA,Recall,0.846,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Recall,0.814,0.009 -Llama 3.3 70b,Solo Models,Advisor,NA,Recall,0.814,0.009 Llama 4 Maverick,Solo Models,Advisor,Meta,Recall,0.79,0.005 -Llama 4 Maverick,Solo Models,Advisor,NA,Recall,0.79,0.005 Llama 4 Scout,Solo Models,Advisor,Meta,Recall,0.793,0.003 -Llama 4 Scout,Solo Models,Advisor,NA,Recall,0.793,0.003 MedGemma 27B,Solo Models,Advisor,Google,Recall,0.738,0.007 -MedGemma 27B,Solo Models,Advisor,NA,Recall,0.738,0.007 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Recall,0.747,0.024 -Mistral Large 2.1,Solo Models,Advisor,NA,Recall,0.747,0.024 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Recall,0.757,0.013 -Mistral Medium 3.1,Solo Models,Advisor,NA,Recall,0.757,0.013 Qwen3 235B,Solo Models,Advisor,Alibaba,Recall,0.758,0.014 -Qwen3 235B,Solo Models,Advisor,NA,Recall,0.758,0.014 Qwen3 32B,Solo Models,Advisor,Alibaba,Recall,0.751,0.01 -Qwen3 32B,Solo Models,Advisor,NA,Recall,0.751,0.01 o1,Solo Models,Advisor,OpenAI,Recall,0.721,0.005 -o1,Solo Models,Advisor,NA,Recall,0.721,0.005 o1 mini,Solo Models,Advisor,OpenAI,Recall,0.704,0.007 -o1 mini,Solo Models,Advisor,NA,Recall,0.704,0.007 o3 mini,Solo Models,Advisor,OpenAI,Recall,0.519,0.009 -o3 mini,Solo Models,Advisor,NA,Recall,0.519,0.009 o4 mini,Solo Models,Advisor,OpenAI,Recall,0.58,0.009 -o4 mini,Solo Models,Advisor,NA,Recall,0.58,0.009 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Restraint,0.542,0.002 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.542,0.002 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Restraint,0.573,0.007 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.573,0.007 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.444,0.015 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.444,0.015 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.614,0.002 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.614,0.002 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.574,0.005 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.574,0.005 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.533,0.005 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.533,0.005 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.556,0.006 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.556,0.006 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Restraint,0.54,0.009 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.54,0.009 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Restraint,0.533,0.007 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.533,0.007 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Restraint,0.551,0.011 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.011 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Restraint,0.583,0.009 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.583,0.009 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Restraint,0.599,0.003 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.599,0.003 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Restraint,0.544,0.005 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.544,0.005 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Restraint,0.551,0.008 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.008 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Restraint,0.55,0.008 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.55,0.008 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.507,0.005 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.507,0.005 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.526,0.005 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.526,0.005 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Restraint,0.543,0.005 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.543,0.005 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Restraint,0.556,0.004 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.556,0.004 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Restraint,0.52,0.006 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.006 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Restraint,0.532,0.005 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.532,0.005 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Restraint,0.523,0.008 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.523,0.008 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.402,0.01 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.402,0.01 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.602,0.007 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.602,0.007 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.547,0.001 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.547,0.001 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.485,0.002 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.485,0.002 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.535,0.006 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.535,0.006 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.486,0.003 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.486,0.003 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.481,0.003 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.481,0.003 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.503,0.009 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.503,0.009 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Restraint,0.66,0 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.66,0 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.447,0.011 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.447,0.011 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.644,0.008 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.644,0.008 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.631,0.009 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.631,0.009 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.637,0.005 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.637,0.005 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.639,0.007 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.639,0.007 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.615,0.021 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.615,0.021 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Restraint,0.646,0.005 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.646,0.005 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.662,0.002 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.662,0.002 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.632,0.005 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.632,0.005 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.664,0.001 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.664,0.001 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.426,0.024 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.426,0.024 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.633,0.011 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.633,0.011 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.606,0.003 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.606,0.003 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.609,0.012 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.609,0.012 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.6,0.003 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.6,0.003 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.606,0.011 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.606,0.011 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.617,0.012 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.617,0.012 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Restraint,0.508,0.003 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.508,0.003 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.528,0.005 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.528,0.005 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.424,0.008 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.424,0.008 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.604,0.013 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.604,0.013 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.535,0.011 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.535,0.011 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.452,0.002 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.452,0.002 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.52,0.008 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.008 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.455,0.003 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.455,0.003 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.448,0.004 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.448,0.004 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.487,0.004 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.487,0.004 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.539,0.007 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.539,0.007 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.525,0.006 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.525,0.006 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.503,0.002 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.503,0.002 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.517,0.009 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.517,0.009 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Restraint,0.538,0.003 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.538,0.003 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.499,0.003 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.499,0.003 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Restraint,0.586,0.006 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.586,0.006 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.403,0.013 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.403,0.013 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.615,0.001 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.615,0.001 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.57,0.004 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.57,0.004 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.563,0.002 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.563,0.002 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Restraint,0.551,0.008 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.551,0.008 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Restraint,0.586,0.003 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.586,0.003 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.569,0.005 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.569,0.005 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Restraint,0.567,0.007 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.567,0.007 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Restraint,0.569,0.004 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.569,0.004 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Restraint,0.529,0.008 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.529,0.008 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Restraint,0.56,0.01 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.56,0.01 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.45,0.021 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.45,0.021 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.626,0.006 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.626,0.006 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.566,0.012 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.566,0.012 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.476,0.004 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.476,0.004 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.523,0.007 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.523,0.007 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Restraint,0.543,0.003 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.543,0.003 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.491,0.002 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.491,0.002 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.484,0.005 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.484,0.005 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.529,0.005 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.529,0.005 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Restraint,0.518,0.009 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.518,0.009 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Restraint,0.547,0.005 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.547,0.005 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.441,0.002 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.441,0.002 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.629,0.009 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.629,0.009 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.554,0.006 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.554,0.006 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.455,0.003 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.455,0.003 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Restraint,0.52,0.007 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.52,0.007 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.451,0.003 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.451,0.003 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Restraint,0.448,0.004 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.448,0.004 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Restraint,0.513,0.004 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.513,0.004 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Restraint,0.61,0.007 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.61,0.007 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Restraint,0.665,0.022 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.665,0.022 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.472,0.023 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.472,0.023 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.672,0.002 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.672,0.002 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.634,0.007 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.634,0.007 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.629,0.015 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.629,0.015 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Restraint,0.597,0.012 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.597,0.012 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.694,0.02 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.694,0.02 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Restraint,0.668,0.02 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.668,0.02 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Restraint,0.685,0.027 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Restraint,0.685,0.027 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Restraint,0.551,0.006 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.551,0.006 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Restraint,0.595,0.005 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.595,0.005 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Restraint,0.515,0.006 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.515,0.006 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Restraint,0.548,0.003 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.548,0.003 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Restraint,0.547,0.005 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.547,0.005 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Restraint,0.59,0.004 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.59,0.004 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Restraint,0.538,0.005 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.538,0.005 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Restraint,0.591,0.008 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.591,0.008 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Restraint,0.651,0.008 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.651,0.008 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Restraint,0.523,0.009 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.523,0.009 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Restraint,0.536,0.003 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.536,0.003 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.579,0.013 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.579,0.013 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.502,0.003 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.502,0.003 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.53,0.003 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.003 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Restraint,0.539,0.008 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.539,0.008 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.573,0.006 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.573,0.006 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.539,0.007 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.539,0.007 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Restraint,0.533,0.006 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.533,0.006 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Restraint,0.584,0.005 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.584,0.005 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Restraint,0.534,0.006 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.534,0.006 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Restraint,0.593,0.005 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.593,0.005 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Restraint,0.58,0.004 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.58,0.004 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Restraint,0.602,0.003 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.602,0.003 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Restraint,0.567,0.007 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.567,0.007 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Restraint,0.588,0.015 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.588,0.015 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.492,0.004 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.492,0.004 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Restraint,0.524,0.007 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.524,0.007 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Restraint,0.583,0.007 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.583,0.007 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Restraint,0.526,0.005 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.526,0.005 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Restraint,0.548,0.006 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.548,0.006 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Restraint,0.52,0.006 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.52,0.006 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Restraint,0.608,0.008 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.608,0.008 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Restraint,0.54,0.008 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.54,0.008 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.45,0.004 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.45,0.004 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.45,0.003 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.45,0.003 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Restraint,0.53,0.005 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.005 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Restraint,0.609,0.006 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.609,0.006 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Restraint,0.53,0.002 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.53,0.002 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.448,0.005 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.448,0.005 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Restraint,0.447,0.007 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Restraint,0.447,0.007 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Restraint,0.543,0.004 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Restraint,0.543,0.004 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Restraint,0.494,0.007 -Claude Haiku 4.5,Solo Models,Advisor,NA,Restraint,0.494,0.007 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Restraint,0.591,0.005 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Restraint,0.591,0.005 DeepSeek R1,Solo Models,Advisor,DeepSeek,Restraint,0.521,0.005 -DeepSeek R1,Solo Models,Advisor,NA,Restraint,0.521,0.005 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Restraint,0.478,0.004 -DeepSeek V3.1,Solo Models,Advisor,NA,Restraint,0.478,0.004 -Expert AI,Solo Models,Advisor,NA,Restraint,0.5,0.005 GPT-4.1,Solo Models,Advisor,OpenAI,Restraint,0.487,0.003 -GPT-4.1,Solo Models,Advisor,NA,Restraint,0.487,0.003 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Restraint,0.498,0.003 -GPT-4.1 mini,Solo Models,Advisor,NA,Restraint,0.498,0.003 GPT-4o,Solo Models,Advisor,OpenAI,Restraint,0.487,0.007 -GPT-4o,Solo Models,Advisor,NA,Restraint,0.487,0.007 GPT-4o mini,Solo Models,Advisor,OpenAI,Restraint,0.554,0.007 -GPT-4o mini,Solo Models,Advisor,NA,Restraint,0.554,0.007 GPT-5,Solo Models,Advisor,OpenAI,Restraint,0.631,0.012 -GPT-5,Solo Models,Advisor,NA,Restraint,0.631,0.012 GPT-5 mini,Solo Models,Advisor,OpenAI,Restraint,0.601,0.008 -GPT-5 mini,Solo Models,Advisor,NA,Restraint,0.601,0.008 GPT-5 nano,Solo Models,Advisor,OpenAI,Restraint,0.534,0.006 -GPT-5 nano,Solo Models,Advisor,NA,Restraint,0.534,0.006 Gemini 2.0 Flash,Solo Models,Advisor,Google,Restraint,0.452,0.003 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Restraint,0.452,0.003 Gemini 2.5 Flash,Solo Models,Advisor,Google,Restraint,0.485,0.007 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Restraint,0.485,0.007 Gemini 2.5 Pro,Solo Models,Advisor,Google,Restraint,0.545,0.011 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Restraint,0.545,0.011 Gemini 3 Pro,Solo Models,Advisor,Google,Restraint,0.648,0.02 -Gemini 3 Pro,Solo Models,Advisor,NA,Restraint,0.648,0.02 Glass Health 4.0,Solo Models,Advisor,Glass Health,Restraint,0.613,0.016 -Glass Health 4.0,Solo Models,Advisor,NA,Restraint,0.613,0.016 Grok 4,Solo Models,Advisor,xAI,Restraint,0.537,0.012 -Grok 4,Solo Models,Advisor,NA,Restraint,0.537,0.012 Grok 4 Fast,Solo Models,Advisor,xAI,Restraint,0.532,0.013 -Grok 4 Fast,Solo Models,Advisor,NA,Restraint,0.532,0.013 Kimi K2,Solo Models,Advisor,Moonshot AI,Restraint,0.469,0.007 -Kimi K2,Solo Models,Advisor,NA,Restraint,0.469,0.007 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Restraint,0.541,0.004 -LiSA 1.0,Solo Models,Advisor,NA,Restraint,0.541,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Restraint,0.451,0.005 -Llama 3.3 70b,Solo Models,Advisor,NA,Restraint,0.451,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,Restraint,0.491,0.003 -Llama 4 Maverick,Solo Models,Advisor,NA,Restraint,0.491,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,Restraint,0.446,0.001 -Llama 4 Scout,Solo Models,Advisor,NA,Restraint,0.446,0.001 MedGemma 27B,Solo Models,Advisor,Google,Restraint,0.505,0.005 -MedGemma 27B,Solo Models,Advisor,NA,Restraint,0.505,0.005 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Restraint,0.532,0.015 -Mistral Large 2.1,Solo Models,Advisor,NA,Restraint,0.532,0.015 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Restraint,0.505,0.008 -Mistral Medium 3.1,Solo Models,Advisor,NA,Restraint,0.505,0.008 Qwen3 235B,Solo Models,Advisor,Alibaba,Restraint,0.49,0.006 -Qwen3 235B,Solo Models,Advisor,NA,Restraint,0.49,0.006 Qwen3 32B,Solo Models,Advisor,Alibaba,Restraint,0.46,0.005 -Qwen3 32B,Solo Models,Advisor,NA,Restraint,0.46,0.005 o1,Solo Models,Advisor,OpenAI,Restraint,0.59,0.005 -o1,Solo Models,Advisor,NA,Restraint,0.59,0.005 o1 mini,Solo Models,Advisor,OpenAI,Restraint,0.479,0.004 -o1 mini,Solo Models,Advisor,NA,Restraint,0.479,0.004 o3 mini,Solo Models,Advisor,OpenAI,Restraint,0.693,0.008 -o3 mini,Solo Models,Advisor,NA,Restraint,0.693,0.008 o4 mini,Solo Models,Advisor,OpenAI,Restraint,0.651,0.004 -o4 mini,Solo Models,Advisor,NA,Restraint,0.651,0.004 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Safety,0.673,0.011 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.673,0.011 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Safety,0.681,0.008 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.681,0.008 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.374,0.022 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.374,0.022 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.665,0.017 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.665,0.017 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.657,0.035 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.657,0.035 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.656,0.029 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.656,0.029 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.697,0.011 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.697,0.011 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Safety,0.654,0.026 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.654,0.026 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Safety,0.65,0.02 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.02 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Safety,0.652,0.03 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.652,0.03 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Safety,0.68,0.015 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.68,0.015 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Safety,0.691,0.008 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.691,0.008 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Safety,0.687,0.025 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.687,0.025 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Safety,0.663,0.015 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.663,0.015 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Safety,0.671,0.015 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.671,0.015 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.689,0.014 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.689,0.014 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.681,0.011 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.681,0.011 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Safety,0.686,0.01 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.686,0.01 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Safety,0.701,0.012 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.701,0.012 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Safety,0.675,0.012 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.675,0.012 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Safety,0.672,0.014 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.672,0.014 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Safety,0.657,0.013 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.657,0.013 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.328,0.02 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.328,0.02 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.659,0.006 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.659,0.006 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.66,0.005 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.66,0.005 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.633,0.022 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.633,0.022 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.693,0.011 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.693,0.011 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.615,0.02 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.615,0.02 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.617,0.013 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.617,0.013 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.627,0.018 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.627,0.018 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Safety,0.65,0.021 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.021 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.347,0.018 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.347,0.018 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.64,0.013 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.64,0.013 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.634,0.009 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.634,0.009 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.64,0.014 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.64,0.014 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.654,0.018 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.654,0.018 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.667,0.022 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.667,0.022 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Safety,0.633,0.013 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.633,0.013 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.635,0.029 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.635,0.029 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.62,0.037 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.62,0.037 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.616,0.023 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.616,0.023 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.327,0.059 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.327,0.059 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.621,0.017 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.621,0.017 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.626,0.037 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.626,0.037 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.613,0.03 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.613,0.03 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.664,0.024 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.664,0.024 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.577,0.029 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.577,0.029 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.58,0.025 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.58,0.025 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Safety,0.694,0.015 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.694,0.015 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.661,0.02 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.02 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.443,0.045 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.443,0.045 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.674,0.021 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.674,0.021 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.67,0.023 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.67,0.023 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.614,0.007 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.614,0.007 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.679,0.02 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.679,0.02 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.61,0.011 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.61,0.011 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.605,0.002 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.605,0.002 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.643,0.014 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.643,0.014 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.666,0.022 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.666,0.022 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.682,0.023 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.682,0.023 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.69,0.009 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.69,0.009 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.66,0.015 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.66,0.015 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Safety,0.707,0.011 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.707,0.011 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.688,0.01 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.688,0.01 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Safety,0.67,0.01 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.67,0.01 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.339,0.042 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.339,0.042 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.698,0.022 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.698,0.022 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.705,0.016 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.705,0.016 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.706,0.011 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.706,0.011 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Safety,0.661,0.014 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.014 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Safety,0.715,0.015 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.715,0.015 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.687,0.004 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.687,0.004 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Safety,0.686,0.003 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.686,0.003 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Safety,0.682,0.009 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.682,0.009 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Safety,0.69,0.008 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.69,0.008 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Safety,0.652,0.024 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.652,0.024 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.442,0.017 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.442,0.017 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.631,0.014 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.631,0.014 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.65,0.025 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.025 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.614,0.023 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.614,0.023 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.684,0.017 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.684,0.017 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Safety,0.7,0.009 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,Safety,0.7,0.009 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.616,0.009 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.616,0.009 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.59,0.036 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.59,0.036 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.631,0.031 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.631,0.031 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Safety,0.674,0.007 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.674,0.007 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Safety,0.65,0.016 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.65,0.016 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.471,0.022 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.471,0.022 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.661,0.006 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.661,0.006 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.643,0.015 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.643,0.015 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.592,0.009 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.592,0.009 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Safety,0.717,0.018 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.717,0.018 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.52,0.008 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.52,0.008 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Safety,0.491,0.008 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.491,0.008 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Safety,0.625,0.025 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.625,0.025 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Safety,0.647,0.014 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,Safety,0.647,0.014 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Safety,0.607,0.022 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.607,0.022 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.367,0.037 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,Safety,0.367,0.037 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.621,0.027 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,Safety,0.621,0.027 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.584,0.01 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.584,0.01 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.567,0.023 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,Safety,0.567,0.023 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Safety,0.673,0.017 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,Safety,0.673,0.017 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.483,0.006 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,Safety,0.483,0.006 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Safety,0.486,0.006 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,Safety,0.486,0.006 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Safety,0.477,0.008 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,Safety,0.477,0.008 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Safety,0.658,0.005 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.658,0.005 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Safety,0.669,0.008 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.669,0.008 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Safety,0.695,0.016 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.695,0.016 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Safety,0.698,0.016 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.016 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Safety,0.698,0.014 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.014 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Safety,0.694,0.022 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.694,0.022 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Safety,0.691,0.027 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.691,0.027 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Safety,0.712,0.01 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.712,0.01 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,Safety,0.641,0.018 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.641,0.018 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Safety,0.686,0.032 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.686,0.032 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Safety,0.698,0.011 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.698,0.011 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.714,0.02 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.714,0.02 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.69,0.017 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.69,0.017 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.675,0.021 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.675,0.021 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,Safety,0.688,0.014 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.688,0.014 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.707,0.02 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.707,0.02 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.69,0.014 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.69,0.014 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,Safety,0.687,0.037 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.687,0.037 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Safety,0.708,0.016 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.708,0.016 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Safety,0.7,0.007 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.7,0.007 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Safety,0.705,0.017 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.705,0.017 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Safety,0.706,0.019 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.706,0.019 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Safety,0.697,0.02 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.697,0.02 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Safety,0.675,0.02 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.675,0.02 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Safety,0.715,0.044 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.715,0.044 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.619,0.014 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.619,0.014 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Safety,0.73,0.016 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.73,0.016 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Safety,0.736,0.016 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.736,0.016 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Safety,0.736,0.012 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.736,0.012 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Safety,0.756,0.015 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.756,0.015 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Safety,0.729,0.029 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.729,0.029 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Safety,0.655,0.013 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.655,0.013 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Safety,0.715,0.021 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.715,0.021 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.522,0.011 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.522,0.011 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.524,0.011 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.524,0.011 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Safety,0.64,0.017 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.64,0.017 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Safety,0.661,0.025 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.661,0.025 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Safety,0.718,0.033 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.718,0.033 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.5,0.007 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.5,0.007 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Safety,0.499,0.008 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,Safety,0.499,0.008 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Safety,0.654,0.009 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,Safety,0.654,0.009 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Safety,0.628,0.009 -Claude Haiku 4.5,Solo Models,Advisor,NA,Safety,0.628,0.009 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Safety,0.668,0.009 -Claude Sonnet 4.5,Solo Models,Advisor,NA,Safety,0.668,0.009 DeepSeek R1,Solo Models,Advisor,DeepSeek,Safety,0.672,0.01 -DeepSeek R1,Solo Models,Advisor,NA,Safety,0.672,0.01 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Safety,0.662,0.011 -DeepSeek V3.1,Solo Models,Advisor,NA,Safety,0.662,0.011 -Expert AI,Solo Models,Advisor,NA,Safety,0.644,0.011 GPT-4.1,Solo Models,Advisor,OpenAI,Safety,0.609,0.008 -GPT-4.1,Solo Models,Advisor,NA,Safety,0.609,0.008 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Safety,0.543,0.005 -GPT-4.1 mini,Solo Models,Advisor,NA,Safety,0.543,0.005 GPT-4o,Solo Models,Advisor,OpenAI,Safety,0.575,0.016 -GPT-4o,Solo Models,Advisor,NA,Safety,0.575,0.016 GPT-4o mini,Solo Models,Advisor,OpenAI,Safety,0.49,0.012 -GPT-4o mini,Solo Models,Advisor,NA,Safety,0.49,0.012 GPT-5,Solo Models,Advisor,OpenAI,Safety,0.642,0.01 -GPT-5,Solo Models,Advisor,NA,Safety,0.642,0.01 GPT-5 mini,Solo Models,Advisor,OpenAI,Safety,0.621,0.014 -GPT-5 mini,Solo Models,Advisor,NA,Safety,0.621,0.014 GPT-5 nano,Solo Models,Advisor,OpenAI,Safety,0.586,0.014 -GPT-5 nano,Solo Models,Advisor,NA,Safety,0.586,0.014 Gemini 2.0 Flash,Solo Models,Advisor,Google,Safety,0.605,0.013 -Gemini 2.0 Flash,Solo Models,Advisor,NA,Safety,0.605,0.013 Gemini 2.5 Flash,Solo Models,Advisor,Google,Safety,0.664,0.013 -Gemini 2.5 Flash,Solo Models,Advisor,NA,Safety,0.664,0.013 Gemini 2.5 Pro,Solo Models,Advisor,Google,Safety,0.695,0.01 -Gemini 2.5 Pro,Solo Models,Advisor,NA,Safety,0.695,0.01 Gemini 3 Pro,Solo Models,Advisor,Google,Safety,0.628,0.013 -Gemini 3 Pro,Solo Models,Advisor,NA,Safety,0.628,0.013 Glass Health 4.0,Solo Models,Advisor,Glass Health,Safety,0.663,0.013 -Glass Health 4.0,Solo Models,Advisor,NA,Safety,0.663,0.013 Grok 4,Solo Models,Advisor,xAI,Safety,0.646,0.014 -Grok 4,Solo Models,Advisor,NA,Safety,0.646,0.014 Grok 4 Fast,Solo Models,Advisor,xAI,Safety,0.649,0.021 -Grok 4 Fast,Solo Models,Advisor,NA,Safety,0.649,0.021 Kimi K2,Solo Models,Advisor,Moonshot AI,Safety,0.613,0.012 -Kimi K2,Solo Models,Advisor,NA,Safety,0.613,0.012 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Safety,0.679,0.008 -LiSA 1.0,Solo Models,Advisor,NA,Safety,0.679,0.008 Llama 3.3 70b,Solo Models,Advisor,Meta,Safety,0.538,0.009 -Llama 3.3 70b,Solo Models,Advisor,NA,Safety,0.538,0.009 Llama 4 Maverick,Solo Models,Advisor,Meta,Safety,0.612,0.006 -Llama 4 Maverick,Solo Models,Advisor,NA,Safety,0.612,0.006 Llama 4 Scout,Solo Models,Advisor,Meta,Safety,0.482,0.006 -Llama 4 Scout,Solo Models,Advisor,NA,Safety,0.482,0.006 MedGemma 27B,Solo Models,Advisor,Google,Safety,0.554,0.015 -MedGemma 27B,Solo Models,Advisor,NA,Safety,0.554,0.015 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Safety,0.604,0.011 -Mistral Large 2.1,Solo Models,Advisor,NA,Safety,0.604,0.011 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Safety,0.537,0.014 -Mistral Medium 3.1,Solo Models,Advisor,NA,Safety,0.537,0.014 Qwen3 235B,Solo Models,Advisor,Alibaba,Safety,0.565,0.015 -Qwen3 235B,Solo Models,Advisor,NA,Safety,0.565,0.015 Qwen3 32B,Solo Models,Advisor,Alibaba,Safety,0.527,0.014 -Qwen3 32B,Solo Models,Advisor,NA,Safety,0.527,0.014 o1,Solo Models,Advisor,OpenAI,Safety,0.599,0.009 -o1,Solo Models,Advisor,NA,Safety,0.599,0.009 o1 mini,Solo Models,Advisor,OpenAI,Safety,0.461,0.015 -o1 mini,Solo Models,Advisor,NA,Safety,0.461,0.015 o3 mini,Solo Models,Advisor,OpenAI,Safety,0.494,0.006 -o3 mini,Solo Models,Advisor,NA,Safety,0.494,0.006 o4 mini,Solo Models,Advisor,OpenAI,Safety,0.525,0.014 -o4 mini,Solo Models,Advisor,NA,Safety,0.525,0.014 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,nnh_cumulative,9.141,0.944 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,nnh_cumulative,9.697,0.594 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.697,0.594 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,2.784,0.175 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.784,0.175 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.889,1.089 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.889,1.089 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.862,2.304 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.862,2.304 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,8.586,0.495 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,11.126,3.139 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.126,3.139 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,nnh_cumulative,8.159,0.914 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.159,0.914 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,nnh_cumulative,7.906,0.419 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.906,0.419 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,nnh_cumulative,8.372,0.792 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,nnh_cumulative,9.485,0.661 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.485,0.661 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,nnh_cumulative,13.492,0.974 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.492,0.974 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,nnh_cumulative,10.152,1.175 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.152,1.175 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,nnh_cumulative,9.36,1.549 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.36,1.549 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,nnh_cumulative,11.574,0.907 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.574,0.907 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,12.208,1.79 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,12.208,1.79 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,11.636,1.5 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.636,1.5 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,nnh_cumulative,10.373,1.084 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.373,1.084 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,nnh_cumulative,14.599,1.47 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,14.599,1.47 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,nnh_cumulative,10.842,1.451 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.842,1.451 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,nnh_cumulative,10.891,1.669 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.891,1.669 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,nnh_cumulative,10.067,1.145 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.633,0.078 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.633,0.078 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,9.394,0.594 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.394,0.594 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.333,0 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.333,0 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.586,0.495 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,10.067,1.145 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,8.12,0.419 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,7.906,0.419 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.906,0.419 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.333,0 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.333,0 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,nnh_cumulative,8.462,1.508 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.462,1.508 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.615,0.172 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.615,0.172 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.098,0.692 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.098,0.692 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,8.372,0.792 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.095,1.867 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.095,1.867 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,10.104,4.114 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.104,4.114 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.442,1.273 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.442,1.273 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,nnh_cumulative,8.155,0.573 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.155,0.573 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,7.937,2.04 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.937,2.04 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,6.843,2.211 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.843,2.211 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.222,1.089 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.222,1.089 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.518,0.284 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.518,0.284 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.425,1.207 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.425,1.207 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,7.639,2.326 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.639,2.326 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,7.555,1.82 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.555,1.82 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.928,1.315 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.928,1.315 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,5.828,0.838 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,5.828,0.838 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,6.035,0.646 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.035,0.646 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,nnh_cumulative,11.79,1.328 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.79,1.328 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,9.697,0.594 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.697,0.594 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,3.394,0.385 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,3.394,0.385 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,9.394,0.594 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.394,0.594 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,8.889,1.089 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.889,1.089 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,9.141,0.944 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,9.975,2.512 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.975,2.512 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,8.97,0.604 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.97,0.604 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,8.625,0.914 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.625,0.914 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,9.677,0.783 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.677,0.783 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,10.067,1.145 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.067,1.145 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,11.667,1.633 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.667,1.633 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,11.136,1.519 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.136,1.519 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,8.799,1.203 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.799,1.203 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,nnh_cumulative,15.718,2.738 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,15.718,2.738 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,11.636,1.932 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.636,1.932 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,nnh_cumulative,7.723,0.674 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.723,0.674 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,2.476,0.253 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.476,0.253 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,10.53,1.998 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.53,1.998 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,11.781,1.622 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.781,1.622 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,11.204,1.417 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,11.204,1.417 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,nnh_cumulative,7.996,0.847 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.996,0.847 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,nnh_cumulative,15.147,2.581 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,15.147,2.581 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,9.141,0.944 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,nnh_cumulative,9.141,0.944 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,nnh_cumulative,8.586,0.495 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,0.495 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,nnh_cumulative,10.5,0.98 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.5,0.98 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,nnh_cumulative,8.625,0.914 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.625,0.914 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,3.622,0.231 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,3.622,0.231 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,7.009,0.67 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.009,0.67 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,8.12,0.419 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,10.278,2.373 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.278,2.373 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,8.372,0.792 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,nnh_cumulative,13.024,1.467 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.024,1.467 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,9.59,0.594 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.59,0.594 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,8.586,1.95 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.586,1.95 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,9.141,0.944 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.141,0.944 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,nnh_cumulative,8.838,0.495 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.838,0.495 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,nnh_cumulative,9.333,0.8 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.333,0.8 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,4.013,0.328 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.013,0.328 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,8.12,0.419 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.12,0.419 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,7.692,0 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.692,0 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,9.091,0 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.091,0 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,nnh_cumulative,13.968,2.776 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,13.968,2.776 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,7.438,0.356 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,7.438,0.356 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,nnh_cumulative,6.182,0.307 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.182,0.307 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,nnh_cumulative,9.764,1.32 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,9.764,1.32 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,nnh_cumulative,10.051,0.81 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,10.051,0.81 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,nnh_cumulative,6.564,0.72 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.564,0.72 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,2.81,0.181 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,2.81,0.181 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,6.825,0.311 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.825,0.311 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,5.773,0.214 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,5.773,0.214 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,6.144,0.512 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,6.144,0.512 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,nnh_cumulative,8.372,0.792 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,8.372,0.792 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,4.353,0.214 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.353,0.214 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,nnh_cumulative,4.486,0.271 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.486,0.271 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,nnh_cumulative,4.227,0.118 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,nnh_cumulative,4.227,0.118 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,nnh_cumulative,8.757,0.646 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.757,0.646 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,nnh_cumulative,9.531,1.071 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.531,1.071 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,nnh_cumulative,11.955,1.937 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.955,1.937 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,nnh_cumulative,11.323,2.173 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.323,2.173 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,nnh_cumulative,14.335,1.515 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,14.335,1.515 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,nnh_cumulative,10.933,2.46 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.933,2.46 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,nnh_cumulative,11.667,3.443 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.667,3.443 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,nnh_cumulative,12.024,1.227 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.024,1.227 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,nnh_cumulative,8.028,0.935 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.028,0.935 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,nnh_cumulative,11.12,2.825 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.12,2.825 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,nnh_cumulative,13.051,1.152 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13.051,1.152 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,12.037,0.907 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.037,0.907 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,12.231,2.771 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.231,2.771 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,9.061,1.569 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.061,1.569 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,nnh_cumulative,11.246,2.002 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.246,2.002 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,13.222,3.446 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13.222,3.446 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,10.543,1.229 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.543,1.229 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,nnh_cumulative,12.753,4.294 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.753,4.294 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,nnh_cumulative,11.857,1.618 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.857,1.618 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,nnh_cumulative,11.25,2.376 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.25,2.376 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,nnh_cumulative,10.789,1.612 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,10.789,1.612 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,nnh_cumulative,12.794,2.181 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.794,2.181 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,nnh_cumulative,9.893,1.226 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.893,1.226 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,nnh_cumulative,8.524,0.712 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.524,0.712 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,nnh_cumulative,11.364,2.227 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,11.364,2.227 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,9.889,0.871 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.889,0.871 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,nnh_cumulative,15.246,2.272 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,15.246,2.272 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,nnh_cumulative,16.27,3.076 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,16.27,3.076 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,nnh_cumulative,15.429,2.519 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,15.429,2.519 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,nnh_cumulative,23.25,4.276 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,23.25,4.276 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,nnh_cumulative,16.349,4.322 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,16.349,4.322 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,nnh_cumulative,8.603,1.017 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.603,1.017 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,nnh_cumulative,13,3.601 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,13,3.601 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,7.505,0.553 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,7.505,0.553 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,7.505,0.553 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,7.505,0.553 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,nnh_cumulative,9.707,0.92 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,9.707,0.92 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,nnh_cumulative,8.538,1.28 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,8.538,1.28 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,nnh_cumulative,12.262,2.436 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,12.262,2.436 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,6.522,0.418 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,6.522,0.418 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,nnh_cumulative,6.485,0.532 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,nnh_cumulative,6.485,0.532 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,nnh_cumulative,8.368,0.354 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,nnh_cumulative,8.368,0.354 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,nnh_cumulative,7.442,0.381 -Claude Haiku 4.5,Solo Models,Advisor,NA,nnh_cumulative,7.442,0.381 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,nnh_cumulative,9.958,0.888 -Claude Sonnet 4.5,Solo Models,Advisor,NA,nnh_cumulative,9.958,0.888 DeepSeek R1,Solo Models,Advisor,DeepSeek,nnh_cumulative,10.566,0.895 -DeepSeek R1,Solo Models,Advisor,NA,nnh_cumulative,10.566,0.895 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,nnh_cumulative,9.644,0.82 -DeepSeek V3.1,Solo Models,Advisor,NA,nnh_cumulative,9.644,0.82 -Expert AI,Solo Models,Advisor,NA,nnh_cumulative,9.218,0.721 GPT-4.1,Solo Models,Advisor,OpenAI,nnh_cumulative,7.805,0.359 -GPT-4.1,Solo Models,Advisor,NA,nnh_cumulative,7.805,0.359 GPT-4.1 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,5.65,0.194 -GPT-4.1 mini,Solo Models,Advisor,NA,nnh_cumulative,5.65,0.194 GPT-4o,Solo Models,Advisor,OpenAI,nnh_cumulative,6.698,0.474 -GPT-4o,Solo Models,Advisor,NA,nnh_cumulative,6.698,0.474 GPT-4o mini,Solo Models,Advisor,OpenAI,nnh_cumulative,5.034,0.352 -GPT-4o mini,Solo Models,Advisor,NA,nnh_cumulative,5.034,0.352 GPT-5,Solo Models,Advisor,OpenAI,nnh_cumulative,7.982,0.48 -GPT-5,Solo Models,Advisor,NA,nnh_cumulative,7.982,0.48 GPT-5 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,7.23,0.454 -GPT-5 mini,Solo Models,Advisor,NA,nnh_cumulative,7.23,0.454 GPT-5 nano,Solo Models,Advisor,OpenAI,nnh_cumulative,6.318,0.565 -GPT-5 nano,Solo Models,Advisor,NA,nnh_cumulative,6.318,0.565 Gemini 2.0 Flash,Solo Models,Advisor,Google,nnh_cumulative,9.128,0.675 -Gemini 2.0 Flash,Solo Models,Advisor,NA,nnh_cumulative,9.128,0.675 Gemini 2.5 Flash,Solo Models,Advisor,Google,nnh_cumulative,11.461,1.265 -Gemini 2.5 Flash,Solo Models,Advisor,NA,nnh_cumulative,11.461,1.265 Gemini 2.5 Pro,Solo Models,Advisor,Google,nnh_cumulative,10.425,1.273 -Gemini 2.5 Pro,Solo Models,Advisor,NA,nnh_cumulative,10.425,1.273 Gemini 3 Pro,Solo Models,Advisor,Google,nnh_cumulative,7.621,0.403 -Gemini 3 Pro,Solo Models,Advisor,NA,nnh_cumulative,7.621,0.403 Glass Health 4.0,Solo Models,Advisor,Glass Health,nnh_cumulative,8.411,0.628 -Glass Health 4.0,Solo Models,Advisor,NA,nnh_cumulative,8.411,0.628 Grok 4,Solo Models,Advisor,xAI,nnh_cumulative,7.28,0.607 -Grok 4,Solo Models,Advisor,NA,nnh_cumulative,7.28,0.607 Grok 4 Fast,Solo Models,Advisor,xAI,nnh_cumulative,9.236,0.969 -Grok 4 Fast,Solo Models,Advisor,NA,nnh_cumulative,9.236,0.969 Kimi K2,Solo Models,Advisor,Moonshot AI,nnh_cumulative,8.633,0.634 -Kimi K2,Solo Models,Advisor,NA,nnh_cumulative,8.633,0.634 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,nnh_cumulative,10.726,0.822 -LiSA 1.0,Solo Models,Advisor,NA,nnh_cumulative,10.726,0.822 Llama 3.3 70b,Solo Models,Advisor,Meta,nnh_cumulative,8.27,0.502 -Llama 3.3 70b,Solo Models,Advisor,NA,nnh_cumulative,8.27,0.502 Llama 4 Maverick,Solo Models,Advisor,Meta,nnh_cumulative,10.337,0.649 -Llama 4 Maverick,Solo Models,Advisor,NA,nnh_cumulative,10.337,0.649 Llama 4 Scout,Solo Models,Advisor,Meta,nnh_cumulative,6.372,0.175 -Llama 4 Scout,Solo Models,Advisor,NA,nnh_cumulative,6.372,0.175 MedGemma 27B,Solo Models,Advisor,Google,nnh_cumulative,7.225,0.538 -MedGemma 27B,Solo Models,Advisor,NA,nnh_cumulative,7.225,0.538 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,7.162,0.631 -Mistral Large 2.1,Solo Models,Advisor,NA,nnh_cumulative,7.162,0.631 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,6.202,0.473 -Mistral Medium 3.1,Solo Models,Advisor,NA,nnh_cumulative,6.202,0.473 Qwen3 235B,Solo Models,Advisor,Alibaba,nnh_cumulative,6.485,0.532 -Qwen3 235B,Solo Models,Advisor,NA,nnh_cumulative,6.485,0.532 Qwen3 32B,Solo Models,Advisor,Alibaba,nnh_cumulative,5.78,0.374 -Qwen3 32B,Solo Models,Advisor,NA,nnh_cumulative,5.78,0.374 o1,Solo Models,Advisor,OpenAI,nnh_cumulative,5.608,0.358 -o1,Solo Models,Advisor,NA,nnh_cumulative,5.608,0.358 o1 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,6.131,0.541 -o1 mini,Solo Models,Advisor,NA,nnh_cumulative,6.131,0.541 o3 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,4.512,0.128 -o3 mini,Solo Models,Advisor,NA,nnh_cumulative,4.512,0.128 o4 mini,Solo Models,Advisor,OpenAI,nnh_cumulative,4.901,0.504 -o4 mini,Solo Models,Advisor,NA,nnh_cumulative,4.901,0.504 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,normalized,15,1.132 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,15,1.132 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,normalized,13.333,0.653 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,13.333,0.653 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,100.333,5.807 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,100.333,5.807 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,15,2.263 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.263 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,15.667,3.974 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,15.667,3.974 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,17,1.132 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,17,1.132 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,12.333,5.227 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,12.333,5.227 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,normalized,17.667,1.307 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,1.307 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,normalized,18,1.132 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,18,1.132 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,normalized,16.333,1.729 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.333,1.729 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,normalized,13.2,1.999 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,13.2,1.999 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,normalized,9.8,0.704 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,9.8,0.704 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,normalized,13.8,0.96 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,13.8,0.96 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,normalized,15.125,1.633 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,15.125,1.633 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,normalized,12.667,1.729 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,12.667,1.729 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,12.875,1.306 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,12.875,1.306 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,10.8,1.395 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,10.8,1.395 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,normalized,19.4,4.713 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,19.4,4.713 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,normalized,10.7,0.879 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,10.7,0.879 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,normalized,14.3,1.547 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,14.3,1.547 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,normalized,13.9,1.926 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,13.9,1.926 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,normalized,18.333,2.356 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,18.333,2.356 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,118.333,16.797 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,118.333,16.797 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,15.333,0.653 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,0.653 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,16.667,0.653 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,0.653 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,23.667,0.653 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,23.667,0.653 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,15.333,1.729 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.729 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,24.667,0.653 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,24.667,0.653 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,26.333,0.653 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,26.333,0.653 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,24,0 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,24,0 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,normalized,15.333,1.729 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.729 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,108.667,10.453 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,108.667,10.453 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,17.25,1.373 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,17.25,1.373 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,16.333,0.653 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16.333,0.653 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,18,4.08 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,18,4.08 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,14.667,2.613 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,14.667,2.613 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,20.333,2.356 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,20.333,2.356 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,normalized,17.2,1.267 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,17.2,1.267 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,18.667,3.974 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,18.667,3.974 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,21,4.933 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,21,4.933 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19.667,2.848 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19.667,2.848 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,127,23.82 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,127,23.82 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,18.333,2.613 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,18.333,2.613 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19,4.08 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19,4.08 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,18.667,4.573 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,18.667,4.573 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,16.667,2.848 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,2.848 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,22.667,3.457 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,22.667,3.457 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,22,2.994 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22,2.994 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,normalized,14,2.445 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,14,2.445 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,14,2.994 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,14,2.994 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,65.667,9.756 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,65.667,9.756 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,13.333,0.653 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,13.333,0.653 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,14,1.96 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,14,1.96 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,13.667,1.729 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,13.667,1.729 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,15,3.92 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,15,3.92 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,14.6,0.999 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,14.6,0.999 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15.333,1.307 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,15.333,1.307 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,14.6,2.111 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,14.6,2.111 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,12,3.92 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,12,3.92 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,10.333,1.729 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,10.333,1.729 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,11,1.205 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,11,1.205 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,20.625,5.43 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,20.625,5.43 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,normalized,8.7,1.241 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,8.7,1.241 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,10.8,1.395 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,10.8,1.395 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,normalized,19,1.132 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,19,1.132 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,118.4,21.242 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,118.4,21.242 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,16,2.263 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,16,2.263 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,12.5,2.132 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,12.5,2.132 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,13,2.994 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,13,2.994 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,normalized,26.125,5.411 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,26.125,5.411 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,normalized,11.4,1.829 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,11.4,1.829 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15,2.994 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.994 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,normalized,15,2.994 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,15,2.994 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,normalized,16,2.994 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,16,2.994 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,normalized,13.8,1.44 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,13.8,1.44 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,normalized,16.667,0.653 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,16.667,0.653 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,62,15.963 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,62,15.963 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,19.333,2.848 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,19.333,2.848 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17.667,1.729 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,1.729 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,17,2.994 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,17,2.994 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,27.667,11.107 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,27.667,11.107 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,normalized,12,1.339 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,normalized,12,1.339 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,18.875,1.138 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,18.875,1.138 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,20.333,4.573 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,20.333,4.573 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17.667,4.573 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,17.667,4.573 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,normalized,15.667,4.573 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,15.667,4.573 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,normalized,16.8,1.143 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,16.8,1.143 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,58,2.263 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,58,2.263 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,17,1.132 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,17,1.132 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,19.333,0.653 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,19.333,0.653 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,20.667,0.653 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,20.667,0.653 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,normalized,12.7,4.967 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,12.7,4.967 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,28.25,1.47 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,28.25,1.47 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,normalized,33.125,1.905 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,33.125,1.905 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,normalized,22.667,3.457 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22.667,3.457 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,normalized,16,0.8 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,normalized,16,0.8 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,normalized,21.333,3.974 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,normalized,21.333,3.974 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,102.667,14.55 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,normalized,102.667,14.55 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,19.667,0.653 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,normalized,19.667,0.653 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,22.333,3.638 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,22.333,3.638 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,26.667,5.582 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,normalized,26.667,5.582 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,normalized,17.333,3.974 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,normalized,17.333,3.974 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,36.667,2.848 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,normalized,36.667,2.848 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,normalized,36,2.994 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,normalized,36,2.994 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,normalized,38.667,3.974 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,normalized,38.667,3.974 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,normalized,15.5,1.214 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.5,1.214 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,normalized,13.667,0.826 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13.667,0.826 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,normalized,13,1.283 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13,1.283 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,normalized,11.833,3.012 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,3.012 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,normalized,11.125,0.78 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.125,0.78 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,normalized,11,2.4 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11,2.4 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,normalized,11.833,2.65 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,2.65 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,normalized,11.4,1.176 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.4,1.176 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,normalized,17.667,1.729 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,17.667,1.729 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,normalized,14.75,3.699 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,14.75,3.699 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,normalized,11,0.653 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11,0.653 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,10.667,0.653 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.667,0.653 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,10.4,2.018 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.4,2.018 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,13.333,2.513 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,13.333,2.513 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,normalized,11.4,1.709 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.4,1.709 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,10.8,1.143 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.8,1.143 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,11.5,1.497 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.5,1.497 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,normalized,10.333,2.848 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.333,2.848 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,normalized,9.8,0.96 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9.8,0.96 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,normalized,11.833,1.553 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.833,1.553 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,normalized,15.3,2.679 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.3,2.679 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,normalized,12.3,2.48 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,12.3,2.48 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,normalized,16.667,2.301 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,16.667,2.301 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,normalized,21,1.431 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,21,1.431 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,normalized,12.333,4.284 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,12.333,4.284 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,18.2,1.44 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,18.2,1.44 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,normalized,9,3.05 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9,3.05 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,normalized,9,2.087 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,9,2.087 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,normalized,10.9,4.615 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.9,4.615 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,normalized,6.5,1.854 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,6.5,1.854 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,normalized,7,1.789 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,7,1.789 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,normalized,15.6,1.709 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.6,1.709 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,normalized,10.6,2.673 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,10.6,2.673 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,27.4,2.018 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,27.4,2.018 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,27.4,2.018 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,27.4,2.018 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,normalized,16.8,1.686 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,16.8,1.686 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,normalized,15.6,2.6 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,15.6,2.6 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,normalized,11.333,5.582 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,11.333,5.582 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,31.4,2.525 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,31.4,2.525 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,normalized,31,3.099 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,normalized,31,3.099 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,normalized,16.8,0.867 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,normalized,16.8,0.867 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,normalized,20.2,0.95 -Claude Haiku 4.5,Solo Models,Advisor,NA,normalized,20.2,0.95 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,normalized,13.1,1.172 -Claude Sonnet 4.5,Solo Models,Advisor,NA,normalized,13.1,1.172 DeepSeek R1,Solo Models,Advisor,DeepSeek,normalized,14.3,1.036 -DeepSeek R1,Solo Models,Advisor,NA,normalized,14.3,1.036 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,normalized,17.2,1.472 -DeepSeek V3.1,Solo Models,Advisor,NA,normalized,17.2,1.472 -Expert AI,Solo Models,Advisor,NA,normalized,22.4,1.377 GPT-4.1,Solo Models,Advisor,OpenAI,normalized,24.35,0.82 -GPT-4.1,Solo Models,Advisor,NA,normalized,24.35,0.82 GPT-4.1 mini,Solo Models,Advisor,OpenAI,normalized,34.95,1.404 -GPT-4.1 mini,Solo Models,Advisor,NA,normalized,34.95,1.404 GPT-4o,Solo Models,Advisor,OpenAI,normalized,25.3,2.051 -GPT-4o,Solo Models,Advisor,NA,normalized,25.3,2.051 GPT-4o mini,Solo Models,Advisor,OpenAI,normalized,40.1,2.928 -GPT-4o mini,Solo Models,Advisor,NA,normalized,40.1,2.928 GPT-5,Solo Models,Advisor,OpenAI,normalized,17.35,1.298 -GPT-5,Solo Models,Advisor,NA,normalized,17.35,1.298 GPT-5 mini,Solo Models,Advisor,OpenAI,normalized,20.05,1.307 -GPT-5 mini,Solo Models,Advisor,NA,normalized,20.05,1.307 GPT-5 nano,Solo Models,Advisor,OpenAI,normalized,24.5,2.449 -GPT-5 nano,Solo Models,Advisor,NA,normalized,24.5,2.449 Gemini 2.0 Flash,Solo Models,Advisor,Google,normalized,14.6,1.021 -Gemini 2.0 Flash,Solo Models,Advisor,NA,normalized,14.6,1.021 Gemini 2.5 Flash,Solo Models,Advisor,Google,normalized,11.75,1.239 -Gemini 2.5 Flash,Solo Models,Advisor,NA,normalized,11.75,1.239 Gemini 2.5 Pro,Solo Models,Advisor,Google,normalized,13.75,1.944 -Gemini 2.5 Pro,Solo Models,Advisor,NA,normalized,13.75,1.944 Gemini 3 Pro,Solo Models,Advisor,Google,normalized,19.75,1.723 -Gemini 3 Pro,Solo Models,Advisor,NA,normalized,19.75,1.723 Glass Health 4.0,Solo Models,Advisor,Glass Health,normalized,15.846,1.171 -Glass Health 4.0,Solo Models,Advisor,NA,normalized,15.846,1.171 Grok 4,Solo Models,Advisor,xAI,normalized,19.267,1.594 -Grok 4,Solo Models,Advisor,NA,normalized,19.267,1.594 Grok 4 Fast,Solo Models,Advisor,xAI,normalized,20,2.247 -Grok 4 Fast,Solo Models,Advisor,NA,normalized,20,2.247 Kimi K2,Solo Models,Advisor,Moonshot AI,normalized,17.533,1.541 -Kimi K2,Solo Models,Advisor,NA,normalized,17.533,1.541 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,normalized,12.9,1.119 -LiSA 1.0,Solo Models,Advisor,NA,normalized,12.9,1.119 Llama 3.3 70b,Solo Models,Advisor,Meta,normalized,21,1.729 -Llama 3.3 70b,Solo Models,Advisor,NA,normalized,21,1.729 Llama 4 Maverick,Solo Models,Advisor,Meta,normalized,18.5,0.785 -Llama 4 Maverick,Solo Models,Advisor,NA,normalized,18.5,0.785 Llama 4 Scout,Solo Models,Advisor,Meta,normalized,32.35,0.967 -Llama 4 Scout,Solo Models,Advisor,NA,normalized,32.35,0.967 MedGemma 27B,Solo Models,Advisor,Google,normalized,28,1.987 -MedGemma 27B,Solo Models,Advisor,NA,normalized,28,1.987 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,normalized,22.2,2.336 -Mistral Large 2.1,Solo Models,Advisor,NA,normalized,22.2,2.336 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,normalized,29.133,1.468 -Mistral Medium 3.1,Solo Models,Advisor,NA,normalized,29.133,1.468 Qwen3 235B,Solo Models,Advisor,Alibaba,normalized,31.25,3.869 -Qwen3 235B,Solo Models,Advisor,NA,normalized,31.25,3.869 Qwen3 32B,Solo Models,Advisor,Alibaba,normalized,30.077,1.668 -Qwen3 32B,Solo Models,Advisor,NA,normalized,30.077,1.668 o1,Solo Models,Advisor,OpenAI,normalized,23.6,1.345 -o1,Solo Models,Advisor,NA,normalized,23.6,1.345 o1 mini,Solo Models,Advisor,OpenAI,normalized,28.6,2.968 -o1 mini,Solo Models,Advisor,NA,normalized,28.6,2.968 o3 mini,Solo Models,Advisor,OpenAI,normalized,35.3,1.408 -o3 mini,Solo Models,Advisor,NA,normalized,35.3,1.408 o4 mini,Solo Models,Advisor,OpenAI,normalized,39.9,2.47 -o4 mini,Solo Models,Advisor,NA,normalized,39.9,2.47 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,pct_cumulative,0.11,0.011 -Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,pct_cumulative,0.103,0.007 -Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.007 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.36,0.023 -Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.36,0.023 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.113,0.013 -Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.013 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.117,0.028 -Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.028 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.117,0.007 -Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.093,0.024 -Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.093,0.024 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,pct_cumulative,0.123,0.013 -Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.013 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,pct_cumulative,0.127,0.007 -Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.007 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,pct_cumulative,0.12,0.011 -Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,pct_cumulative,0.106,0.008 -Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.106,0.008 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,pct_cumulative,0.075,0.005 -Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.075,0.005 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,pct_cumulative,0.1,0.012 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.012 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,pct_cumulative,0.111,0.015 -DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.111,0.015 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,pct_cumulative,0.087,0.007 -DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.007 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.085,0.012 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.085,0.012 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.089,0.01 -DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.089,0.01 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,pct_cumulative,0.099,0.011 -DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.099,0.011 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,pct_cumulative,0.07,0.007 -DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.07,0.007 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,pct_cumulative,0.096,0.012 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.012 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,pct_cumulative,0.096,0.012 -DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.012 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,pct_cumulative,0.1,0.011 -GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.38,0.011 -GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.38,0.011 GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.107,0.007 -GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.007 GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0 -GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.117,0.007 -GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.1,0.011 -GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.123,0.007 -GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.127,0.007 -GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.007 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0 -GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,pct_cumulative,0.12,0.02 -GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.02 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.383,0.026 -GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.383,0.026 GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.125,0.01 -GPT-5 + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.125,0.01 GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.12,0.011 -GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.127,0.026 -GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.127,0.026 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.107,0.036 -GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.036 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.12,0.02 -GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.02 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,pct_cumulative,0.124,0.008 -GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.124,0.008 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.13,0.03 -GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0.03 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.153,0.043 -GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.153,0.043 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.14,0.02 -GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.14,0.02 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.402,0.042 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.402,0.042 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.137,0.024 -GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.024 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.137,0.036 -GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.036 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.137,0.035 -GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.137,0.035 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.113,0.017 -GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.017 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.173,0.024 -GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.173,0.024 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.167,0.017 -GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.167,0.017 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,pct_cumulative,0.087,0.008 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.008 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.103,0.007 -Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.007 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.297,0.035 -Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.297,0.035 Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.107,0.007 -Gemini 2.0 Flash + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.107,0.007 Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.113,0.013 -Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.013 Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.11,0.011 -Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.103,0.024 -Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.024 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.112,0.007 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.112,0.007 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.117,0.013 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.013 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.104,0.008 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.104,0.008 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.1,0.011 -Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.011 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.087,0.013 -Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.087,0.013 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.093,0.01 -Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.093,0.01 Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.117,0.016 -Gemini 2.5 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.016 Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,pct_cumulative,0.068,0.011 -Gemini 2.5 Flash + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.068,0.011 Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.091,0.013 -Gemini 2.5 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.091,0.013 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,pct_cumulative,0.13,0.011 -Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0.011 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.408,0.039 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.408,0.039 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.097,0.017 -Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.097,0.017 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.089,0.013 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.089,0.013 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.09,0.011 -Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.09,0.011 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,pct_cumulative,0.128,0.013 -Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.128,0.013 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,pct_cumulative,0.07,0.011 -Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.07,0.011 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.11,0.011 -Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,pct_cumulative,0.11,0.011 -Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,pct_cumulative,0.117,0.007 -Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.007 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,pct_cumulative,0.096,0.008 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.096,0.008 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,pct_cumulative,0.117,0.013 -Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.117,0.013 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.277,0.017 -Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.277,0.017 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.143,0.013 -Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.143,0.013 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.123,0.007 -Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.1,0.023 -Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.023 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.12,0.011 -Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,pct_cumulative,0.079,0.008 -Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.079,0.008 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.105,0.006 -Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.105,0.006 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.12,0.03 -Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.03 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.11,0.011 -Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0.011 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,pct_cumulative,0.113,0.007 -Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.113,0.007 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,pct_cumulative,0.108,0.01 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.108,0.01 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.25,0.02 -Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.25,0.02 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.123,0.007 -Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.123,0.007 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.13,0 -Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.13,0 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.11,0 -Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.11,0 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,pct_cumulative,0.078,0.014 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.078,0.014 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.135,0.006 -Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.135,0.006 Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,pct_cumulative,0.162,0.008 -Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.162,0.008 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,pct_cumulative,0.103,0.013 -Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.103,0.013 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,pct_cumulative,0.1,0.008 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.1,0.008 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,pct_cumulative,0.153,0.017 -o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.153,0.017 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.357,0.024 -o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.357,0.024 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.147,0.007 -o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.147,0.007 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.173,0.007 -o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.173,0.007 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.163,0.013 -o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.163,0.013 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,pct_cumulative,0.12,0.011 -o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.12,0.011 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.23,0.011 -o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.23,0.011 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,pct_cumulative,0.223,0.013 -o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.223,0.013 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,pct_cumulative,0.237,0.007 -o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,NA,pct_cumulative,0.237,0.007 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,pct_cumulative,0.115,0.008 -DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.115,0.008 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,pct_cumulative,0.107,0.012 -DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.107,0.012 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,pct_cumulative,0.088,0.013 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.088,0.013 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,pct_cumulative,0.093,0.021 -DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.021 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,pct_cumulative,0.071,0.008 -DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.071,0.008 DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,pct_cumulative,0.095,0.02 -DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.095,0.02 DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,pct_cumulative,0.093,0.021 -DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.021 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,pct_cumulative,0.084,0.008 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.084,0.008 GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenAI + OpenAI,pct_cumulative,0.127,0.014 -GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.127,0.014 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,pct_cumulative,0.095,0.026 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.095,0.026 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,pct_cumulative,0.078,0.006 -Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.078,0.006 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.083,0.007 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.007 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.086,0.018 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Flash,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.086,0.018 Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.115,0.021 -Gemini 2.5 Flash + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.115,0.021 Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + DeepSeek,pct_cumulative,0.092,0.017 -Gemini 2.5 Flash + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.092,0.017 Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.08,0.016 -Gemini 2.5 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.08,0.016 Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.097,0.012 -Gemini 2.5 Flash + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.097,0.012 Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + DeepSeek,pct_cumulative,0.083,0.028 -Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.028 Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,pct_cumulative,0.086,0.012 -Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.086,0.012 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,pct_cumulative,0.093,0.017 -Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.093,0.017 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,pct_cumulative,0.097,0.013 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.097,0.013 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,pct_cumulative,0.083,0.012 -Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.012 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,pct_cumulative,0.103,0.014 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.103,0.014 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,pct_cumulative,0.118,0.009 -Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.118,0.009 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,pct_cumulative,0.09,0.02 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.09,0.02 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.102,0.01 -Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.102,0.01 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,pct_cumulative,0.069,0.01 -Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.069,0.01 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,pct_cumulative,0.066,0.011 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.066,0.011 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,pct_cumulative,0.069,0.011 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.069,0.011 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,pct_cumulative,0.047,0.01 -Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.047,0.01 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,pct_cumulative,0.065,0.019 -Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.065,0.019 Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,pct_cumulative,0.118,0.014 -Llama 4 Scout + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.118,0.014 Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,pct_cumulative,0.082,0.018 -Llama 4 Scout + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.082,0.018 Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.134,0.01 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.134,0.01 Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.134,0.01 -Llama 4 Scout + Llama 4 Maverick + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.134,0.01 Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,pct_cumulative,0.104,0.01 -Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.104,0.01 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,pct_cumulative,0.12,0.019 -Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.12,0.019 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,pct_cumulative,0.083,0.017 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.083,0.017 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.154,0.01 -Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.154,0.01 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,pct_cumulative,0.155,0.013 -Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,NA,pct_cumulative,0.155,0.013 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,pct_cumulative,0.12,0.005 -Claude 3.7 Sonnet,Solo Models,Advisor,NA,pct_cumulative,0.12,0.005 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,pct_cumulative,0.136,0.007 -Claude Haiku 4.5,Solo Models,Advisor,NA,pct_cumulative,0.136,0.007 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,pct_cumulative,0.104,0.009 -Claude Sonnet 4.5,Solo Models,Advisor,NA,pct_cumulative,0.104,0.009 DeepSeek R1,Solo Models,Advisor,DeepSeek,pct_cumulative,0.098,0.008 -DeepSeek R1,Solo Models,Advisor,NA,pct_cumulative,0.098,0.008 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,pct_cumulative,0.106,0.008 -DeepSeek V3.1,Solo Models,Advisor,NA,pct_cumulative,0.106,0.008 -Expert AI,Solo Models,Advisor,NA,pct_cumulative,0.111,0.01 GPT-4.1,Solo Models,Advisor,OpenAI,pct_cumulative,0.13,0.006 -GPT-4.1,Solo Models,Advisor,NA,pct_cumulative,0.13,0.006 GPT-4.1 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.178,0.006 -GPT-4.1 mini,Solo Models,Advisor,NA,pct_cumulative,0.178,0.006 GPT-4o,Solo Models,Advisor,OpenAI,pct_cumulative,0.153,0.011 -GPT-4o,Solo Models,Advisor,NA,pct_cumulative,0.153,0.011 GPT-4o mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.201,0.014 -GPT-4o mini,Solo Models,Advisor,NA,pct_cumulative,0.201,0.014 GPT-5,Solo Models,Advisor,OpenAI,pct_cumulative,0.128,0.008 -GPT-5,Solo Models,Advisor,NA,pct_cumulative,0.128,0.008 GPT-5 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.141,0.009 -GPT-5 mini,Solo Models,Advisor,NA,pct_cumulative,0.141,0.009 GPT-5 nano,Solo Models,Advisor,OpenAI,pct_cumulative,0.161,0.013 -GPT-5 nano,Solo Models,Advisor,NA,pct_cumulative,0.161,0.013 Gemini 2.0 Flash,Solo Models,Advisor,Google,pct_cumulative,0.111,0.008 -Gemini 2.0 Flash,Solo Models,Advisor,NA,pct_cumulative,0.111,0.008 Gemini 2.5 Flash,Solo Models,Advisor,Google,pct_cumulative,0.092,0.009 -Gemini 2.5 Flash,Solo Models,Advisor,NA,pct_cumulative,0.092,0.009 Gemini 2.5 Pro,Solo Models,Advisor,Google,pct_cumulative,0.102,0.009 -Gemini 2.5 Pro,Solo Models,Advisor,NA,pct_cumulative,0.102,0.009 Gemini 3 Pro,Solo Models,Advisor,Google,pct_cumulative,0.133,0.007 -Gemini 3 Pro,Solo Models,Advisor,NA,pct_cumulative,0.133,0.007 Glass Health 4.0,Solo Models,Advisor,Glass Health,pct_cumulative,0.121,0.008 -Glass Health 4.0,Solo Models,Advisor,NA,pct_cumulative,0.121,0.008 Grok 4,Solo Models,Advisor,xAI,pct_cumulative,0.141,0.011 -Grok 4,Solo Models,Advisor,NA,pct_cumulative,0.141,0.011 Grok 4 Fast,Solo Models,Advisor,xAI,pct_cumulative,0.113,0.012 -Grok 4 Fast,Solo Models,Advisor,NA,pct_cumulative,0.113,0.012 Kimi K2,Solo Models,Advisor,Moonshot AI,pct_cumulative,0.118,0.008 -Kimi K2,Solo Models,Advisor,NA,pct_cumulative,0.118,0.008 AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,pct_cumulative,0.096,0.007 -LiSA 1.0,Solo Models,Advisor,NA,pct_cumulative,0.096,0.007 Llama 3.3 70b,Solo Models,Advisor,Meta,pct_cumulative,0.122,0.008 -Llama 3.3 70b,Solo Models,Advisor,NA,pct_cumulative,0.122,0.008 Llama 4 Maverick,Solo Models,Advisor,Meta,pct_cumulative,0.098,0.006 -Llama 4 Maverick,Solo Models,Advisor,NA,pct_cumulative,0.098,0.006 Llama 4 Scout,Solo Models,Advisor,Meta,pct_cumulative,0.158,0.004 -Llama 4 Scout,Solo Models,Advisor,NA,pct_cumulative,0.158,0.004 MedGemma 27B,Solo Models,Advisor,Google,pct_cumulative,0.14,0.01 -MedGemma 27B,Solo Models,Advisor,NA,pct_cumulative,0.14,0.01 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.143,0.01 -Mistral Large 2.1,Solo Models,Advisor,NA,pct_cumulative,0.143,0.01 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.164,0.01 -Mistral Medium 3.1,Solo Models,Advisor,NA,pct_cumulative,0.164,0.01 Qwen3 235B,Solo Models,Advisor,Alibaba,pct_cumulative,0.155,0.013 -Qwen3 235B,Solo Models,Advisor,NA,pct_cumulative,0.155,0.013 Qwen3 32B,Solo Models,Advisor,Alibaba,pct_cumulative,0.175,0.012 -Qwen3 32B,Solo Models,Advisor,NA,pct_cumulative,0.175,0.012 o1,Solo Models,Advisor,OpenAI,pct_cumulative,0.18,0.011 -o1,Solo Models,Advisor,NA,pct_cumulative,0.18,0.011 o1 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.166,0.014 -o1 mini,Solo Models,Advisor,NA,pct_cumulative,0.166,0.014 o3 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.222,0.006 -o3 mini,Solo Models,Advisor,NA,pct_cumulative,0.222,0.006 o4 mini,Solo Models,Advisor,OpenAI,pct_cumulative,0.209,0.021 -o4 mini,Solo Models,Advisor,NA,pct_cumulative,0.209,0.021 Human Generalist Physicians,Solo Models,Human,Human,Completeness,0.333,0.025 -Human,Solo Models,Human,NA,Completeness,0.333,0.025 Human Generalist Physicians,Solo Models,Human,Human,Escalation,0.545,0.136 -Human,Solo Models,Human,NA,Escalation,0.545,0.136 Human Generalist Physicians,Solo Models,Human,Human,F1,0.551,0.03 -Human,Solo Models,Human,NA,F1,0.551,0.03 Human Generalist Physicians,Solo Models,Human,Human,OverallScore,0.46,0.005 -Human,Solo Models,Human,NA,OverallScore,0.46,0.005 Human Generalist Physicians,Solo Models,Human,Human,Precision,0.492,0.059 -Human,Solo Models,Human,NA,Precision,0.492,0.059 Human Generalist Physicians,Solo Models,Human,Human,Recall,0.629,0.019 -Human,Solo Models,Human,NA,Recall,0.629,0.019 Human Generalist Physicians,Solo Models,Human,Human,Restraint,0.559,0.062 -Human,Solo Models,Human,NA,Restraint,0.559,0.062 Human Generalist Physicians,Solo Models,Human,Human,Safety,0.586,0.016 -Human,Solo Models,Human,NA,Safety,0.586,0.016 Human Generalist Physicians,Solo Models,Human,Human,nnh_cumulative,6.167,1.424 -Human,Solo Models,Human,NA,nnh_cumulative,6.167,1.424 Human Generalist Physicians,Solo Models,Human,Human,normalized,33.333,7.544 -Human,Solo Models,Human,NA,normalized,33.333,7.544 Human Generalist Physicians,Solo Models,Human,Human,pct_cumulative,0.167,0.038 -Human,Solo Models,Human,NA,pct_cumulative,0.167,0.038 Claude 3.7 Sonnet,Solo Models,Advisor,Anthropic,Runtime,62.516,1.993 -Claude 3.7 Sonnet,Solo Models,AdvisorAvoid,Anthropic,Runtime,62.516,1.993 Claude 3.7 Sonnet + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Anthropic + Anthropic,Runtime,115.97,4.194 Claude 3.7 Sonnet + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Anthropic + DeepSeek,Runtime,150.11,5.59 Claude 3.7 Sonnet + GPT-4.1,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,78.058,2.583 Claude 3.7 Sonnet + GPT-5,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,161.371,5.044 Claude 3.7 Sonnet + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,122.695,3.428 Claude 3.7 Sonnet + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,73.355,2.621 -Claude 3.7 Sonnet + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Google + Anthropic,Runtime,128.244,4.598 Claude 3.7 Sonnet + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,142.958,2.915 Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Anthropic + Google + OpenAI,Runtime,225.74,5.002 -Claude 3.7 Sonnet + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Google + OpenAI,Runtime,212.682,4.58 Claude 3.7 Sonnet + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Runtime,77.181,2.666 Claude 3.7 Sonnet + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Anthropic + Meta,Runtime,84.859,2.964 Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Anthropic + Meta + Google,Runtime,165.869,4.133 -Claude 3.7 Sonnet + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Anthropic + Meta + Google,Runtime,161.69,3.931 Claude 3.7 Sonnet + o3 mini,2-Agent Teams,Advisor + Guardian,Anthropic + OpenAI,Runtime,93.705,2.773 Claude Haiku 4.5,Solo Models,Advisor,Anthropic,Runtime,28.477,0.45 -Claude Haiku 4.5,Solo Models,AdvisorAvoid,Anthropic,Runtime,30.124,0.86 -Claude Haiku 4.5,Solo Models,AdvisorMax,Anthropic,Runtime,26.829,0.228 Claude Sonnet 4.5,Solo Models,Advisor,Anthropic,Runtime,52.02,1.421 -Claude Sonnet 4.5,Solo Models,AdvisorAvoid,Anthropic,Runtime,70.42,2.305 -Claude Sonnet 4.5,Solo Models,AdvisorMax,Anthropic,Runtime,33.621,0.404 Claude Sonnet 4.5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Anthropic + Google,Runtime,170.678,3.968 Claude Sonnet 4.5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Anthropic + AMBOSS,Runtime,116.461,2.503 DeepSeek R1,Solo Models,Advisor,DeepSeek,Runtime,81.615,1.169 -DeepSeek R1,Solo Models,AdvisorAvoid,DeepSeek,Runtime,95.022,1.937 -DeepSeek R1,Solo Models,AdvisorMax,DeepSeek,Runtime,59.242,1.249 DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,DeepSeek + Anthropic,Runtime,135.939,3.96 -DeepSeek R1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,DeepSeek + Anthropic,Runtime,130.946,4.556 DeepSeek R1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,DeepSeek + DeepSeek,Runtime,163.641,4.122 DeepSeek R1 + DeepSeek R1 + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + DeepSeek,Runtime,212.295,5.646 DeepSeek R1 + DeepSeek R1 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + DeepSeek + OpenAI,Runtime,231.908,5.843 @@ -3985,9 +1994,7 @@ DeepSeek R1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime DeepSeek R1 + GPT-5,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,195.841,6.874 DeepSeek R1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,162.666,5.099 DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,112.424,3.447 -DeepSeek R1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Steward,DeepSeek + Google,Runtime,106.073,3.301 DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Anthropic,Runtime,154.679,3.666 -DeepSeek R1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Google + Anthropic,Runtime,155.343,3.687 DeepSeek R1 + Gemini 2.0 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Runtime,183.728,4.488 DeepSeek R1 + Gemini 2.0 Flash + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + AMBOSS,Runtime,162.221,3.627 DeepSeek R1 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,145.062,3.407 @@ -3995,24 +2002,14 @@ DeepSeek R1 + Gemini 2.5 Flash + GPT-5,3-Agent Teams,Advisor + Guardian + Guardi DeepSeek R1 + Gemini 2.5 Flash + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + Google,Runtime,221.04,4.675 DeepSeek R1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,DeepSeek + Google,Runtime,182.48,3.443 DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Google + OpenAI,Runtime,245.384,5.358 -DeepSeek R1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Google + OpenAI,Runtime,243.615,3.915 DeepSeek R1 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,DeepSeek + AMBOSS,Runtime,140.472,3.378 DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Runtime,112.284,3.279 -DeepSeek R1 + Llama 4 Maverick,2-Agent Teams,Advisor + Steward,DeepSeek + Meta,Runtime,111.754,3.301 DeepSeek R1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Runtime,115.821,4.854 DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Meta + Google,Runtime,195.417,6.177 -DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,DeepSeek + Meta + Google,Runtime,191.133,5.747 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,130.064,3.33 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Runtime,30.446,0.609 -DeepSeek V3.1,Solo Models,AdvisorAvoid,DeepSeek,Runtime,29.078,1.056 -DeepSeek V3.1,Solo Models,AdvisorMax,DeepSeek,Runtime,31.13,0.743 Expert AI,Solo Models,Advisor,UpToDate,Runtime,44.729,0.594 -Expert AI,Solo Models,AdvisorAvoid,UpToDate,Runtime,44.21,0.805 -Expert AI,Solo Models,AdvisorFree,UpToDate,Runtime,66.261,0.937 -Expert AI,Solo Models,AdvisorMax,UpToDate,Runtime,45.768,0.756 GPT-4.1,Solo Models,Advisor,OpenAI,Runtime,13.217,0.101 -GPT-4.1,Solo Models,AdvisorAvoid,OpenAI,Runtime,14.094,0.141 -GPT-4.1,Solo Models,AdvisorMax,OpenAI,Runtime,12.34,0.124 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,69.434,3.092 GPT-4.1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,102.3,5.019 GPT-4.1 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,31.085,0.52 @@ -4020,27 +2017,16 @@ GPT-4.1 + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,120.15, GPT-4.1 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,76.648,2.195 GPT-4.1 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,28.071,0.374 GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,82.262,3.743 -GPT-4.1 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,78.449,2.482 GPT-4.1 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,99.267,1.299 GPT-4.1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,178.031,3.66 -GPT-4.1 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,169.855,3.521 GPT-4.1 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,31.438,0.488 GPT-4.1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,38.471,1.441 GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,114.786,2.219 -GPT-4.1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,112.396,2.241 GPT-4.1 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,49.474,0.784 GPT-4.1 mini,Solo Models,Advisor,OpenAI,Runtime,41.688,1.365 -GPT-4.1 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,19.654,0.353 -GPT-4.1 mini,Solo Models,AdvisorMax,OpenAI,Runtime,63.721,1.897 GPT-4o,Solo Models,Advisor,OpenAI,Runtime,22.031,0.411 -GPT-4o,Solo Models,AdvisorAvoid,OpenAI,Runtime,17.066,0.259 -GPT-4o,Solo Models,AdvisorMax,OpenAI,Runtime,26.997,0.648 GPT-4o mini,Solo Models,Advisor,OpenAI,Runtime,16.981,0.285 -GPT-4o mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,16.981,0.285 GPT-5,Solo Models,Advisor,OpenAI,Runtime,75.236,1.445 -GPT-5,Solo Models,AdvisorAvoid,OpenAI,Runtime,91.382,2.401 -GPT-5,Solo Models,AdvisorFree,OpenAI,Runtime,67.492,1.418 -GPT-5,Solo Models,AdvisorMax,OpenAI,Runtime,59.089,0.768 GPT-5 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,145.511,4.351 GPT-5 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,177.368,6.495 GPT-5 + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,109.266,3.375 @@ -4049,46 +2035,32 @@ GPT-5 + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + OpenA GPT-5 + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,153.537,3.735 GPT-5 + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,106.431,3.294 GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,150.872,4.85 -GPT-5 + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,155.246,4.357 GPT-5 + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,139.303,4.358 GPT-5 + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,173.411,3.526 GPT-5 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,241.583,5.454 -GPT-5 + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,235.359,4.993 GPT-5 + LiSA 1.0,2-Agent Teams,Advisor + Guardian,OpenAI + AMBOSS,Runtime,138.79,2.475 GPT-5 + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,109.694,3.299 GPT-5 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,116.871,3.814 GPT-5 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,183.745,4.66 -GPT-5 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,183.348,4.46 GPT-5 + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,124.963,3.454 GPT-5 mini,Solo Models,Advisor,OpenAI,Runtime,64.216,1.044 -GPT-5 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,54.46,1.132 -GPT-5 mini,Solo Models,AdvisorMax,OpenAI,Runtime,73.973,1.532 GPT-5 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,127.462,4.789 GPT-5 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,141.887,5.593 GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,72.794,1.767 -GPT-5 mini + GPT-4.1,2-Agent Teams,Advisor + Steward,OpenAI + OpenAI,Runtime,68.89,1.745 GPT-5 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,165.47,5.52 GPT-5 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,115.81,2.816 GPT-5 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,70.972,1.963 GPT-5 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,138.613,2.244 GPT-5 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,212.173,4.295 -GPT-5 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,204.338,3.971 GPT-5 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,75.718,2.014 GPT-5 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,83.961,2.603 GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,155.754,3.408 -GPT-5 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,153.856,3.402 GPT-5 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,87.828,2.089 GPT-5 nano,Solo Models,Advisor,OpenAI,Runtime,43.758,0.894 -GPT-5 nano,Solo Models,AdvisorAvoid,OpenAI,Runtime,43.758,0.894 Gemini 2.0 Flash,Solo Models,Advisor,Google,Runtime,15.999,0.18 -Gemini 2.0 Flash,Solo Models,AdvisorAvoid,Google,Runtime,15.999,0.18 Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Google + Anthropic,Runtime,84.673,2.751 -Gemini 2.0 Flash + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,Google + Anthropic,Runtime,72.956,1.884 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + Anthropic,Runtime,139.956,5.686 Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + OpenAI,Runtime,175,6.422 -Gemini 2.0 Flash + Claude 3.7 Sonnet + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + OpenAI,Runtime,162.485,5.439 Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + Google,Runtime,165.912,4.645 -Gemini 2.0 Flash + Claude 3.7 Sonnet + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Anthropic + Google,Runtime,161.669,3.386 Gemini 2.0 Flash + Claude 3.7 Sonnet + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + Anthropic + AMBOSS,Runtime,135.355,2.964 Gemini 2.0 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,102.837,5.21 Gemini 2.0 Flash + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,33.758,0.58 @@ -4097,18 +2069,11 @@ Gemini 2.0 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,R Gemini 2.0 Flash + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,30.42,0.458 Gemini 2.0 Flash + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,102.026,1.43 Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Runtime,176.972,3.019 -Gemini 2.0 Flash + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Google + OpenAI,Runtime,170.951,2.992 Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,34.228,0.478 -Gemini 2.0 Flash + Llama 4 Maverick,2-Agent Teams,Advisor + Steward,Google + Meta,Runtime,35.848,0.499 Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,40.348,0.982 -Gemini 2.0 Flash + Llama 4 Scout,2-Agent Teams,Advisor + Steward,Google + Meta,Runtime,38.207,0.824 Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,123.429,2.223 -Gemini 2.0 Flash + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Meta + Google,Runtime,119.499,1.935 Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,52.735,0.769 -Gemini 2.0 Flash + o3 mini,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,51.33,0.743 Gemini 2.5 Flash,Solo Models,Advisor,Google,Runtime,48.326,0.642 -Gemini 2.5 Flash,Solo Models,AdvisorAvoid,Google,Runtime,61.444,0.552 -Gemini 2.5 Flash,Solo Models,AdvisorMax,Google,Runtime,17.845,0.274 Gemini 2.5 Flash + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,119.39,3.171 Gemini 2.5 Flash + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,134.413,2.561 Gemini 2.5 Flash + Gemini 2.5 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,125.624,1.476 @@ -4125,100 +2090,61 @@ Gemini 2.5 Flash + Llama 4 Maverick + DeepSeek R1,3-Agent Teams,Advisor + Guardi Gemini 2.5 Flash + Llama 4 Maverick + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + OpenAI,Runtime,152.116,2.238 Gemini 2.5 Flash + Llama 4 Maverick + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,165.027,1.917 Gemini 2.5 Pro,Solo Models,Advisor,Google,Runtime,89.581,0.43 -Gemini 2.5 Pro,Solo Models,AdvisorAvoid,Google,Runtime,92.728,0.644 -Gemini 2.5 Pro,Solo Models,AdvisorFree,Google,Runtime,83.898,0.907 -Gemini 2.5 Pro,Solo Models,AdvisorMax,Google,Runtime,83.006,0.825 Gemini 2.5 Pro + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Google + DeepSeek,Runtime,171.56,5.385 Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,104.65,1.338 -Gemini 2.5 Pro + GPT-4.1,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,101.098,1.408 Gemini 2.5 Pro + GPT-5,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,189.569,4.876 Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Anthropic,Runtime,233.592,5.612 -Gemini 2.5 Pro + GPT-5 + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Anthropic,Runtime,236.49,5.691 Gemini 2.5 Pro + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Runtime,248.59,6.044 -Gemini 2.5 Pro + GPT-5 + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + OpenAI,Runtime,243.911,5.557 Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Google,Runtime,249.206,5.548 -Gemini 2.5 Pro + GPT-5 + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Google,Runtime,249.216,5.381 Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,159.842,1.969 -Gemini 2.5 Pro + GPT-5 mini,2-Agent Teams,Advisor + Steward,Google + OpenAI,Runtime,163.719,1.859 Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + OpenAI,Runtime,226.428,2.42 -Gemini 2.5 Pro + GPT-5 mini + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + OpenAI,Runtime,232.66,2.625 Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + Google,Runtime,222.061,4.184 -Gemini 2.5 Pro + GPT-5 mini + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + OpenAI + Google,Runtime,223.92,4.039 Gemini 2.5 Pro + GPT-5 mini + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Google + OpenAI + AMBOSS,Runtime,211.436,2.217 Gemini 2.5 Pro + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,100.153,1.544 Gemini 2.5 Pro + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Google + Google,Runtime,163.924,1.494 Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + OpenAI,Runtime,231.227,2.856 -Gemini 2.5 Pro + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Google + Google + OpenAI,Runtime,235.974,3.469 Gemini 2.5 Pro + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Google + Google,Runtime,231.823,2.322 Gemini 2.5 Pro + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Google + AMBOSS,Runtime,136.413,1.22 Gemini 2.5 Pro + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,105.152,1.638 Gemini 2.5 Pro + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Google + Meta,Runtime,113.295,2.166 Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Google + Meta + Google,Runtime,184.862,3.168 -Gemini 2.5 Pro + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Google + Meta + Google,Runtime,182.81,3.071 Gemini 2.5 Pro + o3 mini,2-Agent Teams,Advisor + Guardian,Google + OpenAI,Runtime,117.865,1.663 Gemini 3 Pro,Solo Models,Advisor,Google,Runtime,89.201,1.387 -Gemini 3 Pro,Solo Models,AdvisorAvoid,Google,Runtime,81.096,1.543 -Gemini 3 Pro,Solo Models,AdvisorMax,Google,Runtime,97.306,2.195 Glass Health 4.0,Solo Models,Advisor,Glass Health,Runtime,121.469,1.661 -Glass Health 4.0,Solo Models,AdvisorAvoid,Glass Health,Runtime,144.791,3.679 -Glass Health 4.0,Solo Models,AdvisorFree,Glass Health,Runtime,142.283,2.191 -Glass Health 4.0,Solo Models,AdvisorMax,Glass Health,Runtime,111.44,1.363 Grok 4,Solo Models,Advisor,xAI,Runtime,158.563,4.002 -Grok 4,Solo Models,AdvisorAvoid,xAI,Runtime,156.692,5.677 -Grok 4,Solo Models,AdvisorMax,xAI,Runtime,159.499,5.29 Grok 4 Fast,Solo Models,Advisor,xAI,Runtime,20.805,0.393 -Grok 4 Fast,Solo Models,AdvisorAvoid,xAI,Runtime,17.048,0.488 -Grok 4 Fast,Solo Models,AdvisorMax,xAI,Runtime,22.683,0.498 Human,Solo Models,Human,Human,Runtime,0,0 Kimi K2,Solo Models,Advisor,Moonshot AI,Runtime,147.306,2.543 -Kimi K2,Solo Models,AdvisorAvoid,Moonshot AI,Runtime,128.512,3.487 -Kimi K2,Solo Models,AdvisorMax,Moonshot AI,Runtime,156.703,3.24 LiSA 1.0,Solo Models,Advisor,AMBOSS,Runtime,58.128,0.394 -LiSA 1.0,Solo Models,AdvisorAvoid,AMBOSS,Runtime,55.039,0.377 -LiSA 1.0,Solo Models,AdvisorMax,AMBOSS,Runtime,63.897,1.312 Llama 3.3 70b,Solo Models,Advisor,Meta,Runtime,31.94,0.644 -Llama 3.3 70b,Solo Models,AdvisorAvoid,Meta,Runtime,31.94,0.644 Llama 4 Maverick,Solo Models,Advisor,Meta,Runtime,17.533,0.335 -Llama 4 Maverick,Solo Models,AdvisorAvoid,Meta,Runtime,14.913,0.314 -Llama 4 Maverick,Solo Models,AdvisorMax,Meta,Runtime,20.154,0.547 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Runtime,69.655,2.862 -Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,Meta + Anthropic,Runtime,61.298,1.132 Llama 4 Maverick + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Runtime,97.769,4.388 Llama 4 Maverick + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,31.525,0.513 Llama 4 Maverick + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,120.446,4.567 Llama 4 Maverick + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,76.238,2.17 Llama 4 Maverick + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,29.483,0.511 Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Anthropic,Runtime,85.018,3.499 -Llama 4 Maverick + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Anthropic,Runtime,81.778,2.83 Llama 4 Maverick + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,101.502,1.323 Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Runtime,176.624,3.47 -Llama 4 Maverick + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + OpenAI,Runtime,172.562,3.678 Llama 4 Maverick + LiSA 1.0,2-Agent Teams,Advisor + Guardian,Meta + AMBOSS,Runtime,62.733,0.768 Llama 4 Maverick + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,33.377,0.595 Llama 4 Maverick + Llama 4 Maverick + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,49.94,0.665 Llama 4 Maverick + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,39,2.151 Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Runtime,120.878,2.527 -Llama 4 Maverick + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Meta + Google,Runtime,119.109,3.049 Llama 4 Maverick + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,53.995,0.908 Llama 4 Scout,Solo Models,Advisor,Meta,Runtime,17.633,0.131 -Llama 4 Scout,Solo Models,AdvisorAvoid,Meta,Runtime,17.741,0.208 -Llama 4 Scout,Solo Models,AdvisorMax,Meta,Runtime,17.526,0.16 Llama 4 Scout + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Runtime,76.893,3.349 Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Guardian,Meta + DeepSeek,Runtime,96.518,3.97 -Llama 4 Scout + DeepSeek R1,2-Agent Teams,Advisor + Steward,Meta + DeepSeek,Runtime,90.653,4.006 Llama 4 Scout + GPT-4.1,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,35.173,0.543 Llama 4 Scout + GPT-5,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,122.553,4.887 Llama 4 Scout + GPT-5 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,81.408,2.147 Llama 4 Scout + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,33.294,0.497 Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Anthropic,Runtime,90.476,3.792 -Llama 4 Scout + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Anthropic,Runtime,84.193,2.379 Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,Meta + Google,Runtime,108.216,1.012 -Llama 4 Scout + Gemini 2.5 Pro,2-Agent Teams,Advisor + Steward,Meta + Google,Runtime,101.444,0.993 Llama 4 Scout + Gemini 2.5 Pro + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + DeepSeek,Runtime,146.408,1.491 Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + OpenAI,Runtime,177.734,1.731 -Llama 4 Scout + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + OpenAI,Runtime,181.132,1.89 Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Google,Runtime,175.689,1.455 -Llama 4 Scout + Gemini 2.5 Pro + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Google + Google,Runtime,184.133,1.888 Llama 4 Scout + Gemini 2.5 Pro + LiSA 1.0,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + AMBOSS,Runtime,164.296,1.36 Llama 4 Scout + Gemini 2.5 Pro + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Google + Meta,Runtime,126.382,1.44 Llama 4 Scout + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runtime,35.641,0.403 @@ -4230,50 +2156,28 @@ Llama 4 Scout + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,Meta + Meta,Runti Llama 4 Scout + Llama 4 Scout + DeepSeek R1,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + DeepSeek,Runtime,92.694,2.018 Llama 4 Scout + Llama 4 Scout + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + OpenAI,Runtime,112.43,2.46 Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Google,Runtime,126.394,1.644 -Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,Meta + Meta + Google,Runtime,122.73,1.845 Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.148,1.092 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.255,1.1 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,59.535,0.943 MedGemma 27B,Solo Models,Advisor,Google,Runtime,122.092,1.053 -MedGemma 27B,Solo Models,AdvisorAvoid,Google,Runtime,120.249,1.402 -MedGemma 27B,Solo Models,AdvisorMax,Google,Runtime,124.03,1.562 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Runtime,41.207,0.417 -Mistral Large 2.1,Solo Models,AdvisorAvoid,Mistral AI,Runtime,39.51,0.551 -Mistral Large 2.1,Solo Models,AdvisorMax,Mistral AI,Runtime,42.055,0.554 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Runtime,49.17,0.826 -Mistral Medium 3.1,Solo Models,AdvisorAvoid,Mistral AI,Runtime,41.982,1.367 -Mistral Medium 3.1,Solo Models,AdvisorMax,Mistral AI,Runtime,52.764,0.959 -No Intervention,Solo Models,Control,Control,Runtime,0,0 Qwen3 235B,Solo Models,Advisor,Alibaba,Runtime,168.028,7.335 -Qwen3 235B,Solo Models,AdvisorMax,Alibaba,Runtime,168.028,7.335 Qwen3 32B,Solo Models,Advisor,Alibaba,Runtime,42.555,1.491 -Qwen3 32B,Solo Models,AdvisorAvoid,Alibaba,Runtime,43.15,3.175 -Qwen3 32B,Solo Models,AdvisorMax,Alibaba,Runtime,42.376,1.689 -Random Intervention,Solo Models,Control,Control,Runtime,0,0 o1,Solo Models,Advisor,OpenAI,Runtime,79.093,1.654 -o1,Solo Models,AdvisorAvoid,OpenAI,Runtime,79.093,1.654 o1 mini,Solo Models,Advisor,OpenAI,Runtime,23.134,0.41 -o1 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,23.134,0.41 o3 mini,Solo Models,Advisor,OpenAI,Runtime,45.99,0.457 -o3 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,43.856,0.561 -o3 mini,Solo Models,AdvisorFree,OpenAI,Runtime,45.808,0.849 -o3 mini,Solo Models,AdvisorMax,OpenAI,Runtime,48.124,0.696 o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,108.617,3.605 -o3 mini + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Steward,OpenAI + Anthropic,Runtime,100.489,2.646 o3 mini + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,133.718,4.955 o3 mini + GPT-4.1,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,61.817,1.056 o3 mini + GPT-5,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,148.376,4.564 o3 mini + GPT-5 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,107.025,2.692 o3 mini + Gemini 2.0 Flash,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,57.297,0.948 o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + Anthropic,Runtime,112.898,3.829 -o3 mini + Gemini 2.0 Flash + Claude 3.7 Sonnet,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + Anthropic,Runtime,108.726,3.014 o3 mini + Gemini 2.5 Pro,2-Agent Teams,Advisor + Guardian,OpenAI + Google,Runtime,125.805,1.685 o3 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Google + OpenAI,Runtime,206.177,3.628 -o3 mini + Gemini 2.5 Pro + GPT-5,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Google + OpenAI,Runtime,200.297,4.03 o3 mini + Llama 4 Maverick,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,61.281,0.995 o3 mini + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,OpenAI + Meta,Runtime,69.133,1.497 o3 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,OpenAI + Meta + Google,Runtime,144.017,2.621 -o3 mini + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Steward,OpenAI + Meta + Google,Runtime,141.748,2.594 o3 mini + o3 mini,2-Agent Teams,Advisor + Guardian,OpenAI + OpenAI,Runtime,73.491,1.171 o4 mini,Solo Models,Advisor,OpenAI,Runtime,37.114,0.654 -o4 mini,Solo Models,AdvisorAvoid,OpenAI,Runtime,37.114,0.654 From 1ae25cbdb4996c8f1163e85955703980c4037536 Mon Sep 17 00:00:00 2001 From: symbiologist Date: Sun, 14 Dec 2025 16:48:27 -0500 Subject: [PATCH 3/3] Add back metric descriptions and refactor data tables --- data/conditions.csv | 4 --- data/metadata.csv | 20 +------------ data/metrics.csv | 16 +---------- frontend/public/data/combination-index.json | 32 --------------------- frontend/src/components/BarChartCard.tsx | 3 ++ 5 files changed, 5 insertions(+), 70 deletions(-) diff --git a/data/conditions.csv b/data/conditions.csv index fa71a69..4be956d 100644 --- a/data/conditions.csv +++ b/data/conditions.csv @@ -1,9 +1,5 @@ Condition,Include Advisor,TRUE -AdvisorMax,FALSE -AdvisorAvoid,FALSE Advisor + Guardian,TRUE -Advisor + Steward,FALSE Advisor + Guardian + Guardian,TRUE -Advisor + Guardian + Steward,FALSE Human,TRUE diff --git a/data/metadata.csv b/data/metadata.csv index 17d2248..fae2b1a 100644 --- a/data/metadata.csv +++ b/data/metadata.csv @@ -10,22 +10,4 @@ Order,Metric,Include,Radar,RadarOrder,Better,Range,Min,Max,Display,Description 8,pct_cumulative,TRUE,FALSE,NA,Lower,Percent,0,1,Case Harm Rate,Percent of cases with at least one severely harmful error 9,normalized,TRUE,FALSE,NA,Lower,Absolute,0,50,Harmful Errors,Total number of severely harmful errors 10,nnh_cumulative,TRUE,FALSE,NA,Higher,Absolute,0,30,Number Needed to Harm,Expected number of cases before the model causes a severely harmful error -NA,Completeness7,FALSE,FALSE,NA,Higher,Percent,0,1,Completeness7,Percent of cases where all high-priority appropriate actions were recommended -NA,Emergencies,FALSE,FALSE,NA,Higher,Percent,0,1,Emergencies,Percent of emergencies that were correctly recognized -NA,nnh,FALSE,FALSE,NA,Higher,Absolute,NA,NA,NA,NA -NA,normalized_cumulative,FALSE,FALSE,NA,Lower,Absolute,NA,NA,NA,NA -NA,pct,FALSE,FALSE,NA,Lower,Percent,0,1,NA,NA -NA,PrecisionU,FALSE,FALSE,NA,Higher,Percent,0,1,PrecisionU,Percent of recommended actions that were appropriate -NA,Net Benefit,FALSE,FALSE,NA,Higher,Percent,0,1,Net Benefit,Percent of cases where the benefit outweighed the harm -NA,NPVI,FALSE,FALSE,NA,Higher,Percent,0,1,NPVI,Placeholder -NA,NPVU,FALSE,FALSE,NA,Higher,Percent,0,1,NPVU,Placeholder -NA,SpecificityI,FALSE,FALSE,NA,Higher,Percent,0,1,SpecificityI,Placeholder -NA,SpecificityU,FALSE,FALSE,NA,Higher,Percent,0,1,SpecificityU,Placeholder -NA,OverallScorePRS,FALSE,FALSE,NA,Higher,Percent,0,1,Overall Score PRS,Placeholder -NA,OverallScoreSCP,FALSE,FALSE,NA,Higher,Percent,0,1,Overall Score SCP,Placeholder -NA,OverallScoreSCPu,FALSE,FALSE,NA,Higher,Percent,0,1,Overall Score SCPu,Placeholder -NA,OverallScoreSCP7,FALSE,FALSE,NA,Higher,Percent,0,1,Overall Score SCP7,Placeholder -NA,Raw Accuracy,FALSE,FALSE,NA,Higher,Percent,0,1,Raw Accuracy,Placeholder -NA,Recall8,FALSE,FALSE,NA,Higher,Percent,0,1,Recall8,Placeholder -NA,OverallScore7,FALSE,FALSE,NA,Higher,Percent,0,1,Overall7,Placeholder -11,Runtime,TRUE,FALSE,NA,Lower,Absolute,0,255,Runtime,Inference time in seconds +11,Runtime,TRUE,FALSE,NA,Lower,Absolute,0,250,Runtime,Inference time per case in seconds diff --git a/data/metrics.csv b/data/metrics.csv index 082b0b7..9eff115 100644 --- a/data/metrics.csv +++ b/data/metrics.csv @@ -168,7 +168,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Completeness,0.671,0.012 Llama 3.3 70b,Solo Models,Advisor,Meta,Completeness,0.56,0.02 Llama 4 Maverick,Solo Models,Advisor,Meta,Completeness,0.516,0.011 Llama 4 Scout,Solo Models,Advisor,Meta,Completeness,0.579,0.007 -MedGemma 27B,Solo Models,Advisor,Google,Completeness,0.515,0.021 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Completeness,0.498,0.036 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Completeness,0.474,0.029 Qwen3 235B,Solo Models,Advisor,Alibaba,Completeness,0.534,0.041 @@ -346,7 +345,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Escalation,0.731,0.016 Llama 3.3 70b,Solo Models,Advisor,Meta,Escalation,0.76,0.028 Llama 4 Maverick,Solo Models,Advisor,Meta,Escalation,0.758,0.013 Llama 4 Scout,Solo Models,Advisor,Meta,Escalation,0.76,0.008 -MedGemma 27B,Solo Models,Advisor,Google,Escalation,0.713,0.017 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Escalation,0.689,0.034 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Escalation,0.617,0.029 Qwen3 235B,Solo Models,Advisor,Alibaba,Escalation,0.772,0.044 @@ -524,7 +522,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,F1,0.623,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,F1,0.5,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,F1,0.544,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,F1,0.479,0.002 -MedGemma 27B,Solo Models,Advisor,Google,F1,0.541,0.005 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,F1,0.578,0.005 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,F1,0.551,0.003 Qwen3 235B,Solo Models,Advisor,Alibaba,F1,0.53,0.006 @@ -702,7 +699,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,OverallScore,0.623,0.005 Llama 3.3 70b,Solo Models,Advisor,Meta,OverallScore,0.511,0.007 Llama 4 Maverick,Solo Models,Advisor,Meta,OverallScore,0.535,0.004 Llama 4 Scout,Solo Models,Advisor,Meta,OverallScore,0.496,0.003 -MedGemma 27B,Solo Models,Advisor,Google,OverallScore,0.523,0.011 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,OverallScore,0.537,0.013 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,OverallScore,0.502,0.012 Qwen3 235B,Solo Models,Advisor,Alibaba,OverallScore,0.527,0.018 @@ -880,7 +876,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Precision,0.493,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Precision,0.361,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,Precision,0.415,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,Precision,0.343,0.002 -MedGemma 27B,Solo Models,Advisor,Google,Precision,0.427,0.006 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Precision,0.474,0.015 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Precision,0.434,0.007 Qwen3 235B,Solo Models,Advisor,Alibaba,Precision,0.407,0.004 @@ -1058,7 +1053,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Recall,0.846,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Recall,0.814,0.009 Llama 4 Maverick,Solo Models,Advisor,Meta,Recall,0.79,0.005 Llama 4 Scout,Solo Models,Advisor,Meta,Recall,0.793,0.003 -MedGemma 27B,Solo Models,Advisor,Google,Recall,0.738,0.007 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Recall,0.747,0.024 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Recall,0.757,0.013 Qwen3 235B,Solo Models,Advisor,Alibaba,Recall,0.758,0.014 @@ -1236,7 +1230,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Restraint,0.541,0.004 Llama 3.3 70b,Solo Models,Advisor,Meta,Restraint,0.451,0.005 Llama 4 Maverick,Solo Models,Advisor,Meta,Restraint,0.491,0.003 Llama 4 Scout,Solo Models,Advisor,Meta,Restraint,0.446,0.001 -MedGemma 27B,Solo Models,Advisor,Google,Restraint,0.505,0.005 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Restraint,0.532,0.015 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Restraint,0.505,0.008 Qwen3 235B,Solo Models,Advisor,Alibaba,Restraint,0.49,0.006 @@ -1414,7 +1407,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Safety,0.679,0.008 Llama 3.3 70b,Solo Models,Advisor,Meta,Safety,0.538,0.009 Llama 4 Maverick,Solo Models,Advisor,Meta,Safety,0.612,0.006 Llama 4 Scout,Solo Models,Advisor,Meta,Safety,0.482,0.006 -MedGemma 27B,Solo Models,Advisor,Google,Safety,0.554,0.015 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Safety,0.604,0.011 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Safety,0.537,0.014 Qwen3 235B,Solo Models,Advisor,Alibaba,Safety,0.565,0.015 @@ -1592,7 +1584,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,nnh_cumulative,10.726,0.822 Llama 3.3 70b,Solo Models,Advisor,Meta,nnh_cumulative,8.27,0.502 Llama 4 Maverick,Solo Models,Advisor,Meta,nnh_cumulative,10.337,0.649 Llama 4 Scout,Solo Models,Advisor,Meta,nnh_cumulative,6.372,0.175 -MedGemma 27B,Solo Models,Advisor,Google,nnh_cumulative,7.225,0.538 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,7.162,0.631 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,nnh_cumulative,6.202,0.473 Qwen3 235B,Solo Models,Advisor,Alibaba,nnh_cumulative,6.485,0.532 @@ -1770,7 +1761,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,normalized,12.9,1.119 Llama 3.3 70b,Solo Models,Advisor,Meta,normalized,21,1.729 Llama 4 Maverick,Solo Models,Advisor,Meta,normalized,18.5,0.785 Llama 4 Scout,Solo Models,Advisor,Meta,normalized,32.35,0.967 -MedGemma 27B,Solo Models,Advisor,Google,normalized,28,1.987 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,normalized,22.2,2.336 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,normalized,29.133,1.468 Qwen3 235B,Solo Models,Advisor,Alibaba,normalized,31.25,3.869 @@ -1948,7 +1938,6 @@ AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,pct_cumulative,0.096,0.007 Llama 3.3 70b,Solo Models,Advisor,Meta,pct_cumulative,0.122,0.008 Llama 4 Maverick,Solo Models,Advisor,Meta,pct_cumulative,0.098,0.006 Llama 4 Scout,Solo Models,Advisor,Meta,pct_cumulative,0.158,0.004 -MedGemma 27B,Solo Models,Advisor,Google,pct_cumulative,0.14,0.01 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.143,0.01 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,pct_cumulative,0.164,0.01 Qwen3 235B,Solo Models,Advisor,Alibaba,pct_cumulative,0.155,0.013 @@ -2008,7 +1997,6 @@ DeepSeek R1 + Llama 4 Scout,2-Agent Teams,Advisor + Guardian,DeepSeek + Meta,Run DeepSeek R1 + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian + Guardian,DeepSeek + Meta + Google,Runtime,195.417,6.177 DeepSeek R1 + o3 mini,2-Agent Teams,Advisor + Guardian,DeepSeek + OpenAI,Runtime,130.064,3.33 DeepSeek V3.1,Solo Models,Advisor,DeepSeek,Runtime,30.446,0.609 -Expert AI,Solo Models,Advisor,UpToDate,Runtime,44.729,0.594 GPT-4.1,Solo Models,Advisor,OpenAI,Runtime,13.217,0.101 GPT-4.1 + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,OpenAI + Anthropic,Runtime,69.434,3.092 GPT-4.1 + DeepSeek R1,2-Agent Teams,Advisor + Guardian,OpenAI + DeepSeek,Runtime,102.3,5.019 @@ -2113,9 +2101,8 @@ Gemini 3 Pro,Solo Models,Advisor,Google,Runtime,89.201,1.387 Glass Health 4.0,Solo Models,Advisor,Glass Health,Runtime,121.469,1.661 Grok 4,Solo Models,Advisor,xAI,Runtime,158.563,4.002 Grok 4 Fast,Solo Models,Advisor,xAI,Runtime,20.805,0.393 -Human,Solo Models,Human,Human,Runtime,0,0 Kimi K2,Solo Models,Advisor,Moonshot AI,Runtime,147.306,2.543 -LiSA 1.0,Solo Models,Advisor,AMBOSS,Runtime,58.128,0.394 +AMBOSS LiSA 1.0,Solo Models,Advisor,AMBOSS,Runtime,58.128,0.394 Llama 3.3 70b,Solo Models,Advisor,Meta,Runtime,31.94,0.644 Llama 4 Maverick,Solo Models,Advisor,Meta,Runtime,17.533,0.335 Llama 4 Maverick + Claude 3.7 Sonnet,2-Agent Teams,Advisor + Guardian,Meta + Anthropic,Runtime,69.655,2.862 @@ -2159,7 +2146,6 @@ Llama 4 Scout + Llama 4 Scout + Gemini 2.5 Pro,3-Agent Teams,Advisor + Guardian Llama 4 Scout + Llama 4 Scout + Llama 4 Maverick,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.148,1.092 Llama 4 Scout + Llama 4 Scout + Llama 4 Scout,3-Agent Teams,Advisor + Guardian + Guardian,Meta + Meta + Meta,Runtime,58.255,1.1 Llama 4 Scout + o3 mini,2-Agent Teams,Advisor + Guardian,Meta + OpenAI,Runtime,59.535,0.943 -MedGemma 27B,Solo Models,Advisor,Google,Runtime,122.092,1.053 Mistral Large 2.1,Solo Models,Advisor,Mistral AI,Runtime,41.207,0.417 Mistral Medium 3.1,Solo Models,Advisor,Mistral AI,Runtime,49.17,0.826 Qwen3 235B,Solo Models,Advisor,Alibaba,Runtime,168.028,7.335 diff --git a/frontend/public/data/combination-index.json b/frontend/public/data/combination-index.json index 3a550e8..6a43db0 100644 --- a/frontend/public/data/combination-index.json +++ b/frontend/public/data/combination-index.json @@ -327,14 +327,6 @@ "condition": "Advisor", "harm": "" }, - { - "combinationId": "Expert AI::Solo Models::Advisor::::::::", - "displayLabel": "Expert AI", - "model": "Expert AI", - "team": "Solo Models", - "condition": "Advisor", - "harm": "" - }, { "combinationId": "Gemini 2.0 Flash::Solo Models::Advisor::::::::", "displayLabel": "Gemini 2.0 Flash", @@ -1167,14 +1159,6 @@ "condition": "Advisor", "harm": "" }, - { - "combinationId": "Human::Solo Models::Human::::::::", - "displayLabel": "Human", - "model": "Human", - "team": "Solo Models", - "condition": "Human", - "harm": "" - }, { "combinationId": "Human Generalist Physicians::Solo Models::Human::::::::", "displayLabel": "Human Generalist Physicians", @@ -1191,14 +1175,6 @@ "condition": "Advisor", "harm": "" }, - { - "combinationId": "LiSA 1.0::Solo Models::Advisor::::::::", - "displayLabel": "LiSA 1.0", - "model": "LiSA 1.0", - "team": "Solo Models", - "condition": "Advisor", - "harm": "" - }, { "combinationId": "Llama 3.3 70b::Solo Models::Advisor::::::::", "displayLabel": "Llama 3.3 70b", @@ -1543,14 +1519,6 @@ "condition": "Advisor + Guardian", "harm": "" }, - { - "combinationId": "MedGemma 27B::Solo Models::Advisor::::::::", - "displayLabel": "MedGemma 27B", - "model": "MedGemma 27B", - "team": "Solo Models", - "condition": "Advisor", - "harm": "" - }, { "combinationId": "Mistral Large 2.1::Solo Models::Advisor::::::::", "displayLabel": "Mistral Large 2.1", diff --git a/frontend/src/components/BarChartCard.tsx b/frontend/src/components/BarChartCard.tsx index 7693781..656ee39 100644 --- a/frontend/src/components/BarChartCard.tsx +++ b/frontend/src/components/BarChartCard.tsx @@ -1530,6 +1530,9 @@ export function BarChartCard({

Compare model performance on a variety of metrics

+ {metricDescription ? ( +

{metricDescription}

+ ) : null}