Merge pull request #4202 from Kilo-Org/mark/opus-auto-approval

markijbema · web-flow · commit ce76e5cbc0c0 · 2025-12-05T10:31:52.000+01:00
feat: add Opus auto-approval for autocomplete test runner
diff --git a/src/test-llm-autocompletion/README.md b/src/test-llm-autocompletion/README.md
@@ -58,6 +58,11 @@ pnpm run test:verbose
 # Run without interactive approval (fail if not already approved)
 pnpm run test --skip-approval
 
+# Run with Opus auto-approval (uses Claude Opus to judge completions)
+pnpm run test --opus-approval
+# Or short form
+pnpm run test -oa
+
 # Run a single test
 pnpm run test closing-brace
 
@@ -66,6 +71,7 @@ pnpm run clean
 
 # Combine flags
 pnpm run test --verbose --skip-approval
+pnpm run test --verbose --opus-approval
 ```
 
 ### Completion Strategy
@@ -124,6 +130,40 @@ This is useful for:
 - Regression testing to ensure outputs haven't changed
 - Validating that all test outputs have been reviewed
 
+### Opus Auto-Approval Mode
+
+Use `--opus-approval` (or `-oa`) to automatically judge completions using Claude Opus:
+
+```bash
+pnpm run test --opus-approval
+pnpm run test -oa
+```
+
+When a new output is detected that hasn't been previously approved/rejected:
+
+1. Opus evaluates whether the completion is useful (meaningful code) vs not useful (trivial like semicolons)
+2. Opus responds with APPROVED or REJECTED based on its judgment
+3. The result is saved to the approvals directory for later manual review
+
+Opus considers a suggestion **useful** if it:
+
+- Provides meaningful code that helps the developer
+- Completes a logical code pattern
+- Adds substantial functionality (not just trivial characters)
+- Is syntactically correct and contextually appropriate
+
+Opus considers a suggestion **not useful** if it:
+
+- Only adds trivial characters like semicolons, closing brackets, or single characters
+- Is empty or nearly empty
+- Is syntactically incorrect or doesn't make sense in context
+
+This is useful for:
+
+- Quickly processing large batches of new test outputs
+- Getting consistent, objective judgments on completion quality
+- Reducing manual review burden while still saving decisions for later audit
+
 ## User Interaction
 
 When new output is detected, you'll see:
diff --git a/src/test-llm-autocompletion/approvals.ts b/src/test-llm-autocompletion/approvals.ts
@@ -1,6 +1,7 @@
 import fs from "fs"
 import path from "path"
 import readline from "readline"
+import { askOpusApproval } from "./opus-approval.js"
 
 const APPROVALS_DIR = "approvals"
 
@@ -9,6 +10,20 @@ export interface ApprovalResult {
 	newOutput: boolean
 }
 
+function getExistingOutputs(categoryDir: string, testName: string, type: "approved" | "rejected"): string[] {
+	if (!fs.existsSync(categoryDir)) {
+		return []
+	}
+
+	const pattern = new RegExp(`^${testName}\\.${type}\\.\\d+\\.txt$`)
+	const files = fs.readdirSync(categoryDir).filter((f) => pattern.test(f))
+
+	return files.map((file) => {
+		const filePath = path.join(categoryDir, file)
+		return fs.readFileSync(filePath, "utf-8")
+	})
+}
+
 function getCategoryPath(category: string): string {
 	return path.join(APPROVALS_DIR, category)
 }
@@ -88,6 +103,7 @@ export async function checkApproval(
 	input: string,
 	output: string,
 	skipApproval: boolean = false,
+	useOpusApproval: boolean = false,
 ): Promise<ApprovalResult> {
 	const categoryDir = getCategoryPath(category)
 
@@ -106,7 +122,15 @@ export async function checkApproval(
 		return { isApproved: false, newOutput: true }
 	}
 
-	const isApproved = await askUserApproval(category, testName, input, output)
+	// Use Opus for auto-approval if enabled, otherwise ask user
+	let isApproved: boolean
+	if (useOpusApproval) {
+		const previouslyApproved = getExistingOutputs(categoryDir, testName, "approved")
+		const previouslyRejected = getExistingOutputs(categoryDir, testName, "rejected")
+		isApproved = await askOpusApproval(input, output, previouslyApproved, previouslyRejected)
+	} else {
+		isApproved = await askUserApproval(category, testName, input, output)
+	}
 
 	const type: "approved" | "rejected" = isApproved ? "approved" : "rejected"
 
diff --git a/src/test-llm-autocompletion/llm-client.ts b/src/test-llm-autocompletion/llm-client.ts
@@ -19,7 +19,7 @@ export interface FimResponse {
 	tokensUsed?: number
 }
 
-function getKiloBaseUriFromToken(kilocodeToken?: string): string {
+export function getKiloBaseUriFromToken(kilocodeToken?: string): string {
 	if (kilocodeToken) {
 		try {
 			const payload_string = kilocodeToken.split(".")[1]
diff --git a/src/test-llm-autocompletion/opus-approval.ts b/src/test-llm-autocompletion/opus-approval.ts
@@ -0,0 +1,94 @@
+import OpenAI from "openai"
+import { DEFAULT_HEADERS } from "../api/providers/constants.js"
+import { getKiloBaseUriFromToken } from "./llm-client.js"
+
+const OPUS_MODEL = "anthropic/claude-opus-4.5"
+
+export async function askOpusApproval(
+	input: string,
+	output: string,
+	previouslyApproved: string[],
+	previouslyRejected: string[],
+): Promise<boolean> {
+	const apiKey = process.env.KILOCODE_API_KEY
+	if (!apiKey) {
+		throw new Error("KILOCODE_API_KEY is required for Opus auto-approval")
+	}
+
+	const baseUrl = getKiloBaseUriFromToken(apiKey)
+	const openai = new OpenAI({
+		baseURL: `${baseUrl}/api/openrouter/`,
+		apiKey,
+		defaultHeaders: {
+			...DEFAULT_HEADERS,
+			"X-KILOCODE-TESTER": "SUPPRESS",
+		},
+	})
+
+	const systemPrompt = `You are an expert code reviewer evaluating autocomplete suggestions.
+Your task is to determine if an autocomplete suggestion is USEFUL or NOT USEFUL.
+
+A suggestion is USEFUL if it:
+- Provides meaningful code that helps the developer
+- Completes a logical code pattern
+- Adds substantial functionality (not just trivial characters)
+- Is syntactically correct and contextually appropriate
+
+A suggestion is NOT USEFUL if it:
+- Only adds trivial characters like semicolons, closing brackets, or single characters
+- Is empty or nearly empty
+- Is syntactically incorrect
+- Doesn't make sense in the context
+- Repeats what's already there
+
+Respond with ONLY "APPROVED" or "REJECTED" - nothing else.`
+
+	let userPrompt = `Here is the code context (with cursor position marked by where the completion would be inserted):
+
+INPUT (code before completion):
+\`\`\`
+${input}
+\`\`\`
+
+OUTPUT (code after completion):
+\`\`\`
+${output}
+\`\`\`
+`
+
+	// Add previously approved outputs as examples
+	if (previouslyApproved.length > 0) {
+		userPrompt += `\n--- PREVIOUSLY APPROVED OUTPUTS (for reference) ---\n`
+		for (let i = 0; i < previouslyApproved.length; i++) {
+			userPrompt += `\nApproved example ${i + 1}:\n\`\`\`\n${previouslyApproved[i]}\n\`\`\`\n`
+		}
+	}
+
+	// Add previously rejected outputs as examples
+	if (previouslyRejected.length > 0) {
+		userPrompt += `\n--- PREVIOUSLY REJECTED OUTPUTS (for reference) ---\n`
+		for (let i = 0; i < previouslyRejected.length; i++) {
+			userPrompt += `\nRejected example ${i + 1}:\n\`\`\`\n${previouslyRejected[i]}\n\`\`\`\n`
+		}
+	}
+
+	userPrompt += `\nIs this autocomplete suggestion useful? Respond with ONLY "APPROVED" or "REJECTED".`
+
+	try {
+		const response = await openai.chat.completions.create({
+			model: OPUS_MODEL,
+			messages: [
+				{ role: "system", content: systemPrompt },
+				{ role: "user", content: userPrompt },
+			],
+			max_tokens: 10,
+			temperature: 0,
+		})
+
+		const content = response.choices[0].message.content?.trim().toUpperCase() || ""
+		return content === "APPROVED"
+	} catch (error) {
+		console.error("Opus approval error:", error)
+		throw error
+	}
+}
diff --git a/src/test-llm-autocompletion/runner.ts b/src/test-llm-autocompletion/runner.ts
@@ -22,10 +22,12 @@ export class TestRunner {
 	private verbose: boolean
 	private results: TestResult[] = []
 	private skipApproval: boolean
+	private useOpusApproval: boolean
 
-	constructor(verbose: boolean = false, skipApproval: boolean = false) {
+	constructor(verbose: boolean = false, skipApproval: boolean = false, useOpusApproval: boolean = false) {
 		this.verbose = verbose
 		this.skipApproval = skipApproval
+		this.useOpusApproval = useOpusApproval
 		this.tester = new GhostProviderTester()
 	}
 
@@ -57,6 +59,7 @@ export class TestRunner {
 				testCase.input,
 				actualValue,
 				this.skipApproval,
+				this.useOpusApproval,
 			)
 
 			return {
@@ -91,6 +94,9 @@ export class TestRunner {
 		if (this.skipApproval) {
 			console.log("Skip Approval: enabled (tests will fail if not already approved)")
 		}
+		if (this.useOpusApproval) {
+			console.log("Opus Auto-Approval: enabled (using Claude Opus to judge completions)")
+		}
 		console.log("Total tests:", testCases.length)
 		console.log("Categories:", getCategories().join(", "))
 		console.log("\n" + "─".repeat(80) + "\n")
@@ -403,10 +409,11 @@ async function main() {
 	const args = process.argv.slice(2)
 	const verbose = args.includes("--verbose") || args.includes("-v")
 	const skipApproval = args.includes("--skip-approval") || args.includes("-sa")
+	const useOpusApproval = args.includes("--opus-approval") || args.includes("-oa")
 
 	const command = args.find((arg) => !arg.startsWith("-"))
 
-	const runner = new TestRunner(verbose, skipApproval)
+	const runner = new TestRunner(verbose, skipApproval, useOpusApproval)
 
 	try {
 		if (command === "clean") {

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ export interface FimResponse {`
`19`	`19`	`tokensUsed?: number`
`20`	`20`	`}`
`21`	`21`
`22`		`-function getKiloBaseUriFromToken(kilocodeToken?: string): string {`
	`22`	`+export function getKiloBaseUriFromToken(kilocodeToken?: string): string {`
`23`	`23`	`if (kilocodeToken) {`
`24`	`24`	`try {`
`25`	`25`	`const payload_string = kilocodeToken.split(".")[1]`