Skip to content

Commit ce76e5c

Browse files
authored
Merge pull request #4202 from Kilo-Org/mark/opus-auto-approval
feat: add Opus auto-approval for autocomplete test runner
2 parents 56ee5d6 + 9f9fcb5 commit ce76e5c

File tree

5 files changed

+169
-4
lines changed

5 files changed

+169
-4
lines changed

src/test-llm-autocompletion/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ pnpm run test:verbose
5858
# Run without interactive approval (fail if not already approved)
5959
pnpm run test --skip-approval
6060
61+
# Run with Opus auto-approval (uses Claude Opus to judge completions)
62+
pnpm run test --opus-approval
63+
# Or short form
64+
pnpm run test -oa
65+
6166
# Run a single test
6267
pnpm run test closing-brace
6368
@@ -66,6 +71,7 @@ pnpm run clean
6671
6772
# Combine flags
6873
pnpm run test --verbose --skip-approval
74+
pnpm run test --verbose --opus-approval
6975
```
7076

7177
### Completion Strategy
@@ -124,6 +130,40 @@ This is useful for:
124130
- Regression testing to ensure outputs haven't changed
125131
- Validating that all test outputs have been reviewed
126132

133+
### Opus Auto-Approval Mode
134+
135+
Use `--opus-approval` (or `-oa`) to automatically judge completions using Claude Opus:
136+
137+
```bash
138+
pnpm run test --opus-approval
139+
pnpm run test -oa
140+
```
141+
142+
When a new output is detected that hasn't been previously approved/rejected:
143+
144+
1. Opus evaluates whether the completion is useful (meaningful code) vs not useful (trivial like semicolons)
145+
2. Opus responds with APPROVED or REJECTED based on its judgment
146+
3. The result is saved to the approvals directory for later manual review
147+
148+
Opus considers a suggestion **useful** if it:
149+
150+
- Provides meaningful code that helps the developer
151+
- Completes a logical code pattern
152+
- Adds substantial functionality (not just trivial characters)
153+
- Is syntactically correct and contextually appropriate
154+
155+
Opus considers a suggestion **not useful** if it:
156+
157+
- Only adds trivial characters like semicolons, closing brackets, or single characters
158+
- Is empty or nearly empty
159+
- Is syntactically incorrect or doesn't make sense in context
160+
161+
This is useful for:
162+
163+
- Quickly processing large batches of new test outputs
164+
- Getting consistent, objective judgments on completion quality
165+
- Reducing manual review burden while still saving decisions for later audit
166+
127167
## User Interaction
128168

129169
When new output is detected, you'll see:

src/test-llm-autocompletion/approvals.ts

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import fs from "fs"
22
import path from "path"
33
import readline from "readline"
4+
import { askOpusApproval } from "./opus-approval.js"
45

56
const APPROVALS_DIR = "approvals"
67

@@ -9,6 +10,20 @@ export interface ApprovalResult {
910
newOutput: boolean
1011
}
1112

13+
function getExistingOutputs(categoryDir: string, testName: string, type: "approved" | "rejected"): string[] {
14+
if (!fs.existsSync(categoryDir)) {
15+
return []
16+
}
17+
18+
const pattern = new RegExp(`^${testName}\\.${type}\\.\\d+\\.txt$`)
19+
const files = fs.readdirSync(categoryDir).filter((f) => pattern.test(f))
20+
21+
return files.map((file) => {
22+
const filePath = path.join(categoryDir, file)
23+
return fs.readFileSync(filePath, "utf-8")
24+
})
25+
}
26+
1227
function getCategoryPath(category: string): string {
1328
return path.join(APPROVALS_DIR, category)
1429
}
@@ -88,6 +103,7 @@ export async function checkApproval(
88103
input: string,
89104
output: string,
90105
skipApproval: boolean = false,
106+
useOpusApproval: boolean = false,
91107
): Promise<ApprovalResult> {
92108
const categoryDir = getCategoryPath(category)
93109

@@ -106,7 +122,15 @@ export async function checkApproval(
106122
return { isApproved: false, newOutput: true }
107123
}
108124

109-
const isApproved = await askUserApproval(category, testName, input, output)
125+
// Use Opus for auto-approval if enabled, otherwise ask user
126+
let isApproved: boolean
127+
if (useOpusApproval) {
128+
const previouslyApproved = getExistingOutputs(categoryDir, testName, "approved")
129+
const previouslyRejected = getExistingOutputs(categoryDir, testName, "rejected")
130+
isApproved = await askOpusApproval(input, output, previouslyApproved, previouslyRejected)
131+
} else {
132+
isApproved = await askUserApproval(category, testName, input, output)
133+
}
110134

111135
const type: "approved" | "rejected" = isApproved ? "approved" : "rejected"
112136

src/test-llm-autocompletion/llm-client.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ export interface FimResponse {
1919
tokensUsed?: number
2020
}
2121

22-
function getKiloBaseUriFromToken(kilocodeToken?: string): string {
22+
export function getKiloBaseUriFromToken(kilocodeToken?: string): string {
2323
if (kilocodeToken) {
2424
try {
2525
const payload_string = kilocodeToken.split(".")[1]
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import OpenAI from "openai"
2+
import { DEFAULT_HEADERS } from "../api/providers/constants.js"
3+
import { getKiloBaseUriFromToken } from "./llm-client.js"
4+
5+
const OPUS_MODEL = "anthropic/claude-opus-4.5"
6+
7+
export async function askOpusApproval(
8+
input: string,
9+
output: string,
10+
previouslyApproved: string[],
11+
previouslyRejected: string[],
12+
): Promise<boolean> {
13+
const apiKey = process.env.KILOCODE_API_KEY
14+
if (!apiKey) {
15+
throw new Error("KILOCODE_API_KEY is required for Opus auto-approval")
16+
}
17+
18+
const baseUrl = getKiloBaseUriFromToken(apiKey)
19+
const openai = new OpenAI({
20+
baseURL: `${baseUrl}/api/openrouter/`,
21+
apiKey,
22+
defaultHeaders: {
23+
...DEFAULT_HEADERS,
24+
"X-KILOCODE-TESTER": "SUPPRESS",
25+
},
26+
})
27+
28+
const systemPrompt = `You are an expert code reviewer evaluating autocomplete suggestions.
29+
Your task is to determine if an autocomplete suggestion is USEFUL or NOT USEFUL.
30+
31+
A suggestion is USEFUL if it:
32+
- Provides meaningful code that helps the developer
33+
- Completes a logical code pattern
34+
- Adds substantial functionality (not just trivial characters)
35+
- Is syntactically correct and contextually appropriate
36+
37+
A suggestion is NOT USEFUL if it:
38+
- Only adds trivial characters like semicolons, closing brackets, or single characters
39+
- Is empty or nearly empty
40+
- Is syntactically incorrect
41+
- Doesn't make sense in the context
42+
- Repeats what's already there
43+
44+
Respond with ONLY "APPROVED" or "REJECTED" - nothing else.`
45+
46+
let userPrompt = `Here is the code context (with cursor position marked by where the completion would be inserted):
47+
48+
INPUT (code before completion):
49+
\`\`\`
50+
${input}
51+
\`\`\`
52+
53+
OUTPUT (code after completion):
54+
\`\`\`
55+
${output}
56+
\`\`\`
57+
`
58+
59+
// Add previously approved outputs as examples
60+
if (previouslyApproved.length > 0) {
61+
userPrompt += `\n--- PREVIOUSLY APPROVED OUTPUTS (for reference) ---\n`
62+
for (let i = 0; i < previouslyApproved.length; i++) {
63+
userPrompt += `\nApproved example ${i + 1}:\n\`\`\`\n${previouslyApproved[i]}\n\`\`\`\n`
64+
}
65+
}
66+
67+
// Add previously rejected outputs as examples
68+
if (previouslyRejected.length > 0) {
69+
userPrompt += `\n--- PREVIOUSLY REJECTED OUTPUTS (for reference) ---\n`
70+
for (let i = 0; i < previouslyRejected.length; i++) {
71+
userPrompt += `\nRejected example ${i + 1}:\n\`\`\`\n${previouslyRejected[i]}\n\`\`\`\n`
72+
}
73+
}
74+
75+
userPrompt += `\nIs this autocomplete suggestion useful? Respond with ONLY "APPROVED" or "REJECTED".`
76+
77+
try {
78+
const response = await openai.chat.completions.create({
79+
model: OPUS_MODEL,
80+
messages: [
81+
{ role: "system", content: systemPrompt },
82+
{ role: "user", content: userPrompt },
83+
],
84+
max_tokens: 10,
85+
temperature: 0,
86+
})
87+
88+
const content = response.choices[0].message.content?.trim().toUpperCase() || ""
89+
return content === "APPROVED"
90+
} catch (error) {
91+
console.error("Opus approval error:", error)
92+
throw error
93+
}
94+
}

src/test-llm-autocompletion/runner.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ export class TestRunner {
2222
private verbose: boolean
2323
private results: TestResult[] = []
2424
private skipApproval: boolean
25+
private useOpusApproval: boolean
2526

26-
constructor(verbose: boolean = false, skipApproval: boolean = false) {
27+
constructor(verbose: boolean = false, skipApproval: boolean = false, useOpusApproval: boolean = false) {
2728
this.verbose = verbose
2829
this.skipApproval = skipApproval
30+
this.useOpusApproval = useOpusApproval
2931
this.tester = new GhostProviderTester()
3032
}
3133

@@ -57,6 +59,7 @@ export class TestRunner {
5759
testCase.input,
5860
actualValue,
5961
this.skipApproval,
62+
this.useOpusApproval,
6063
)
6164

6265
return {
@@ -91,6 +94,9 @@ export class TestRunner {
9194
if (this.skipApproval) {
9295
console.log("Skip Approval: enabled (tests will fail if not already approved)")
9396
}
97+
if (this.useOpusApproval) {
98+
console.log("Opus Auto-Approval: enabled (using Claude Opus to judge completions)")
99+
}
94100
console.log("Total tests:", testCases.length)
95101
console.log("Categories:", getCategories().join(", "))
96102
console.log("\n" + "─".repeat(80) + "\n")
@@ -403,10 +409,11 @@ async function main() {
403409
const args = process.argv.slice(2)
404410
const verbose = args.includes("--verbose") || args.includes("-v")
405411
const skipApproval = args.includes("--skip-approval") || args.includes("-sa")
412+
const useOpusApproval = args.includes("--opus-approval") || args.includes("-oa")
406413

407414
const command = args.find((arg) => !arg.startsWith("-"))
408415

409-
const runner = new TestRunner(verbose, skipApproval)
416+
const runner = new TestRunner(verbose, skipApproval, useOpusApproval)
410417

411418
try {
412419
if (command === "clean") {

0 commit comments

Comments
 (0)