Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions bench/src/benchmarks/humaneval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ function buildProgram(task: HumanEvalTask, candidate: string): string {
export interface CheckResult {
/** {0,1} pass-count for this candidate (1 = the check() suite passed). */
pass: number
/** On failure: the interpreter stderr tail (traceback / failing assertion). The
* execution-grounded feedback a self-repair loop steers on; ignored by selection. */
detail?: string
}

/** Run one candidate's deployable test program in an isolated container:
Expand Down Expand Up @@ -174,8 +177,9 @@ export function runChecker(task: HumanEvalTask, candidate: string): Promise<Chec
return
}
// killed-by-timeout or a non-zero exit (assert failure / error) are genuine
// test FAILURES — score 0, do not throw.
finish({ pass: 0 })
// test FAILURES — score 0, do not throw. Carry the stderr tail as the
// execution-grounded failure detail (empty ⇒ timeout/SIGKILL left no output).
finish({ pass: 0, detail: (stderr || '').slice(-600) || 'timed out (no output)' })
return
}
finish({ pass: 1 })
Expand Down
143 changes: 143 additions & 0 deletions bench/src/humaneval-repair-gate.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/**
* HumanEval self-repair gate — the TOOL-USING router backend vs blind resampling
* at equal compute, the strongest form of the steering question.
*
* The earlier steering gate (the rsi analyst arm) used an LLM that AUDITED the
* prior code WITHOUT running it — and was a null (−1.2pp, n.s.). This removes that
* weakness: the worker gets a `run_tests` tool (the deployable Docker checker), so
* it actually RUNS the tests, sees the real failure, and fixes — execution-grounded
* self-repair, off-box over the Tangle router's tool-calling (no sandbox). If
* steering ever beats compute on a deployable checker, this is where it should.
*
* blind@K — K independent completions, verifier-grounded pick (breadth/resample)
* repair@K — ONE worker, up to K tool-turns: write → run_tests → fix → … (depth/tool-grounded)
*
* Equal budget: one inference turn = one router completion, so both arms spend ≤K
* completions. Both finals are judged by the SAME check() suite. Per-task {0,1}
* outcomes, paired 95% bootstrap CI (discordant pairs = the power).
*
* TANGLE_API_KEY=… N=82 K=3 OFFSET=82 WORKER_MODEL=gpt-3.5-turbo \
* tsx src/humaneval-repair-gate.mts
*/
import { type HumanEvalTask, basePrompt, extractCode, loadHumanEval, runChecker } from './benchmarks/humaneval'
import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client'
import { verifierGroundedSelect } from './selector'
import { type PairedLift, pairedLift, pool } from './stats.mts'

function must(name: string): string {
const v = process.env[name]
if (!v) throw new Error(`env ${name} is required`)
return v
}

const runTestsTool: ToolSpec = {
type: 'function',
function: {
name: 'run_tests',
description:
"Run the task's test suite against your candidate function and return PASS or the real failure output. Verify with this before giving your final answer.",
parameters: {
type: 'object',
properties: { code: { type: 'string', description: 'The COMPLETE Python function definition to test (signature + body, plus any imports).' } },
required: ['code'],
},
},
}

const repairSystem = [
'You complete a Python function. You have a run_tests tool that runs the REAL test suite against your code.',
'Workflow: write the function, call run_tests to check it, and if it fails read the error and fix the function, then call run_tests again.',
'When run_tests reports all tests passed, reply with the final function in a single ```python block and do NOT call the tool again.',
].join(' ')

/** repair@K: one worker, up to K inference turns, steering on real test failures. */
async function repairAttempt(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number> {
let lastTested = ''
const r = await routerToolLoop(
cfg,
repairSystem,
basePrompt(task),
[runTestsTool],
async (name, args) => {
if (name !== 'run_tests') return `error: unknown tool ${name}`
const code = extractCode(String(args.code ?? ''))
lastTested = code
const res = await runChecker(task, code)
return res.pass === 1
? 'ALL TESTS PASSED. Reply with the final function now; do not call run_tests again.'
: `TESTS FAILED:\n${res.detail ?? 'no output'}\n\nFix the function and call run_tests again.`
},
{ maxTurns: k, temperature: 0.3 },
)
// Judge the model's final answer; fall back to the last code it tested (it may
// report "done" without re-pasting the passing function).
const finalCode = extractCode(r.final) || lastTested
if (!finalCode) return 0
return (await runChecker(task, finalCode)).pass
}

/** blind@K: K independent completions, verifier-grounded pick (the resample control). */
async function blindAttempts(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number[]> {
const base = basePrompt(task)
const passes: number[] = []
for (let i = 0; i < k; i += 1) {
const res = await routerChatWithUsage(cfg, [{ role: 'user', content: base }], { temperature: 0.8 })
passes.push((await runChecker(task, extractCode(res.content))).pass)
}
return passes
}

const pct = (x: number) => `${(x * 100).toFixed(1)}%`
const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`

async function main(): Promise<void> {
const n = Number(process.env.N ?? 82)
const k = Number(process.env.K ?? 3)
const offset = Number(process.env.OFFSET ?? 82)
const model = process.env.WORKER_MODEL ?? 'gpt-3.5-turbo'
const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model }
const concurrency = Number(process.env.CONCURRENCY ?? 6)
if (k < 2) throw new Error('K must be >= 2 (repair needs at least write + one fix)')

console.log(`=== HumanEval self-repair gate · tool-using router worker · N=${n} K=${k} offset=${offset} model=${model} ===`)
const tasks = await loadHumanEval(n, offset)
console.log(`loaded ${tasks.length} task(s); running blind@${k} (resample) vs repair@${k} (run_tests-grounded), conc=${concurrency}\n`)

const rows = await pool(tasks, concurrency, async (task, i) => {
const blind = await blindAttempts(cfg, task, k)
const repair = await repairAttempt(cfg, task, k)
const blind1 = blind[0] ?? 0
const blindK = blind[verifierGroundedSelect(blind)] ?? 0
process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId}: blind@1=${blind1} blind@${k}=${blindK} repair@${k}=${repair}\n`)
return { blind1, blindK, repair }
})

const blind1 = rows.map((r) => r.blind1)
const blindK = rows.map((r) => r.blindK)
const repairK = rows.map((r) => r.repair)
const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length

console.log(`\n${'='.repeat(74)}`)
console.log(`RESULTS · HumanEval self-repair · n=${tasks.length} · k=${k} · ${model}`)
console.log('='.repeat(74))
console.log(` blind pass@1 ${pct(rate(blind1))}`)
console.log(` blind@${k} (resample) ${pct(rate(blindK))}`)
console.log(` repair@${k} (tools) ${pct(rate(repairK))}`)

const row = (label: string, l: PairedLift) =>
console.log(` ${label.padEnd(34)} ${pp(l.point).padStart(7)} CI [${pp(l.low)}, ${pp(l.high)}] (paired ${l.pairs}, discordant ${l.discordant})`)
const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)')

const repairVsBlind = pairedLift(blindK, repairK)
const computeVsBlind1 = pairedLift(blind1, blindK)
console.log(`\n PAIRED LIFTS (95% bootstrap CI, B=10000):`)
row(`repair@${k} − blind@${k} (steering)`, repairVsBlind)
row(`blind@${k} − blind@1 (more-compute)`, computeVsBlind1)
console.log(`\n VERDICT:`)
console.log(` execution-grounded self-repair beats blind resampling @ equal k? ${repairVsBlind.point > 0 ? 'yes' : 'no'} (${pp(repairVsBlind.point)}, ${sig(repairVsBlind)})`)
}

main().catch((err) => {
console.error(`humaneval-repair-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`)
process.exit(1)
})
79 changes: 79 additions & 0 deletions bench/src/router-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,82 @@ export async function routerChatWithTools(
...(costUsd !== undefined ? { costUsd } : {}),
}
}

export interface ToolSpec {
type: 'function'
function: { name: string; description?: string; parameters: unknown }
}

export interface RouterToolLoopResult {
/** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */
final: string
/** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */
turns: number
toolCalls: number
usage: { input: number; output: number }
}

/**
* The tool-using router backend: a real agentic loop OVER the Tangle router (which
* supports tool-calling), off-box — no sandbox. Each turn is one router completion
* with `tools`; if the model emits tool_calls, `execute` runs them on the host and
* their results are folded back as `tool` messages; the loop repeats until the
* model answers without a tool call or the turn budget is hit. One turn = one
* inference call, so `maxTurns` is the equal-compute unit against random@k.
*
* This is the depth substrate for agentic gates (the worker ACTS, observes the real
* result, and continues) that the chat-only `routerChatWithUsage` cannot express.
*/
export async function routerToolLoop(
cfg: RouterConfig,
system: string,
user: string,
tools: ReadonlyArray<ToolSpec>,
execute: (name: string, args: Record<string, unknown>) => Promise<string>,
opts?: { maxTurns?: number; temperature?: number; signal?: AbortSignal },
): Promise<RouterToolLoopResult> {
const maxTurns = opts?.maxTurns ?? 4
const messages: Array<Record<string, unknown>> = [
{ role: 'system', content: system },
{ role: 'user', content: user },
]
let toolCalls = 0
let lastText = ''
const usage = { input: 0, output: 0 }

for (let turn = 1; turn <= maxTurns; turn += 1) {
const r = await routerChatWithTools(cfg, messages, tools, {
...(opts?.temperature !== undefined ? { temperature: opts.temperature } : {}),
...(opts?.signal ? { signal: opts.signal } : {}),
})
if (r.usage) {
usage.input += r.usage.input
usage.output += r.usage.output
}
if (r.content) lastText = r.content
if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage }

// Record the assistant turn verbatim (content + the tool_calls it requested), then
// run each call on the host and fold the result back as a `tool` message.
messages.push({
role: 'assistant',
content: r.content ?? '',
tool_calls: r.toolCalls.map((tc) => ({ id: tc.id, type: 'function', function: { name: tc.name, arguments: tc.arguments } })),
})
for (const tc of r.toolCalls) {
toolCalls += 1
let args: Record<string, unknown> = {}
try {
args = JSON.parse(tc.arguments) as Record<string, unknown>
} catch {
// Malformed tool args from the model are a real outcome, not an infra fault — feed
// the error back so the model can correct, rather than throwing the whole loop.
messages.push({ role: 'tool', tool_call_id: tc.id, content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}` })
continue
}
const out = await execute(tc.name, args)
messages.push({ role: 'tool', tool_call_id: tc.id, content: out })
}
}
return { final: lastText, turns: maxTurns, toolCalls, usage }
}
Loading