From 5729396cb846285d818a8b56f084530b2b2a0818 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Mon, 8 Jun 2026 10:30:25 -0600 Subject: [PATCH] feat(bench): tool-using router backend (routerToolLoop) + self-repair gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit routerToolLoop (router-client.ts): a real agentic loop OVER the Tangle router's tool-calling, OFF-BOX — each turn is a router completion with `tools`; tool_calls execute on the host and fold back as `tool` messages; repeat until done or budget. The depth substrate the chat-only routerChatWithUsage couldn't express (routes around the sandbox→router egress block, #984). One turn = one completion, so maxTurns is the equal-compute unit vs random@k. humaneval-repair-gate.mts: the strongest steering test — the worker gets a run_tests tool (the deployable Docker checker, now returning failure detail), so it RUNS the tests, sees the real failure, and fixes, vs blind resampling at equal k. Removes the weakness of the earlier LLM-audit null (which never ran the code). Gate result (gpt-3.5-turbo, HumanEval hard half, n=82, equal k=3, paired bootstrap): blind@1 54.9% blind@3 (resample) 75.6% repair@3 (tools) 58.5% repair@3 − blind@3 (steering) -17.1pp CI [-26.8, -7.3] SIGNIFICANT NEGATIVE blind@3 − blind@1 (more-compute) +20.7pp CI [+12.2, +30.5] SIGNIFICANT POSITIVE Execution-grounded self-repair is SIGNIFICANTLY WORSE than blind resampling at equal budget: refining one anchored attempt loses to exploring k fresh ones. The tool backend works (recovered tasks blind missed); the depth strategy loses on single-shot codegen. Sharpens the steering-vs-compute boundary: breadth wins where fresh samples are cheap+independent; depth's one win remains EOPS (stateful/agentic). --- bench/src/benchmarks/humaneval.ts | 8 +- bench/src/humaneval-repair-gate.mts | 143 ++++++++++++++++++++++++++++ bench/src/router-client.ts | 79 +++++++++++++++ 3 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 bench/src/humaneval-repair-gate.mts diff --git a/bench/src/benchmarks/humaneval.ts b/bench/src/benchmarks/humaneval.ts index 60fefdf..8a966bc 100644 --- a/bench/src/benchmarks/humaneval.ts +++ b/bench/src/benchmarks/humaneval.ts @@ -98,6 +98,9 @@ function buildProgram(task: HumanEvalTask, candidate: string): string { export interface CheckResult { /** {0,1} pass-count for this candidate (1 = the check() suite passed). */ pass: number + /** On failure: the interpreter stderr tail (traceback / failing assertion). The + * execution-grounded feedback a self-repair loop steers on; ignored by selection. */ + detail?: string } /** Run one candidate's deployable test program in an isolated container: @@ -174,8 +177,9 @@ export function runChecker(task: HumanEvalTask, candidate: string): Promise { + let lastTested = '' + const r = await routerToolLoop( + cfg, + repairSystem, + basePrompt(task), + [runTestsTool], + async (name, args) => { + if (name !== 'run_tests') return `error: unknown tool ${name}` + const code = extractCode(String(args.code ?? '')) + lastTested = code + const res = await runChecker(task, code) + return res.pass === 1 + ? 'ALL TESTS PASSED. Reply with the final function now; do not call run_tests again.' + : `TESTS FAILED:\n${res.detail ?? 'no output'}\n\nFix the function and call run_tests again.` + }, + { maxTurns: k, temperature: 0.3 }, + ) + // Judge the model's final answer; fall back to the last code it tested (it may + // report "done" without re-pasting the passing function). + const finalCode = extractCode(r.final) || lastTested + if (!finalCode) return 0 + return (await runChecker(task, finalCode)).pass +} + +/** blind@K: K independent completions, verifier-grounded pick (the resample control). */ +async function blindAttempts(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise { + const base = basePrompt(task) + const passes: number[] = [] + for (let i = 0; i < k; i += 1) { + const res = await routerChatWithUsage(cfg, [{ role: 'user', content: base }], { temperature: 0.8 }) + passes.push((await runChecker(task, extractCode(res.content))).pass) + } + return passes +} + +const pct = (x: number) => `${(x * 100).toFixed(1)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 82) + const k = Number(process.env.K ?? 3) + const offset = Number(process.env.OFFSET ?? 82) + const model = process.env.WORKER_MODEL ?? 'gpt-3.5-turbo' + const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model } + const concurrency = Number(process.env.CONCURRENCY ?? 6) + if (k < 2) throw new Error('K must be >= 2 (repair needs at least write + one fix)') + + console.log(`=== HumanEval self-repair gate · tool-using router worker · N=${n} K=${k} offset=${offset} model=${model} ===`) + const tasks = await loadHumanEval(n, offset) + console.log(`loaded ${tasks.length} task(s); running blind@${k} (resample) vs repair@${k} (run_tests-grounded), conc=${concurrency}\n`) + + const rows = await pool(tasks, concurrency, async (task, i) => { + const blind = await blindAttempts(cfg, task, k) + const repair = await repairAttempt(cfg, task, k) + const blind1 = blind[0] ?? 0 + const blindK = blind[verifierGroundedSelect(blind)] ?? 0 + process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId}: blind@1=${blind1} blind@${k}=${blindK} repair@${k}=${repair}\n`) + return { blind1, blindK, repair } + }) + + const blind1 = rows.map((r) => r.blind1) + const blindK = rows.map((r) => r.blindK) + const repairK = rows.map((r) => r.repair) + const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length + + console.log(`\n${'='.repeat(74)}`) + console.log(`RESULTS · HumanEval self-repair · n=${tasks.length} · k=${k} · ${model}`) + console.log('='.repeat(74)) + console.log(` blind pass@1 ${pct(rate(blind1))}`) + console.log(` blind@${k} (resample) ${pct(rate(blindK))}`) + console.log(` repair@${k} (tools) ${pct(rate(repairK))}`) + + const row = (label: string, l: PairedLift) => + console.log(` ${label.padEnd(34)} ${pp(l.point).padStart(7)} CI [${pp(l.low)}, ${pp(l.high)}] (paired ${l.pairs}, discordant ${l.discordant})`) + const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)') + + const repairVsBlind = pairedLift(blindK, repairK) + const computeVsBlind1 = pairedLift(blind1, blindK) + console.log(`\n PAIRED LIFTS (95% bootstrap CI, B=10000):`) + row(`repair@${k} − blind@${k} (steering)`, repairVsBlind) + row(`blind@${k} − blind@1 (more-compute)`, computeVsBlind1) + console.log(`\n VERDICT:`) + console.log(` execution-grounded self-repair beats blind resampling @ equal k? ${repairVsBlind.point > 0 ? 'yes' : 'no'} (${pp(repairVsBlind.point)}, ${sig(repairVsBlind)})`) +} + +main().catch((err) => { + console.error(`humaneval-repair-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`) + process.exit(1) +}) diff --git a/bench/src/router-client.ts b/bench/src/router-client.ts index 0014d22..d1b20e8 100644 --- a/bench/src/router-client.ts +++ b/bench/src/router-client.ts @@ -138,3 +138,82 @@ export async function routerChatWithTools( ...(costUsd !== undefined ? { costUsd } : {}), } } + +export interface ToolSpec { + type: 'function' + function: { name: string; description?: string; parameters: unknown } +} + +export interface RouterToolLoopResult { + /** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */ + final: string + /** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */ + turns: number + toolCalls: number + usage: { input: number; output: number } +} + +/** + * The tool-using router backend: a real agentic loop OVER the Tangle router (which + * supports tool-calling), off-box — no sandbox. Each turn is one router completion + * with `tools`; if the model emits tool_calls, `execute` runs them on the host and + * their results are folded back as `tool` messages; the loop repeats until the + * model answers without a tool call or the turn budget is hit. One turn = one + * inference call, so `maxTurns` is the equal-compute unit against random@k. + * + * This is the depth substrate for agentic gates (the worker ACTS, observes the real + * result, and continues) that the chat-only `routerChatWithUsage` cannot express. + */ +export async function routerToolLoop( + cfg: RouterConfig, + system: string, + user: string, + tools: ReadonlyArray, + execute: (name: string, args: Record) => Promise, + opts?: { maxTurns?: number; temperature?: number; signal?: AbortSignal }, +): Promise { + const maxTurns = opts?.maxTurns ?? 4 + const messages: Array> = [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ] + let toolCalls = 0 + let lastText = '' + const usage = { input: 0, output: 0 } + + for (let turn = 1; turn <= maxTurns; turn += 1) { + const r = await routerChatWithTools(cfg, messages, tools, { + ...(opts?.temperature !== undefined ? { temperature: opts.temperature } : {}), + ...(opts?.signal ? { signal: opts.signal } : {}), + }) + if (r.usage) { + usage.input += r.usage.input + usage.output += r.usage.output + } + if (r.content) lastText = r.content + if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage } + + // Record the assistant turn verbatim (content + the tool_calls it requested), then + // run each call on the host and fold the result back as a `tool` message. + messages.push({ + role: 'assistant', + content: r.content ?? '', + tool_calls: r.toolCalls.map((tc) => ({ id: tc.id, type: 'function', function: { name: tc.name, arguments: tc.arguments } })), + }) + for (const tc of r.toolCalls) { + toolCalls += 1 + let args: Record = {} + try { + args = JSON.parse(tc.arguments) as Record + } catch { + // Malformed tool args from the model are a real outcome, not an infra fault — feed + // the error back so the model can correct, rather than throwing the whole loop. + messages.push({ role: 'tool', tool_call_id: tc.id, content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}` }) + continue + } + const out = await execute(tc.name, args) + messages.push({ role: 'tool', tool_call_id: tc.id, content: out }) + } + } + return { final: lastText, turns: maxTurns, toolCalls, usage } +}