diff --git a/bench/src/benchmarks/humaneval.ts b/bench/src/benchmarks/humaneval.ts index 60fefdf..8a966bc 100644 --- a/bench/src/benchmarks/humaneval.ts +++ b/bench/src/benchmarks/humaneval.ts @@ -98,6 +98,9 @@ function buildProgram(task: HumanEvalTask, candidate: string): string { export interface CheckResult { /** {0,1} pass-count for this candidate (1 = the check() suite passed). */ pass: number + /** On failure: the interpreter stderr tail (traceback / failing assertion). The + * execution-grounded feedback a self-repair loop steers on; ignored by selection. */ + detail?: string } /** Run one candidate's deployable test program in an isolated container: @@ -174,8 +177,9 @@ export function runChecker(task: HumanEvalTask, candidate: string): Promise { + let lastTested = '' + const r = await routerToolLoop( + cfg, + repairSystem, + basePrompt(task), + [runTestsTool], + async (name, args) => { + if (name !== 'run_tests') return `error: unknown tool ${name}` + const code = extractCode(String(args.code ?? '')) + lastTested = code + const res = await runChecker(task, code) + return res.pass === 1 + ? 'ALL TESTS PASSED. Reply with the final function now; do not call run_tests again.' + : `TESTS FAILED:\n${res.detail ?? 'no output'}\n\nFix the function and call run_tests again.` + }, + { maxTurns: k, temperature: 0.3 }, + ) + // Judge the model's final answer; fall back to the last code it tested (it may + // report "done" without re-pasting the passing function). + const finalCode = extractCode(r.final) || lastTested + if (!finalCode) return 0 + return (await runChecker(task, finalCode)).pass +} + +/** blind@K: K independent completions, verifier-grounded pick (the resample control). */ +async function blindAttempts(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise { + const base = basePrompt(task) + const passes: number[] = [] + for (let i = 0; i < k; i += 1) { + const res = await routerChatWithUsage(cfg, [{ role: 'user', content: base }], { temperature: 0.8 }) + passes.push((await runChecker(task, extractCode(res.content))).pass) + } + return passes +} + +const pct = (x: number) => `${(x * 100).toFixed(1)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 82) + const k = Number(process.env.K ?? 3) + const offset = Number(process.env.OFFSET ?? 82) + const model = process.env.WORKER_MODEL ?? 'gpt-3.5-turbo' + const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model } + const concurrency = Number(process.env.CONCURRENCY ?? 6) + if (k < 2) throw new Error('K must be >= 2 (repair needs at least write + one fix)') + + console.log(`=== HumanEval self-repair gate · tool-using router worker · N=${n} K=${k} offset=${offset} model=${model} ===`) + const tasks = await loadHumanEval(n, offset) + console.log(`loaded ${tasks.length} task(s); running blind@${k} (resample) vs repair@${k} (run_tests-grounded), conc=${concurrency}\n`) + + const rows = await pool(tasks, concurrency, async (task, i) => { + const blind = await blindAttempts(cfg, task, k) + const repair = await repairAttempt(cfg, task, k) + const blind1 = blind[0] ?? 0 + const blindK = blind[verifierGroundedSelect(blind)] ?? 0 + process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId}: blind@1=${blind1} blind@${k}=${blindK} repair@${k}=${repair}\n`) + return { blind1, blindK, repair } + }) + + const blind1 = rows.map((r) => r.blind1) + const blindK = rows.map((r) => r.blindK) + const repairK = rows.map((r) => r.repair) + const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length + + console.log(`\n${'='.repeat(74)}`) + console.log(`RESULTS · HumanEval self-repair · n=${tasks.length} · k=${k} · ${model}`) + console.log('='.repeat(74)) + console.log(` blind pass@1 ${pct(rate(blind1))}`) + console.log(` blind@${k} (resample) ${pct(rate(blindK))}`) + console.log(` repair@${k} (tools) ${pct(rate(repairK))}`) + + const row = (label: string, l: PairedLift) => + console.log(` ${label.padEnd(34)} ${pp(l.point).padStart(7)} CI [${pp(l.low)}, ${pp(l.high)}] (paired ${l.pairs}, discordant ${l.discordant})`) + const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)') + + const repairVsBlind = pairedLift(blindK, repairK) + const computeVsBlind1 = pairedLift(blind1, blindK) + console.log(`\n PAIRED LIFTS (95% bootstrap CI, B=10000):`) + row(`repair@${k} − blind@${k} (steering)`, repairVsBlind) + row(`blind@${k} − blind@1 (more-compute)`, computeVsBlind1) + console.log(`\n VERDICT:`) + console.log(` execution-grounded self-repair beats blind resampling @ equal k? ${repairVsBlind.point > 0 ? 'yes' : 'no'} (${pp(repairVsBlind.point)}, ${sig(repairVsBlind)})`) +} + +main().catch((err) => { + console.error(`humaneval-repair-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`) + process.exit(1) +}) diff --git a/bench/src/router-client.ts b/bench/src/router-client.ts index 0014d22..d1b20e8 100644 --- a/bench/src/router-client.ts +++ b/bench/src/router-client.ts @@ -138,3 +138,82 @@ export async function routerChatWithTools( ...(costUsd !== undefined ? { costUsd } : {}), } } + +export interface ToolSpec { + type: 'function' + function: { name: string; description?: string; parameters: unknown } +} + +export interface RouterToolLoopResult { + /** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */ + final: string + /** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */ + turns: number + toolCalls: number + usage: { input: number; output: number } +} + +/** + * The tool-using router backend: a real agentic loop OVER the Tangle router (which + * supports tool-calling), off-box — no sandbox. Each turn is one router completion + * with `tools`; if the model emits tool_calls, `execute` runs them on the host and + * their results are folded back as `tool` messages; the loop repeats until the + * model answers without a tool call or the turn budget is hit. One turn = one + * inference call, so `maxTurns` is the equal-compute unit against random@k. + * + * This is the depth substrate for agentic gates (the worker ACTS, observes the real + * result, and continues) that the chat-only `routerChatWithUsage` cannot express. + */ +export async function routerToolLoop( + cfg: RouterConfig, + system: string, + user: string, + tools: ReadonlyArray, + execute: (name: string, args: Record) => Promise, + opts?: { maxTurns?: number; temperature?: number; signal?: AbortSignal }, +): Promise { + const maxTurns = opts?.maxTurns ?? 4 + const messages: Array> = [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ] + let toolCalls = 0 + let lastText = '' + const usage = { input: 0, output: 0 } + + for (let turn = 1; turn <= maxTurns; turn += 1) { + const r = await routerChatWithTools(cfg, messages, tools, { + ...(opts?.temperature !== undefined ? { temperature: opts.temperature } : {}), + ...(opts?.signal ? { signal: opts.signal } : {}), + }) + if (r.usage) { + usage.input += r.usage.input + usage.output += r.usage.output + } + if (r.content) lastText = r.content + if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage } + + // Record the assistant turn verbatim (content + the tool_calls it requested), then + // run each call on the host and fold the result back as a `tool` message. + messages.push({ + role: 'assistant', + content: r.content ?? '', + tool_calls: r.toolCalls.map((tc) => ({ id: tc.id, type: 'function', function: { name: tc.name, arguments: tc.arguments } })), + }) + for (const tc of r.toolCalls) { + toolCalls += 1 + let args: Record = {} + try { + args = JSON.parse(tc.arguments) as Record + } catch { + // Malformed tool args from the model are a real outcome, not an infra fault — feed + // the error back so the model can correct, rather than throwing the whole loop. + messages.push({ role: 'tool', tool_call_id: tc.id, content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}` }) + continue + } + const out = await execute(tc.name, args) + messages.push({ role: 'tool', tool_call_id: tc.id, content: out }) + } + } + return { final: lastText, turns: maxTurns, toolCalls, usage } +}