From 5729396cb846285d818a8b56f084530b2b2a0818 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Mon, 8 Jun 2026 10:30:25 -0600
Subject: [PATCH] feat(bench): tool-using router backend (routerToolLoop) +
 self-repair gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

routerToolLoop (router-client.ts): a real agentic loop OVER the Tangle router's
tool-calling, OFF-BOX — each turn is a router completion with `tools`; tool_calls
execute on the host and fold back as `tool` messages; repeat until done or budget.
The depth substrate the chat-only routerChatWithUsage couldn't express (routes
around the sandbox→router egress block, #984). One turn = one completion, so
maxTurns is the equal-compute unit vs random@k.

humaneval-repair-gate.mts: the strongest steering test — the worker gets a
run_tests tool (the deployable Docker checker, now returning failure detail), so
it RUNS the tests, sees the real failure, and fixes, vs blind resampling at equal
k. Removes the weakness of the earlier LLM-audit null (which never ran the code).

Gate result (gpt-3.5-turbo, HumanEval hard half, n=82, equal k=3, paired bootstrap):
  blind@1 54.9%  blind@3 (resample) 75.6%  repair@3 (tools) 58.5%
  repair@3 − blind@3 (steering)     -17.1pp  CI [-26.8, -7.3]   SIGNIFICANT NEGATIVE
  blind@3 − blind@1 (more-compute)  +20.7pp  CI [+12.2, +30.5]  SIGNIFICANT POSITIVE

Execution-grounded self-repair is SIGNIFICANTLY WORSE than blind resampling at
equal budget: refining one anchored attempt loses to exploring k fresh ones. The
tool backend works (recovered tasks blind missed); the depth strategy loses on
single-shot codegen. Sharpens the steering-vs-compute boundary: breadth wins where
fresh samples are cheap+independent; depth's one win remains EOPS (stateful/agentic).
---
 bench/src/benchmarks/humaneval.ts   |   8 +-
 bench/src/humaneval-repair-gate.mts | 143 ++++++++++++++++++++++++++++
 bench/src/router-client.ts          |  79 +++++++++++++++
 3 files changed, 228 insertions(+), 2 deletions(-)
 create mode 100644 bench/src/humaneval-repair-gate.mts

diff --git a/bench/src/benchmarks/humaneval.ts b/bench/src/benchmarks/humaneval.ts
index 60fefdf..8a966bc 100644
--- a/bench/src/benchmarks/humaneval.ts
+++ b/bench/src/benchmarks/humaneval.ts
@@ -98,6 +98,9 @@ function buildProgram(task: HumanEvalTask, candidate: string): string {
 export interface CheckResult {
   /** {0,1} pass-count for this candidate (1 = the check() suite passed). */
   pass: number
+  /** On failure: the interpreter stderr tail (traceback / failing assertion). The
+   *  execution-grounded feedback a self-repair loop steers on; ignored by selection. */
+  detail?: string
 }
 
 /** Run one candidate's deployable test program in an isolated container:
@@ -174,8 +177,9 @@ export function runChecker(task: HumanEvalTask, candidate: string): Promise<Chec
             return
           }
           // killed-by-timeout or a non-zero exit (assert failure / error) are genuine
-          // test FAILURES — score 0, do not throw.
-          finish({ pass: 0 })
+          // test FAILURES — score 0, do not throw. Carry the stderr tail as the
+          // execution-grounded failure detail (empty ⇒ timeout/SIGKILL left no output).
+          finish({ pass: 0, detail: (stderr || '').slice(-600) || 'timed out (no output)' })
           return
         }
         finish({ pass: 1 })
diff --git a/bench/src/humaneval-repair-gate.mts b/bench/src/humaneval-repair-gate.mts
new file mode 100644
index 0000000..bc92ea5
--- /dev/null
+++ b/bench/src/humaneval-repair-gate.mts
@@ -0,0 +1,143 @@
+/**
+ * HumanEval self-repair gate — the TOOL-USING router backend vs blind resampling
+ * at equal compute, the strongest form of the steering question.
+ *
+ * The earlier steering gate (the rsi analyst arm) used an LLM that AUDITED the
+ * prior code WITHOUT running it — and was a null (−1.2pp, n.s.). This removes that
+ * weakness: the worker gets a `run_tests` tool (the deployable Docker checker), so
+ * it actually RUNS the tests, sees the real failure, and fixes — execution-grounded
+ * self-repair, off-box over the Tangle router's tool-calling (no sandbox). If
+ * steering ever beats compute on a deployable checker, this is where it should.
+ *
+ *   blind@K   — K independent completions, verifier-grounded pick           (breadth/resample)
+ *   repair@K  — ONE worker, up to K tool-turns: write → run_tests → fix → …  (depth/tool-grounded)
+ *
+ * Equal budget: one inference turn = one router completion, so both arms spend ≤K
+ * completions. Both finals are judged by the SAME check() suite. Per-task {0,1}
+ * outcomes, paired 95% bootstrap CI (discordant pairs = the power).
+ *
+ *   TANGLE_API_KEY=… N=82 K=3 OFFSET=82 WORKER_MODEL=gpt-3.5-turbo \
+ *     tsx src/humaneval-repair-gate.mts
+ */
+import { type HumanEvalTask, basePrompt, extractCode, loadHumanEval, runChecker } from './benchmarks/humaneval'
+import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client'
+import { verifierGroundedSelect } from './selector'
+import { type PairedLift, pairedLift, pool } from './stats.mts'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+const runTestsTool: ToolSpec = {
+  type: 'function',
+  function: {
+    name: 'run_tests',
+    description:
+      "Run the task's test suite against your candidate function and return PASS or the real failure output. Verify with this before giving your final answer.",
+    parameters: {
+      type: 'object',
+      properties: { code: { type: 'string', description: 'The COMPLETE Python function definition to test (signature + body, plus any imports).' } },
+      required: ['code'],
+    },
+  },
+}
+
+const repairSystem = [
+  'You complete a Python function. You have a run_tests tool that runs the REAL test suite against your code.',
+  'Workflow: write the function, call run_tests to check it, and if it fails read the error and fix the function, then call run_tests again.',
+  'When run_tests reports all tests passed, reply with the final function in a single ```python block and do NOT call the tool again.',
+].join(' ')
+
+/** repair@K: one worker, up to K inference turns, steering on real test failures. */
+async function repairAttempt(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number> {
+  let lastTested = ''
+  const r = await routerToolLoop(
+    cfg,
+    repairSystem,
+    basePrompt(task),
+    [runTestsTool],
+    async (name, args) => {
+      if (name !== 'run_tests') return `error: unknown tool ${name}`
+      const code = extractCode(String(args.code ?? ''))
+      lastTested = code
+      const res = await runChecker(task, code)
+      return res.pass === 1
+        ? 'ALL TESTS PASSED. Reply with the final function now; do not call run_tests again.'
+        : `TESTS FAILED:\n${res.detail ?? 'no output'}\n\nFix the function and call run_tests again.`
+    },
+    { maxTurns: k, temperature: 0.3 },
+  )
+  // Judge the model's final answer; fall back to the last code it tested (it may
+  // report "done" without re-pasting the passing function).
+  const finalCode = extractCode(r.final) || lastTested
+  if (!finalCode) return 0
+  return (await runChecker(task, finalCode)).pass
+}
+
+/** blind@K: K independent completions, verifier-grounded pick (the resample control). */
+async function blindAttempts(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number[]> {
+  const base = basePrompt(task)
+  const passes: number[] = []
+  for (let i = 0; i < k; i += 1) {
+    const res = await routerChatWithUsage(cfg, [{ role: 'user', content: base }], { temperature: 0.8 })
+    passes.push((await runChecker(task, extractCode(res.content))).pass)
+  }
+  return passes
+}
+
+const pct = (x: number) => `${(x * 100).toFixed(1)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 82)
+  const k = Number(process.env.K ?? 3)
+  const offset = Number(process.env.OFFSET ?? 82)
+  const model = process.env.WORKER_MODEL ?? 'gpt-3.5-turbo'
+  const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model }
+  const concurrency = Number(process.env.CONCURRENCY ?? 6)
+  if (k < 2) throw new Error('K must be >= 2 (repair needs at least write + one fix)')
+
+  console.log(`=== HumanEval self-repair gate · tool-using router worker · N=${n} K=${k} offset=${offset} model=${model} ===`)
+  const tasks = await loadHumanEval(n, offset)
+  console.log(`loaded ${tasks.length} task(s); running blind@${k} (resample) vs repair@${k} (run_tests-grounded), conc=${concurrency}\n`)
+
+  const rows = await pool(tasks, concurrency, async (task, i) => {
+    const blind = await blindAttempts(cfg, task, k)
+    const repair = await repairAttempt(cfg, task, k)
+    const blind1 = blind[0] ?? 0
+    const blindK = blind[verifierGroundedSelect(blind)] ?? 0
+    process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId}: blind@1=${blind1} blind@${k}=${blindK} repair@${k}=${repair}\n`)
+    return { blind1, blindK, repair }
+  })
+
+  const blind1 = rows.map((r) => r.blind1)
+  const blindK = rows.map((r) => r.blindK)
+  const repairK = rows.map((r) => r.repair)
+  const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length
+
+  console.log(`\n${'='.repeat(74)}`)
+  console.log(`RESULTS · HumanEval self-repair · n=${tasks.length} · k=${k} · ${model}`)
+  console.log('='.repeat(74))
+  console.log(`  blind pass@1        ${pct(rate(blind1))}`)
+  console.log(`  blind@${k} (resample) ${pct(rate(blindK))}`)
+  console.log(`  repair@${k} (tools)   ${pct(rate(repairK))}`)
+
+  const row = (label: string, l: PairedLift) =>
+    console.log(`  ${label.padEnd(34)} ${pp(l.point).padStart(7)}   CI [${pp(l.low)}, ${pp(l.high)}]   (paired ${l.pairs}, discordant ${l.discordant})`)
+  const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)')
+
+  const repairVsBlind = pairedLift(blindK, repairK)
+  const computeVsBlind1 = pairedLift(blind1, blindK)
+  console.log(`\n  PAIRED LIFTS (95% bootstrap CI, B=10000):`)
+  row(`repair@${k} − blind@${k} (steering)`, repairVsBlind)
+  row(`blind@${k} − blind@1 (more-compute)`, computeVsBlind1)
+  console.log(`\n  VERDICT:`)
+  console.log(`    execution-grounded self-repair beats blind resampling @ equal k?  ${repairVsBlind.point > 0 ? 'yes' : 'no'} (${pp(repairVsBlind.point)}, ${sig(repairVsBlind)})`)
+}
+
+main().catch((err) => {
+  console.error(`humaneval-repair-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`)
+  process.exit(1)
+})
diff --git a/bench/src/router-client.ts b/bench/src/router-client.ts
index 0014d22..d1b20e8 100644
--- a/bench/src/router-client.ts
+++ b/bench/src/router-client.ts
@@ -138,3 +138,82 @@ export async function routerChatWithTools(
     ...(costUsd !== undefined ? { costUsd } : {}),
   }
 }
+
+export interface ToolSpec {
+  type: 'function'
+  function: { name: string; description?: string; parameters: unknown }
+}
+
+export interface RouterToolLoopResult {
+  /** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */
+  final: string
+  /** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */
+  turns: number
+  toolCalls: number
+  usage: { input: number; output: number }
+}
+
+/**
+ * The tool-using router backend: a real agentic loop OVER the Tangle router (which
+ * supports tool-calling), off-box — no sandbox. Each turn is one router completion
+ * with `tools`; if the model emits tool_calls, `execute` runs them on the host and
+ * their results are folded back as `tool` messages; the loop repeats until the
+ * model answers without a tool call or the turn budget is hit. One turn = one
+ * inference call, so `maxTurns` is the equal-compute unit against random@k.
+ *
+ * This is the depth substrate for agentic gates (the worker ACTS, observes the real
+ * result, and continues) that the chat-only `routerChatWithUsage` cannot express.
+ */
+export async function routerToolLoop(
+  cfg: RouterConfig,
+  system: string,
+  user: string,
+  tools: ReadonlyArray<ToolSpec>,
+  execute: (name: string, args: Record<string, unknown>) => Promise<string>,
+  opts?: { maxTurns?: number; temperature?: number; signal?: AbortSignal },
+): Promise<RouterToolLoopResult> {
+  const maxTurns = opts?.maxTurns ?? 4
+  const messages: Array<Record<string, unknown>> = [
+    { role: 'system', content: system },
+    { role: 'user', content: user },
+  ]
+  let toolCalls = 0
+  let lastText = ''
+  const usage = { input: 0, output: 0 }
+
+  for (let turn = 1; turn <= maxTurns; turn += 1) {
+    const r = await routerChatWithTools(cfg, messages, tools, {
+      ...(opts?.temperature !== undefined ? { temperature: opts.temperature } : {}),
+      ...(opts?.signal ? { signal: opts.signal } : {}),
+    })
+    if (r.usage) {
+      usage.input += r.usage.input
+      usage.output += r.usage.output
+    }
+    if (r.content) lastText = r.content
+    if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage }
+
+    // Record the assistant turn verbatim (content + the tool_calls it requested), then
+    // run each call on the host and fold the result back as a `tool` message.
+    messages.push({
+      role: 'assistant',
+      content: r.content ?? '',
+      tool_calls: r.toolCalls.map((tc) => ({ id: tc.id, type: 'function', function: { name: tc.name, arguments: tc.arguments } })),
+    })
+    for (const tc of r.toolCalls) {
+      toolCalls += 1
+      let args: Record<string, unknown> = {}
+      try {
+        args = JSON.parse(tc.arguments) as Record<string, unknown>
+      } catch {
+        // Malformed tool args from the model are a real outcome, not an infra fault — feed
+        // the error back so the model can correct, rather than throwing the whole loop.
+        messages.push({ role: 'tool', tool_call_id: tc.id, content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}` })
+        continue
+      }
+      const out = await execute(tc.name, args)
+      messages.push({ role: 'tool', tool_call_id: tc.id, content: out })
+    }
+  }
+  return { final: lastText, turns: maxTurns, toolCalls, usage }
+}