tangle-network · drewstone · Jun 8, 2026 · Jun 8, 2026
diff --git a/bench/src/benchmarks/humaneval.ts b/bench/src/benchmarks/humaneval.ts
@@ -98,6 +98,9 @@ function buildProgram(task: HumanEvalTask, candidate: string): string {
 export interface CheckResult {
   /** {0,1} pass-count for this candidate (1 = the check() suite passed). */
   pass: number
+  /** On failure: the interpreter stderr tail (traceback / failing assertion). The
+   *  execution-grounded feedback a self-repair loop steers on; ignored by selection. */
+  detail?: string
 }
 
 /** Run one candidate's deployable test program in an isolated container:
@@ -174,8 +177,9 @@ export function runChecker(task: HumanEvalTask, candidate: string): Promise<Chec
             return
           }
           // killed-by-timeout or a non-zero exit (assert failure / error) are genuine
-          // test FAILURES — score 0, do not throw.
-          finish({ pass: 0 })
+          // test FAILURES — score 0, do not throw. Carry the stderr tail as the
+          // execution-grounded failure detail (empty ⇒ timeout/SIGKILL left no output).
+          finish({ pass: 0, detail: (stderr || '').slice(-600) || 'timed out (no output)' })
           return
         }
         finish({ pass: 1 })

diff --git a/bench/src/humaneval-repair-gate.mts b/bench/src/humaneval-repair-gate.mts
@@ -0,0 +1,143 @@
+/**
+ * HumanEval self-repair gate — the TOOL-USING router backend vs blind resampling
+ * at equal compute, the strongest form of the steering question.
+ *
+ * The earlier steering gate (the rsi analyst arm) used an LLM that AUDITED the
+ * prior code WITHOUT running it — and was a null (−1.2pp, n.s.). This removes that
+ * weakness: the worker gets a `run_tests` tool (the deployable Docker checker), so
+ * it actually RUNS the tests, sees the real failure, and fixes — execution-grounded
+ * self-repair, off-box over the Tangle router's tool-calling (no sandbox). If
+ * steering ever beats compute on a deployable checker, this is where it should.
+ *
+ *   blind@K   — K independent completions, verifier-grounded pick           (breadth/resample)
+ *   repair@K  — ONE worker, up to K tool-turns: write → run_tests → fix → …  (depth/tool-grounded)
+ *
+ * Equal budget: one inference turn = one router completion, so both arms spend ≤K
+ * completions. Both finals are judged by the SAME check() suite. Per-task {0,1}
+ * outcomes, paired 95% bootstrap CI (discordant pairs = the power).
+ *
+ *   TANGLE_API_KEY=… N=82 K=3 OFFSET=82 WORKER_MODEL=gpt-3.5-turbo \
+ *     tsx src/humaneval-repair-gate.mts
+ */
+import { type HumanEvalTask, basePrompt, extractCode, loadHumanEval, runChecker } from './benchmarks/humaneval'
+import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client'
+import { verifierGroundedSelect } from './selector'
+import { type PairedLift, pairedLift, pool } from './stats.mts'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+const runTestsTool: ToolSpec = {
+  type: 'function',
+  function: {
+    name: 'run_tests',
+    description:
+      "Run the task's test suite against your candidate function and return PASS or the real failure output. Verify with this before giving your final answer.",
+    parameters: {
+      type: 'object',
+      properties: { code: { type: 'string', description: 'The COMPLETE Python function definition to test (signature + body, plus any imports).' } },
+      required: ['code'],
+    },
+  },
+}
+
+const repairSystem = [
+  'You complete a Python function. You have a run_tests tool that runs the REAL test suite against your code.',
+  'Workflow: write the function, call run_tests to check it, and if it fails read the error and fix the function, then call run_tests again.',
+  'When run_tests reports all tests passed, reply with the final function in a single ```python block and do NOT call the tool again.',
+].join(' ')
+
+/** repair@K: one worker, up to K inference turns, steering on real test failures. */
+async function repairAttempt(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number> {
+  let lastTested = ''
+  const r = await routerToolLoop(
+    cfg,
+    repairSystem,
+    basePrompt(task),
+    [runTestsTool],
+    async (name, args) => {
+      if (name !== 'run_tests') return `error: unknown tool ${name}`
+      const code = extractCode(String(args.code ?? ''))
+      lastTested = code
+      const res = await runChecker(task, code)
+      return res.pass === 1
+        ? 'ALL TESTS PASSED. Reply with the final function now; do not call run_tests again.'
+        : `TESTS FAILED:\n${res.detail ?? 'no output'}\n\nFix the function and call run_tests again.`
+    },
+    { maxTurns: k, temperature: 0.3 },
+  )
+  // Judge the model's final answer; fall back to the last code it tested (it may
+  // report "done" without re-pasting the passing function).
+  const finalCode = extractCode(r.final) || lastTested
+  if (!finalCode) return 0
+  return (await runChecker(task, finalCode)).pass
+}
+
+/** blind@K: K independent completions, verifier-grounded pick (the resample control). */
+async function blindAttempts(cfg: RouterConfig, task: HumanEvalTask, k: number): Promise<number[]> {
+  const base = basePrompt(task)
+  const passes: number[] = []
+  for (let i = 0; i < k; i += 1) {
+    const res = await routerChatWithUsage(cfg, [{ role: 'user', content: base }], { temperature: 0.8 })
+    passes.push((await runChecker(task, extractCode(res.content))).pass)
+  }
+  return passes
+}
+
+const pct = (x: number) => `${(x * 100).toFixed(1)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 82)
+  const k = Number(process.env.K ?? 3)
+  const offset = Number(process.env.OFFSET ?? 82)
+  const model = process.env.WORKER_MODEL ?? 'gpt-3.5-turbo'
+  const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model }
+  const concurrency = Number(process.env.CONCURRENCY ?? 6)
+  if (k < 2) throw new Error('K must be >= 2 (repair needs at least write + one fix)')
+
+  console.log(`=== HumanEval self-repair gate · tool-using router worker · N=${n} K=${k} offset=${offset} model=${model} ===`)
+  const tasks = await loadHumanEval(n, offset)
+  console.log(`loaded ${tasks.length} task(s); running blind@${k} (resample) vs repair@${k} (run_tests-grounded), conc=${concurrency}\n`)
+
+  const rows = await pool(tasks, concurrency, async (task, i) => {
+    const blind = await blindAttempts(cfg, task, k)
+    const repair = await repairAttempt(cfg, task, k)
+    const blind1 = blind[0] ?? 0
+    const blindK = blind[verifierGroundedSelect(blind)] ?? 0
+    process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId}: blind@1=${blind1} blind@${k}=${blindK} repair@${k}=${repair}\n`)
+    return { blind1, blindK, repair }
+  })
+
+  const blind1 = rows.map((r) => r.blind1)
+  const blindK = rows.map((r) => r.blindK)
+  const repairK = rows.map((r) => r.repair)
+  const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length
+
+  console.log(`\n${'='.repeat(74)}`)
+  console.log(`RESULTS · HumanEval self-repair · n=${tasks.length} · k=${k} · ${model}`)
+  console.log('='.repeat(74))
+  console.log(`  blind pass@1        ${pct(rate(blind1))}`)
+  console.log(`  blind@${k} (resample) ${pct(rate(blindK))}`)
+  console.log(`  repair@${k} (tools)   ${pct(rate(repairK))}`)
+
+  const row = (label: string, l: PairedLift) =>
+    console.log(`  ${label.padEnd(34)} ${pp(l.point).padStart(7)}   CI [${pp(l.low)}, ${pp(l.high)}]   (paired ${l.pairs}, discordant ${l.discordant})`)
+  const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)')
+
+  const repairVsBlind = pairedLift(blindK, repairK)
+  const computeVsBlind1 = pairedLift(blind1, blindK)
+  console.log(`\n  PAIRED LIFTS (95% bootstrap CI, B=10000):`)
+  row(`repair@${k} − blind@${k} (steering)`, repairVsBlind)
+  row(`blind@${k} − blind@1 (more-compute)`, computeVsBlind1)
+  console.log(`\n  VERDICT:`)
+  console.log(`    execution-grounded self-repair beats blind resampling @ equal k?  ${repairVsBlind.point > 0 ? 'yes' : 'no'} (${pp(repairVsBlind.point)}, ${sig(repairVsBlind)})`)
+}
+
+main().catch((err) => {
+  console.error(`humaneval-repair-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`)
+  process.exit(1)
+})
diff --git a/bench/src/router-client.ts b/bench/src/router-client.ts
@@ -138,3 +138,82 @@ export async function routerChatWithTools(
     ...(costUsd !== undefined ? { costUsd } : {}),
   }
 }
+
+export interface ToolSpec {
+  type: 'function'
+  function: { name: string; description?: string; parameters: unknown }
+}
+
+export interface RouterToolLoopResult {
+  /** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */
+  final: string
+  /** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */
+  turns: number
+  toolCalls: number
+  usage: { input: number; output: number }
+}
+
+/**
+ * The tool-using router backend: a real agentic loop OVER the Tangle router (which
+ * supports tool-calling), off-box — no sandbox. Each turn is one router completion
+ * with `tools`; if the model emits tool_calls, `execute` runs them on the host and
+ * their results are folded back as `tool` messages; the loop repeats until the
+ * model answers without a tool call or the turn budget is hit. One turn = one
+ * inference call, so `maxTurns` is the equal-compute unit against random@k.
+ *
+ * This is the depth substrate for agentic gates (the worker ACTS, observes the real
+ * result, and continues) that the chat-only `routerChatWithUsage` cannot express.
+ */
+export async function routerToolLoop(
+  cfg: RouterConfig,
+  system: string,
+  user: string,
+  tools: ReadonlyArray<ToolSpec>,
+  execute: (name: string, args: Record<string, unknown>) => Promise<string>,
+  opts?: { maxTurns?: number; temperature?: number; signal?: AbortSignal },
+): Promise<RouterToolLoopResult> {
+  const maxTurns = opts?.maxTurns ?? 4
+  const messages: Array<Record<string, unknown>> = [
+    { role: 'system', content: system },
+    { role: 'user', content: user },
+  ]
+  let toolCalls = 0
+  let lastText = ''
+  const usage = { input: 0, output: 0 }
+
+  for (let turn = 1; turn <= maxTurns; turn += 1) {
+    const r = await routerChatWithTools(cfg, messages, tools, {
+      ...(opts?.temperature !== undefined ? { temperature: opts.temperature } : {}),
+      ...(opts?.signal ? { signal: opts.signal } : {}),
+    })
+    if (r.usage) {
+      usage.input += r.usage.input
+      usage.output += r.usage.output
+    }
+    if (r.content) lastText = r.content
+    if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage }
+
+    // Record the assistant turn verbatim (content + the tool_calls it requested), then
+    // run each call on the host and fold the result back as a `tool` message.
+    messages.push({
+      role: 'assistant',
+      content: r.content ?? '',
+      tool_calls: r.toolCalls.map((tc) => ({ id: tc.id, type: 'function', function: { name: tc.name, arguments: tc.arguments } })),
+    })
+    for (const tc of r.toolCalls) {
+      toolCalls += 1
+      let args: Record<string, unknown> = {}
+      try {
+        args = JSON.parse(tc.arguments) as Record<string, unknown>
+      } catch {
+        // Malformed tool args from the model are a real outcome, not an infra fault — feed
+        // the error back so the model can correct, rather than throwing the whole loop.
+        messages.push({ role: 'tool', tool_call_id: tc.id, content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}` })
+        continue
+      }
+      const out = await execute(tc.name, args)
+      messages.push({ role: 'tool', tool_call_id: tc.id, content: out })
+    }
+  }
+  return { final: lastText, turns: maxTurns, toolCalls, usage }
+}