tangle-network · drewstone · Jun 8, 2026 · Jun 7, 2026 · Jun 7, 2026 · Jun 8, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,11 @@
+# agent-runtime Agent Bootloader
+
+Read `CLAUDE.md` first. This repo keeps provider-specific entry files short:
+
+- `CLAUDE.md`: repo orientation, code map, layering, commands, and local deltas.
+- `docs/BUILDING.md`: stable building discipline.
+- `docs/ANTI_PATTERNS.md`: named failure modes and stop signs.
+- `.evolve/current.json` and `memory/`: live state and evidence ledger.
+
+Do not duplicate long-lived process rules here. Add durable rules to the docs
+above and keep this file as the provider-neutral pointer.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -56,12 +56,7 @@ This repo is the empirical home of the RSI/learning-flywheel thesis, but **mecha
 
 **The live science state — every number, what's proven/disproven, the current goal — lives in `.evolve/current.json` + the `memory/` evidence ledger. Read them; do not mirror them here.** `docs/eval-substrate.md` holds the north star (the RSI runtime + its eval substrate) and the measurement non-negotiables.
 
-**Process discipline (the anti-patterns that have bitten this repo):**
-- **Don't build mechanism ahead of the gate.** Per-branch adaptive sub-agents, learned planners, the outer flywheel — all wait for a *positive* gate result. Expressiveness was the closed gap; the open one is evidentiary.
-- **Don't re-run a settled measurement.** The instrument already returned 0 coding-headroom (3 runs) and steering-loses on FinSearchComp. Read the dated controlled-result memory note before proposing to "test if steering helps" again.
-- **Estimate cost before launch.** cells × per-cell-time / concurrency. A cell is a multi-min rollout; GEPA multiplies it (POP×GENS×cells). FinSearchComp-over-sandbox ≈ 3hr/run with ~14% stream-drop loss — budget it or use the offline corpus / local gate (conc≤2).
-- **Confounds before causal claims.** Never claim a win where treatment got more compute than control. Isolate via refine@k vs random@k at EQUAL k; exclude infra-errored cells; report the discordant count; apply BH across arms; prefer deterministic-judge domains. Run the cheapest decisive check first.
-- **No overclaim.** "Validates the concept" ≠ "validates the product." Route through the real kernel (`runLoop` + `createDriver` + judge-as-`Validator`) to claim the product. Underpowered splits (n≈20) are not wins. A confounded "steering proven" (treatment got more compute than control) is a cautionary precedent — see the memory ledger.
+**Process discipline:** stable build rules live in `docs/BUILDING.md`; named failure modes live in `docs/ANTI_PATTERNS.md`. `CLAUDE.md` is the bootloader, not the whole policy manual.
 
 ## Memory discipline
 

diff --git a/bench/src/fleet.mts b/bench/src/fleet.mts
@@ -0,0 +1,122 @@
+/**
+ * The whole vision, end to end, runnable from a laptop: a thin local driver
+ * fans out N workers to CLOUD sandboxes, observes each worker's trace, reports
+ * what to fix, and writes durable learnings to a corpus the NEXT run reads back.
+ *
+ * Local process = the driver (orchestrate + observe). All agent work runs in the
+ * cloud (Tangle sandbox SDK). Scale is the API key, not the laptop.
+ *
+ *   dotenvx run -f ~/company/devops/secrets/.env.keys -f ~/company/devops/secrets/agent-state.env -- \
+ *     env BACKEND=opencode MODEL=gpt-4.1 N=2 CORPUS=/tmp/fleet-corpus.jsonl \
+ *     pnpm exec tsx src/fleet.mts
+ *
+ * Run it twice: the second run injects the first run's learnings into the workers.
+ */
+import { createChatClient } from '@tangle-network/agent-eval'
+import { FileCorpus, observe, openSandboxRun, renderReport } from '@tangle-network/agent-runtime/loops'
+import { Sandbox } from '@tangle-network/sandbox'
+import { answerOutput, sandboxAgentRun, type WorkerBackendType } from './experiment'
+
+function env(name: string, fallback?: string): string {
+  const v = process.env[name] ?? fallback
+  if (v === undefined) throw new Error(`missing env ${name}`)
+  return v
+}
+
+// The fleet's work: each worker gets one subtask. Swap for any real decomposition.
+const subtasks = [
+  'Write a Python function `is_prime(n)` and three asserts proving it. Run them and report PASS/FAIL.',
+  'Write a Python function `fib(n)` (iterative) and three asserts proving it. Run them and report PASS/FAIL.',
+  'Write a Python function `rev_words(s)` that reverses word order, with asserts. Run them and report PASS/FAIL.',
+]
+
+interface WorkerResult {
+  id: string
+  task: string
+  output: string
+  events: unknown[]
+  wallMs: number
+  error?: string
+}
+
+async function runWorker(
+  client: Sandbox,
+  cfg: { backendType: WorkerBackendType; model: string; routerBaseUrl: string; routerKey: string },
+  id: string,
+  task: string,
+  priorLearnings: string,
+): Promise<WorkerResult> {
+  const startedAt = Date.now()
+  const prompt = priorLearnings ? `${priorLearnings}\n\n---\n\n${task}` : task
+  const controller = new AbortController()
+  const timer = setTimeout(() => controller.abort(), Number(process.env.TIMEOUT_MS ?? 240_000))
+  try {
+    const agentRun = sandboxAgentRun({ ...cfg, name: id })
+    const run = await openSandboxRun<string>(
+      client,
+      { agentRun, signal: controller.signal },
+      { kind: 'events', fromEvents: (events) => answerOutput.parse(events as never) },
+    )
+    try {
+      const turn = await run.start(prompt)
+      return { id, task, output: (turn.out ?? '').trim(), events: turn.events, wallMs: Date.now() - startedAt }
+    } finally {
+      await run.close().catch(() => {})
+    }
+  } catch (err) {
+    return { id, task, output: '', events: [], wallMs: Date.now() - startedAt, error: err instanceof Error ? err.message : String(err) }
+  } finally {
+    clearTimeout(timer)
+  }
+}
+
+async function main(): Promise<void> {
+  const routerKey = env('TANGLE_API_KEY')
+  const cfg = {
+    backendType: env('BACKEND', 'opencode') as WorkerBackendType,
+    model: env('MODEL', 'gpt-4.1'),
+    routerBaseUrl: env('ROUTER_BASE_URL', 'https://router.tangle.tools/v1'),
+    routerKey,
+  }
+  const n = Math.min(Number(env('N', '2')), subtasks.length)
+  const corpus = new FileCorpus(env('CORPUS', '/tmp/fleet-corpus.jsonl'))
+  const observerModel = env('OBSERVER_MODEL', 'gpt-4.1')
+  const chat = createChatClient({ transport: 'router', apiKey: routerKey, baseUrl: cfg.routerBaseUrl, defaultModel: observerModel })
+  const client = new Sandbox({ baseUrl: env('SANDBOX_BASE_URL', 'https://sandbox.tangle.tools'), apiKey: routerKey })
+
+  // ── continuous: read what prior runs LEARNED, inject it into this run's workers
+  const prior = await corpus.query({ tags: ['audience:agent'], limit: 8 })
+  const priorLearnings = prior.length
+    ? `PRIOR LEARNINGS (from earlier runs — apply them):\n${prior.map((r) => `- ${r.claim}`).join('\n')}`
+    : ''
+  console.error(`\n=== FLEET · ${n} workers · ${cfg.backendType}/${cfg.model} · cloud ===`)
+  console.error(prior.length ? `carrying ${prior.length} prior learning(s) into the workers\n` : 'first run — no prior learnings yet\n')
+
+  // ── fan out N workers to cloud sandboxes, in parallel
+  const tasks = subtasks.slice(0, n)
+  const workers = await Promise.all(
+    tasks.map((task, i) => runWorker(client, cfg, `worker-${i + 1}`, task, priorLearnings)),
+  )
+
+  // ── observe each worker's trace → findings → operator report + durable learnings
+  let totalLearned = 0
+  for (const w of workers) {
+    console.error(`\n── ${w.id} (${Math.round(w.wallMs / 1000)}s)${w.error ? ` — ERROR: ${w.error}` : ''}`)
+    if (w.error) continue
+    const ob = await observe(
+      { task: w.task, output: w.output, trace: w.events, outcome: w.output ? 'passed' : 'unknown', runId: w.id },
+      { chat, model: observerModel, corpus, tags: [cfg.backendType, 'fleet'] },
+    )
+    totalLearned += ob.learned.length
+    console.error(`  answer: ${w.output.slice(0, 120).replace(/\n/g, ' ')}`)
+    console.error(renderReport(ob.findings).split('\n').map((l) => `  ${l}`).join('\n'))
+    console.error(`  → ${ob.learned.length} new learning(s) saved to the corpus`)
+  }
+
+  console.error(`\n=== fleet done: ${workers.filter((w) => !w.error).length}/${n} workers ok · ${totalLearned} learnings banked → run again to apply them ===`)
+}
+
+main().catch((e) => {
+  console.error(e)
+  process.exit(1)
+})