tangle-network · drewstone · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/bench/src/commit0-gate.mts b/bench/src/commit0-gate.mts
@@ -37,7 +37,11 @@ import { Sandbox } from '@tangle-network/sandbox'
 import { createCommit0Adapter } from './benchmarks/commit0'
 import type { BenchTask } from './benchmarks/types'
 import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
-import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
+import {
+  type BenchRuntimeDecisionPoint,
+  type BenchRuntimeHookEvent,
+  createRuntimeHookRecorder,
+} from './runtime-hook-recorder'
 import { pool } from './stats.mts'
 
 function must(name: string): string {
@@ -67,6 +71,7 @@ interface Shot {
   /** measured count of stream events from the rollout (0 if it errored before streaming) */
   events: number
   runtimeEvents?: BenchRuntimeHookEvent[]
+  runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
 }
 
 /** Build the rollout prompt: clone the stub, implement the source, write the diff to
@@ -178,6 +183,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
       ok,
       events: turn.events.length,
       runtimeEvents: runtime.events,
+      runtimeDecisionPoints: runtime.decisionPoints,
       wallMs: Date.now() - startedAt,
       ...(ok ? {} : { detail: `empty patch${turn.readError ? ` (read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` }),
     }
@@ -190,6 +196,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
       ok: false,
       events: 0,
       runtimeEvents: runtime.events,
+      runtimeDecisionPoints: runtime.decisionPoints,
       wallMs: Date.now() - startedAt,
       detail: `rollout error: ${msg.slice(0, 200)}`,
     }
@@ -330,6 +337,9 @@ async function main(): Promise<void> {
     const runtimeEvents = shots
       .filter((x) => x.task.id === task.id)
       .flatMap((x) => x.runtimeEvents ?? [])
+    const runtimeDecisionPoints = shots
+      .filter((x) => x.task.id === task.id)
+      .flatMap((x) => x.runtimeDecisionPoints ?? [])
     for (let i = 0; i < k; i += 1) {
       const s = shots.find((x) => x.task.id === task.id && x.attempt === i)
       let sc: { score: number; resolved: boolean } | undefined
@@ -365,6 +375,7 @@ async function main(): Promise<void> {
       model,
       infraError: false,
       runtimeEvents,
+      runtimeDecisionPoints,
     })
     await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
   }

diff --git a/bench/src/corpus.test.mts b/bench/src/corpus.test.mts
@@ -1,6 +1,13 @@
 import assert from 'node:assert/strict'
 import { isRunRecord } from '@tangle-network/agent-eval'
-import { type AttemptRecord, benchRecordToCorpusRecords, buildRunRecordFromAttempts, type RunRecord } from './corpus'
+import {
+  type AttemptRecord,
+  benchRecordToCorpusRecords,
+  buildRunRecord,
+  buildRunRecordFromAttempts,
+  type RunRecord,
+} from './corpus'
+import { createRuntimeHookRecorder } from './runtime-hook-recorder'
 
 const measuredAttempt = (round: number, output: string, valid: boolean): AttemptRecord => ({
   round,
@@ -31,6 +38,51 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
   ...over,
 })
 
+// --- runtime recorder snapshots decision points before persistent corpus storage ---
+{
+  const recorder = createRuntimeHookRecorder()
+  const largeContext = `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}`
+  const largeDetail = `token=supersecret ${'detail'.repeat(1_000)}`
+  recorder.hooks.onDecisionPoint?.(
+    {
+      id: 'run-1:agent.turn:0:failure-recovery',
+      runId: 'run-1',
+      scenarioId: 'task-1',
+      stepIndex: 0,
+      kind: 'retry',
+      candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`),
+      context: largeContext,
+      evidence: [
+        {
+          source: 'tool_result',
+          id: 'tool-1:result',
+          detail: largeDetail,
+          metadata: { authorization: 'Bearer should-not-survive', nested: { apiKey: 'also-redacted' } },
+        },
+      ],
+      metadata: { token: 'should-not-survive', safe: 'kept' },
+    },
+    {},
+  )
+
+  const [point] = recorder.decisionPoints
+  assert.ok(point, 'decision point recorded')
+  assert.notEqual(point, undefined)
+  assert.equal(point.candidateActions.length, 50, 'candidate actions are bounded')
+  assert.equal(point.context?.length, 20_000, 'context is bounded')
+  assert.equal(point.evidence[0]?.detail?.length, 2_000, 'evidence detail is bounded')
+  assert.equal(point.context?.includes('abc.def.ghi'), false, 'context secrets are redacted')
+  assert.equal(point.evidence[0]?.detail?.includes('supersecret'), false, 'evidence detail secrets are redacted')
+  assert.equal(point.metadata?.token, '[REDACTED]', 'top-level sensitive metadata is redacted')
+  assert.equal(point.metadata?.safe, 'kept', 'non-sensitive metadata is preserved')
+  assert.equal(point.evidence[0]?.metadata?.authorization, '[REDACTED]', 'evidence metadata is redacted')
+  assert.equal(
+    (point.evidence[0]?.metadata?.nested as { apiKey?: unknown } | undefined)?.apiKey,
+    '[REDACTED]',
+    'nested sensitive metadata is redacted',
+  )
+}
+
 // --- happy path: a measured run projects to one canonical CorpusRecord per attempt ---
 {
   const rec = baseRec([
@@ -103,17 +155,93 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
   assert.equal(records[0]?.outcome.searchScore, undefined, 'no searchScore on a holdout record')
 }
 
+// --- bench writer preserves runtime trajectory evidence and semantic decision points ---
+{
+  const record = buildRunRecord({
+    benchmark: 'commit0',
+    instanceId: 'task-1',
+    condition: 'random@2',
+    model: 'gpt-5',
+    resolved: true,
+    infraError: false,
+    now: () => new Date('2026-06-03T00:00:00.000Z'),
+    iterations: [
+      {
+        index: 0,
+        task: 'prompt',
+        agentRunName: 'worker',
+        output: 'completion',
+        verdict: { valid: true, score: 1 },
+        events: [],
+        startedAt: 10,
+        endedAt: 20,
+        costUsd: 0.01,
+        tokenUsage: { input: 10, output: 5 },
+      },
+    ],
+    runtimeEvents: [
+      {
+        id: 'run-1:agent.run:before',
+        runId: 'run-1',
+        scenarioId: 'task-1',
+        target: 'agent.run',
+        phase: 'before',
+        timestamp: 1,
+      },
+    ],
+    runtimeDecisionPoints: [
+      {
+        id: 'run-1:agent.turn:0:failure-recovery',
+        runId: 'run-1',
+        scenarioId: 'task-1',
+        stepIndex: 0,
+        kind: 'retry',
+        candidateActions: ['retry', 'verify', 'stop'],
+        evidence: [{ source: 'tool_result', id: 'tool-1:result' }],
+        metadata: { target: 'failure-recovery' },
+      },
+    ],
+  })
+  assert.equal(record.runtimeEvents?.length, 1, 'runtime lifecycle events survive the writer')
+  assert.equal(record.runtimeDecisionPoints?.length, 1, 'runtime decision points survive the writer')
+  assert.equal(record.runtimeDecisionPoints?.[0]?.metadata?.target, 'failure-recovery')
+}
+
 // --- buildRunRecordFromAttempts: default derivations from the attempts ---
 {
-  const rec = buildRunRecordFromAttempts(
-    [measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)],
-    { benchmark: 'aec-bench', instanceId: 'i9', condition: 'random@2', model: 'gpt-5', now: () => new Date('2026-06-06T00:00:00.000Z') },
-  )
+  const rec = buildRunRecordFromAttempts([measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)], {
+    benchmark: 'aec-bench',
+    instanceId: 'i9',
+    condition: 'random@2',
+    model: 'gpt-5',
+    now: () => new Date('2026-06-06T00:00:00.000Z'),
+    runtimeEvents: [
+      {
+        id: 'run-2:agent.run:before',
+        runId: 'run-2',
+        target: 'agent.run',
+        phase: 'before',
+        timestamp: 1,
+      },
+    ],
+    runtimeDecisionPoints: [
+      {
+        id: 'run-2:agent.turn:0:failure-recovery',
+        runId: 'run-2',
+        stepIndex: 0,
+        kind: 'retry',
+        candidateActions: ['retry', 'verify', 'stop'],
+        evidence: [{ source: 'tool_result', id: 'tool-2:result' }],
+      },
+    ],
+  })
   assert.equal(rec.ts, '2026-06-06T00:00:00.000Z', 'now() seam stamps ts')
   assert.equal(rec.blindResolved, false, 'blindResolved = attempts[0].valid === true')
   assert.equal(rec.resolved, true, 'resolved = any attempt valid')
   assert.equal(rec.infraError, false, 'scored+valid attempts ⇒ not infra')
   assert.equal(rec.attempts.length, 2)
+  assert.equal(rec.runtimeEvents?.length, 1, 'attempt writer preserves lifecycle events')
+  assert.equal(rec.runtimeDecisionPoints?.length, 1, 'attempt writer preserves decision points')
 }
 
 // --- no scored + no valid attempt ⇒ derived infraError ---

diff --git a/bench/src/corpus.ts b/bench/src/corpus.ts
@@ -17,7 +17,7 @@ import { dirname } from 'node:path'
 import { hashContent, type RunSplitTag, validateRunRecord } from '@tangle-network/agent-eval'
 import type { CorpusRecord } from '@tangle-network/agent-eval/rl'
 import type { Iteration } from '@tangle-network/agent-runtime/loops'
-import type { BenchRuntimeHookEvent } from './runtime-hook-recorder'
+import type { BenchRuntimeDecisionPoint, BenchRuntimeHookEvent } from './runtime-hook-recorder'
 
 /** One attempt within a condition-run: the prompt/steer sent, the output, the
  *  verdict, the measured economics, and a bounded trace summary.
@@ -70,6 +70,8 @@ export interface RunRecord {
   commitSha?: string
   /** Passive runtime hook evidence captured during the run. Optional and bounded by producers. */
   runtimeEvents?: BenchRuntimeHookEvent[]
+  /** Semantic runtime decision points captured during the run. Optional and producer-defined. */
+  runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
 }
 
 const TRACE_TAIL_MAX = 600
@@ -120,6 +122,7 @@ export function buildRunRecord<Task, Output>(args: {
   splitTag?: RunSplitTag
   commitSha?: string
   runtimeEvents?: BenchRuntimeHookEvent[]
+  runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
 }): RunRecord {
   const attempts = args.iterations.map(summarizeAttempt)
   return {
@@ -138,6 +141,9 @@ export function buildRunRecord<Task, Output>(args: {
     ...(args.runtimeEvents !== undefined && args.runtimeEvents.length > 0
       ? { runtimeEvents: args.runtimeEvents }
       : {}),
+    ...(args.runtimeDecisionPoints !== undefined && args.runtimeDecisionPoints.length > 0
+      ? { runtimeDecisionPoints: args.runtimeDecisionPoints }
+      : {}),
   }
 }
 
@@ -169,6 +175,7 @@ export function buildRunRecordFromAttempts(
     splitTag?: RunSplitTag
     commitSha?: string
     runtimeEvents?: BenchRuntimeHookEvent[]
+    runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
   },
 ): RunRecord {
   const anyScored = attempts.some((a) => a.score !== undefined)
@@ -189,6 +196,9 @@ export function buildRunRecordFromAttempts(
     ...(meta.runtimeEvents !== undefined && meta.runtimeEvents.length > 0
       ? { runtimeEvents: meta.runtimeEvents }
       : {}),
+    ...(meta.runtimeDecisionPoints !== undefined && meta.runtimeDecisionPoints.length > 0
+      ? { runtimeDecisionPoints: meta.runtimeDecisionPoints }
+      : {}),
   }
 }
 

diff --git a/bench/src/experiment.ts b/bench/src/experiment.ts
@@ -340,6 +340,7 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
           infraError,
           ...(cfg.now ? { now: cfg.now } : {}),
           runtimeEvents: runtime.events,
+          runtimeDecisionPoints: runtime.decisionPoints,
         }),
       ).catch((err) =>
         console.error(