Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion bench/src/commit0-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ import { Sandbox } from '@tangle-network/sandbox'
import { createCommit0Adapter } from './benchmarks/commit0'
import type { BenchTask } from './benchmarks/types'
import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
import {
type BenchRuntimeDecisionPoint,
type BenchRuntimeHookEvent,
createRuntimeHookRecorder,
} from './runtime-hook-recorder'
import { pool } from './stats.mts'

function must(name: string): string {
Expand Down Expand Up @@ -67,6 +71,7 @@ interface Shot {
/** measured count of stream events from the rollout (0 if it errored before streaming) */
events: number
runtimeEvents?: BenchRuntimeHookEvent[]
runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
}

/** Build the rollout prompt: clone the stub, implement the source, write the diff to
Expand Down Expand Up @@ -178,6 +183,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
ok,
events: turn.events.length,
runtimeEvents: runtime.events,
runtimeDecisionPoints: runtime.decisionPoints,
wallMs: Date.now() - startedAt,
...(ok ? {} : { detail: `empty patch${turn.readError ? ` (read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` }),
}
Expand All @@ -190,6 +196,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
ok: false,
events: 0,
runtimeEvents: runtime.events,
runtimeDecisionPoints: runtime.decisionPoints,
wallMs: Date.now() - startedAt,
detail: `rollout error: ${msg.slice(0, 200)}`,
}
Expand Down Expand Up @@ -330,6 +337,9 @@ async function main(): Promise<void> {
const runtimeEvents = shots
.filter((x) => x.task.id === task.id)
.flatMap((x) => x.runtimeEvents ?? [])
const runtimeDecisionPoints = shots
.filter((x) => x.task.id === task.id)
.flatMap((x) => x.runtimeDecisionPoints ?? [])
for (let i = 0; i < k; i += 1) {
const s = shots.find((x) => x.task.id === task.id && x.attempt === i)
let sc: { score: number; resolved: boolean } | undefined
Expand Down Expand Up @@ -365,6 +375,7 @@ async function main(): Promise<void> {
model,
infraError: false,
runtimeEvents,
runtimeDecisionPoints,
})
await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
}
Expand Down
138 changes: 133 additions & 5 deletions bench/src/corpus.test.mts
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import assert from 'node:assert/strict'
import { isRunRecord } from '@tangle-network/agent-eval'
import { type AttemptRecord, benchRecordToCorpusRecords, buildRunRecordFromAttempts, type RunRecord } from './corpus'
import {
type AttemptRecord,
benchRecordToCorpusRecords,
buildRunRecord,
buildRunRecordFromAttempts,
type RunRecord,
} from './corpus'
import { createRuntimeHookRecorder } from './runtime-hook-recorder'

const measuredAttempt = (round: number, output: string, valid: boolean): AttemptRecord => ({
round,
Expand Down Expand Up @@ -31,6 +38,51 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
...over,
})

// --- runtime recorder snapshots decision points before persistent corpus storage ---
{
const recorder = createRuntimeHookRecorder()
const largeContext = `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}`
const largeDetail = `token=supersecret ${'detail'.repeat(1_000)}`
recorder.hooks.onDecisionPoint?.(
{
id: 'run-1:agent.turn:0:failure-recovery',
runId: 'run-1',
scenarioId: 'task-1',
stepIndex: 0,
kind: 'retry',
candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`),
context: largeContext,
evidence: [
{
source: 'tool_result',
id: 'tool-1:result',
detail: largeDetail,
metadata: { authorization: 'Bearer should-not-survive', nested: { apiKey: 'also-redacted' } },
},
],
metadata: { token: 'should-not-survive', safe: 'kept' },
},
{},
)

const [point] = recorder.decisionPoints
assert.ok(point, 'decision point recorded')
assert.notEqual(point, undefined)
assert.equal(point.candidateActions.length, 50, 'candidate actions are bounded')
assert.equal(point.context?.length, 20_000, 'context is bounded')
assert.equal(point.evidence[0]?.detail?.length, 2_000, 'evidence detail is bounded')
assert.equal(point.context?.includes('abc.def.ghi'), false, 'context secrets are redacted')
assert.equal(point.evidence[0]?.detail?.includes('supersecret'), false, 'evidence detail secrets are redacted')
assert.equal(point.metadata?.token, '[REDACTED]', 'top-level sensitive metadata is redacted')
assert.equal(point.metadata?.safe, 'kept', 'non-sensitive metadata is preserved')
assert.equal(point.evidence[0]?.metadata?.authorization, '[REDACTED]', 'evidence metadata is redacted')
assert.equal(
(point.evidence[0]?.metadata?.nested as { apiKey?: unknown } | undefined)?.apiKey,
'[REDACTED]',
'nested sensitive metadata is redacted',
)
}

// --- happy path: a measured run projects to one canonical CorpusRecord per attempt ---
{
const rec = baseRec([
Expand Down Expand Up @@ -103,17 +155,93 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
assert.equal(records[0]?.outcome.searchScore, undefined, 'no searchScore on a holdout record')
}

// --- bench writer preserves runtime trajectory evidence and semantic decision points ---
{
const record = buildRunRecord({
benchmark: 'commit0',
instanceId: 'task-1',
condition: 'random@2',
model: 'gpt-5',
resolved: true,
infraError: false,
now: () => new Date('2026-06-03T00:00:00.000Z'),
iterations: [
{
index: 0,
task: 'prompt',
agentRunName: 'worker',
output: 'completion',
verdict: { valid: true, score: 1 },
events: [],
startedAt: 10,
endedAt: 20,
costUsd: 0.01,
tokenUsage: { input: 10, output: 5 },
},
],
runtimeEvents: [
{
id: 'run-1:agent.run:before',
runId: 'run-1',
scenarioId: 'task-1',
target: 'agent.run',
phase: 'before',
timestamp: 1,
},
],
runtimeDecisionPoints: [
{
id: 'run-1:agent.turn:0:failure-recovery',
runId: 'run-1',
scenarioId: 'task-1',
stepIndex: 0,
kind: 'retry',
candidateActions: ['retry', 'verify', 'stop'],
evidence: [{ source: 'tool_result', id: 'tool-1:result' }],
metadata: { target: 'failure-recovery' },
},
],
})
assert.equal(record.runtimeEvents?.length, 1, 'runtime lifecycle events survive the writer')
assert.equal(record.runtimeDecisionPoints?.length, 1, 'runtime decision points survive the writer')
assert.equal(record.runtimeDecisionPoints?.[0]?.metadata?.target, 'failure-recovery')
}

// --- buildRunRecordFromAttempts: default derivations from the attempts ---
{
const rec = buildRunRecordFromAttempts(
[measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)],
{ benchmark: 'aec-bench', instanceId: 'i9', condition: 'random@2', model: 'gpt-5', now: () => new Date('2026-06-06T00:00:00.000Z') },
)
const rec = buildRunRecordFromAttempts([measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)], {
benchmark: 'aec-bench',
instanceId: 'i9',
condition: 'random@2',
model: 'gpt-5',
now: () => new Date('2026-06-06T00:00:00.000Z'),
runtimeEvents: [
{
id: 'run-2:agent.run:before',
runId: 'run-2',
target: 'agent.run',
phase: 'before',
timestamp: 1,
},
],
runtimeDecisionPoints: [
{
id: 'run-2:agent.turn:0:failure-recovery',
runId: 'run-2',
stepIndex: 0,
kind: 'retry',
candidateActions: ['retry', 'verify', 'stop'],
evidence: [{ source: 'tool_result', id: 'tool-2:result' }],
},
],
})
assert.equal(rec.ts, '2026-06-06T00:00:00.000Z', 'now() seam stamps ts')
assert.equal(rec.blindResolved, false, 'blindResolved = attempts[0].valid === true')
assert.equal(rec.resolved, true, 'resolved = any attempt valid')
assert.equal(rec.infraError, false, 'scored+valid attempts ⇒ not infra')
assert.equal(rec.attempts.length, 2)
assert.equal(rec.runtimeEvents?.length, 1, 'attempt writer preserves lifecycle events')
assert.equal(rec.runtimeDecisionPoints?.length, 1, 'attempt writer preserves decision points')
}

// --- no scored + no valid attempt ⇒ derived infraError ---
Expand Down
12 changes: 11 additions & 1 deletion bench/src/corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { dirname } from 'node:path'
import { hashContent, type RunSplitTag, validateRunRecord } from '@tangle-network/agent-eval'
import type { CorpusRecord } from '@tangle-network/agent-eval/rl'
import type { Iteration } from '@tangle-network/agent-runtime/loops'
import type { BenchRuntimeHookEvent } from './runtime-hook-recorder'
import type { BenchRuntimeDecisionPoint, BenchRuntimeHookEvent } from './runtime-hook-recorder'

/** One attempt within a condition-run: the prompt/steer sent, the output, the
* verdict, the measured economics, and a bounded trace summary.
Expand Down Expand Up @@ -70,6 +70,8 @@ export interface RunRecord {
commitSha?: string
/** Passive runtime hook evidence captured during the run. Optional and bounded by producers. */
runtimeEvents?: BenchRuntimeHookEvent[]
/** Semantic runtime decision points captured during the run. Optional and producer-defined. */
runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
}

const TRACE_TAIL_MAX = 600
Expand Down Expand Up @@ -120,6 +122,7 @@ export function buildRunRecord<Task, Output>(args: {
splitTag?: RunSplitTag
commitSha?: string
runtimeEvents?: BenchRuntimeHookEvent[]
runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
}): RunRecord {
const attempts = args.iterations.map(summarizeAttempt)
return {
Expand All @@ -138,6 +141,9 @@ export function buildRunRecord<Task, Output>(args: {
...(args.runtimeEvents !== undefined && args.runtimeEvents.length > 0
? { runtimeEvents: args.runtimeEvents }
: {}),
...(args.runtimeDecisionPoints !== undefined && args.runtimeDecisionPoints.length > 0
? { runtimeDecisionPoints: args.runtimeDecisionPoints }
: {}),
}
}

Expand Down Expand Up @@ -169,6 +175,7 @@ export function buildRunRecordFromAttempts(
splitTag?: RunSplitTag
commitSha?: string
runtimeEvents?: BenchRuntimeHookEvent[]
runtimeDecisionPoints?: BenchRuntimeDecisionPoint[]
},
): RunRecord {
const anyScored = attempts.some((a) => a.score !== undefined)
Expand All @@ -189,6 +196,9 @@ export function buildRunRecordFromAttempts(
...(meta.runtimeEvents !== undefined && meta.runtimeEvents.length > 0
? { runtimeEvents: meta.runtimeEvents }
: {}),
...(meta.runtimeDecisionPoints !== undefined && meta.runtimeDecisionPoints.length > 0
? { runtimeDecisionPoints: meta.runtimeDecisionPoints }
: {}),
}
}

Expand Down
1 change: 1 addition & 0 deletions bench/src/experiment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
infraError,
...(cfg.now ? { now: cfg.now } : {}),
runtimeEvents: runtime.events,
runtimeDecisionPoints: runtime.decisionPoints,
}),
).catch((err) =>
console.error(
Expand Down
Loading
Loading