diff --git a/bench/src/commit0-gate.mts b/bench/src/commit0-gate.mts index 24ac823..4b09d4b 100644 --- a/bench/src/commit0-gate.mts +++ b/bench/src/commit0-gate.mts @@ -37,7 +37,11 @@ import { Sandbox } from '@tangle-network/sandbox' import { createCommit0Adapter } from './benchmarks/commit0' import type { BenchTask } from './benchmarks/types' import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus' -import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder' +import { + type BenchRuntimeDecisionPoint, + type BenchRuntimeHookEvent, + createRuntimeHookRecorder, +} from './runtime-hook-recorder' import { pool } from './stats.mts' function must(name: string): string { @@ -67,6 +71,7 @@ interface Shot { /** measured count of stream events from the rollout (0 if it errored before streaming) */ events: number runtimeEvents?: BenchRuntimeHookEvent[] + runtimeDecisionPoints?: BenchRuntimeDecisionPoint[] } /** Build the rollout prompt: clone the stub, implement the source, write the diff to @@ -178,6 +183,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise< ok, events: turn.events.length, runtimeEvents: runtime.events, + runtimeDecisionPoints: runtime.decisionPoints, wallMs: Date.now() - startedAt, ...(ok ? {} : { detail: `empty patch${turn.readError ? ` (read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` }), } @@ -190,6 +196,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise< ok: false, events: 0, runtimeEvents: runtime.events, + runtimeDecisionPoints: runtime.decisionPoints, wallMs: Date.now() - startedAt, detail: `rollout error: ${msg.slice(0, 200)}`, } @@ -330,6 +337,9 @@ async function main(): Promise { const runtimeEvents = shots .filter((x) => x.task.id === task.id) .flatMap((x) => x.runtimeEvents ?? []) + const runtimeDecisionPoints = shots + .filter((x) => x.task.id === task.id) + .flatMap((x) => x.runtimeDecisionPoints ?? []) for (let i = 0; i < k; i += 1) { const s = shots.find((x) => x.task.id === task.id && x.attempt === i) let sc: { score: number; resolved: boolean } | undefined @@ -365,6 +375,7 @@ async function main(): Promise { model, infraError: false, runtimeEvents, + runtimeDecisionPoints, }) await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash } diff --git a/bench/src/corpus.test.mts b/bench/src/corpus.test.mts index 90ea049..9eba8f6 100644 --- a/bench/src/corpus.test.mts +++ b/bench/src/corpus.test.mts @@ -1,6 +1,13 @@ import assert from 'node:assert/strict' import { isRunRecord } from '@tangle-network/agent-eval' -import { type AttemptRecord, benchRecordToCorpusRecords, buildRunRecordFromAttempts, type RunRecord } from './corpus' +import { + type AttemptRecord, + benchRecordToCorpusRecords, + buildRunRecord, + buildRunRecordFromAttempts, + type RunRecord, +} from './corpus' +import { createRuntimeHookRecorder } from './runtime-hook-recorder' const measuredAttempt = (round: number, output: string, valid: boolean): AttemptRecord => ({ round, @@ -31,6 +38,51 @@ const baseRec = (attempts: AttemptRecord[], over: Partial = {}): RunR ...over, }) +// --- runtime recorder snapshots decision points before persistent corpus storage --- +{ + const recorder = createRuntimeHookRecorder() + const largeContext = `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}` + const largeDetail = `token=supersecret ${'detail'.repeat(1_000)}` + recorder.hooks.onDecisionPoint?.( + { + id: 'run-1:agent.turn:0:failure-recovery', + runId: 'run-1', + scenarioId: 'task-1', + stepIndex: 0, + kind: 'retry', + candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`), + context: largeContext, + evidence: [ + { + source: 'tool_result', + id: 'tool-1:result', + detail: largeDetail, + metadata: { authorization: 'Bearer should-not-survive', nested: { apiKey: 'also-redacted' } }, + }, + ], + metadata: { token: 'should-not-survive', safe: 'kept' }, + }, + {}, + ) + + const [point] = recorder.decisionPoints + assert.ok(point, 'decision point recorded') + assert.notEqual(point, undefined) + assert.equal(point.candidateActions.length, 50, 'candidate actions are bounded') + assert.equal(point.context?.length, 20_000, 'context is bounded') + assert.equal(point.evidence[0]?.detail?.length, 2_000, 'evidence detail is bounded') + assert.equal(point.context?.includes('abc.def.ghi'), false, 'context secrets are redacted') + assert.equal(point.evidence[0]?.detail?.includes('supersecret'), false, 'evidence detail secrets are redacted') + assert.equal(point.metadata?.token, '[REDACTED]', 'top-level sensitive metadata is redacted') + assert.equal(point.metadata?.safe, 'kept', 'non-sensitive metadata is preserved') + assert.equal(point.evidence[0]?.metadata?.authorization, '[REDACTED]', 'evidence metadata is redacted') + assert.equal( + (point.evidence[0]?.metadata?.nested as { apiKey?: unknown } | undefined)?.apiKey, + '[REDACTED]', + 'nested sensitive metadata is redacted', + ) +} + // --- happy path: a measured run projects to one canonical CorpusRecord per attempt --- { const rec = baseRec([ @@ -103,17 +155,93 @@ const baseRec = (attempts: AttemptRecord[], over: Partial = {}): RunR assert.equal(records[0]?.outcome.searchScore, undefined, 'no searchScore on a holdout record') } +// --- bench writer preserves runtime trajectory evidence and semantic decision points --- +{ + const record = buildRunRecord({ + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + resolved: true, + infraError: false, + now: () => new Date('2026-06-03T00:00:00.000Z'), + iterations: [ + { + index: 0, + task: 'prompt', + agentRunName: 'worker', + output: 'completion', + verdict: { valid: true, score: 1 }, + events: [], + startedAt: 10, + endedAt: 20, + costUsd: 0.01, + tokenUsage: { input: 10, output: 5 }, + }, + ], + runtimeEvents: [ + { + id: 'run-1:agent.run:before', + runId: 'run-1', + scenarioId: 'task-1', + target: 'agent.run', + phase: 'before', + timestamp: 1, + }, + ], + runtimeDecisionPoints: [ + { + id: 'run-1:agent.turn:0:failure-recovery', + runId: 'run-1', + scenarioId: 'task-1', + stepIndex: 0, + kind: 'retry', + candidateActions: ['retry', 'verify', 'stop'], + evidence: [{ source: 'tool_result', id: 'tool-1:result' }], + metadata: { target: 'failure-recovery' }, + }, + ], + }) + assert.equal(record.runtimeEvents?.length, 1, 'runtime lifecycle events survive the writer') + assert.equal(record.runtimeDecisionPoints?.length, 1, 'runtime decision points survive the writer') + assert.equal(record.runtimeDecisionPoints?.[0]?.metadata?.target, 'failure-recovery') +} + // --- buildRunRecordFromAttempts: default derivations from the attempts --- { - const rec = buildRunRecordFromAttempts( - [measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)], - { benchmark: 'aec-bench', instanceId: 'i9', condition: 'random@2', model: 'gpt-5', now: () => new Date('2026-06-06T00:00:00.000Z') }, - ) + const rec = buildRunRecordFromAttempts([measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)], { + benchmark: 'aec-bench', + instanceId: 'i9', + condition: 'random@2', + model: 'gpt-5', + now: () => new Date('2026-06-06T00:00:00.000Z'), + runtimeEvents: [ + { + id: 'run-2:agent.run:before', + runId: 'run-2', + target: 'agent.run', + phase: 'before', + timestamp: 1, + }, + ], + runtimeDecisionPoints: [ + { + id: 'run-2:agent.turn:0:failure-recovery', + runId: 'run-2', + stepIndex: 0, + kind: 'retry', + candidateActions: ['retry', 'verify', 'stop'], + evidence: [{ source: 'tool_result', id: 'tool-2:result' }], + }, + ], + }) assert.equal(rec.ts, '2026-06-06T00:00:00.000Z', 'now() seam stamps ts') assert.equal(rec.blindResolved, false, 'blindResolved = attempts[0].valid === true') assert.equal(rec.resolved, true, 'resolved = any attempt valid') assert.equal(rec.infraError, false, 'scored+valid attempts ⇒ not infra') assert.equal(rec.attempts.length, 2) + assert.equal(rec.runtimeEvents?.length, 1, 'attempt writer preserves lifecycle events') + assert.equal(rec.runtimeDecisionPoints?.length, 1, 'attempt writer preserves decision points') } // --- no scored + no valid attempt ⇒ derived infraError --- diff --git a/bench/src/corpus.ts b/bench/src/corpus.ts index 1b64dd8..33997c8 100644 --- a/bench/src/corpus.ts +++ b/bench/src/corpus.ts @@ -17,7 +17,7 @@ import { dirname } from 'node:path' import { hashContent, type RunSplitTag, validateRunRecord } from '@tangle-network/agent-eval' import type { CorpusRecord } from '@tangle-network/agent-eval/rl' import type { Iteration } from '@tangle-network/agent-runtime/loops' -import type { BenchRuntimeHookEvent } from './runtime-hook-recorder' +import type { BenchRuntimeDecisionPoint, BenchRuntimeHookEvent } from './runtime-hook-recorder' /** One attempt within a condition-run: the prompt/steer sent, the output, the * verdict, the measured economics, and a bounded trace summary. @@ -70,6 +70,8 @@ export interface RunRecord { commitSha?: string /** Passive runtime hook evidence captured during the run. Optional and bounded by producers. */ runtimeEvents?: BenchRuntimeHookEvent[] + /** Semantic runtime decision points captured during the run. Optional and producer-defined. */ + runtimeDecisionPoints?: BenchRuntimeDecisionPoint[] } const TRACE_TAIL_MAX = 600 @@ -120,6 +122,7 @@ export function buildRunRecord(args: { splitTag?: RunSplitTag commitSha?: string runtimeEvents?: BenchRuntimeHookEvent[] + runtimeDecisionPoints?: BenchRuntimeDecisionPoint[] }): RunRecord { const attempts = args.iterations.map(summarizeAttempt) return { @@ -138,6 +141,9 @@ export function buildRunRecord(args: { ...(args.runtimeEvents !== undefined && args.runtimeEvents.length > 0 ? { runtimeEvents: args.runtimeEvents } : {}), + ...(args.runtimeDecisionPoints !== undefined && args.runtimeDecisionPoints.length > 0 + ? { runtimeDecisionPoints: args.runtimeDecisionPoints } + : {}), } } @@ -169,6 +175,7 @@ export function buildRunRecordFromAttempts( splitTag?: RunSplitTag commitSha?: string runtimeEvents?: BenchRuntimeHookEvent[] + runtimeDecisionPoints?: BenchRuntimeDecisionPoint[] }, ): RunRecord { const anyScored = attempts.some((a) => a.score !== undefined) @@ -189,6 +196,9 @@ export function buildRunRecordFromAttempts( ...(meta.runtimeEvents !== undefined && meta.runtimeEvents.length > 0 ? { runtimeEvents: meta.runtimeEvents } : {}), + ...(meta.runtimeDecisionPoints !== undefined && meta.runtimeDecisionPoints.length > 0 + ? { runtimeDecisionPoints: meta.runtimeDecisionPoints } + : {}), } } diff --git a/bench/src/experiment.ts b/bench/src/experiment.ts index 1e96397..3253dbb 100644 --- a/bench/src/experiment.ts +++ b/bench/src/experiment.ts @@ -340,6 +340,7 @@ export async function runExperiment(cfg: ExperimentConfig): Promise console.error( diff --git a/bench/src/runtime-hook-recorder.ts b/bench/src/runtime-hook-recorder.ts index 0b48782..237e6c5 100644 --- a/bench/src/runtime-hook-recorder.ts +++ b/bench/src/runtime-hook-recorder.ts @@ -13,26 +13,125 @@ export interface BenchRuntimeHookEvent { metadata?: Record } +export interface BenchRuntimeDecisionEvidenceRef { + source: string + id: string + detail?: string + metadata?: Record +} + +export interface BenchRuntimeDecisionPoint { + id: string + runId: string + scenarioId?: string + stepIndex: number + kind: string + candidateActions: string[] + context?: string + evidence: BenchRuntimeDecisionEvidenceRef[] + metadata?: Record +} + export interface BenchRuntimeHooks { onEvent?: ( event: BenchRuntimeHookEvent, context: { signal?: AbortSignal }, ) => void | Promise + onDecisionPoint?: ( + point: BenchRuntimeDecisionPoint, + context: { signal?: AbortSignal }, + ) => void | Promise } export interface RuntimeHookRecorder { readonly events: BenchRuntimeHookEvent[] + readonly decisionPoints: BenchRuntimeDecisionPoint[] readonly hooks: BenchRuntimeHooks } +const MAX_STRING_LENGTH = 12_000 +const MAX_CONTEXT_LENGTH = 20_000 +const MAX_EVIDENCE_DETAIL_LENGTH = 2_000 +const MAX_CANDIDATE_ACTIONS = 50 +const MAX_EVIDENCE_REFS = 50 +const MAX_METADATA_DEPTH = 4 +const MAX_METADATA_KEYS = 100 +const SENSITIVE_KEY_RE = /(?:authorization|api[_-]?key|token|secret|password|cookie|credential|bearer)/i +const SENSITIVE_VALUE_RES = [ + /\bBearer\s+[A-Za-z0-9._~+/=-]+/gi, + /\b(?:sk|gh[pousr])_[A-Za-z0-9_]{20,}\b/g, + /\b(?:sk|ghp|gho|ghu|ghs|ghr)-[A-Za-z0-9_-]{20,}\b/g, +] +const SENSITIVE_ASSIGNMENT_RE = + /\b(api[_-]?key|token|secret|password|cookie)\s*[:=]\s*["']?[^"'\s,;}]+/gi + +function sanitizeString(value: string, maxLength: number): string { + let sanitized = value + for (const pattern of SENSITIVE_VALUE_RES) { + sanitized = sanitized.replace(pattern, '[REDACTED]') + } + sanitized = sanitized.replace(SENSITIVE_ASSIGNMENT_RE, (_match, key: string) => `${key}=[REDACTED]`) + if (sanitized.length <= maxLength) return sanitized + return sanitized.slice(0, maxLength) +} + +function sanitizeMetadata(value: unknown, depth = 0): unknown { + if (value == null) return value + if (typeof value === 'string') return sanitizeString(value, MAX_STRING_LENGTH) + if (typeof value === 'number' || typeof value === 'boolean') return value + if (Array.isArray(value)) { + if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]' + return value.slice(0, MAX_METADATA_KEYS).map((item) => sanitizeMetadata(item, depth + 1)) + } + if (typeof value !== 'object') return undefined + if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]' + + const sanitized: Record = {} + for (const [key, nested] of Object.entries(value).slice(0, MAX_METADATA_KEYS)) { + sanitized[key] = SENSITIVE_KEY_RE.test(key) ? '[REDACTED]' : sanitizeMetadata(nested, depth + 1) + } + return sanitized +} + +function sanitizeMetadataRecord(metadata: Record | undefined): Record | undefined { + if (!metadata) return undefined + const sanitized = sanitizeMetadata(metadata) + if (!sanitized || typeof sanitized !== 'object' || Array.isArray(sanitized)) return undefined + return sanitized as Record +} + +function snapshotDecisionPoint(point: BenchRuntimeDecisionPoint): BenchRuntimeDecisionPoint { + return { + id: point.id, + runId: point.runId, + scenarioId: point.scenarioId, + stepIndex: point.stepIndex, + kind: point.kind, + candidateActions: point.candidateActions.slice(0, MAX_CANDIDATE_ACTIONS).map((action) => sanitizeString(action, MAX_STRING_LENGTH)), + context: typeof point.context === 'string' ? sanitizeString(point.context, MAX_CONTEXT_LENGTH) : undefined, + evidence: point.evidence.slice(0, MAX_EVIDENCE_REFS).map((ref) => ({ + source: sanitizeString(ref.source, MAX_STRING_LENGTH), + id: sanitizeString(ref.id, MAX_STRING_LENGTH), + detail: typeof ref.detail === 'string' ? sanitizeString(ref.detail, MAX_EVIDENCE_DETAIL_LENGTH) : undefined, + metadata: sanitizeMetadataRecord(ref.metadata), + })), + metadata: sanitizeMetadataRecord(point.metadata), + } +} + export function createRuntimeHookRecorder(): RuntimeHookRecorder { const events: BenchRuntimeHookEvent[] = [] + const decisionPoints: BenchRuntimeDecisionPoint[] = [] return { events, + decisionPoints, hooks: { onEvent: (event) => { events.push(event) }, + onDecisionPoint: (point) => { + decisionPoints.push(snapshotDecisionPoint(point)) + }, }, } } diff --git a/bench/src/worker.ts b/bench/src/worker.ts index 02c2710..8738c56 100644 --- a/bench/src/worker.ts +++ b/bench/src/worker.ts @@ -13,7 +13,11 @@ import { } from '@tangle-network/agent-runtime/loops' import { Sandbox } from '@tangle-network/sandbox' import type { BenchTask } from './benchmarks/types' -import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder' +import { + type BenchRuntimeDecisionPoint, + type BenchRuntimeHookEvent, + createRuntimeHookRecorder, +} from './runtime-hook-recorder' export interface WorkerConfig { sandboxBaseUrl: string @@ -30,6 +34,7 @@ export interface ShotResult { ok: boolean detail?: string runtimeEvents?: BenchRuntimeHookEvent[] + runtimeDecisionPoints?: BenchRuntimeDecisionPoint[] } const PATCH_PATH = '/tmp/solution.patch' @@ -121,6 +126,7 @@ export async function solveShot( ? `empty patch${turn.readError ? ` (patch read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` : undefined, runtimeEvents: runtime.events, + runtimeDecisionPoints: runtime.decisionPoints, } } finally { if (timer) clearTimeout(timer)