diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts index 788603f..df59bb2 100644 --- a/src/belief-state/runtime-benchmark-corpus.ts +++ b/src/belief-state/runtime-benchmark-corpus.ts @@ -17,8 +17,26 @@ type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & { benchmark?: unknown condition?: unknown instanceId?: unknown + runtimeDecisionPoints?: unknown } +const MAX_STRING_LENGTH = 12_000 +const MAX_CONTEXT_LENGTH = 20_000 +const MAX_EVIDENCE_DETAIL_LENGTH = 2_000 +const MAX_CANDIDATE_ACTIONS = 50 +const MAX_EVIDENCE_REFS = 50 +const MAX_METADATA_DEPTH = 4 +const MAX_METADATA_KEYS = 100 +const SENSITIVE_KEY_RE = + /(?:authorization|api[_-]?key|token|secret|password|cookie|credential|bearer)/i +const SENSITIVE_VALUE_RES = [ + /\bBearer\s+[A-Za-z0-9._~+/=-]+/gi, + /\b(?:sk|gh[pousr])_[A-Za-z0-9_]{20,}\b/g, + /\b(?:sk|ghp|gho|ghu|ghs|ghr)-[A-Za-z0-9_-]{20,}\b/g, +] +const SENSITIVE_ASSIGNMENT_RE = + /\b(api[_-]?key|token|secret|password|cookie)\s*[:=]\s*["']?[^"'\s,;}]+/gi + export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions extends Omit< BuildRuntimeBeliefPhase0MeasurementOptions, @@ -56,11 +74,12 @@ export function buildRuntimeBenchmarkBeliefPhase0Measurement( recordIdOf: runtimeBenchmarkRecordId, scenarioIdOf: runtimeBenchmarkScenarioId, }) - const decisions = options.decisions ?? [] + const decisions = + options.decisions ?? runtimeBenchmarkDecisionPoints(options.records, diagnostics) const labels = options.labels ?? [] if (decisions.length === 0) { diagnostics.push( - 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + 'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows', ) } if (labels.length === 0 && decisions.length > 0) { @@ -105,6 +124,156 @@ function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): s return nonEmptyString(record.instanceId) } +function runtimeBenchmarkDecisionPoints( + records: RuntimeBenchmarkTrajectoryRecord[], + diagnostics: string[], +): RuntimeBeliefDecisionPoint[] { + const decisions: RuntimeBeliefDecisionPoint[] = [] + for (let recordIndex = 0; recordIndex < records.length; recordIndex += 1) { + const record = records[recordIndex]! + const raw = record.runtimeDecisionPoints + if (raw === undefined) continue + const recordId = runtimeBenchmarkRecordId(record) ?? `record[${recordIndex}]` + if (!Array.isArray(raw)) { + diagnostics.push(`${recordId}: runtimeDecisionPoints is not an array`) + continue + } + for (let pointIndex = 0; pointIndex < raw.length; pointIndex += 1) { + const point = runtimeBenchmarkDecisionPoint(raw[pointIndex], { + diagnostics, + path: `${recordId}: runtimeDecisionPoints[${pointIndex}]`, + }) + if (!point) { + diagnostics.push( + `${recordId}: runtimeDecisionPoints[${pointIndex}] is not a RuntimeDecisionPoint`, + ) + continue + } + decisions.push(point) + } + } + return decisions +} + +function runtimeBenchmarkDecisionPoint( + input: unknown, + context: { diagnostics: string[]; path: string }, +): RuntimeBeliefDecisionPoint | null { + if (!isRecord(input)) return null + if (typeof input.id !== 'string' || input.id.length === 0) return null + if (typeof input.runId !== 'string' || input.runId.length === 0) return null + if ( + typeof input.stepIndex !== 'number' || + !Number.isInteger(input.stepIndex) || + input.stepIndex < 0 + ) { + return null + } + if (typeof input.kind !== 'string' || input.kind.length === 0) return null + return { + id: sanitizeString(input.id, MAX_STRING_LENGTH), + runId: sanitizeString(input.runId, MAX_STRING_LENGTH), + scenarioId: sanitizeOptionalString(input.scenarioId, MAX_STRING_LENGTH), + stepIndex: input.stepIndex, + kind: sanitizeString(input.kind, MAX_STRING_LENGTH), + candidateActions: stringArray(input.candidateActions, { + ...context, + maxItems: MAX_CANDIDATE_ACTIONS, + label: 'candidateActions', + }), + context: sanitizeOptionalString(input.context, MAX_CONTEXT_LENGTH), + evidence: runtimeBenchmarkEvidence(input.evidence, context), + metadata: sanitizeMetadataRecord(input.metadata), + } +} + +function runtimeBenchmarkEvidence( + input: unknown, + context: { diagnostics: string[]; path: string }, +): RuntimeBeliefDecisionPoint['evidence'] { + if (!Array.isArray(input)) return [] + if (input.length > MAX_EVIDENCE_REFS) { + context.diagnostics.push(`${context.path}: evidence truncated to ${MAX_EVIDENCE_REFS} refs`) + } + return input.slice(0, MAX_EVIDENCE_REFS).flatMap((item) => { + if (!isRecord(item)) return [] + const source = sanitizeOptionalString(item.source, MAX_STRING_LENGTH) + const id = sanitizeOptionalString(item.id, MAX_STRING_LENGTH) + if (!source || !id) return [] + return [ + { + source, + id, + detail: sanitizeOptionalString(item.detail, MAX_EVIDENCE_DETAIL_LENGTH), + metadata: sanitizeMetadataRecord(item.metadata), + }, + ] + }) +} + +function stringArray( + input: unknown, + context: { diagnostics: string[]; path: string; maxItems: number; label: string }, +): string[] | undefined { + if (!Array.isArray(input)) return undefined + if (input.length > context.maxItems) { + context.diagnostics.push(`${context.path}: ${context.label} truncated to ${context.maxItems}`) + } + const values = input + .slice(0, context.maxItems) + .filter((value): value is string => typeof value === 'string' && value.length > 0) + .map((value) => sanitizeString(value, MAX_STRING_LENGTH)) + return values.length > 0 ? values : undefined +} + +function sanitizeMetadataRecord(metadata: unknown): Record | undefined { + if (!isRecord(metadata)) return undefined + const sanitized = sanitizeMetadata(metadata) + if (!sanitized || typeof sanitized !== 'object' || Array.isArray(sanitized)) return undefined + return sanitized as Record +} + +function sanitizeMetadata(value: unknown, depth = 0): unknown { + if (value == null) return value + if (typeof value === 'string') return sanitizeString(value, MAX_STRING_LENGTH) + if (typeof value === 'number' || typeof value === 'boolean') return value + if (Array.isArray(value)) { + if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]' + return value.slice(0, MAX_METADATA_KEYS).map((item) => sanitizeMetadata(item, depth + 1)) + } + if (!isRecord(value)) return undefined + if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]' + + const sanitized: Record = {} + for (const [key, nested] of Object.entries(value).slice(0, MAX_METADATA_KEYS)) { + sanitized[key] = SENSITIVE_KEY_RE.test(key) ? '[REDACTED]' : sanitizeMetadata(nested, depth + 1) + } + return sanitized +} + +function sanitizeOptionalString(value: unknown, maxLength: number): string | undefined { + return typeof value === 'string' && value.length > 0 + ? sanitizeString(value, maxLength) + : undefined +} + +function sanitizeString(value: string, maxLength: number): string { + let sanitized = value + for (const pattern of SENSITIVE_VALUE_RES) { + sanitized = sanitized.replace(pattern, '[REDACTED]') + } + sanitized = sanitized.replace( + SENSITIVE_ASSIGNMENT_RE, + (_match, key: string) => `${key}=[REDACTED]`, + ) + if (sanitized.length <= maxLength) return sanitized + return sanitized.slice(0, maxLength) +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + function nonEmptyString(value: unknown): string | undefined { return typeof value === 'string' && value.length > 0 ? value : undefined } diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts index caee096..30fef43 100644 --- a/tests/belief-state/runtime-benchmark-corpus.test.ts +++ b/tests/belief-state/runtime-benchmark-corpus.test.ts @@ -45,11 +45,11 @@ describe('runtime benchmark corpus belief-state projection', () => { expect(report.measurement.points).toEqual([]) expect(report.measurement.summary.packetStatus).toBe('blocked') expect(report.diagnostics).toContain( - 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + 'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows', ) }) - it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => { + it('feeds record runtime decisions and labels into the existing Phase 0 measurement', () => { const decisions = Array.from({ length: 12 }, (_, index) => decision(index)) const labels = decisions.map((item, index) => ({ decisionId: item.id, @@ -67,9 +67,9 @@ describe('runtime benchmark corpus belief-state projection', () => { model: 'gpt-5', splitTag: 'holdout', runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)], + runtimeDecisionPoints: decisions, }, ], - decisions, labels, targetId: 'failure-recovery', minN: 12, @@ -128,9 +128,72 @@ describe('runtime benchmark corpus belief-state projection', () => { expect(report.diagnostics).toEqual([ 'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted', 'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent', - 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + 'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows', ]) }) + + it('sanitizes and bounds embedded runtime decisions from corpus records', () => { + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0), + runtimeDecisionPoints: [ + { + id: 'commit0:task-1:0:agent.turn:0:failure-recovery', + runId: 'commit0:task-1:0', + scenarioId: 'task-1', + stepIndex: 0, + kind: 'retry', + candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`), + context: `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}`, + evidence: Array.from({ length: 75 }, (_, index) => ({ + source: 'runtime_event', + id: `event-${index}`, + detail: `token=supersecret ${'detail'.repeat(1_000)}`, + metadata: { authorization: 'Bearer should-not-survive' }, + })), + metadata: { token: 'should-not-survive', safe: 'kept' }, + }, + { + id: 'commit0:task-1:0:agent.turn:-1:failure-recovery', + runId: 'commit0:task-1:0', + stepIndex: -1, + kind: 'retry', + candidateActions: ['retry'], + }, + ], + }, + ], + targetId: 'failure-recovery', + minN: 1, + claimScope: 'selective', + }) + + expect(report.decisions).toHaveLength(1) + expect(report.decisions[0]?.candidateActions).toHaveLength(50) + expect(report.decisions[0]?.context).toHaveLength(20_000) + expect(report.decisions[0]?.context?.includes('abc.def.ghi')).toBe(false) + expect(report.decisions[0]?.evidence).toHaveLength(50) + expect(report.decisions[0]?.evidence?.[0]?.detail).toHaveLength(2_000) + expect(report.decisions[0]?.evidence?.[0]?.detail?.includes('supersecret')).toBe(false) + expect(report.decisions[0]?.metadata).toMatchObject({ + token: '[REDACTED]', + safe: 'kept', + }) + expect(report.decisions[0]?.evidence?.[0]?.metadata?.authorization).toBe('[REDACTED]') + expect(report.diagnostics).toEqual( + expect.arrayContaining([ + 'commit0:task-1:random@2: runtimeDecisionPoints[0]: candidateActions truncated to 50', + 'commit0:task-1:random@2: runtimeDecisionPoints[0]: evidence truncated to 50 refs', + 'commit0:task-1:random@2: runtimeDecisionPoints[1] is not a RuntimeDecisionPoint', + 'no decision labels supplied; observed action/outcome joins will be incomplete', + ]), + ) + }) }) function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) {