Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 171 additions & 2 deletions src/belief-state/runtime-benchmark-corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,26 @@ type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & {
benchmark?: unknown
condition?: unknown
instanceId?: unknown
runtimeDecisionPoints?: unknown
}

const MAX_STRING_LENGTH = 12_000
const MAX_CONTEXT_LENGTH = 20_000
const MAX_EVIDENCE_DETAIL_LENGTH = 2_000
const MAX_CANDIDATE_ACTIONS = 50
const MAX_EVIDENCE_REFS = 50
const MAX_METADATA_DEPTH = 4
const MAX_METADATA_KEYS = 100
const SENSITIVE_KEY_RE =
/(?:authorization|api[_-]?key|token|secret|password|cookie|credential|bearer)/i
const SENSITIVE_VALUE_RES = [
/\bBearer\s+[A-Za-z0-9._~+/=-]+/gi,
/\b(?:sk|gh[pousr])_[A-Za-z0-9_]{20,}\b/g,
/\b(?:sk|ghp|gho|ghu|ghs|ghr)-[A-Za-z0-9_-]{20,}\b/g,
]
const SENSITIVE_ASSIGNMENT_RE =
/\b(api[_-]?key|token|secret|password|cookie)\s*[:=]\s*["']?[^"'\s,;}]+/gi

export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
extends Omit<
BuildRuntimeBeliefPhase0MeasurementOptions,
Expand Down Expand Up @@ -56,11 +74,12 @@ export function buildRuntimeBenchmarkBeliefPhase0Measurement(
recordIdOf: runtimeBenchmarkRecordId,
scenarioIdOf: runtimeBenchmarkScenarioId,
})
const decisions = options.decisions ?? []
const decisions =
options.decisions ?? runtimeBenchmarkDecisionPoints(options.records, diagnostics)
const labels = options.labels ?? []
if (decisions.length === 0) {
diagnostics.push(
'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
)
}
if (labels.length === 0 && decisions.length > 0) {
Expand Down Expand Up @@ -105,6 +124,156 @@ function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): s
return nonEmptyString(record.instanceId)
}

function runtimeBenchmarkDecisionPoints(
records: RuntimeBenchmarkTrajectoryRecord[],
diagnostics: string[],
): RuntimeBeliefDecisionPoint[] {
const decisions: RuntimeBeliefDecisionPoint[] = []
for (let recordIndex = 0; recordIndex < records.length; recordIndex += 1) {
const record = records[recordIndex]!
const raw = record.runtimeDecisionPoints
if (raw === undefined) continue
const recordId = runtimeBenchmarkRecordId(record) ?? `record[${recordIndex}]`
if (!Array.isArray(raw)) {
diagnostics.push(`${recordId}: runtimeDecisionPoints is not an array`)
continue
}
for (let pointIndex = 0; pointIndex < raw.length; pointIndex += 1) {
const point = runtimeBenchmarkDecisionPoint(raw[pointIndex], {
diagnostics,
path: `${recordId}: runtimeDecisionPoints[${pointIndex}]`,
})
if (!point) {
diagnostics.push(
`${recordId}: runtimeDecisionPoints[${pointIndex}] is not a RuntimeDecisionPoint`,
)
continue
}
decisions.push(point)
}
}
return decisions
}

function runtimeBenchmarkDecisionPoint(
input: unknown,
context: { diagnostics: string[]; path: string },
): RuntimeBeliefDecisionPoint | null {
if (!isRecord(input)) return null
if (typeof input.id !== 'string' || input.id.length === 0) return null
if (typeof input.runId !== 'string' || input.runId.length === 0) return null
if (
typeof input.stepIndex !== 'number' ||
!Number.isInteger(input.stepIndex) ||
input.stepIndex < 0
) {
return null
}
if (typeof input.kind !== 'string' || input.kind.length === 0) return null
return {
id: sanitizeString(input.id, MAX_STRING_LENGTH),
runId: sanitizeString(input.runId, MAX_STRING_LENGTH),
scenarioId: sanitizeOptionalString(input.scenarioId, MAX_STRING_LENGTH),
stepIndex: input.stepIndex,
kind: sanitizeString(input.kind, MAX_STRING_LENGTH),
candidateActions: stringArray(input.candidateActions, {
...context,
maxItems: MAX_CANDIDATE_ACTIONS,
label: 'candidateActions',
}),
context: sanitizeOptionalString(input.context, MAX_CONTEXT_LENGTH),
evidence: runtimeBenchmarkEvidence(input.evidence, context),
metadata: sanitizeMetadataRecord(input.metadata),
}
}

function runtimeBenchmarkEvidence(
input: unknown,
context: { diagnostics: string[]; path: string },
): RuntimeBeliefDecisionPoint['evidence'] {
if (!Array.isArray(input)) return []
if (input.length > MAX_EVIDENCE_REFS) {
context.diagnostics.push(`${context.path}: evidence truncated to ${MAX_EVIDENCE_REFS} refs`)
}
return input.slice(0, MAX_EVIDENCE_REFS).flatMap((item) => {
if (!isRecord(item)) return []
const source = sanitizeOptionalString(item.source, MAX_STRING_LENGTH)
const id = sanitizeOptionalString(item.id, MAX_STRING_LENGTH)
if (!source || !id) return []
return [
{
source,
id,
detail: sanitizeOptionalString(item.detail, MAX_EVIDENCE_DETAIL_LENGTH),
metadata: sanitizeMetadataRecord(item.metadata),
},
]
})
}

function stringArray(
input: unknown,
context: { diagnostics: string[]; path: string; maxItems: number; label: string },
): string[] | undefined {
if (!Array.isArray(input)) return undefined
if (input.length > context.maxItems) {
context.diagnostics.push(`${context.path}: ${context.label} truncated to ${context.maxItems}`)
}
const values = input
.slice(0, context.maxItems)
.filter((value): value is string => typeof value === 'string' && value.length > 0)
.map((value) => sanitizeString(value, MAX_STRING_LENGTH))
return values.length > 0 ? values : undefined
}

function sanitizeMetadataRecord(metadata: unknown): Record<string, unknown> | undefined {
if (!isRecord(metadata)) return undefined
const sanitized = sanitizeMetadata(metadata)
if (!sanitized || typeof sanitized !== 'object' || Array.isArray(sanitized)) return undefined
return sanitized as Record<string, unknown>
}

function sanitizeMetadata(value: unknown, depth = 0): unknown {
if (value == null) return value
if (typeof value === 'string') return sanitizeString(value, MAX_STRING_LENGTH)
if (typeof value === 'number' || typeof value === 'boolean') return value
if (Array.isArray(value)) {
if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]'
return value.slice(0, MAX_METADATA_KEYS).map((item) => sanitizeMetadata(item, depth + 1))
}
if (!isRecord(value)) return undefined
if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]'

const sanitized: Record<string, unknown> = {}
for (const [key, nested] of Object.entries(value).slice(0, MAX_METADATA_KEYS)) {
sanitized[key] = SENSITIVE_KEY_RE.test(key) ? '[REDACTED]' : sanitizeMetadata(nested, depth + 1)
}
return sanitized
}

function sanitizeOptionalString(value: unknown, maxLength: number): string | undefined {
return typeof value === 'string' && value.length > 0
? sanitizeString(value, maxLength)
: undefined
}

function sanitizeString(value: string, maxLength: number): string {
let sanitized = value
for (const pattern of SENSITIVE_VALUE_RES) {
sanitized = sanitized.replace(pattern, '[REDACTED]')
}
sanitized = sanitized.replace(
SENSITIVE_ASSIGNMENT_RE,
(_match, key: string) => `${key}=[REDACTED]`,
)
if (sanitized.length <= maxLength) return sanitized
return sanitized.slice(0, maxLength)
}

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value)
}

function nonEmptyString(value: unknown): string | undefined {
return typeof value === 'string' && value.length > 0 ? value : undefined
}
71 changes: 67 additions & 4 deletions tests/belief-state/runtime-benchmark-corpus.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ describe('runtime benchmark corpus belief-state projection', () => {
expect(report.measurement.points).toEqual([])
expect(report.measurement.summary.packetStatus).toBe('blocked')
expect(report.diagnostics).toContain(
'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
)
})

it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => {
it('feeds record runtime decisions and labels into the existing Phase 0 measurement', () => {
const decisions = Array.from({ length: 12 }, (_, index) => decision(index))
const labels = decisions.map((item, index) => ({
decisionId: item.id,
Expand All @@ -67,9 +67,9 @@ describe('runtime benchmark corpus belief-state projection', () => {
model: 'gpt-5',
splitTag: 'holdout',
runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)],
runtimeDecisionPoints: decisions,
},
],
decisions,
labels,
targetId: 'failure-recovery',
minN: 12,
Expand Down Expand Up @@ -128,9 +128,72 @@ describe('runtime benchmark corpus belief-state projection', () => {
expect(report.diagnostics).toEqual([
'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted',
'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent',
'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
])
})

it('sanitizes and bounds embedded runtime decisions from corpus records', () => {
const report = buildRuntimeBenchmarkBeliefPhase0Measurement({
records: [
{
benchmark: 'commit0',
instanceId: 'task-1',
condition: 'random@2',
model: 'gpt-5',
runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0),
runtimeDecisionPoints: [
{
id: 'commit0:task-1:0:agent.turn:0:failure-recovery',
runId: 'commit0:task-1:0',
scenarioId: 'task-1',
stepIndex: 0,
kind: 'retry',
candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`),
context: `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}`,
evidence: Array.from({ length: 75 }, (_, index) => ({
source: 'runtime_event',
id: `event-${index}`,
detail: `token=supersecret ${'detail'.repeat(1_000)}`,
metadata: { authorization: 'Bearer should-not-survive' },
})),
metadata: { token: 'should-not-survive', safe: 'kept' },
},
{
id: 'commit0:task-1:0:agent.turn:-1:failure-recovery',
runId: 'commit0:task-1:0',
stepIndex: -1,
kind: 'retry',
candidateActions: ['retry'],
},
],
},
],
targetId: 'failure-recovery',
minN: 1,
claimScope: 'selective',
})

expect(report.decisions).toHaveLength(1)
expect(report.decisions[0]?.candidateActions).toHaveLength(50)
expect(report.decisions[0]?.context).toHaveLength(20_000)
expect(report.decisions[0]?.context?.includes('abc.def.ghi')).toBe(false)
expect(report.decisions[0]?.evidence).toHaveLength(50)
expect(report.decisions[0]?.evidence?.[0]?.detail).toHaveLength(2_000)
expect(report.decisions[0]?.evidence?.[0]?.detail?.includes('supersecret')).toBe(false)
expect(report.decisions[0]?.metadata).toMatchObject({
token: '[REDACTED]',
safe: 'kept',
})
expect(report.decisions[0]?.evidence?.[0]?.metadata?.authorization).toBe('[REDACTED]')
expect(report.diagnostics).toEqual(
expect.arrayContaining([
'commit0:task-1:random@2: runtimeDecisionPoints[0]: candidateActions truncated to 50',
'commit0:task-1:random@2: runtimeDecisionPoints[0]: evidence truncated to 50 refs',
'commit0:task-1:random@2: runtimeDecisionPoints[1] is not a RuntimeDecisionPoint',
'no decision labels supplied; observed action/outcome joins will be incomplete',
]),
)
})
})

function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) {
Expand Down
Loading