tangle-network · drewstone · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts
@@ -17,8 +17,26 @@ type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & {
   benchmark?: unknown
   condition?: unknown
   instanceId?: unknown
+  runtimeDecisionPoints?: unknown
 }
 
+const MAX_STRING_LENGTH = 12_000
+const MAX_CONTEXT_LENGTH = 20_000
+const MAX_EVIDENCE_DETAIL_LENGTH = 2_000
+const MAX_CANDIDATE_ACTIONS = 50
+const MAX_EVIDENCE_REFS = 50
+const MAX_METADATA_DEPTH = 4
+const MAX_METADATA_KEYS = 100
+const SENSITIVE_KEY_RE =
+  /(?:authorization|api[_-]?key|token|secret|password|cookie|credential|bearer)/i
+const SENSITIVE_VALUE_RES = [
+  /\bBearer\s+[A-Za-z0-9._~+/=-]+/gi,
+  /\b(?:sk|gh[pousr])_[A-Za-z0-9_]{20,}\b/g,
+  /\b(?:sk|ghp|gho|ghu|ghs|ghr)-[A-Za-z0-9_-]{20,}\b/g,
+]
+const SENSITIVE_ASSIGNMENT_RE =
+  /\b(api[_-]?key|token|secret|password|cookie)\s*[:=]\s*["']?[^"'\s,;}]+/gi
+
 export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
   extends Omit<
     BuildRuntimeBeliefPhase0MeasurementOptions,
@@ -56,11 +74,12 @@ export function buildRuntimeBenchmarkBeliefPhase0Measurement(
     recordIdOf: runtimeBenchmarkRecordId,
     scenarioIdOf: runtimeBenchmarkScenarioId,
   })
-  const decisions = options.decisions ?? []
+  const decisions =
+    options.decisions ?? runtimeBenchmarkDecisionPoints(options.records, diagnostics)
   const labels = options.labels ?? []
   if (decisions.length === 0) {
     diagnostics.push(
-      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+      'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
     )
   }
   if (labels.length === 0 && decisions.length > 0) {
@@ -105,6 +124,156 @@ function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): s
   return nonEmptyString(record.instanceId)
 }
 
+function runtimeBenchmarkDecisionPoints(
+  records: RuntimeBenchmarkTrajectoryRecord[],
+  diagnostics: string[],
+): RuntimeBeliefDecisionPoint[] {
+  const decisions: RuntimeBeliefDecisionPoint[] = []
+  for (let recordIndex = 0; recordIndex < records.length; recordIndex += 1) {
+    const record = records[recordIndex]!
+    const raw = record.runtimeDecisionPoints
+    if (raw === undefined) continue
+    const recordId = runtimeBenchmarkRecordId(record) ?? `record[${recordIndex}]`
+    if (!Array.isArray(raw)) {
+      diagnostics.push(`${recordId}: runtimeDecisionPoints is not an array`)
+      continue
+    }
+    for (let pointIndex = 0; pointIndex < raw.length; pointIndex += 1) {
+      const point = runtimeBenchmarkDecisionPoint(raw[pointIndex], {
+        diagnostics,
+        path: `${recordId}: runtimeDecisionPoints[${pointIndex}]`,
+      })
+      if (!point) {
+        diagnostics.push(
+          `${recordId}: runtimeDecisionPoints[${pointIndex}] is not a RuntimeDecisionPoint`,
+        )
+        continue
+      }
+      decisions.push(point)
+    }
+  }
+  return decisions
+}
+
+function runtimeBenchmarkDecisionPoint(
+  input: unknown,
+  context: { diagnostics: string[]; path: string },
+): RuntimeBeliefDecisionPoint | null {
+  if (!isRecord(input)) return null
+  if (typeof input.id !== 'string' || input.id.length === 0) return null
+  if (typeof input.runId !== 'string' || input.runId.length === 0) return null
+  if (
+    typeof input.stepIndex !== 'number' ||
+    !Number.isInteger(input.stepIndex) ||
+    input.stepIndex < 0
+  ) {
+    return null
+  }
+  if (typeof input.kind !== 'string' || input.kind.length === 0) return null
+  return {
+    id: sanitizeString(input.id, MAX_STRING_LENGTH),
+    runId: sanitizeString(input.runId, MAX_STRING_LENGTH),
+    scenarioId: sanitizeOptionalString(input.scenarioId, MAX_STRING_LENGTH),
+    stepIndex: input.stepIndex,
+    kind: sanitizeString(input.kind, MAX_STRING_LENGTH),
+    candidateActions: stringArray(input.candidateActions, {
+      ...context,
+      maxItems: MAX_CANDIDATE_ACTIONS,
+      label: 'candidateActions',
+    }),
+    context: sanitizeOptionalString(input.context, MAX_CONTEXT_LENGTH),
+    evidence: runtimeBenchmarkEvidence(input.evidence, context),
+    metadata: sanitizeMetadataRecord(input.metadata),
+  }
+}
+
+function runtimeBenchmarkEvidence(
+  input: unknown,
+  context: { diagnostics: string[]; path: string },
+): RuntimeBeliefDecisionPoint['evidence'] {
+  if (!Array.isArray(input)) return []
+  if (input.length > MAX_EVIDENCE_REFS) {
+    context.diagnostics.push(`${context.path}: evidence truncated to ${MAX_EVIDENCE_REFS} refs`)
+  }
+  return input.slice(0, MAX_EVIDENCE_REFS).flatMap((item) => {
+    if (!isRecord(item)) return []
+    const source = sanitizeOptionalString(item.source, MAX_STRING_LENGTH)
+    const id = sanitizeOptionalString(item.id, MAX_STRING_LENGTH)
+    if (!source || !id) return []
+    return [
+      {
+        source,
+        id,
+        detail: sanitizeOptionalString(item.detail, MAX_EVIDENCE_DETAIL_LENGTH),
+        metadata: sanitizeMetadataRecord(item.metadata),
+      },
+    ]
+  })
+}
+
+function stringArray(
+  input: unknown,
+  context: { diagnostics: string[]; path: string; maxItems: number; label: string },
+): string[] | undefined {
+  if (!Array.isArray(input)) return undefined
+  if (input.length > context.maxItems) {
+    context.diagnostics.push(`${context.path}: ${context.label} truncated to ${context.maxItems}`)
+  }
+  const values = input
+    .slice(0, context.maxItems)
+    .filter((value): value is string => typeof value === 'string' && value.length > 0)
+    .map((value) => sanitizeString(value, MAX_STRING_LENGTH))
+  return values.length > 0 ? values : undefined
+}
+
+function sanitizeMetadataRecord(metadata: unknown): Record<string, unknown> | undefined {
+  if (!isRecord(metadata)) return undefined
+  const sanitized = sanitizeMetadata(metadata)
+  if (!sanitized || typeof sanitized !== 'object' || Array.isArray(sanitized)) return undefined
+  return sanitized as Record<string, unknown>
+}
+
+function sanitizeMetadata(value: unknown, depth = 0): unknown {
+  if (value == null) return value
+  if (typeof value === 'string') return sanitizeString(value, MAX_STRING_LENGTH)
+  if (typeof value === 'number' || typeof value === 'boolean') return value
+  if (Array.isArray(value)) {
+    if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]'
+    return value.slice(0, MAX_METADATA_KEYS).map((item) => sanitizeMetadata(item, depth + 1))
+  }
+  if (!isRecord(value)) return undefined
+  if (depth >= MAX_METADATA_DEPTH) return '[MaxDepth]'
+
+  const sanitized: Record<string, unknown> = {}
+  for (const [key, nested] of Object.entries(value).slice(0, MAX_METADATA_KEYS)) {
+    sanitized[key] = SENSITIVE_KEY_RE.test(key) ? '[REDACTED]' : sanitizeMetadata(nested, depth + 1)
+  }
+  return sanitized
+}
+
+function sanitizeOptionalString(value: unknown, maxLength: number): string | undefined {
+  return typeof value === 'string' && value.length > 0
+    ? sanitizeString(value, maxLength)
+    : undefined
+}
+
+function sanitizeString(value: string, maxLength: number): string {
+  let sanitized = value
+  for (const pattern of SENSITIVE_VALUE_RES) {
+    sanitized = sanitized.replace(pattern, '[REDACTED]')
+  }
+  sanitized = sanitized.replace(
+    SENSITIVE_ASSIGNMENT_RE,
+    (_match, key: string) => `${key}=[REDACTED]`,
+  )
+  if (sanitized.length <= maxLength) return sanitized
+  return sanitized.slice(0, maxLength)
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value)
+}
+
 function nonEmptyString(value: unknown): string | undefined {
   return typeof value === 'string' && value.length > 0 ? value : undefined
 }
diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts
@@ -45,11 +45,11 @@ describe('runtime benchmark corpus belief-state projection', () => {
     expect(report.measurement.points).toEqual([])
     expect(report.measurement.summary.packetStatus).toBe('blocked')
     expect(report.diagnostics).toContain(
-      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+      'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
     )
   })
 
-  it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => {
+  it('feeds record runtime decisions and labels into the existing Phase 0 measurement', () => {
     const decisions = Array.from({ length: 12 }, (_, index) => decision(index))
     const labels = decisions.map((item, index) => ({
       decisionId: item.id,
@@ -67,9 +67,9 @@ describe('runtime benchmark corpus belief-state projection', () => {
           model: 'gpt-5',
           splitTag: 'holdout',
           runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)],
+          runtimeDecisionPoints: decisions,
         },
       ],
-      decisions,
       labels,
       targetId: 'failure-recovery',
       minN: 12,
@@ -128,9 +128,72 @@ describe('runtime benchmark corpus belief-state projection', () => {
     expect(report.diagnostics).toEqual([
       'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted',
       'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent',
-      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+      'no runtime decision points supplied or found on records; benchmark lifecycle events alone cannot produce belief decision rows',
     ])
   })
+
+  it('sanitizes and bounds embedded runtime decisions from corpus records', () => {
+    const report = buildRuntimeBenchmarkBeliefPhase0Measurement({
+      records: [
+        {
+          benchmark: 'commit0',
+          instanceId: 'task-1',
+          condition: 'random@2',
+          model: 'gpt-5',
+          runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0),
+          runtimeDecisionPoints: [
+            {
+              id: 'commit0:task-1:0:agent.turn:0:failure-recovery',
+              runId: 'commit0:task-1:0',
+              scenarioId: 'task-1',
+              stepIndex: 0,
+              kind: 'retry',
+              candidateActions: Array.from({ length: 75 }, (_, index) => `candidate-${index}`),
+              context: `Bearer abc.def.ghi ${'ctx'.repeat(10_000)}`,
+              evidence: Array.from({ length: 75 }, (_, index) => ({
+                source: 'runtime_event',
+                id: `event-${index}`,
+                detail: `token=supersecret ${'detail'.repeat(1_000)}`,
+                metadata: { authorization: 'Bearer should-not-survive' },
+              })),
+              metadata: { token: 'should-not-survive', safe: 'kept' },
+            },
+            {
+              id: 'commit0:task-1:0:agent.turn:-1:failure-recovery',
+              runId: 'commit0:task-1:0',
+              stepIndex: -1,
+              kind: 'retry',
+              candidateActions: ['retry'],
+            },
+          ],
+        },
+      ],
+      targetId: 'failure-recovery',
+      minN: 1,
+      claimScope: 'selective',
+    })
+
+    expect(report.decisions).toHaveLength(1)
+    expect(report.decisions[0]?.candidateActions).toHaveLength(50)
+    expect(report.decisions[0]?.context).toHaveLength(20_000)
+    expect(report.decisions[0]?.context?.includes('abc.def.ghi')).toBe(false)
+    expect(report.decisions[0]?.evidence).toHaveLength(50)
+    expect(report.decisions[0]?.evidence?.[0]?.detail).toHaveLength(2_000)
+    expect(report.decisions[0]?.evidence?.[0]?.detail?.includes('supersecret')).toBe(false)
+    expect(report.decisions[0]?.metadata).toMatchObject({
+      token: '[REDACTED]',
+      safe: 'kept',
+    })
+    expect(report.decisions[0]?.evidence?.[0]?.metadata?.authorization).toBe('[REDACTED]')
+    expect(report.diagnostics).toEqual(
+      expect.arrayContaining([
+        'commit0:task-1:random@2: runtimeDecisionPoints[0]: candidateActions truncated to 50',
+        'commit0:task-1:random@2: runtimeDecisionPoints[0]: evidence truncated to 50 refs',
+        'commit0:task-1:random@2: runtimeDecisionPoints[1] is not a RuntimeDecisionPoint',
+        'no decision labels supplied; observed action/outcome joins will be incomplete',
+      ]),
+    )
+  })
 })
 
 function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) {