diff --git a/.claude/skills/eval-campaign/SKILL.md b/.claude/skills/eval-campaign/SKILL.md new file mode 100644 index 0000000..a4af918 --- /dev/null +++ b/.claude/skills/eval-campaign/SKILL.md @@ -0,0 +1,113 @@ +--- +name: eval-campaign +description: Wire a product agent's self-improvement loop (measure → optimize → gate → ship) onto the shared @tangle-network/agent-app/eval-campaign scaffold. Use when adding or refactoring any product agent's eval/ loop. +--- + +# Wiring a product onto the eval-campaign scaffold + +You are integrating a product agent's self-improvement loop. The loop **engine already exists** in the substrate — do not rebuild it. Your job is to supply the three things only the product knows, and call one function. + +## Mental model (read first) + +`selfImprove` (from `@tangle-network/agent-eval/contract`, re-exported here) owns the entire cycle: + +- the **train/holdout split** from a flat `scenarios` array, +- the **driver** (default `gepaDriver` from your `mutationPrimitives`), +- the **held-out production gate** (default `defaultProductionGate`, `deltaThreshold` 0.05), +- **durable provenance** + optional hosted ingest, +- every budget/seed/storage default. + +A product brings exactly three things: + +1. **`scenarios`** — your corpus (personas / cases / tasks) in the substrate `Scenario` shape. +2. **`agent`** — `(surface, scenario, ctx) => artifact`: run your agent under the current surface (a system-prompt addendum the loop optimizes) and return the artifact your judge scores. Report real cost via `ctx.cost.observe(...)` so the backend-integrity guard sees a real run. +3. **`judge`** — score an artifact on your rubric. Use `buildEnsembleJudge` (below) for a multi-model ensemble, or hand-write a `JudgeConfig` for a bespoke composite. + +Everything else is a default you override only when you have a reason. + +## The one import + +```ts +import { + selfImprove, + buildEnsembleJudge, + type SelfImproveOptions, + type JudgeVerdict, +} from '@tangle-network/agent-app/eval-campaign' +``` + +> Requires `@tangle-network/agent-eval >= 0.81.0` (peer). The scaffold composes the substrate downward; never import a product package from agent-eval (layering rule). + +## Minimal wiring (copy, then fill the three blanks) + +```ts +const RUBRIC = ['accuracy', 'grounding', 'tone'] as const +type Dim = (typeof RUBRIC)[number] + +const judge = buildEnsembleJudge({ + name: 'my-product', + rubric: RUBRIC, + judgeReps: 3, // 3 uncorrelated judges → inter-rater bands + async scoreOne({ artifact, scenario, rep }) { + const model = JUDGE_MODELS[rep % JUDGE_MODELS.length] // vary the model per rep + try { + const v = await callMyJudge(model, artifact, scenario) // → { accuracy, grounding, tone } + return { model, perDimension: v, rationale: v.note, costUsd: v.cost } + } catch (err) { + return { model, perDimension: null, rationale: String(err) } // failure ≠ zero + } + }, +}) + +const result = await selfImprove({ + scenarios: loadMyScenarios(), // YOU own + agent: dispatchUnderSurface, // YOU own — (surface, scenario, ctx) => artifact + judge, // built above + baselineSurface: '', // the addendum the loop optimizes (start empty) + mutationPrimitives: MY_DIRECTIVES, // the optimization levers (default driver mutates toward these) + runDir: process.env.MY_RUN_DIR, // a real path → durable provenance; omit → in-memory + // budget / model / gate / hostedTenant all default — override only when needed +}) + +if (result.gate.decision === 'ship') await ship(result.winnerSurface) +``` + +## `buildEnsembleJudge` contract + +- `scoreOne` is called `judgeReps` times per artifact; **vary the model by `rep`** so the ensemble is uncorrelated (judges sharing a base model share its bias). +- Return `{ model, perDimension: null }` to record a judge failure **without** killing the ensemble — the reducer means over survivors. +- The reducer (`aggregateJudgeVerdicts`) **throws only if every rep failed** → the campaign records a failed cell, never a silent zero. +- `weights` (partial) selects-and-weights named dimensions; default is uniform. + +## Config reference (all `SelfImproveOptions`, all optional unless noted) + +| Field | Default | When to set | +|---|---|---| +| `scenarios` | — (required) | your corpus | +| `agent` | — (required) | your dispatch under a surface | +| `judge` | — (required) | `buildEnsembleJudge` or a `JudgeConfig` | +| `baselineSurface` | — (required) | the surface the loop optimizes; start `''` | +| `mutationPrimitives` | gepaDriver's own | your optimization levers (additive directives) | +| `driver` | `gepaDriver` | pass `evolutionaryDriver({ mutator })` for blind addendum rotation | +| `gate` | `defaultProductionGate` (Δ 0.05) | `paretoSignificanceGate` for multi-objective; tune `deltaThreshold` for your rubric scale | +| `budget` | 3 gens × pop 2, 0.25 holdout | raise for deeper search | +| `runDir` | `mem://…` (non-durable) | a real path to persist provenance + spans | +| `hostedTenant` | off | ship eval-run events to a hosted orchestrator | +| `collectWorkerRecords` | — | return the per-call `RunRecord`s your agent accumulated → real backend-integrity verdict | +| `onProgress` | — | stream baseline/generation/gate events to a UI | + +## Fail-loud contract (do not break) + +- In `agent`, report real cost via `ctx.cost.observe(costUsd, label)` + `ctx.cost.observeTokens(...)`. A dispatch that reports `{0,0}` trips `expectUsage` — that is the honest "ran against a stub" signal; never paper over it. +- A judge failure is `perDimension: null`, never a fabricated zero. +- Train and holdout must both be non-empty (`selfImprove` derives the split; supply enough scenarios). + +## Anti-patterns (these are what this scaffold deletes) + +- ❌ Hand-rolling `runImprovementLoop({...})` + `emitLoopProvenance({...})` + a train/holdout split. That is ~100 lines of identical boilerplate per product. Call `selfImprove`. +- ❌ A per-product copy of the judge-ensemble reducer (survivor-mean / disagreement / cost-sum). Use `buildEnsembleJudge` → `aggregateJudgeVerdicts`. +- ❌ `import type` from a product package inside the scaffold or substrate (upward dependency — forbidden). + +## Where it lives in the product + +One file: `eval/self-improve.ts`. It exports `runMyEval` (measure: `selfImprove` with `budget.generations = 0`, or `runCampaign`) and `runMySelfImprovement` (optimize: the wiring above). The product's harness/CLI calls these; nothing else duplicates the loop. diff --git a/package.json b/package.json index e7d6662..836dee9 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,11 @@ "import": "./dist/eval/index.js", "default": "./dist/eval/index.js" }, + "./eval-campaign": { + "types": "./dist/eval-campaign/index.d.ts", + "import": "./dist/eval-campaign/index.js", + "default": "./dist/eval-campaign/index.js" + }, "./knowledge": { "types": "./dist/knowledge/index.d.ts", "import": "./dist/knowledge/index.js", @@ -131,7 +136,7 @@ "typecheck": "tsc --noEmit" }, "devDependencies": { - "@tangle-network/agent-eval": "^0.70.0", + "@tangle-network/agent-eval": "^0.81.0", "@tangle-network/agent-integrations": "^0.32.0", "@tangle-network/agent-knowledge": "^1.5.2", "@types/node": "^25.6.0", @@ -140,7 +145,7 @@ "vitest": "^3.0.0" }, "peerDependencies": { - "@tangle-network/agent-eval": ">=0.50.0", + "@tangle-network/agent-eval": ">=0.81.0", "@tangle-network/agent-integrations": ">=0.32.0", "@tangle-network/agent-knowledge": ">=1.5.0", "@tangle-network/agent-runtime": ">=0.21.0" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1c9d776..06052c7 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -13,8 +13,8 @@ importers: version: 0.25.2(@tangle-network/agent-knowledge@1.5.2(typescript@5.9.3))(typescript@5.9.3) devDependencies: '@tangle-network/agent-eval': - specifier: ^0.70.0 - version: 0.70.0(typescript@5.9.3) + specifier: ^0.81.0 + version: 0.81.0(typescript@5.9.3) '@tangle-network/agent-integrations': specifier: ^0.32.0 version: 0.32.0 @@ -434,8 +434,8 @@ packages: '@tangle-network/sandbox': optional: true - '@tangle-network/agent-eval@0.70.0': - resolution: {integrity: sha512-Dt5t80djk+snH6hxqUlK9Aa3jf9iWXuwnzCCdK3XOb1QtZM1bes2dbYc9XqjLgEGNSOnjJpz/5/ay9w4r0SHOQ==} + '@tangle-network/agent-eval@0.81.0': + resolution: {integrity: sha512-qFdVaNYuQlmJgzxnwQ4VsR6mYZTxHDRSu6dm2j7c2WWrhNZz9Bp/NVH5f0klUISLp5ULjFcvSICpRAAM1xYZhg==} engines: {node: '>=20'} hasBin: true peerDependencies: @@ -1235,7 +1235,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-eval@0.70.0(typescript@5.9.3)': + '@tangle-network/agent-eval@0.81.0(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3) '@ax-llm/ax': 19.0.45(zod@4.4.3) diff --git a/src/eval-campaign/index.test.ts b/src/eval-campaign/index.test.ts new file mode 100644 index 0000000..a7b2058 --- /dev/null +++ b/src/eval-campaign/index.test.ts @@ -0,0 +1,98 @@ +/** + * buildEnsembleJudge wraps the substrate reducer into a JudgeConfig. These + * tests pin the contract the loop depends on: N reps fan out, a single rep + * failing does not fail the cell, all-failed throws (failed cell, not a zero), + * and the JudgeScore shape is exactly what runCampaign/selfImprove consume. + */ + +import { describe, expect, it } from 'vitest' + +import type { Scenario } from '@tangle-network/agent-eval/campaign' +import { buildEnsembleJudge } from './index' + +type Dim = 'accuracy' | 'tone' +const RUBRIC = ['accuracy', 'tone'] as const + +interface Art { + text: string +} +const scenario: Scenario = { id: 's1', kind: 'test' } +const signal = new AbortController().signal + +describe('buildEnsembleJudge', () => { + it('fans out judgeReps calls and returns the JudgeScore shape', async () => { + let calls = 0 + const judge = buildEnsembleJudge({ + name: 'test', + rubric: RUBRIC, + judgeReps: 3, + async scoreOne({ rep }) { + calls++ + return { model: `m${rep}`, perDimension: { accuracy: 0.8, tone: 0.6 } } + }, + }) + const score = await judge.score({ artifact: { text: 'x' }, scenario, signal }) + expect(calls).toBe(3) + expect(score.dimensions.accuracy).toBeCloseTo(0.8, 5) + expect(score.dimensions.tone).toBeCloseTo(0.6, 5) + expect(score.composite).toBeCloseTo(0.7, 5) + expect(typeof score.notes).toBe('string') + }) + + it('exposes the rubric as JudgeDimensions', () => { + const judge = buildEnsembleJudge({ + name: 'test', + rubric: RUBRIC, + describe: (d) => `desc:${d}`, + async scoreOne() { + return { model: 'm', perDimension: { accuracy: 1, tone: 1 } } + }, + }) + expect(judge.dimensions).toEqual([ + { key: 'accuracy', description: 'desc:accuracy' }, + { key: 'tone', description: 'desc:tone' }, + ]) + }) + + it('a single rep failing does NOT fail the cell — means over survivors', async () => { + const judge = buildEnsembleJudge({ + name: 'test', + rubric: RUBRIC, + judgeReps: 2, + async scoreOne({ rep }) { + if (rep === 0) throw new Error('judge 0 down') + return { model: 'm1', perDimension: { accuracy: 0.9, tone: 0.9 } } + }, + }) + const score = await judge.score({ artifact: { text: 'x' }, scenario, signal }) + expect(score.dimensions.accuracy).toBeCloseTo(0.9, 5) // survivor only, not (0.9+0)/2 + }) + + it('throws (failed cell, not a zero) when every rep fails', async () => { + const judge = buildEnsembleJudge({ + name: 'test', + rubric: RUBRIC, + judgeReps: 2, + async scoreOne() { + throw new Error('all down') + }, + }) + await expect(judge.score({ artifact: { text: 'x' }, scenario, signal })).rejects.toThrow( + /all 2 judges failed/, + ) + }) + + it('rejects an empty rubric and judgeReps < 1', () => { + expect(() => + buildEnsembleJudge({ name: 't', rubric: [], scoreOne: async () => ({ model: 'm', perDimension: null }) }), + ).toThrow(/rubric is empty/) + expect(() => + buildEnsembleJudge({ + name: 't', + rubric: RUBRIC, + judgeReps: 0, + scoreOne: async () => ({ model: 'm', perDimension: { accuracy: 1, tone: 1 } }), + }), + ).toThrow(/judgeReps must be >= 1/) + }) +}) diff --git a/src/eval-campaign/index.ts b/src/eval-campaign/index.ts new file mode 100644 index 0000000..6777e28 --- /dev/null +++ b/src/eval-campaign/index.ts @@ -0,0 +1,144 @@ +/** + * Eval-campaign — the app-shell's curated surface for a product's + * self-improvement loop, NOT a reimplementation. + * + * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency): + * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA + * driver, the held-out production gate, durable provenance + hosted ingest, and + * every default. A product should NOT hand-roll `runImprovementLoop` + + * `emitLoopProvenance` around it (that is the boilerplate this surface exists to + * delete). It should call `selfImprove` with three things it actually owns: + * scenarios, an `agent` dispatch, and a `judge`. + * + * This module adds the one piece `selfImprove` does not own and which every + * multi-model product re-hand-rolls — the ensemble judge: + * + * {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a + * `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via + * the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread, + * fail-loud on all-failed). A product writes its rubric + one judge call; the + * fan-out, partial-failure handling, and composite are the scaffold's. + * + * Everything else is a curated re-export so a product has ONE eval import: + * `selfImprove` + the gates + the drivers + the types. See + * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract. + */ + +import { + aggregateJudgeVerdicts, + type JudgeVerdict, +} from '@tangle-network/agent-eval' +import type { + JudgeConfig, + JudgeScore, + Scenario, +} from '@tangle-network/agent-eval/campaign' + +/** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */ +export interface EnsembleJudgeConfig { + /** Judge name — appears in traces and scorecards. */ + name: string + /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the + * reducer keys, so a judge that omits a dimension scores it 0 (never silently + * dropped). */ + rubric: readonly D[] + /** + * Score ONE artifact on the rubric → a raw per-dimension verdict. Called + * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated + * ensemble (judges that share a base model share its bias). Return + * `{ model, perDimension: null }` to record a judge failure WITHOUT killing + * the ensemble; throw only on an unrecoverable error (the whole rep is then + * treated as a failed judge). + */ + scoreOne: (input: { + artifact: TArtifact + scenario: TScenario + signal: AbortSignal + rep: number + }) => Promise> + /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`. + * Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */ + judgeReps?: number + /** Per-dimension composite weights. Default: uniform over `rubric`. A partial + * map selects-and-weights exactly the named dimensions. */ + weights?: Partial> + /** Optional human-readable dimension descriptions. Default: the key itself. */ + describe?: (dim: D) => string +} + +/** + * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent + * `scoreOne` calls and reduces them with the substrate's + * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell + * (it is recorded and dropped); only ALL judges failing throws — which the + * campaign records as a failed cell, never a silent zero. + * + * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`). + */ +export function buildEnsembleJudge( + cfg: EnsembleJudgeConfig, +): JudgeConfig { + const reps = cfg.judgeReps ?? 1 + if (reps < 1) { + throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`) + } + if (cfg.rubric.length === 0) { + throw new Error('buildEnsembleJudge: rubric is empty') + } + return { + name: cfg.name, + dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })), + async score({ artifact, scenario, signal }): Promise { + const settled = await Promise.allSettled( + Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep })), + ) + const verdicts: JudgeVerdict[] = settled.map((r, rep) => + r.status === 'fulfilled' + ? r.value + : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) }, + ) + // Throws iff EVERY rep failed → the campaign records a failed cell. + const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights) + return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale } + }, + } +} + +// ── Curated re-exports — the one eval import for a product loop ────────────── +// The loop engine + gates + drivers + the ensemble reducer, so a product wires +// its self-improvement loop from a single module instead of reaching across +// three agent-eval subpaths. All DOWNWARD imports (agent-app consumes the +// substrate); the layering rule is preserved. + +export { aggregateJudgeVerdicts } from '@tangle-network/agent-eval' +export type { + EnsembleAggregate, + JudgeVerdict, + RunRecord, +} from '@tangle-network/agent-eval' +export { + defaultProductionGate, + evolutionaryDriver, + gepaDriver, + paretoSignificanceGate, + runCampaign, +} from '@tangle-network/agent-eval/campaign' +export type { + CampaignResult, + DispatchContext, + Gate, + ImprovementDriver, + JudgeConfig, + JudgeDimension, + JudgeScore, + LabeledScenarioStore, + MutableSurface, + Mutator, + Scenario, +} from '@tangle-network/agent-eval/campaign' +export { selfImprove } from '@tangle-network/agent-eval/contract' +export type { + SelfImproveBudget, + SelfImproveOptions, + SelfImproveResult, +} from '@tangle-network/agent-eval/contract' diff --git a/tsup.config.ts b/tsup.config.ts index a232a94..a3676c5 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -8,6 +8,7 @@ export default defineConfig({ 'tangle/index': 'src/tangle/index.ts', 'runtime/index': 'src/runtime/index.ts', 'eval/index': 'src/eval/index.ts', + 'eval-campaign/index': 'src/eval-campaign/index.ts', 'knowledge/index': 'src/knowledge/index.ts', 'knowledge-loop/index': 'src/knowledge-loop/index.ts', 'harness/index': 'src/harness/index.ts',