diff --git a/.claude/skills/eval-campaign/SKILL.md b/.claude/skills/eval-campaign/SKILL.md
new file mode 100644
index 0000000..a4af918
--- /dev/null
+++ b/.claude/skills/eval-campaign/SKILL.md
@@ -0,0 +1,113 @@
+---
+name: eval-campaign
+description: Wire a product agent's self-improvement loop (measure → optimize → gate → ship) onto the shared @tangle-network/agent-app/eval-campaign scaffold. Use when adding or refactoring any product agent's eval/ loop.
+---
+
+# Wiring a product onto the eval-campaign scaffold
+
+You are integrating a product agent's self-improvement loop. The loop **engine already exists** in the substrate — do not rebuild it. Your job is to supply the three things only the product knows, and call one function.
+
+## Mental model (read first)
+
+`selfImprove` (from `@tangle-network/agent-eval/contract`, re-exported here) owns the entire cycle:
+
+- the **train/holdout split** from a flat `scenarios` array,
+- the **driver** (default `gepaDriver` from your `mutationPrimitives`),
+- the **held-out production gate** (default `defaultProductionGate`, `deltaThreshold` 0.05),
+- **durable provenance** + optional hosted ingest,
+- every budget/seed/storage default.
+
+A product brings exactly three things:
+
+1. **`scenarios`** — your corpus (personas / cases / tasks) in the substrate `Scenario` shape.
+2. **`agent`** — `(surface, scenario, ctx) => artifact`: run your agent under the current surface (a system-prompt addendum the loop optimizes) and return the artifact your judge scores. Report real cost via `ctx.cost.observe(...)` so the backend-integrity guard sees a real run.
+3. **`judge`** — score an artifact on your rubric. Use `buildEnsembleJudge` (below) for a multi-model ensemble, or hand-write a `JudgeConfig` for a bespoke composite.
+
+Everything else is a default you override only when you have a reason.
+
+## The one import
+
+```ts
+import {
+  selfImprove,
+  buildEnsembleJudge,
+  type SelfImproveOptions,
+  type JudgeVerdict,
+} from '@tangle-network/agent-app/eval-campaign'
+```
+
+> Requires `@tangle-network/agent-eval >= 0.81.0` (peer). The scaffold composes the substrate downward; never import a product package from agent-eval (layering rule).
+
+## Minimal wiring (copy, then fill the three blanks)
+
+```ts
+const RUBRIC = ['accuracy', 'grounding', 'tone'] as const
+type Dim = (typeof RUBRIC)[number]
+
+const judge = buildEnsembleJudge<MyArtifact, MyScenario, Dim>({
+  name: 'my-product',
+  rubric: RUBRIC,
+  judgeReps: 3,                         // 3 uncorrelated judges → inter-rater bands
+  async scoreOne({ artifact, scenario, rep }) {
+    const model = JUDGE_MODELS[rep % JUDGE_MODELS.length]   // vary the model per rep
+    try {
+      const v = await callMyJudge(model, artifact, scenario) // → { accuracy, grounding, tone }
+      return { model, perDimension: v, rationale: v.note, costUsd: v.cost }
+    } catch (err) {
+      return { model, perDimension: null, rationale: String(err) } // failure ≠ zero
+    }
+  },
+})
+
+const result = await selfImprove<MyScenario, MyArtifact>({
+  scenarios: loadMyScenarios(),         // YOU own
+  agent: dispatchUnderSurface,          // YOU own — (surface, scenario, ctx) => artifact
+  judge,                                // built above
+  baselineSurface: '',                  // the addendum the loop optimizes (start empty)
+  mutationPrimitives: MY_DIRECTIVES,    // the optimization levers (default driver mutates toward these)
+  runDir: process.env.MY_RUN_DIR,       // a real path → durable provenance; omit → in-memory
+  // budget / model / gate / hostedTenant all default — override only when needed
+})
+
+if (result.gate.decision === 'ship') await ship(result.winnerSurface)
+```
+
+## `buildEnsembleJudge` contract
+
+- `scoreOne` is called `judgeReps` times per artifact; **vary the model by `rep`** so the ensemble is uncorrelated (judges sharing a base model share its bias).
+- Return `{ model, perDimension: null }` to record a judge failure **without** killing the ensemble — the reducer means over survivors.
+- The reducer (`aggregateJudgeVerdicts`) **throws only if every rep failed** → the campaign records a failed cell, never a silent zero.
+- `weights` (partial) selects-and-weights named dimensions; default is uniform.
+
+## Config reference (all `SelfImproveOptions`, all optional unless noted)
+
+| Field | Default | When to set |
+|---|---|---|
+| `scenarios` | — (required) | your corpus |
+| `agent` | — (required) | your dispatch under a surface |
+| `judge` | — (required) | `buildEnsembleJudge` or a `JudgeConfig` |
+| `baselineSurface` | — (required) | the surface the loop optimizes; start `''` |
+| `mutationPrimitives` | gepaDriver's own | your optimization levers (additive directives) |
+| `driver` | `gepaDriver` | pass `evolutionaryDriver({ mutator })` for blind addendum rotation |
+| `gate` | `defaultProductionGate` (Δ 0.05) | `paretoSignificanceGate` for multi-objective; tune `deltaThreshold` for your rubric scale |
+| `budget` | 3 gens × pop 2, 0.25 holdout | raise for deeper search |
+| `runDir` | `mem://…` (non-durable) | a real path to persist provenance + spans |
+| `hostedTenant` | off | ship eval-run events to a hosted orchestrator |
+| `collectWorkerRecords` | — | return the per-call `RunRecord`s your agent accumulated → real backend-integrity verdict |
+| `onProgress` | — | stream baseline/generation/gate events to a UI |
+
+## Fail-loud contract (do not break)
+
+- In `agent`, report real cost via `ctx.cost.observe(costUsd, label)` + `ctx.cost.observeTokens(...)`. A dispatch that reports `{0,0}` trips `expectUsage` — that is the honest "ran against a stub" signal; never paper over it.
+- A judge failure is `perDimension: null`, never a fabricated zero.
+- Train and holdout must both be non-empty (`selfImprove` derives the split; supply enough scenarios).
+
+## Anti-patterns (these are what this scaffold deletes)
+
+- ❌ Hand-rolling `runImprovementLoop({...})` + `emitLoopProvenance({...})` + a train/holdout split. That is ~100 lines of identical boilerplate per product. Call `selfImprove`.
+- ❌ A per-product copy of the judge-ensemble reducer (survivor-mean / disagreement / cost-sum). Use `buildEnsembleJudge` → `aggregateJudgeVerdicts`.
+- ❌ `import type` from a product package inside the scaffold or substrate (upward dependency — forbidden).
+
+## Where it lives in the product
+
+One file: `eval/self-improve.ts`. It exports `runMyEval` (measure: `selfImprove` with `budget.generations = 0`, or `runCampaign`) and `runMySelfImprovement` (optimize: the wiring above). The product's harness/CLI calls these; nothing else duplicates the loop.
diff --git a/package.json b/package.json
index e7d6662..836dee9 100644
--- a/package.json
+++ b/package.json
@@ -61,6 +61,11 @@
       "import": "./dist/eval/index.js",
       "default": "./dist/eval/index.js"
     },
+    "./eval-campaign": {
+      "types": "./dist/eval-campaign/index.d.ts",
+      "import": "./dist/eval-campaign/index.js",
+      "default": "./dist/eval-campaign/index.js"
+    },
     "./knowledge": {
       "types": "./dist/knowledge/index.d.ts",
       "import": "./dist/knowledge/index.js",
@@ -131,7 +136,7 @@
     "typecheck": "tsc --noEmit"
   },
   "devDependencies": {
-    "@tangle-network/agent-eval": "^0.70.0",
+    "@tangle-network/agent-eval": "^0.81.0",
     "@tangle-network/agent-integrations": "^0.32.0",
     "@tangle-network/agent-knowledge": "^1.5.2",
     "@types/node": "^25.6.0",
@@ -140,7 +145,7 @@
     "vitest": "^3.0.0"
   },
   "peerDependencies": {
-    "@tangle-network/agent-eval": ">=0.50.0",
+    "@tangle-network/agent-eval": ">=0.81.0",
     "@tangle-network/agent-integrations": ">=0.32.0",
     "@tangle-network/agent-knowledge": ">=1.5.0",
     "@tangle-network/agent-runtime": ">=0.21.0"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 1c9d776..06052c7 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -13,8 +13,8 @@ importers:
         version: 0.25.2(@tangle-network/agent-knowledge@1.5.2(typescript@5.9.3))(typescript@5.9.3)
     devDependencies:
       '@tangle-network/agent-eval':
-        specifier: ^0.70.0
-        version: 0.70.0(typescript@5.9.3)
+        specifier: ^0.81.0
+        version: 0.81.0(typescript@5.9.3)
       '@tangle-network/agent-integrations':
         specifier: ^0.32.0
         version: 0.32.0
@@ -434,8 +434,8 @@ packages:
       '@tangle-network/sandbox':
         optional: true
 
-  '@tangle-network/agent-eval@0.70.0':
-    resolution: {integrity: sha512-Dt5t80djk+snH6hxqUlK9Aa3jf9iWXuwnzCCdK3XOb1QtZM1bes2dbYc9XqjLgEGNSOnjJpz/5/ay9w4r0SHOQ==}
+  '@tangle-network/agent-eval@0.81.0':
+    resolution: {integrity: sha512-qFdVaNYuQlmJgzxnwQ4VsR6mYZTxHDRSu6dm2j7c2WWrhNZz9Bp/NVH5f0klUISLp5ULjFcvSICpRAAM1xYZhg==}
     engines: {node: '>=20'}
     hasBin: true
     peerDependencies:
@@ -1235,7 +1235,7 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@tangle-network/agent-eval@0.70.0(typescript@5.9.3)':
+  '@tangle-network/agent-eval@0.81.0(typescript@5.9.3)':
     dependencies:
       '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3)
       '@ax-llm/ax': 19.0.45(zod@4.4.3)
diff --git a/src/eval-campaign/index.test.ts b/src/eval-campaign/index.test.ts
new file mode 100644
index 0000000..a7b2058
--- /dev/null
+++ b/src/eval-campaign/index.test.ts
@@ -0,0 +1,98 @@
+/**
+ * buildEnsembleJudge wraps the substrate reducer into a JudgeConfig. These
+ * tests pin the contract the loop depends on: N reps fan out, a single rep
+ * failing does not fail the cell, all-failed throws (failed cell, not a zero),
+ * and the JudgeScore shape is exactly what runCampaign/selfImprove consume.
+ */
+
+import { describe, expect, it } from 'vitest'
+
+import type { Scenario } from '@tangle-network/agent-eval/campaign'
+import { buildEnsembleJudge } from './index'
+
+type Dim = 'accuracy' | 'tone'
+const RUBRIC = ['accuracy', 'tone'] as const
+
+interface Art {
+  text: string
+}
+const scenario: Scenario = { id: 's1', kind: 'test' }
+const signal = new AbortController().signal
+
+describe('buildEnsembleJudge', () => {
+  it('fans out judgeReps calls and returns the JudgeScore shape', async () => {
+    let calls = 0
+    const judge = buildEnsembleJudge<Art, Scenario, Dim>({
+      name: 'test',
+      rubric: RUBRIC,
+      judgeReps: 3,
+      async scoreOne({ rep }) {
+        calls++
+        return { model: `m${rep}`, perDimension: { accuracy: 0.8, tone: 0.6 } }
+      },
+    })
+    const score = await judge.score({ artifact: { text: 'x' }, scenario, signal })
+    expect(calls).toBe(3)
+    expect(score.dimensions.accuracy).toBeCloseTo(0.8, 5)
+    expect(score.dimensions.tone).toBeCloseTo(0.6, 5)
+    expect(score.composite).toBeCloseTo(0.7, 5)
+    expect(typeof score.notes).toBe('string')
+  })
+
+  it('exposes the rubric as JudgeDimensions', () => {
+    const judge = buildEnsembleJudge<Art, Scenario, Dim>({
+      name: 'test',
+      rubric: RUBRIC,
+      describe: (d) => `desc:${d}`,
+      async scoreOne() {
+        return { model: 'm', perDimension: { accuracy: 1, tone: 1 } }
+      },
+    })
+    expect(judge.dimensions).toEqual([
+      { key: 'accuracy', description: 'desc:accuracy' },
+      { key: 'tone', description: 'desc:tone' },
+    ])
+  })
+
+  it('a single rep failing does NOT fail the cell — means over survivors', async () => {
+    const judge = buildEnsembleJudge<Art, Scenario, Dim>({
+      name: 'test',
+      rubric: RUBRIC,
+      judgeReps: 2,
+      async scoreOne({ rep }) {
+        if (rep === 0) throw new Error('judge 0 down')
+        return { model: 'm1', perDimension: { accuracy: 0.9, tone: 0.9 } }
+      },
+    })
+    const score = await judge.score({ artifact: { text: 'x' }, scenario, signal })
+    expect(score.dimensions.accuracy).toBeCloseTo(0.9, 5) // survivor only, not (0.9+0)/2
+  })
+
+  it('throws (failed cell, not a zero) when every rep fails', async () => {
+    const judge = buildEnsembleJudge<Art, Scenario, Dim>({
+      name: 'test',
+      rubric: RUBRIC,
+      judgeReps: 2,
+      async scoreOne() {
+        throw new Error('all down')
+      },
+    })
+    await expect(judge.score({ artifact: { text: 'x' }, scenario, signal })).rejects.toThrow(
+      /all 2 judges failed/,
+    )
+  })
+
+  it('rejects an empty rubric and judgeReps < 1', () => {
+    expect(() =>
+      buildEnsembleJudge<Art, Scenario, Dim>({ name: 't', rubric: [], scoreOne: async () => ({ model: 'm', perDimension: null }) }),
+    ).toThrow(/rubric is empty/)
+    expect(() =>
+      buildEnsembleJudge<Art, Scenario, Dim>({
+        name: 't',
+        rubric: RUBRIC,
+        judgeReps: 0,
+        scoreOne: async () => ({ model: 'm', perDimension: { accuracy: 1, tone: 1 } }),
+      }),
+    ).toThrow(/judgeReps must be >= 1/)
+  })
+})
diff --git a/src/eval-campaign/index.ts b/src/eval-campaign/index.ts
new file mode 100644
index 0000000..6777e28
--- /dev/null
+++ b/src/eval-campaign/index.ts
@@ -0,0 +1,144 @@
+/**
+ * Eval-campaign — the app-shell's curated surface for a product's
+ * self-improvement loop, NOT a reimplementation.
+ *
+ * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency):
+ * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA
+ * driver, the held-out production gate, durable provenance + hosted ingest, and
+ * every default. A product should NOT hand-roll `runImprovementLoop` +
+ * `emitLoopProvenance` around it (that is the boilerplate this surface exists to
+ * delete). It should call `selfImprove` with three things it actually owns:
+ * scenarios, an `agent` dispatch, and a `judge`.
+ *
+ * This module adds the one piece `selfImprove` does not own and which every
+ * multi-model product re-hand-rolls — the ensemble judge:
+ *
+ *   {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a
+ *   `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via
+ *   the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread,
+ *   fail-loud on all-failed). A product writes its rubric + one judge call; the
+ *   fan-out, partial-failure handling, and composite are the scaffold's.
+ *
+ * Everything else is a curated re-export so a product has ONE eval import:
+ * `selfImprove` + the gates + the drivers + the types. See
+ * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract.
+ */
+
+import {
+  aggregateJudgeVerdicts,
+  type JudgeVerdict,
+} from '@tangle-network/agent-eval'
+import type {
+  JudgeConfig,
+  JudgeScore,
+  Scenario,
+} from '@tangle-network/agent-eval/campaign'
+
+/** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */
+export interface EnsembleJudgeConfig<TArtifact, TScenario extends Scenario, D extends string> {
+  /** Judge name — appears in traces and scorecards. */
+  name: string
+  /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the
+   *  reducer keys, so a judge that omits a dimension scores it 0 (never silently
+   *  dropped). */
+  rubric: readonly D[]
+  /**
+   * Score ONE artifact on the rubric → a raw per-dimension verdict. Called
+   * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated
+   * ensemble (judges that share a base model share its bias). Return
+   * `{ model, perDimension: null }` to record a judge failure WITHOUT killing
+   * the ensemble; throw only on an unrecoverable error (the whole rep is then
+   * treated as a failed judge).
+   */
+  scoreOne: (input: {
+    artifact: TArtifact
+    scenario: TScenario
+    signal: AbortSignal
+    rep: number
+  }) => Promise<JudgeVerdict<D>>
+  /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`.
+   *  Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */
+  judgeReps?: number
+  /** Per-dimension composite weights. Default: uniform over `rubric`. A partial
+   *  map selects-and-weights exactly the named dimensions. */
+  weights?: Partial<Record<D, number>>
+  /** Optional human-readable dimension descriptions. Default: the key itself. */
+  describe?: (dim: D) => string
+}
+
+/**
+ * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent
+ * `scoreOne` calls and reduces them with the substrate's
+ * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell
+ * (it is recorded and dropped); only ALL judges failing throws — which the
+ * campaign records as a failed cell, never a silent zero.
+ *
+ * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`).
+ */
+export function buildEnsembleJudge<TArtifact, TScenario extends Scenario, D extends string>(
+  cfg: EnsembleJudgeConfig<TArtifact, TScenario, D>,
+): JudgeConfig<TArtifact, TScenario> {
+  const reps = cfg.judgeReps ?? 1
+  if (reps < 1) {
+    throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`)
+  }
+  if (cfg.rubric.length === 0) {
+    throw new Error('buildEnsembleJudge: rubric is empty')
+  }
+  return {
+    name: cfg.name,
+    dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })),
+    async score({ artifact, scenario, signal }): Promise<JudgeScore> {
+      const settled = await Promise.allSettled(
+        Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep })),
+      )
+      const verdicts: JudgeVerdict<D>[] = settled.map((r, rep) =>
+        r.status === 'fulfilled'
+          ? r.value
+          : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) },
+      )
+      // Throws iff EVERY rep failed → the campaign records a failed cell.
+      const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights)
+      return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale }
+    },
+  }
+}
+
+// ── Curated re-exports — the one eval import for a product loop ──────────────
+// The loop engine + gates + drivers + the ensemble reducer, so a product wires
+// its self-improvement loop from a single module instead of reaching across
+// three agent-eval subpaths. All DOWNWARD imports (agent-app consumes the
+// substrate); the layering rule is preserved.
+
+export { aggregateJudgeVerdicts } from '@tangle-network/agent-eval'
+export type {
+  EnsembleAggregate,
+  JudgeVerdict,
+  RunRecord,
+} from '@tangle-network/agent-eval'
+export {
+  defaultProductionGate,
+  evolutionaryDriver,
+  gepaDriver,
+  paretoSignificanceGate,
+  runCampaign,
+} from '@tangle-network/agent-eval/campaign'
+export type {
+  CampaignResult,
+  DispatchContext,
+  Gate,
+  ImprovementDriver,
+  JudgeConfig,
+  JudgeDimension,
+  JudgeScore,
+  LabeledScenarioStore,
+  MutableSurface,
+  Mutator,
+  Scenario,
+} from '@tangle-network/agent-eval/campaign'
+export { selfImprove } from '@tangle-network/agent-eval/contract'
+export type {
+  SelfImproveBudget,
+  SelfImproveOptions,
+  SelfImproveResult,
+} from '@tangle-network/agent-eval/contract'
diff --git a/tsup.config.ts b/tsup.config.ts
index a232a94..a3676c5 100644
--- a/tsup.config.ts
+++ b/tsup.config.ts
@@ -8,6 +8,7 @@ export default defineConfig({
     'tangle/index': 'src/tangle/index.ts',
     'runtime/index': 'src/runtime/index.ts',
     'eval/index': 'src/eval/index.ts',
+    'eval-campaign/index': 'src/eval-campaign/index.ts',
     'knowledge/index': 'src/knowledge/index.ts',
     'knowledge-loop/index': 'src/knowledge-loop/index.ts',
     'harness/index': 'src/harness/index.ts',