From 68e72e95e1ea28437714b7d78648ab8d1ff37b0e Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Mon, 20 Apr 2026 16:42:29 +0000
Subject: [PATCH 1/2] feat(cli): add --lang and auto-infer phonemizer locale
 from voice prefix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`hyperframes tts` was calling Kokoro's `model.create(text, voice=, speed=)`
with no language argument, so Kokoro's default phonemizer (en-us) was
applied regardless of the voice selected. Picking `ef_dora` or `jf_alpha`
and feeding it Spanish or Japanese text produced English-phonemized
output.

Closes #349.

- `manager.ts`: add `SUPPORTED_LANGS`, `inferLangFromVoiceId`, and
  `isSupportedLang`. Attach a `defaultLang` field to every bundled voice
  and expand the bundled list with `ef_dora`, `ff_siwis`, `jf_alpha`,
  `zf_xiaobei` so `--list` surfaces multilingual options.
- `synthesize.ts`: accept optional `lang: SupportedLang` in
  `SynthesizeOptions`, forward it to the Python worker as `argv[7]`.
  The worker introspects `Kokoro.create`'s signature and only passes
  `lang=` when the installed kokoro-onnx version supports it. Returned
  metadata now includes `lang` and `langApplied` so callers can detect
  silent no-ops. Bump the cached script filename to `synth-v2.py` so
  existing installs pick up the new script automatically.
- `commands/tts.ts`: add `--lang, -l` with validation against
  `SUPPORTED_LANGS`. Resolution order is explicit `--lang` > inferred
  from voice prefix > `en-us`. When explicit lang disagrees with the
  voice-implied lang (legitimate for stylized accents), emit a
  dim-level hint; suppress under `--json`. When kokoro-onnx silently
  ignores the kwarg, log that too. Update `--list` with a new
  "Lang code" column and add multilingual examples.
- Tests: new `manager.test.ts` covering every supported prefix, the
  unknown-prefix fallback, case-insensitivity, `isSupportedLang`
  validation, and a regression guard that every bundled voice has a
  valid `defaultLang` matching its ID.
- Docs: `docs/packages/cli.mdx` and `skills/hyperframes/references/tts.md`
  updated with the flag, examples, the espeak-ng dependency note for
  non-English phonemization, and the voice-prefix → lang table.

Backward compatibility:
- English voices (a*/b* prefixes) continue to phonemize as en-us / en-gb
  — no change.
- Non-English voices now phonemize correctly by default (bug fix, not a
  regression).
- Older kokoro-onnx versions that don't know the `lang` kwarg keep
  working via signature introspection; the CLI logs a dim note if
  `--lang` was requested but ignored.

Verification:
- `bun --cwd packages/cli test` — 128 tests pass (incl. 17 new).
- `bunx oxlint` and `bunx oxfmt --check` clean on changed files.
- `bun run build` succeeds.
- `npx tsx packages/cli/src/cli.ts tts --help` / `--list` render cleanly;
  invalid `--lang` produces a clean error with the valid-codes list.
---
 docs/packages/cli.mdx                | 11 ++++
 packages/cli/src/commands/tts.ts     | 75 ++++++++++++++++++++++--
 packages/cli/src/tts/manager.test.ts | 69 ++++++++++++++++++++++
 packages/cli/src/tts/manager.ts      | 85 +++++++++++++++++++++++++---
 packages/cli/src/tts/synthesize.ts   | 56 +++++++++++++++---
 skills/hyperframes/references/tts.md | 19 +++++++
 6 files changed, 292 insertions(+), 23 deletions(-)
 create mode 100644 packages/cli/src/tts/manager.test.ts

diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx
index 82192cbfa..ee5df7c11 100644
--- a/docs/packages/cli.mdx
+++ b/docs/packages/cli.mdx
@@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     # Adjust speech speed
     npx hyperframes tts "Slow and clear" --speed 0.8
 
+    # Generate Spanish speech (lang auto-detected from the `e` voice prefix)
+    npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
+
+    # Override the phonemizer (read English text with a French voice)
+    npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr
+
     # Read text from a file
     npx hyperframes tts script.txt
 
@@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     | `--output, -o` | Output file path (default: `speech.wav` in current directory) |
     | `--voice, -v` | Voice ID (run `--list` to see options) |
     | `--speed, -s` | Speech speed multiplier (default: 1.0) |
+    | `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. |
     | `--list` | List available voices and exit |
     | `--json` | Output result as JSON |
 
+    <Tip>
+      Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent.
+    </Tip>
+
     <Tip>
       Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing.
     </Tip>
diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
index d64f38721..9ede90c37 100644
--- a/packages/cli/src/commands/tts.ts
+++ b/packages/cli/src/commands/tts.ts
@@ -7,15 +7,31 @@ export const examples: Example[] = [
   ["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'],
   ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
   ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
+  [
+    "Generate Spanish speech",
+    'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
+  ],
+  [
+    "Override phonemizer language",
+    'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav',
+  ],
   ["Read text from a file", "hyperframes tts script.txt"],
   ["List available voices", "hyperframes tts --list"],
 ];
 import { resolve, extname } from "node:path";
 import * as clack from "@clack/prompts";
 import { c } from "../ui/colors.js";
-import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js";
+import {
+  DEFAULT_VOICE,
+  BUNDLED_VOICES,
+  SUPPORTED_LANGS,
+  inferLangFromVoiceId,
+  isSupportedLang,
+  type SupportedLang,
+} from "../tts/manager.js";
 
 const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
+const langList = SUPPORTED_LANGS.join(", ");
 
 export default defineCommand({
   meta: {
@@ -43,6 +59,11 @@ export default defineCommand({
       description: "Speech speed multiplier (default: 1.0)",
       alias: "s",
     },
+    lang: {
+      type: "string",
+      description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
+      alias: "l",
+    },
     list: {
       type: "boolean",
       description: "List available voices and exit",
@@ -94,15 +115,44 @@ export default defineCommand({
       process.exit(1);
     }
 
+    // ── Resolve lang (explicit > inferred from voice prefix) ──────────
+    const inferredLang = inferLangFromVoiceId(voice);
+    let lang: SupportedLang = inferredLang;
+    if (args.lang != null) {
+      const requested = String(args.lang).toLowerCase();
+      if (!isSupportedLang(requested)) {
+        console.error(
+          c.error(
+            `Unsupported --lang "${args.lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`,
+          ),
+        );
+        process.exit(1);
+      }
+      lang = requested;
+    }
+
+    // Info-level notice when explicit lang differs from voice-implied lang.
+    // Not an error — users sometimes want this intentionally (e.g. reading
+    // English text with a French voice for a stylized accent). Suppress
+    // when --json so machine-readable output stays clean.
+    if (!args.json && args.lang != null && lang !== inferredLang) {
+      console.log(
+        c.dim(
+          `  Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
+        ),
+      );
+    }
+
     // ── Synthesize ────────────────────────────────────────────────────
     const { synthesize } = await import("../tts/synthesize.js");
     const spin = args.json ? null : clack.spinner();
-    spin?.start(`Generating speech with ${c.accent(voice)}...`);
+    spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
 
     try {
       const result = await synthesize(text, output, {
         voice,
         speed,
+        lang,
         onProgress: spin ? (msg) => spin.message(msg) : undefined,
       });
 
@@ -112,6 +162,8 @@ export default defineCommand({
             ok: true,
             voice,
             speed,
+            lang: result.lang,
+            langApplied: result.langApplied,
             durationSeconds: result.durationSeconds,
             outputPath: result.outputPath,
           }),
@@ -122,6 +174,13 @@ export default defineCommand({
             `Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`,
           ),
         );
+        if (args.lang != null && !result.langApplied) {
+          console.log(
+            c.dim(
+              "  Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
+            ),
+          );
+        }
       }
     } catch (err) {
       const message = err instanceof Error ? err.message : String(err);
@@ -147,16 +206,20 @@ function listVoices(json: boolean): void {
 
   console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
   console.log(
-    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Gender")}`,
+    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Lang code")}  ${c.dim("Gender")}`,
   );
-  console.log(`  ${c.dim("─".repeat(60))}`);
+  console.log(`  ${c.dim("─".repeat(72))}`);
   for (const v of BUNDLED_VOICES) {
     const id = v.id.padEnd(18);
     const label = v.label.padEnd(13);
     const lang = v.language.padEnd(10);
-    console.log(`  ${c.accent(id)} ${label} ${lang} ${v.gender}`);
+    const code = v.defaultLang.padEnd(10);
+    console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${v.gender}`);
   }
   console.log(
-    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`,
+    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
+  );
+  console.log(
+    `  ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
   );
 }
diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts
new file mode 100644
index 000000000..4af622863
--- /dev/null
+++ b/packages/cli/src/tts/manager.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from "vitest";
+import {
+  BUNDLED_VOICES,
+  SUPPORTED_LANGS,
+  inferLangFromVoiceId,
+  isSupportedLang,
+} from "./manager.js";
+
+describe("inferLangFromVoiceId", () => {
+  it.each([
+    ["af_heart", "en-us"],
+    ["am_adam", "en-us"],
+    ["bf_emma", "en-gb"],
+    ["bm_george", "en-gb"],
+    ["ef_dora", "es"],
+    ["ff_siwis", "fr-fr"],
+    ["hf_alpha", "hi"],
+    ["if_sara", "it"],
+    ["jf_alpha", "ja"],
+    ["pf_dora", "pt-br"],
+    ["zf_xiaobei", "zh"],
+  ])("maps voice %s to lang %s", (voiceId, expected) => {
+    expect(inferLangFromVoiceId(voiceId)).toBe(expected);
+  });
+
+  it("falls back to en-us for unknown prefixes", () => {
+    expect(inferLangFromVoiceId("xf_test")).toBe("en-us");
+    expect(inferLangFromVoiceId("")).toBe("en-us");
+  });
+
+  it("is case-insensitive on the prefix letter", () => {
+    expect(inferLangFromVoiceId("EF_dora")).toBe("es");
+    expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh");
+  });
+});
+
+describe("isSupportedLang", () => {
+  it("accepts every value in SUPPORTED_LANGS", () => {
+    for (const lang of SUPPORTED_LANGS) {
+      expect(isSupportedLang(lang)).toBe(true);
+    }
+  });
+
+  it("rejects invalid or misspelled lang codes", () => {
+    expect(isSupportedLang("english")).toBe(false);
+    expect(isSupportedLang("EN-US")).toBe(false); // case-sensitive by design
+    expect(isSupportedLang("de")).toBe(false);
+    expect(isSupportedLang("")).toBe(false);
+  });
+});
+
+describe("BUNDLED_VOICES", () => {
+  it("attaches a valid defaultLang to every voice", () => {
+    for (const voice of BUNDLED_VOICES) {
+      expect(isSupportedLang(voice.defaultLang)).toBe(true);
+      expect(voice.defaultLang).toBe(inferLangFromVoiceId(voice.id));
+    }
+  });
+
+  it("exposes at least one voice per non-English language", () => {
+    // Regression guard: --lang is user-facing, so the voice list must give
+    // users a working example in at least the most common non-English locales.
+    const langs = new Set(BUNDLED_VOICES.map((v) => v.defaultLang));
+    expect(langs.has("es")).toBe(true);
+    expect(langs.has("fr-fr")).toBe(true);
+    expect(langs.has("ja")).toBe(true);
+    expect(langs.has("zh")).toBe(true);
+  });
+});
diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts
index cc945bc21..469a2d874 100644
--- a/packages/cli/src/tts/manager.ts
+++ b/packages/cli/src/tts/manager.ts
@@ -17,6 +17,60 @@ const MODEL_URLS: Record<string, string> = {
 const VOICES_URL =
   "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
 
+// ---------------------------------------------------------------------------
+// Languages — Kokoro's phonemizer supports these locale codes. The second
+// letter of a voice ID is gender; the first letter is language. This list
+// mirrors what misaki (English) and espeak-ng (everything else) accept.
+// ---------------------------------------------------------------------------
+
+export const SUPPORTED_LANGS = [
+  "en-us",
+  "en-gb",
+  "es",
+  "fr-fr",
+  "hi",
+  "it",
+  "pt-br",
+  "ja",
+  "zh",
+] as const;
+
+export type SupportedLang = (typeof SUPPORTED_LANGS)[number];
+
+const DEFAULT_LANG: SupportedLang = "en-us";
+
+// First letter of a Kokoro voice ID → phonemizer locale.
+// See https://github.com/hexgrad/kokoro for the full voice catalog.
+const VOICE_PREFIX_LANG: Record<string, SupportedLang> = {
+  a: "en-us", // American English
+  b: "en-gb", // British English
+  e: "es", // Spanish
+  f: "fr-fr", // French
+  h: "hi", // Hindi
+  i: "it", // Italian
+  j: "ja", // Japanese
+  p: "pt-br", // Brazilian Portuguese
+  z: "zh", // Mandarin
+};
+
+/**
+ * Infer the phonemizer language from a Kokoro voice ID prefix.
+ *
+ * Kokoro voice IDs are `<lang><gender>_<name>` where `<lang>` is a single
+ * letter: a=American, b=British, e=Spanish, f=French, h=Hindi, i=Italian,
+ * j=Japanese, p=Brazilian Portuguese, z=Mandarin. Unknown prefixes fall
+ * back to `en-us` — the safe default for Kokoro's English-trained text
+ * frontend.
+ */
+export function inferLangFromVoiceId(voiceId: string): SupportedLang {
+  const first = voiceId.charAt(0).toLowerCase();
+  return VOICE_PREFIX_LANG[first] ?? DEFAULT_LANG;
+}
+
+export function isSupportedLang(value: string): value is SupportedLang {
+  return (SUPPORTED_LANGS as readonly string[]).includes(value);
+}
+
 // ---------------------------------------------------------------------------
 // Voices — Kokoro ships 54 voices across 8 languages. We expose a curated
 // default set and allow users to specify any valid Kokoro voice ID.
@@ -27,17 +81,32 @@ export interface VoiceInfo {
   label: string;
   language: string;
   gender: "female" | "male";
+  /** Phonemizer locale for this voice. Derived from the ID prefix. */
+  defaultLang: SupportedLang;
+}
+
+function makeVoice(
+  id: string,
+  label: string,
+  language: string,
+  gender: "female" | "male",
+): VoiceInfo {
+  return { id, label, language, gender, defaultLang: inferLangFromVoiceId(id) };
 }
 
 export const BUNDLED_VOICES: VoiceInfo[] = [
-  { id: "af_heart", label: "Heart", language: "en-US", gender: "female" },
-  { id: "af_nova", label: "Nova", language: "en-US", gender: "female" },
-  { id: "af_sky", label: "Sky", language: "en-US", gender: "female" },
-  { id: "am_adam", label: "Adam", language: "en-US", gender: "male" },
-  { id: "am_michael", label: "Michael", language: "en-US", gender: "male" },
-  { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
-  { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
-  { id: "bm_george", label: "George", language: "en-GB", gender: "male" },
+  makeVoice("af_heart", "Heart", "en-US", "female"),
+  makeVoice("af_nova", "Nova", "en-US", "female"),
+  makeVoice("af_sky", "Sky", "en-US", "female"),
+  makeVoice("am_adam", "Adam", "en-US", "male"),
+  makeVoice("am_michael", "Michael", "en-US", "male"),
+  makeVoice("bf_emma", "Emma", "en-GB", "female"),
+  makeVoice("bf_isabella", "Isabella", "en-GB", "female"),
+  makeVoice("bm_george", "George", "en-GB", "male"),
+  makeVoice("ef_dora", "Dora", "es", "female"),
+  makeVoice("ff_siwis", "Siwis", "fr-FR", "female"),
+  makeVoice("jf_alpha", "Alpha", "ja", "female"),
+  makeVoice("zf_xiaobei", "Xiaobei", "zh", "female"),
 ];
 
 export const DEFAULT_VOICE = "af_heart";
diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts
index e2a5984fa..2629989c1 100644
--- a/packages/cli/src/tts/synthesize.ts
+++ b/packages/cli/src/tts/synthesize.ts
@@ -2,7 +2,13 @@ import { execFileSync } from "node:child_process";
 import { existsSync, writeFileSync, mkdirSync } from "node:fs";
 import { join, dirname } from "node:path";
 import { homedir } from "node:os";
-import { ensureModel, ensureVoices, DEFAULT_VOICE } from "./manager.js";
+import {
+  ensureModel,
+  ensureVoices,
+  DEFAULT_VOICE,
+  inferLangFromVoiceId,
+  type SupportedLang,
+} from "./manager.js";
 
 // ---------------------------------------------------------------------------
 // Python runtime detection
@@ -54,8 +60,11 @@ function hasPythonPackage(python: string, pkg: string): boolean {
 // Inline Python script for Kokoro synthesis
 // ---------------------------------------------------------------------------
 
+// Kokoro-onnx added the `lang=` kwarg to `Kokoro.create()` in a later release.
+// We pass it conditionally so older installs that only accept `voice=`/`speed=`
+// continue to work (falling back to Kokoro's default phonemization).
 const SYNTH_SCRIPT = `
-import sys, json
+import sys, json, inspect
 
 model_path = sys.argv[1]
 voices_path = sys.argv[2]
@@ -63,12 +72,19 @@ text = sys.argv[3]
 voice = sys.argv[4]
 speed = float(sys.argv[5])
 output_path = sys.argv[6]
+lang = sys.argv[7] if len(sys.argv) > 7 else ""
 
 import kokoro_onnx
 import soundfile as sf
 
 model = kokoro_onnx.Kokoro(model_path, voices_path)
-samples, sample_rate = model.create(text, voice=voice, speed=speed)
+
+kwargs = {"voice": voice, "speed": speed}
+supports_lang = "lang" in inspect.signature(model.create).parameters
+if lang and supports_lang:
+    kwargs["lang"] = lang
+
+samples, sample_rate = model.create(text, **kwargs)
 sf.write(output_path, samples, sample_rate)
 
 duration = len(samples) / sample_rate
@@ -76,12 +92,16 @@ print(json.dumps({
     "outputPath": output_path,
     "sampleRate": sample_rate,
     "durationSeconds": round(duration, 3),
+    "lang": lang if (lang and supports_lang) else None,
+    "langApplied": bool(lang and supports_lang),
 }))
 `;
 
-// Cache the script to avoid rewriting it on every invocation
+// Cache the script to avoid rewriting it on every invocation.
+// The filename carries a version suffix so older installs automatically
+// upgrade when the script body changes (e.g., adding the `lang` kwarg).
 const SCRIPT_DIR = join(homedir(), ".cache", "hyperframes", "tts");
-const SCRIPT_PATH = join(SCRIPT_DIR, "synth.py");
+const SCRIPT_PATH = join(SCRIPT_DIR, "synth-v2.py");
 
 function ensureSynthScript(): string {
   if (!existsSync(SCRIPT_PATH)) {
@@ -99,6 +119,12 @@ export interface SynthesizeOptions {
   model?: string;
   voice?: string;
   speed?: number;
+  /**
+   * Phonemizer locale. When omitted, inferred from the voice ID prefix
+   * (e.g., `ef_dora` → `es`). Pass explicitly to override — for example,
+   * reading English text with a French voice as a stylization.
+   */
+  lang?: SupportedLang;
   onProgress?: (message: string) => void;
 }
 
@@ -106,6 +132,10 @@ export interface SynthesizeResult {
   outputPath: string;
   sampleRate: number;
   durationSeconds: number;
+  /** Language actually applied during synthesis, or null if kokoro-onnx silently ignored it. */
+  lang: SupportedLang | null;
+  /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */
+  langApplied: boolean;
 }
 
 /**
@@ -118,6 +148,7 @@ export async function synthesize(
 ): Promise<SynthesizeResult> {
   const voice = options?.voice ?? DEFAULT_VOICE;
   const speed = options?.speed ?? 1.0;
+  const lang: SupportedLang = options?.lang ?? inferLangFromVoiceId(voice);
 
   // 1. Ensure Python 3 is available with kokoro-onnx
   options?.onProgress?.("Checking Python runtime...");
@@ -151,11 +182,11 @@ export async function synthesize(
   mkdirSync(dirname(outputPath), { recursive: true });
 
   // 5. Run synthesis
-  options?.onProgress?.(`Generating speech with voice ${voice}...`);
+  options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`);
   try {
     const stdout = execFileSync(
       python,
-      [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath],
+      [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath, lang],
       {
         encoding: "utf-8",
         timeout: 300_000,
@@ -170,13 +201,20 @@ export async function synthesize(
     // Parse the last line of stdout as JSON (in case Python printed warnings before it)
     const lines = stdout.trim().split("\n");
     const jsonLine = lines[lines.length - 1] ?? "";
-    const result: { outputPath: string; sampleRate: number; durationSeconds: number } =
-      JSON.parse(jsonLine);
+    const result: {
+      outputPath: string;
+      sampleRate: number;
+      durationSeconds: number;
+      lang: SupportedLang | null;
+      langApplied: boolean;
+    } = JSON.parse(jsonLine);
 
     return {
       outputPath: result.outputPath,
       sampleRate: result.sampleRate,
       durationSeconds: result.durationSeconds,
+      lang: result.lang,
+      langApplied: result.langApplied,
     };
   } catch (err: unknown) {
     // If the error is our own JSON parse failure but the file was created,
diff --git a/skills/hyperframes/references/tts.md b/skills/hyperframes/references/tts.md
index ee94993a7..c403564d8 100644
--- a/skills/hyperframes/references/tts.md
+++ b/skills/hyperframes/references/tts.md
@@ -16,6 +16,25 @@ Match voice to content. Default is `af_heart`.
 
 Run `npx hyperframes tts --list` for all 54 voices (8 languages).
 
+## Multilingual Phonemization
+
+Kokoro voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from that prefix — you don't need to pass `--lang` when the voice matches the text.
+
+```bash
+npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
+npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav
+```
+
+Use `--lang` only to override auto-detection (e.g. stylized accents):
+
+```bash
+npx hyperframes tts "Hello there" --voice af_heart --lang fr-fr --output accented.wav
+```
+
+Valid `--lang` codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`.
+
+Non-English phonemization requires `espeak-ng` installed system-wide (`brew install espeak-ng` on macOS, `apt-get install espeak-ng` on Debian/Ubuntu).
+
 ## Speed Tuning
 
 - **0.7-0.8** — Tutorial, complex content

From 74cd4738c67133f216b6cb483351f2c4c730bac6 Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Mon, 20 Apr 2026 16:55:21 +0000
Subject: [PATCH 2/2] refactor(cli): simplify tts --lang implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Post-review cleanup on #351. Net -21 lines.

- Drop `defaultLang` field + `makeVoice()` helper from VoiceInfo —
  compute via `inferLangFromVoiceId(v.id)` at read time in listVoices.
  The only reader was the --list table; caching the derived value on
  every voice added a self-consistency invariant we had to test.
- Drop redundant `lang` field from SynthesizeResult — caller already
  knows the requested lang since it passed it in; only `langApplied`
  carries information the caller can't derive.
- Use `errorBox` for --lang validation to match the house style in
  render.ts (other validation errors already use errorBox).
- Reuse existing `langList` module constant in the validation error
  instead of re-joining SUPPORTED_LANGS.
- Inline `DEFAULT_LANG` — used once in inferLangFromVoiceId.
- Trim WHAT-restating comments and the duplicate prefix-enumeration
  JSDoc on inferLangFromVoiceId (VOICE_PREFIX_LANG already carries
  per-row comments).
- Clean up orphaned `synth*.py` files in ~/.cache/hyperframes/tts
  when writing the current versioned script, so repeated upgrades
  don't leak files.
- Drop the `EN-US` case-sensitive-rejection test assertion — the CLI
  lowercases input before validation, so accepting mixed case is a
  feature, not a bug.

Tests: 16/16 in `manager.test.ts`, 127/127 full CLI suite pass.
Lint + format + typecheck clean.
---
 packages/cli/src/commands/tts.ts     | 32 +++++++--------
 packages/cli/src/tts/manager.test.ts | 14 ++-----
 packages/cli/src/tts/manager.ts      | 60 ++++++++++------------------
 packages/cli/src/tts/synthesize.ts   | 25 ++++++++----
 4 files changed, 55 insertions(+), 76 deletions(-)

diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
index 9ede90c37..bf0bc03c1 100644
--- a/packages/cli/src/commands/tts.ts
+++ b/packages/cli/src/commands/tts.ts
@@ -21,6 +21,7 @@ export const examples: Example[] = [
 import { resolve, extname } from "node:path";
 import * as clack from "@clack/prompts";
 import { c } from "../ui/colors.js";
+import { errorBox } from "../ui/format.js";
 import {
   DEFAULT_VOICE,
   BUNDLED_VOICES,
@@ -115,26 +116,19 @@ export default defineCommand({
       process.exit(1);
     }
 
-    // ── Resolve lang (explicit > inferred from voice prefix) ──────────
     const inferredLang = inferLangFromVoiceId(voice);
     let lang: SupportedLang = inferredLang;
     if (args.lang != null) {
       const requested = String(args.lang).toLowerCase();
       if (!isSupportedLang(requested)) {
-        console.error(
-          c.error(
-            `Unsupported --lang "${args.lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`,
-          ),
-        );
+        errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
         process.exit(1);
       }
       lang = requested;
     }
 
-    // Info-level notice when explicit lang differs from voice-implied lang.
-    // Not an error — users sometimes want this intentionally (e.g. reading
-    // English text with a French voice for a stylized accent). Suppress
-    // when --json so machine-readable output stays clean.
+    // Mismatched voice/lang is a valid stylization (English text, French
+    // phonemization for accent), so this is a hint, not an error.
     if (!args.json && args.lang != null && lang !== inferredLang) {
       console.log(
         c.dim(
@@ -162,7 +156,7 @@ export default defineCommand({
             ok: true,
             voice,
             speed,
-            lang: result.lang,
+            lang,
             langApplied: result.langApplied,
             durationSeconds: result.durationSeconds,
             outputPath: result.outputPath,
@@ -199,8 +193,10 @@ export default defineCommand({
 // ---------------------------------------------------------------------------
 
 function listVoices(json: boolean): void {
+  const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
+
   if (json) {
-    console.log(JSON.stringify(BUNDLED_VOICES));
+    console.log(JSON.stringify(rows));
     return;
   }
 
@@ -209,12 +205,12 @@ function listVoices(json: boolean): void {
     `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Lang code")}  ${c.dim("Gender")}`,
   );
   console.log(`  ${c.dim("─".repeat(72))}`);
-  for (const v of BUNDLED_VOICES) {
-    const id = v.id.padEnd(18);
-    const label = v.label.padEnd(13);
-    const lang = v.language.padEnd(10);
-    const code = v.defaultLang.padEnd(10);
-    console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${v.gender}`);
+  for (const row of rows) {
+    const id = row.id.padEnd(18);
+    const label = row.label.padEnd(13);
+    const lang = row.language.padEnd(10);
+    const code = row.defaultLang.padEnd(10);
+    console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
   }
   console.log(
     `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts
index 4af622863..2ea4d9238 100644
--- a/packages/cli/src/tts/manager.test.ts
+++ b/packages/cli/src/tts/manager.test.ts
@@ -43,24 +43,16 @@ describe("isSupportedLang", () => {
 
   it("rejects invalid or misspelled lang codes", () => {
     expect(isSupportedLang("english")).toBe(false);
-    expect(isSupportedLang("EN-US")).toBe(false); // case-sensitive by design
     expect(isSupportedLang("de")).toBe(false);
     expect(isSupportedLang("")).toBe(false);
   });
 });
 
 describe("BUNDLED_VOICES", () => {
-  it("attaches a valid defaultLang to every voice", () => {
-    for (const voice of BUNDLED_VOICES) {
-      expect(isSupportedLang(voice.defaultLang)).toBe(true);
-      expect(voice.defaultLang).toBe(inferLangFromVoiceId(voice.id));
-    }
-  });
-
+  // --lang is user-facing, so the voice list must give users a working
+  // example in at least the most common non-English locales.
   it("exposes at least one voice per non-English language", () => {
-    // Regression guard: --lang is user-facing, so the voice list must give
-    // users a working example in at least the most common non-English locales.
-    const langs = new Set(BUNDLED_VOICES.map((v) => v.defaultLang));
+    const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id)));
     expect(langs.has("es")).toBe(true);
     expect(langs.has("fr-fr")).toBe(true);
     expect(langs.has("ja")).toBe(true);
diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts
index 469a2d874..fa7e3760f 100644
--- a/packages/cli/src/tts/manager.ts
+++ b/packages/cli/src/tts/manager.ts
@@ -17,12 +17,9 @@ const MODEL_URLS: Record<string, string> = {
 const VOICES_URL =
   "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
 
-// ---------------------------------------------------------------------------
-// Languages — Kokoro's phonemizer supports these locale codes. The second
-// letter of a voice ID is gender; the first letter is language. This list
-// mirrors what misaki (English) and espeak-ng (everything else) accept.
-// ---------------------------------------------------------------------------
-
+// Locale codes accepted by Kokoro's phonemizer (misaki for English,
+// espeak-ng for everything else). Kept as a readonly tuple so the union
+// type below stays driven by this single source.
 export const SUPPORTED_LANGS = [
   "en-us",
   "en-gb",
@@ -37,10 +34,8 @@ export const SUPPORTED_LANGS = [
 
 export type SupportedLang = (typeof SUPPORTED_LANGS)[number];
 
-const DEFAULT_LANG: SupportedLang = "en-us";
-
-// First letter of a Kokoro voice ID → phonemizer locale.
-// See https://github.com/hexgrad/kokoro for the full voice catalog.
+// Kokoro voice IDs are `<lang><gender>_<name>` — the first letter is
+// language, the second is gender. See https://github.com/hexgrad/kokoro.
 const VOICE_PREFIX_LANG: Record<string, SupportedLang> = {
   a: "en-us", // American English
   b: "en-gb", // British English
@@ -55,16 +50,12 @@ const VOICE_PREFIX_LANG: Record<string, SupportedLang> = {
 
 /**
  * Infer the phonemizer language from a Kokoro voice ID prefix.
- *
- * Kokoro voice IDs are `<lang><gender>_<name>` where `<lang>` is a single
- * letter: a=American, b=British, e=Spanish, f=French, h=Hindi, i=Italian,
- * j=Japanese, p=Brazilian Portuguese, z=Mandarin. Unknown prefixes fall
- * back to `en-us` — the safe default for Kokoro's English-trained text
- * frontend.
+ * Unknown prefixes fall back to `en-us` — Kokoro's text frontend is
+ * English-trained, so that's the safe default.
  */
 export function inferLangFromVoiceId(voiceId: string): SupportedLang {
   const first = voiceId.charAt(0).toLowerCase();
-  return VOICE_PREFIX_LANG[first] ?? DEFAULT_LANG;
+  return VOICE_PREFIX_LANG[first] ?? "en-us";
 }
 
 export function isSupportedLang(value: string): value is SupportedLang {
@@ -81,32 +72,21 @@ export interface VoiceInfo {
   label: string;
   language: string;
   gender: "female" | "male";
-  /** Phonemizer locale for this voice. Derived from the ID prefix. */
-  defaultLang: SupportedLang;
-}
-
-function makeVoice(
-  id: string,
-  label: string,
-  language: string,
-  gender: "female" | "male",
-): VoiceInfo {
-  return { id, label, language, gender, defaultLang: inferLangFromVoiceId(id) };
 }
 
 export const BUNDLED_VOICES: VoiceInfo[] = [
-  makeVoice("af_heart", "Heart", "en-US", "female"),
-  makeVoice("af_nova", "Nova", "en-US", "female"),
-  makeVoice("af_sky", "Sky", "en-US", "female"),
-  makeVoice("am_adam", "Adam", "en-US", "male"),
-  makeVoice("am_michael", "Michael", "en-US", "male"),
-  makeVoice("bf_emma", "Emma", "en-GB", "female"),
-  makeVoice("bf_isabella", "Isabella", "en-GB", "female"),
-  makeVoice("bm_george", "George", "en-GB", "male"),
-  makeVoice("ef_dora", "Dora", "es", "female"),
-  makeVoice("ff_siwis", "Siwis", "fr-FR", "female"),
-  makeVoice("jf_alpha", "Alpha", "ja", "female"),
-  makeVoice("zf_xiaobei", "Xiaobei", "zh", "female"),
+  { id: "af_heart", label: "Heart", language: "en-US", gender: "female" },
+  { id: "af_nova", label: "Nova", language: "en-US", gender: "female" },
+  { id: "af_sky", label: "Sky", language: "en-US", gender: "female" },
+  { id: "am_adam", label: "Adam", language: "en-US", gender: "male" },
+  { id: "am_michael", label: "Michael", language: "en-US", gender: "male" },
+  { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
+  { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
+  { id: "bm_george", label: "George", language: "en-GB", gender: "male" },
+  { id: "ef_dora", label: "Dora", language: "es", gender: "female" },
+  { id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" },
+  { id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" },
+  { id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" },
 ];
 
 export const DEFAULT_VOICE = "af_heart";
diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts
index 2629989c1..829417914 100644
--- a/packages/cli/src/tts/synthesize.ts
+++ b/packages/cli/src/tts/synthesize.ts
@@ -1,6 +1,6 @@
 import { execFileSync } from "node:child_process";
-import { existsSync, writeFileSync, mkdirSync } from "node:fs";
-import { join, dirname } from "node:path";
+import { existsSync, writeFileSync, mkdirSync, readdirSync, unlinkSync } from "node:fs";
+import { join, dirname, basename } from "node:path";
 import { homedir } from "node:os";
 import {
   ensureModel,
@@ -92,7 +92,6 @@ print(json.dumps({
     "outputPath": output_path,
     "sampleRate": sample_rate,
     "durationSeconds": round(duration, 3),
-    "lang": lang if (lang and supports_lang) else None,
     "langApplied": bool(lang and supports_lang),
 }))
 `;
@@ -107,6 +106,22 @@ function ensureSynthScript(): string {
   if (!existsSync(SCRIPT_PATH)) {
     mkdirSync(SCRIPT_DIR, { recursive: true });
     writeFileSync(SCRIPT_PATH, SYNTH_SCRIPT);
+    // Best-effort: delete older versioned scripts left behind by previous
+    // CLI releases so users don't accumulate stale files in ~/.cache.
+    const currentName = basename(SCRIPT_PATH);
+    try {
+      for (const entry of readdirSync(SCRIPT_DIR)) {
+        if (entry !== currentName && /^synth(-v\d+)?\.py$/.test(entry)) {
+          try {
+            unlinkSync(join(SCRIPT_DIR, entry));
+          } catch {
+            // Ignore — orphan cleanup is best-effort.
+          }
+        }
+      }
+    } catch {
+      // Ignore — directory read is best-effort.
+    }
   }
   return SCRIPT_PATH;
 }
@@ -132,8 +147,6 @@ export interface SynthesizeResult {
   outputPath: string;
   sampleRate: number;
   durationSeconds: number;
-  /** Language actually applied during synthesis, or null if kokoro-onnx silently ignored it. */
-  lang: SupportedLang | null;
   /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */
   langApplied: boolean;
 }
@@ -205,7 +218,6 @@ export async function synthesize(
       outputPath: string;
       sampleRate: number;
       durationSeconds: number;
-      lang: SupportedLang | null;
       langApplied: boolean;
     } = JSON.parse(jsonLine);
 
@@ -213,7 +225,6 @@ export async function synthesize(
       outputPath: result.outputPath,
       sampleRate: result.sampleRate,
       durationSeconds: result.durationSeconds,
-      lang: result.lang,
       langApplied: result.langApplied,
     };
   } catch (err: unknown) {