heygen-com · jrusso1020 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx
@@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     # Adjust speech speed
     npx hyperframes tts "Slow and clear" --speed 0.8
 
+    # Generate Spanish speech (lang auto-detected from the `e` voice prefix)
+    npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
+
+    # Override the phonemizer (read English text with a French voice)
+    npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr
+
     # Read text from a file
     npx hyperframes tts script.txt
 
@@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
     | `--output, -o` | Output file path (default: `speech.wav` in current directory) |
     | `--voice, -v` | Voice ID (run `--list` to see options) |
     | `--speed, -s` | Speech speed multiplier (default: 1.0) |
+    | `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. |
     | `--list` | List available voices and exit |
     | `--json` | Output result as JSON |
 
+    <Tip>
+      Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent.
+    </Tip>
+
     <Tip>
       Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing.
     </Tip>

diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
@@ -7,15 +7,32 @@ export const examples: Example[] = [
   ["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'],
   ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
   ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
+  [
+    "Generate Spanish speech",
+    'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
+  ],
+  [
+    "Override phonemizer language",
+    'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav',
+  ],
   ["Read text from a file", "hyperframes tts script.txt"],
   ["List available voices", "hyperframes tts --list"],
 ];
 import { resolve, extname } from "node:path";
 import * as clack from "@clack/prompts";
 import { c } from "../ui/colors.js";
-import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js";
+import { errorBox } from "../ui/format.js";
+import {
+  DEFAULT_VOICE,
+  BUNDLED_VOICES,
+  SUPPORTED_LANGS,
+  inferLangFromVoiceId,
+  isSupportedLang,
+  type SupportedLang,
+} from "../tts/manager.js";
 
 const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
+const langList = SUPPORTED_LANGS.join(", ");
 
 export default defineCommand({
   meta: {
@@ -43,6 +60,11 @@ export default defineCommand({
       description: "Speech speed multiplier (default: 1.0)",
       alias: "s",
     },
+    lang: {
+      type: "string",
+      description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
+      alias: "l",
+    },
     list: {
       type: "boolean",
       description: "List available voices and exit",
@@ -94,15 +116,37 @@ export default defineCommand({
       process.exit(1);
     }
 
+    const inferredLang = inferLangFromVoiceId(voice);
+    let lang: SupportedLang = inferredLang;
+    if (args.lang != null) {
+      const requested = String(args.lang).toLowerCase();
+      if (!isSupportedLang(requested)) {
+        errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
+        process.exit(1);
+      }
+      lang = requested;
+    }
+
+    // Mismatched voice/lang is a valid stylization (English text, French
+    // phonemization for accent), so this is a hint, not an error.
+    if (!args.json && args.lang != null && lang !== inferredLang) {
+      console.log(
+        c.dim(
+          `  Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
+        ),
+      );
+    }
+
     // ── Synthesize ────────────────────────────────────────────────────
     const { synthesize } = await import("../tts/synthesize.js");
     const spin = args.json ? null : clack.spinner();
-    spin?.start(`Generating speech with ${c.accent(voice)}...`);
+    spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
 
     try {
       const result = await synthesize(text, output, {
         voice,
         speed,
+        lang,
         onProgress: spin ? (msg) => spin.message(msg) : undefined,
       });
 
@@ -112,6 +156,8 @@ export default defineCommand({
             ok: true,
             voice,
             speed,
+            lang,
+            langApplied: result.langApplied,
             durationSeconds: result.durationSeconds,
             outputPath: result.outputPath,
           }),
@@ -122,6 +168,13 @@ export default defineCommand({
             `Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`,
           ),
         );
+        if (args.lang != null && !result.langApplied) {
+          console.log(
+            c.dim(
+              "  Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
+            ),
+          );
+        }
       }
     } catch (err) {
       const message = err instanceof Error ? err.message : String(err);
@@ -140,23 +193,29 @@ export default defineCommand({
 // ---------------------------------------------------------------------------
 
 function listVoices(json: boolean): void {
+  const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
+
   if (json) {
-    console.log(JSON.stringify(BUNDLED_VOICES));
+    console.log(JSON.stringify(rows));
     return;
   }
 
   console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
   console.log(
-    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Gender")}`,
+    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Lang code")}  ${c.dim("Gender")}`,
   );
-  console.log(`  ${c.dim("─".repeat(60))}`);
-  for (const v of BUNDLED_VOICES) {
-    const id = v.id.padEnd(18);
-    const label = v.label.padEnd(13);
-    const lang = v.language.padEnd(10);
-    console.log(`  ${c.accent(id)} ${label} ${lang} ${v.gender}`);
+  console.log(`  ${c.dim("─".repeat(72))}`);
+  for (const row of rows) {
+    const id = row.id.padEnd(18);
+    const label = row.label.padEnd(13);
+    const lang = row.language.padEnd(10);
+    const code = row.defaultLang.padEnd(10);
+    console.log(`  ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
   }
   console.log(
-    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`,
+    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
+  );
+  console.log(
+    `  ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
   );
 }
diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it } from "vitest";
+import {
+  BUNDLED_VOICES,
+  SUPPORTED_LANGS,
+  inferLangFromVoiceId,
+  isSupportedLang,
+} from "./manager.js";
+
+describe("inferLangFromVoiceId", () => {
+  it.each([
+    ["af_heart", "en-us"],
+    ["am_adam", "en-us"],
+    ["bf_emma", "en-gb"],
+    ["bm_george", "en-gb"],
+    ["ef_dora", "es"],
+    ["ff_siwis", "fr-fr"],
+    ["hf_alpha", "hi"],
+    ["if_sara", "it"],
+    ["jf_alpha", "ja"],
+    ["pf_dora", "pt-br"],
+    ["zf_xiaobei", "zh"],
+  ])("maps voice %s to lang %s", (voiceId, expected) => {
+    expect(inferLangFromVoiceId(voiceId)).toBe(expected);
+  });
+
+  it("falls back to en-us for unknown prefixes", () => {
+    expect(inferLangFromVoiceId("xf_test")).toBe("en-us");
+    expect(inferLangFromVoiceId("")).toBe("en-us");
+  });
+
+  it("is case-insensitive on the prefix letter", () => {
+    expect(inferLangFromVoiceId("EF_dora")).toBe("es");
+    expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh");
+  });
+});
+
+describe("isSupportedLang", () => {
+  it("accepts every value in SUPPORTED_LANGS", () => {
+    for (const lang of SUPPORTED_LANGS) {
+      expect(isSupportedLang(lang)).toBe(true);
+    }
+  });
+
+  it("rejects invalid or misspelled lang codes", () => {
+    expect(isSupportedLang("english")).toBe(false);
+    expect(isSupportedLang("de")).toBe(false);
+    expect(isSupportedLang("")).toBe(false);
+  });
+});
+
+describe("BUNDLED_VOICES", () => {
+  // --lang is user-facing, so the voice list must give users a working
+  // example in at least the most common non-English locales.
+  it("exposes at least one voice per non-English language", () => {
+    const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id)));
+    expect(langs.has("es")).toBe(true);
+    expect(langs.has("fr-fr")).toBe(true);
+    expect(langs.has("ja")).toBe(true);
+    expect(langs.has("zh")).toBe(true);
+  });
+});
diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts
@@ -17,6 +17,51 @@ const MODEL_URLS: Record<string, string> = {
 const VOICES_URL =
   "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
 
+// Locale codes accepted by Kokoro's phonemizer (misaki for English,
+// espeak-ng for everything else). Kept as a readonly tuple so the union
+// type below stays driven by this single source.
+export const SUPPORTED_LANGS = [
+  "en-us",
+  "en-gb",
+  "es",
+  "fr-fr",
+  "hi",
+  "it",
+  "pt-br",
+  "ja",
+  "zh",
+] as const;
+
+export type SupportedLang = (typeof SUPPORTED_LANGS)[number];
+
+// Kokoro voice IDs are `<lang><gender>_<name>` — the first letter is
+// language, the second is gender. See https://github.com/hexgrad/kokoro.
+const VOICE_PREFIX_LANG: Record<string, SupportedLang> = {
+  a: "en-us", // American English
+  b: "en-gb", // British English
+  e: "es", // Spanish
+  f: "fr-fr", // French
+  h: "hi", // Hindi
+  i: "it", // Italian
+  j: "ja", // Japanese
+  p: "pt-br", // Brazilian Portuguese
+  z: "zh", // Mandarin
+};
+
+/**
+ * Infer the phonemizer language from a Kokoro voice ID prefix.
+ * Unknown prefixes fall back to `en-us` — Kokoro's text frontend is
+ * English-trained, so that's the safe default.
+ */
+export function inferLangFromVoiceId(voiceId: string): SupportedLang {
+  const first = voiceId.charAt(0).toLowerCase();
+  return VOICE_PREFIX_LANG[first] ?? "en-us";
+}
+
+export function isSupportedLang(value: string): value is SupportedLang {
+  return (SUPPORTED_LANGS as readonly string[]).includes(value);
+}
+
 // ---------------------------------------------------------------------------
 // Voices — Kokoro ships 54 voices across 8 languages. We expose a curated
 // default set and allow users to specify any valid Kokoro voice ID.
@@ -38,6 +83,10 @@ export const BUNDLED_VOICES: VoiceInfo[] = [
   { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
   { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
   { id: "bm_george", label: "George", language: "en-GB", gender: "male" },
+  { id: "ef_dora", label: "Dora", language: "es", gender: "female" },
+  { id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" },
+  { id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" },
+  { id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" },
 ];
 
 export const DEFAULT_VOICE = "af_heart";