diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx
index 82192cbfa..ee5df7c11 100644
--- a/docs/packages/cli.mdx
+++ b/docs/packages/cli.mdx
@@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
# Adjust speech speed
npx hyperframes tts "Slow and clear" --speed 0.8
+ # Generate Spanish speech (lang auto-detected from the `e` voice prefix)
+ npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
+
+ # Override the phonemizer (read English text with a French voice)
+ npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr
+
# Read text from a file
npx hyperframes tts script.txt
@@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
| `--output, -o` | Output file path (default: `speech.wav` in current directory) |
| `--voice, -v` | Voice ID (run `--list` to see options) |
| `--speed, -s` | Speech speed multiplier (default: 1.0) |
+ | `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. |
| `--list` | List available voices and exit |
| `--json` | Output result as JSON |
+
+ Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent.
+
+
Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing.
diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
index d64f38721..bf0bc03c1 100644
--- a/packages/cli/src/commands/tts.ts
+++ b/packages/cli/src/commands/tts.ts
@@ -7,15 +7,32 @@ export const examples: Example[] = [
["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'],
["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
+ [
+ "Generate Spanish speech",
+ 'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
+ ],
+ [
+ "Override phonemizer language",
+ 'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav',
+ ],
["Read text from a file", "hyperframes tts script.txt"],
["List available voices", "hyperframes tts --list"],
];
import { resolve, extname } from "node:path";
import * as clack from "@clack/prompts";
import { c } from "../ui/colors.js";
-import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js";
+import { errorBox } from "../ui/format.js";
+import {
+ DEFAULT_VOICE,
+ BUNDLED_VOICES,
+ SUPPORTED_LANGS,
+ inferLangFromVoiceId,
+ isSupportedLang,
+ type SupportedLang,
+} from "../tts/manager.js";
const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
+const langList = SUPPORTED_LANGS.join(", ");
export default defineCommand({
meta: {
@@ -43,6 +60,11 @@ export default defineCommand({
description: "Speech speed multiplier (default: 1.0)",
alias: "s",
},
+ lang: {
+ type: "string",
+ description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
+ alias: "l",
+ },
list: {
type: "boolean",
description: "List available voices and exit",
@@ -94,15 +116,37 @@ export default defineCommand({
process.exit(1);
}
+ const inferredLang = inferLangFromVoiceId(voice);
+ let lang: SupportedLang = inferredLang;
+ if (args.lang != null) {
+ const requested = String(args.lang).toLowerCase();
+ if (!isSupportedLang(requested)) {
+ errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
+ process.exit(1);
+ }
+ lang = requested;
+ }
+
+ // Mismatched voice/lang is a valid stylization (English text, French
+ // phonemization for accent), so this is a hint, not an error.
+ if (!args.json && args.lang != null && lang !== inferredLang) {
+ console.log(
+ c.dim(
+ ` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
+ ),
+ );
+ }
+
// ── Synthesize ────────────────────────────────────────────────────
const { synthesize } = await import("../tts/synthesize.js");
const spin = args.json ? null : clack.spinner();
- spin?.start(`Generating speech with ${c.accent(voice)}...`);
+ spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);
try {
const result = await synthesize(text, output, {
voice,
speed,
+ lang,
onProgress: spin ? (msg) => spin.message(msg) : undefined,
});
@@ -112,6 +156,8 @@ export default defineCommand({
ok: true,
voice,
speed,
+ lang,
+ langApplied: result.langApplied,
durationSeconds: result.durationSeconds,
outputPath: result.outputPath,
}),
@@ -122,6 +168,13 @@ export default defineCommand({
`Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`,
),
);
+ if (args.lang != null && !result.langApplied) {
+ console.log(
+ c.dim(
+ " Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
+ ),
+ );
+ }
}
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
@@ -140,23 +193,29 @@ export default defineCommand({
// ---------------------------------------------------------------------------
function listVoices(json: boolean): void {
+ const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));
+
if (json) {
- console.log(JSON.stringify(BUNDLED_VOICES));
+ console.log(JSON.stringify(rows));
return;
}
console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
console.log(
- ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Gender")}`,
+ ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`,
);
- console.log(` ${c.dim("─".repeat(60))}`);
- for (const v of BUNDLED_VOICES) {
- const id = v.id.padEnd(18);
- const label = v.label.padEnd(13);
- const lang = v.language.padEnd(10);
- console.log(` ${c.accent(id)} ${label} ${lang} ${v.gender}`);
+ console.log(` ${c.dim("─".repeat(72))}`);
+ for (const row of rows) {
+ const id = row.id.padEnd(18);
+ const label = row.label.padEnd(13);
+ const lang = row.language.padEnd(10);
+ const code = row.defaultLang.padEnd(10);
+ console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
}
console.log(
- `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`,
+ `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
+ );
+ console.log(
+ ` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
);
}
diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts
new file mode 100644
index 000000000..2ea4d9238
--- /dev/null
+++ b/packages/cli/src/tts/manager.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it } from "vitest";
+import {
+ BUNDLED_VOICES,
+ SUPPORTED_LANGS,
+ inferLangFromVoiceId,
+ isSupportedLang,
+} from "./manager.js";
+
+describe("inferLangFromVoiceId", () => {
+ it.each([
+ ["af_heart", "en-us"],
+ ["am_adam", "en-us"],
+ ["bf_emma", "en-gb"],
+ ["bm_george", "en-gb"],
+ ["ef_dora", "es"],
+ ["ff_siwis", "fr-fr"],
+ ["hf_alpha", "hi"],
+ ["if_sara", "it"],
+ ["jf_alpha", "ja"],
+ ["pf_dora", "pt-br"],
+ ["zf_xiaobei", "zh"],
+ ])("maps voice %s to lang %s", (voiceId, expected) => {
+ expect(inferLangFromVoiceId(voiceId)).toBe(expected);
+ });
+
+ it("falls back to en-us for unknown prefixes", () => {
+ expect(inferLangFromVoiceId("xf_test")).toBe("en-us");
+ expect(inferLangFromVoiceId("")).toBe("en-us");
+ });
+
+ it("is case-insensitive on the prefix letter", () => {
+ expect(inferLangFromVoiceId("EF_dora")).toBe("es");
+ expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh");
+ });
+});
+
+describe("isSupportedLang", () => {
+ it("accepts every value in SUPPORTED_LANGS", () => {
+ for (const lang of SUPPORTED_LANGS) {
+ expect(isSupportedLang(lang)).toBe(true);
+ }
+ });
+
+ it("rejects invalid or misspelled lang codes", () => {
+ expect(isSupportedLang("english")).toBe(false);
+ expect(isSupportedLang("de")).toBe(false);
+ expect(isSupportedLang("")).toBe(false);
+ });
+});
+
+describe("BUNDLED_VOICES", () => {
+ // --lang is user-facing, so the voice list must give users a working
+ // example in at least the most common non-English locales.
+ it("exposes at least one voice per non-English language", () => {
+ const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id)));
+ expect(langs.has("es")).toBe(true);
+ expect(langs.has("fr-fr")).toBe(true);
+ expect(langs.has("ja")).toBe(true);
+ expect(langs.has("zh")).toBe(true);
+ });
+});
diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts
index cc945bc21..fa7e3760f 100644
--- a/packages/cli/src/tts/manager.ts
+++ b/packages/cli/src/tts/manager.ts
@@ -17,6 +17,51 @@ const MODEL_URLS: Record = {
const VOICES_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
+// Locale codes accepted by Kokoro's phonemizer (misaki for English,
+// espeak-ng for everything else). Kept as a readonly tuple so the union
+// type below stays driven by this single source.
+export const SUPPORTED_LANGS = [
+ "en-us",
+ "en-gb",
+ "es",
+ "fr-fr",
+ "hi",
+ "it",
+ "pt-br",
+ "ja",
+ "zh",
+] as const;
+
+export type SupportedLang = (typeof SUPPORTED_LANGS)[number];
+
+// Kokoro voice IDs are `_` — the first letter is
+// language, the second is gender. See https://github.com/hexgrad/kokoro.
+const VOICE_PREFIX_LANG: Record = {
+ a: "en-us", // American English
+ b: "en-gb", // British English
+ e: "es", // Spanish
+ f: "fr-fr", // French
+ h: "hi", // Hindi
+ i: "it", // Italian
+ j: "ja", // Japanese
+ p: "pt-br", // Brazilian Portuguese
+ z: "zh", // Mandarin
+};
+
+/**
+ * Infer the phonemizer language from a Kokoro voice ID prefix.
+ * Unknown prefixes fall back to `en-us` — Kokoro's text frontend is
+ * English-trained, so that's the safe default.
+ */
+export function inferLangFromVoiceId(voiceId: string): SupportedLang {
+ const first = voiceId.charAt(0).toLowerCase();
+ return VOICE_PREFIX_LANG[first] ?? "en-us";
+}
+
+export function isSupportedLang(value: string): value is SupportedLang {
+ return (SUPPORTED_LANGS as readonly string[]).includes(value);
+}
+
// ---------------------------------------------------------------------------
// Voices — Kokoro ships 54 voices across 8 languages. We expose a curated
// default set and allow users to specify any valid Kokoro voice ID.
@@ -38,6 +83,10 @@ export const BUNDLED_VOICES: VoiceInfo[] = [
{ id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
{ id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
{ id: "bm_george", label: "George", language: "en-GB", gender: "male" },
+ { id: "ef_dora", label: "Dora", language: "es", gender: "female" },
+ { id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" },
+ { id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" },
+ { id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" },
];
export const DEFAULT_VOICE = "af_heart";
diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts
index e2a5984fa..829417914 100644
--- a/packages/cli/src/tts/synthesize.ts
+++ b/packages/cli/src/tts/synthesize.ts
@@ -1,8 +1,14 @@
import { execFileSync } from "node:child_process";
-import { existsSync, writeFileSync, mkdirSync } from "node:fs";
-import { join, dirname } from "node:path";
+import { existsSync, writeFileSync, mkdirSync, readdirSync, unlinkSync } from "node:fs";
+import { join, dirname, basename } from "node:path";
import { homedir } from "node:os";
-import { ensureModel, ensureVoices, DEFAULT_VOICE } from "./manager.js";
+import {
+ ensureModel,
+ ensureVoices,
+ DEFAULT_VOICE,
+ inferLangFromVoiceId,
+ type SupportedLang,
+} from "./manager.js";
// ---------------------------------------------------------------------------
// Python runtime detection
@@ -54,8 +60,11 @@ function hasPythonPackage(python: string, pkg: string): boolean {
// Inline Python script for Kokoro synthesis
// ---------------------------------------------------------------------------
+// Kokoro-onnx added the `lang=` kwarg to `Kokoro.create()` in a later release.
+// We pass it conditionally so older installs that only accept `voice=`/`speed=`
+// continue to work (falling back to Kokoro's default phonemization).
const SYNTH_SCRIPT = `
-import sys, json
+import sys, json, inspect
model_path = sys.argv[1]
voices_path = sys.argv[2]
@@ -63,12 +72,19 @@ text = sys.argv[3]
voice = sys.argv[4]
speed = float(sys.argv[5])
output_path = sys.argv[6]
+lang = sys.argv[7] if len(sys.argv) > 7 else ""
import kokoro_onnx
import soundfile as sf
model = kokoro_onnx.Kokoro(model_path, voices_path)
-samples, sample_rate = model.create(text, voice=voice, speed=speed)
+
+kwargs = {"voice": voice, "speed": speed}
+supports_lang = "lang" in inspect.signature(model.create).parameters
+if lang and supports_lang:
+ kwargs["lang"] = lang
+
+samples, sample_rate = model.create(text, **kwargs)
sf.write(output_path, samples, sample_rate)
duration = len(samples) / sample_rate
@@ -76,17 +92,36 @@ print(json.dumps({
"outputPath": output_path,
"sampleRate": sample_rate,
"durationSeconds": round(duration, 3),
+ "langApplied": bool(lang and supports_lang),
}))
`;
-// Cache the script to avoid rewriting it on every invocation
+// Cache the script to avoid rewriting it on every invocation.
+// The filename carries a version suffix so older installs automatically
+// upgrade when the script body changes (e.g., adding the `lang` kwarg).
const SCRIPT_DIR = join(homedir(), ".cache", "hyperframes", "tts");
-const SCRIPT_PATH = join(SCRIPT_DIR, "synth.py");
+const SCRIPT_PATH = join(SCRIPT_DIR, "synth-v2.py");
function ensureSynthScript(): string {
if (!existsSync(SCRIPT_PATH)) {
mkdirSync(SCRIPT_DIR, { recursive: true });
writeFileSync(SCRIPT_PATH, SYNTH_SCRIPT);
+ // Best-effort: delete older versioned scripts left behind by previous
+ // CLI releases so users don't accumulate stale files in ~/.cache.
+ const currentName = basename(SCRIPT_PATH);
+ try {
+ for (const entry of readdirSync(SCRIPT_DIR)) {
+ if (entry !== currentName && /^synth(-v\d+)?\.py$/.test(entry)) {
+ try {
+ unlinkSync(join(SCRIPT_DIR, entry));
+ } catch {
+ // Ignore — orphan cleanup is best-effort.
+ }
+ }
+ }
+ } catch {
+ // Ignore — directory read is best-effort.
+ }
}
return SCRIPT_PATH;
}
@@ -99,6 +134,12 @@ export interface SynthesizeOptions {
model?: string;
voice?: string;
speed?: number;
+ /**
+ * Phonemizer locale. When omitted, inferred from the voice ID prefix
+ * (e.g., `ef_dora` → `es`). Pass explicitly to override — for example,
+ * reading English text with a French voice as a stylization.
+ */
+ lang?: SupportedLang;
onProgress?: (message: string) => void;
}
@@ -106,6 +147,8 @@ export interface SynthesizeResult {
outputPath: string;
sampleRate: number;
durationSeconds: number;
+ /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */
+ langApplied: boolean;
}
/**
@@ -118,6 +161,7 @@ export async function synthesize(
): Promise {
const voice = options?.voice ?? DEFAULT_VOICE;
const speed = options?.speed ?? 1.0;
+ const lang: SupportedLang = options?.lang ?? inferLangFromVoiceId(voice);
// 1. Ensure Python 3 is available with kokoro-onnx
options?.onProgress?.("Checking Python runtime...");
@@ -151,11 +195,11 @@ export async function synthesize(
mkdirSync(dirname(outputPath), { recursive: true });
// 5. Run synthesis
- options?.onProgress?.(`Generating speech with voice ${voice}...`);
+ options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`);
try {
const stdout = execFileSync(
python,
- [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath],
+ [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath, lang],
{
encoding: "utf-8",
timeout: 300_000,
@@ -170,13 +214,18 @@ export async function synthesize(
// Parse the last line of stdout as JSON (in case Python printed warnings before it)
const lines = stdout.trim().split("\n");
const jsonLine = lines[lines.length - 1] ?? "";
- const result: { outputPath: string; sampleRate: number; durationSeconds: number } =
- JSON.parse(jsonLine);
+ const result: {
+ outputPath: string;
+ sampleRate: number;
+ durationSeconds: number;
+ langApplied: boolean;
+ } = JSON.parse(jsonLine);
return {
outputPath: result.outputPath,
sampleRate: result.sampleRate,
durationSeconds: result.durationSeconds,
+ langApplied: result.langApplied,
};
} catch (err: unknown) {
// If the error is our own JSON parse failure but the file was created,
diff --git a/skills/hyperframes/references/tts.md b/skills/hyperframes/references/tts.md
index ee94993a7..c403564d8 100644
--- a/skills/hyperframes/references/tts.md
+++ b/skills/hyperframes/references/tts.md
@@ -16,6 +16,25 @@ Match voice to content. Default is `af_heart`.
Run `npx hyperframes tts --list` for all 54 voices (8 languages).
+## Multilingual Phonemization
+
+Kokoro voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from that prefix — you don't need to pass `--lang` when the voice matches the text.
+
+```bash
+npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav
+npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav
+```
+
+Use `--lang` only to override auto-detection (e.g. stylized accents):
+
+```bash
+npx hyperframes tts "Hello there" --voice af_heart --lang fr-fr --output accented.wav
+```
+
+Valid `--lang` codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`.
+
+Non-English phonemization requires `espeak-ng` installed system-wide (`brew install espeak-ng` on macOS, `apt-get install espeak-ng` on Debian/Ubuntu).
+
## Speed Tuning
- **0.7-0.8** — Tutorial, complex content