diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx index 82192cbfa..ee5df7c11 100644 --- a/docs/packages/cli.mdx +++ b/docs/packages/cli.mdx @@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ # Adjust speech speed npx hyperframes tts "Slow and clear" --speed 0.8 + # Generate Spanish speech (lang auto-detected from the `e` voice prefix) + npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav + + # Override the phonemizer (read English text with a French voice) + npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr + # Read text from a file npx hyperframes tts script.txt @@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ | `--output, -o` | Output file path (default: `speech.wav` in current directory) | | `--voice, -v` | Voice ID (run `--list` to see options) | | `--speed, -s` | Speech speed multiplier (default: 1.0) | + | `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. | | `--list` | List available voices and exit | | `--json` | Output result as JSON | + + Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent. + + Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing. diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts index d64f38721..bf0bc03c1 100644 --- a/packages/cli/src/commands/tts.ts +++ b/packages/cli/src/commands/tts.ts @@ -7,15 +7,32 @@ export const examples: Example[] = [ ["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'], ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'], ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'], + [ + "Generate Spanish speech", + 'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav', + ], + [ + "Override phonemizer language", + 'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav', + ], ["Read text from a file", "hyperframes tts script.txt"], ["List available voices", "hyperframes tts --list"], ]; import { resolve, extname } from "node:path"; import * as clack from "@clack/prompts"; import { c } from "../ui/colors.js"; -import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js"; +import { errorBox } from "../ui/format.js"; +import { + DEFAULT_VOICE, + BUNDLED_VOICES, + SUPPORTED_LANGS, + inferLangFromVoiceId, + isSupportedLang, + type SupportedLang, +} from "../tts/manager.js"; const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", "); +const langList = SUPPORTED_LANGS.join(", "); export default defineCommand({ meta: { @@ -43,6 +60,11 @@ export default defineCommand({ description: "Speech speed multiplier (default: 1.0)", alias: "s", }, + lang: { + type: "string", + description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`, + alias: "l", + }, list: { type: "boolean", description: "List available voices and exit", @@ -94,15 +116,37 @@ export default defineCommand({ process.exit(1); } + const inferredLang = inferLangFromVoiceId(voice); + let lang: SupportedLang = inferredLang; + if (args.lang != null) { + const requested = String(args.lang).toLowerCase(); + if (!isSupportedLang(requested)) { + errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`); + process.exit(1); + } + lang = requested; + } + + // Mismatched voice/lang is a valid stylization (English text, French + // phonemization for accent), so this is a hint, not an error. + if (!args.json && args.lang != null && lang !== inferredLang) { + console.log( + c.dim( + ` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`, + ), + ); + } + // ── Synthesize ──────────────────────────────────────────────────── const { synthesize } = await import("../tts/synthesize.js"); const spin = args.json ? null : clack.spinner(); - spin?.start(`Generating speech with ${c.accent(voice)}...`); + spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`); try { const result = await synthesize(text, output, { voice, speed, + lang, onProgress: spin ? (msg) => spin.message(msg) : undefined, }); @@ -112,6 +156,8 @@ export default defineCommand({ ok: true, voice, speed, + lang, + langApplied: result.langApplied, durationSeconds: result.durationSeconds, outputPath: result.outputPath, }), @@ -122,6 +168,13 @@ export default defineCommand({ `Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`, ), ); + if (args.lang != null && !result.langApplied) { + console.log( + c.dim( + " Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.", + ), + ); + } } } catch (err) { const message = err instanceof Error ? err.message : String(err); @@ -140,23 +193,29 @@ export default defineCommand({ // --------------------------------------------------------------------------- function listVoices(json: boolean): void { + const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) })); + if (json) { - console.log(JSON.stringify(BUNDLED_VOICES)); + console.log(JSON.stringify(rows)); return; } console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`); console.log( - ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Gender")}`, + ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`, ); - console.log(` ${c.dim("─".repeat(60))}`); - for (const v of BUNDLED_VOICES) { - const id = v.id.padEnd(18); - const label = v.label.padEnd(13); - const lang = v.language.padEnd(10); - console.log(` ${c.accent(id)} ${label} ${lang} ${v.gender}`); + console.log(` ${c.dim("─".repeat(72))}`); + for (const row of rows) { + const id = row.id.padEnd(18); + const label = row.label.padEnd(13); + const lang = row.language.padEnd(10); + const code = row.defaultLang.padEnd(10); + console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`); } console.log( - `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`, + `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`, + ); + console.log( + ` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`, ); } diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts new file mode 100644 index 000000000..2ea4d9238 --- /dev/null +++ b/packages/cli/src/tts/manager.test.ts @@ -0,0 +1,61 @@ +import { describe, expect, it } from "vitest"; +import { + BUNDLED_VOICES, + SUPPORTED_LANGS, + inferLangFromVoiceId, + isSupportedLang, +} from "./manager.js"; + +describe("inferLangFromVoiceId", () => { + it.each([ + ["af_heart", "en-us"], + ["am_adam", "en-us"], + ["bf_emma", "en-gb"], + ["bm_george", "en-gb"], + ["ef_dora", "es"], + ["ff_siwis", "fr-fr"], + ["hf_alpha", "hi"], + ["if_sara", "it"], + ["jf_alpha", "ja"], + ["pf_dora", "pt-br"], + ["zf_xiaobei", "zh"], + ])("maps voice %s to lang %s", (voiceId, expected) => { + expect(inferLangFromVoiceId(voiceId)).toBe(expected); + }); + + it("falls back to en-us for unknown prefixes", () => { + expect(inferLangFromVoiceId("xf_test")).toBe("en-us"); + expect(inferLangFromVoiceId("")).toBe("en-us"); + }); + + it("is case-insensitive on the prefix letter", () => { + expect(inferLangFromVoiceId("EF_dora")).toBe("es"); + expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh"); + }); +}); + +describe("isSupportedLang", () => { + it("accepts every value in SUPPORTED_LANGS", () => { + for (const lang of SUPPORTED_LANGS) { + expect(isSupportedLang(lang)).toBe(true); + } + }); + + it("rejects invalid or misspelled lang codes", () => { + expect(isSupportedLang("english")).toBe(false); + expect(isSupportedLang("de")).toBe(false); + expect(isSupportedLang("")).toBe(false); + }); +}); + +describe("BUNDLED_VOICES", () => { + // --lang is user-facing, so the voice list must give users a working + // example in at least the most common non-English locales. + it("exposes at least one voice per non-English language", () => { + const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id))); + expect(langs.has("es")).toBe(true); + expect(langs.has("fr-fr")).toBe(true); + expect(langs.has("ja")).toBe(true); + expect(langs.has("zh")).toBe(true); + }); +}); diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts index cc945bc21..fa7e3760f 100644 --- a/packages/cli/src/tts/manager.ts +++ b/packages/cli/src/tts/manager.ts @@ -17,6 +17,51 @@ const MODEL_URLS: Record = { const VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"; +// Locale codes accepted by Kokoro's phonemizer (misaki for English, +// espeak-ng for everything else). Kept as a readonly tuple so the union +// type below stays driven by this single source. +export const SUPPORTED_LANGS = [ + "en-us", + "en-gb", + "es", + "fr-fr", + "hi", + "it", + "pt-br", + "ja", + "zh", +] as const; + +export type SupportedLang = (typeof SUPPORTED_LANGS)[number]; + +// Kokoro voice IDs are `_` — the first letter is +// language, the second is gender. See https://github.com/hexgrad/kokoro. +const VOICE_PREFIX_LANG: Record = { + a: "en-us", // American English + b: "en-gb", // British English + e: "es", // Spanish + f: "fr-fr", // French + h: "hi", // Hindi + i: "it", // Italian + j: "ja", // Japanese + p: "pt-br", // Brazilian Portuguese + z: "zh", // Mandarin +}; + +/** + * Infer the phonemizer language from a Kokoro voice ID prefix. + * Unknown prefixes fall back to `en-us` — Kokoro's text frontend is + * English-trained, so that's the safe default. + */ +export function inferLangFromVoiceId(voiceId: string): SupportedLang { + const first = voiceId.charAt(0).toLowerCase(); + return VOICE_PREFIX_LANG[first] ?? "en-us"; +} + +export function isSupportedLang(value: string): value is SupportedLang { + return (SUPPORTED_LANGS as readonly string[]).includes(value); +} + // --------------------------------------------------------------------------- // Voices — Kokoro ships 54 voices across 8 languages. We expose a curated // default set and allow users to specify any valid Kokoro voice ID. @@ -38,6 +83,10 @@ export const BUNDLED_VOICES: VoiceInfo[] = [ { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" }, { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" }, { id: "bm_george", label: "George", language: "en-GB", gender: "male" }, + { id: "ef_dora", label: "Dora", language: "es", gender: "female" }, + { id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" }, + { id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" }, + { id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" }, ]; export const DEFAULT_VOICE = "af_heart"; diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts index e2a5984fa..829417914 100644 --- a/packages/cli/src/tts/synthesize.ts +++ b/packages/cli/src/tts/synthesize.ts @@ -1,8 +1,14 @@ import { execFileSync } from "node:child_process"; -import { existsSync, writeFileSync, mkdirSync } from "node:fs"; -import { join, dirname } from "node:path"; +import { existsSync, writeFileSync, mkdirSync, readdirSync, unlinkSync } from "node:fs"; +import { join, dirname, basename } from "node:path"; import { homedir } from "node:os"; -import { ensureModel, ensureVoices, DEFAULT_VOICE } from "./manager.js"; +import { + ensureModel, + ensureVoices, + DEFAULT_VOICE, + inferLangFromVoiceId, + type SupportedLang, +} from "./manager.js"; // --------------------------------------------------------------------------- // Python runtime detection @@ -54,8 +60,11 @@ function hasPythonPackage(python: string, pkg: string): boolean { // Inline Python script for Kokoro synthesis // --------------------------------------------------------------------------- +// Kokoro-onnx added the `lang=` kwarg to `Kokoro.create()` in a later release. +// We pass it conditionally so older installs that only accept `voice=`/`speed=` +// continue to work (falling back to Kokoro's default phonemization). const SYNTH_SCRIPT = ` -import sys, json +import sys, json, inspect model_path = sys.argv[1] voices_path = sys.argv[2] @@ -63,12 +72,19 @@ text = sys.argv[3] voice = sys.argv[4] speed = float(sys.argv[5]) output_path = sys.argv[6] +lang = sys.argv[7] if len(sys.argv) > 7 else "" import kokoro_onnx import soundfile as sf model = kokoro_onnx.Kokoro(model_path, voices_path) -samples, sample_rate = model.create(text, voice=voice, speed=speed) + +kwargs = {"voice": voice, "speed": speed} +supports_lang = "lang" in inspect.signature(model.create).parameters +if lang and supports_lang: + kwargs["lang"] = lang + +samples, sample_rate = model.create(text, **kwargs) sf.write(output_path, samples, sample_rate) duration = len(samples) / sample_rate @@ -76,17 +92,36 @@ print(json.dumps({ "outputPath": output_path, "sampleRate": sample_rate, "durationSeconds": round(duration, 3), + "langApplied": bool(lang and supports_lang), })) `; -// Cache the script to avoid rewriting it on every invocation +// Cache the script to avoid rewriting it on every invocation. +// The filename carries a version suffix so older installs automatically +// upgrade when the script body changes (e.g., adding the `lang` kwarg). const SCRIPT_DIR = join(homedir(), ".cache", "hyperframes", "tts"); -const SCRIPT_PATH = join(SCRIPT_DIR, "synth.py"); +const SCRIPT_PATH = join(SCRIPT_DIR, "synth-v2.py"); function ensureSynthScript(): string { if (!existsSync(SCRIPT_PATH)) { mkdirSync(SCRIPT_DIR, { recursive: true }); writeFileSync(SCRIPT_PATH, SYNTH_SCRIPT); + // Best-effort: delete older versioned scripts left behind by previous + // CLI releases so users don't accumulate stale files in ~/.cache. + const currentName = basename(SCRIPT_PATH); + try { + for (const entry of readdirSync(SCRIPT_DIR)) { + if (entry !== currentName && /^synth(-v\d+)?\.py$/.test(entry)) { + try { + unlinkSync(join(SCRIPT_DIR, entry)); + } catch { + // Ignore — orphan cleanup is best-effort. + } + } + } + } catch { + // Ignore — directory read is best-effort. + } } return SCRIPT_PATH; } @@ -99,6 +134,12 @@ export interface SynthesizeOptions { model?: string; voice?: string; speed?: number; + /** + * Phonemizer locale. When omitted, inferred from the voice ID prefix + * (e.g., `ef_dora` → `es`). Pass explicitly to override — for example, + * reading English text with a French voice as a stylization. + */ + lang?: SupportedLang; onProgress?: (message: string) => void; } @@ -106,6 +147,8 @@ export interface SynthesizeResult { outputPath: string; sampleRate: number; durationSeconds: number; + /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */ + langApplied: boolean; } /** @@ -118,6 +161,7 @@ export async function synthesize( ): Promise { const voice = options?.voice ?? DEFAULT_VOICE; const speed = options?.speed ?? 1.0; + const lang: SupportedLang = options?.lang ?? inferLangFromVoiceId(voice); // 1. Ensure Python 3 is available with kokoro-onnx options?.onProgress?.("Checking Python runtime..."); @@ -151,11 +195,11 @@ export async function synthesize( mkdirSync(dirname(outputPath), { recursive: true }); // 5. Run synthesis - options?.onProgress?.(`Generating speech with voice ${voice}...`); + options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`); try { const stdout = execFileSync( python, - [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath], + [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath, lang], { encoding: "utf-8", timeout: 300_000, @@ -170,13 +214,18 @@ export async function synthesize( // Parse the last line of stdout as JSON (in case Python printed warnings before it) const lines = stdout.trim().split("\n"); const jsonLine = lines[lines.length - 1] ?? ""; - const result: { outputPath: string; sampleRate: number; durationSeconds: number } = - JSON.parse(jsonLine); + const result: { + outputPath: string; + sampleRate: number; + durationSeconds: number; + langApplied: boolean; + } = JSON.parse(jsonLine); return { outputPath: result.outputPath, sampleRate: result.sampleRate, durationSeconds: result.durationSeconds, + langApplied: result.langApplied, }; } catch (err: unknown) { // If the error is our own JSON parse failure but the file was created, diff --git a/skills/hyperframes/references/tts.md b/skills/hyperframes/references/tts.md index ee94993a7..c403564d8 100644 --- a/skills/hyperframes/references/tts.md +++ b/skills/hyperframes/references/tts.md @@ -16,6 +16,25 @@ Match voice to content. Default is `af_heart`. Run `npx hyperframes tts --list` for all 54 voices (8 languages). +## Multilingual Phonemization + +Kokoro voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from that prefix — you don't need to pass `--lang` when the voice matches the text. + +```bash +npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav +npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav +``` + +Use `--lang` only to override auto-detection (e.g. stylized accents): + +```bash +npx hyperframes tts "Hello there" --voice af_heart --lang fr-fr --output accented.wav +``` + +Valid `--lang` codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. + +Non-English phonemization requires `espeak-ng` installed system-wide (`brew install espeak-ng` on macOS, `apt-get install espeak-ng` on Debian/Ubuntu). + ## Speed Tuning - **0.7-0.8** — Tutorial, complex content