From 68e72e95e1ea28437714b7d78648ab8d1ff37b0e Mon Sep 17 00:00:00 2001 From: James Date: Mon, 20 Apr 2026 16:42:29 +0000 Subject: [PATCH 1/2] feat(cli): add --lang and auto-infer phonemizer locale from voice prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `hyperframes tts` was calling Kokoro's `model.create(text, voice=, speed=)` with no language argument, so Kokoro's default phonemizer (en-us) was applied regardless of the voice selected. Picking `ef_dora` or `jf_alpha` and feeding it Spanish or Japanese text produced English-phonemized output. Closes #349. - `manager.ts`: add `SUPPORTED_LANGS`, `inferLangFromVoiceId`, and `isSupportedLang`. Attach a `defaultLang` field to every bundled voice and expand the bundled list with `ef_dora`, `ff_siwis`, `jf_alpha`, `zf_xiaobei` so `--list` surfaces multilingual options. - `synthesize.ts`: accept optional `lang: SupportedLang` in `SynthesizeOptions`, forward it to the Python worker as `argv[7]`. The worker introspects `Kokoro.create`'s signature and only passes `lang=` when the installed kokoro-onnx version supports it. Returned metadata now includes `lang` and `langApplied` so callers can detect silent no-ops. Bump the cached script filename to `synth-v2.py` so existing installs pick up the new script automatically. - `commands/tts.ts`: add `--lang, -l` with validation against `SUPPORTED_LANGS`. Resolution order is explicit `--lang` > inferred from voice prefix > `en-us`. When explicit lang disagrees with the voice-implied lang (legitimate for stylized accents), emit a dim-level hint; suppress under `--json`. When kokoro-onnx silently ignores the kwarg, log that too. Update `--list` with a new "Lang code" column and add multilingual examples. - Tests: new `manager.test.ts` covering every supported prefix, the unknown-prefix fallback, case-insensitivity, `isSupportedLang` validation, and a regression guard that every bundled voice has a valid `defaultLang` matching its ID. - Docs: `docs/packages/cli.mdx` and `skills/hyperframes/references/tts.md` updated with the flag, examples, the espeak-ng dependency note for non-English phonemization, and the voice-prefix → lang table. Backward compatibility: - English voices (a*/b* prefixes) continue to phonemize as en-us / en-gb — no change. - Non-English voices now phonemize correctly by default (bug fix, not a regression). - Older kokoro-onnx versions that don't know the `lang` kwarg keep working via signature introspection; the CLI logs a dim note if `--lang` was requested but ignored. Verification: - `bun --cwd packages/cli test` — 128 tests pass (incl. 17 new). - `bunx oxlint` and `bunx oxfmt --check` clean on changed files. - `bun run build` succeeds. - `npx tsx packages/cli/src/cli.ts tts --help` / `--list` render cleanly; invalid `--lang` produces a clean error with the valid-codes list. --- docs/packages/cli.mdx | 11 ++++ packages/cli/src/commands/tts.ts | 75 ++++++++++++++++++++++-- packages/cli/src/tts/manager.test.ts | 69 ++++++++++++++++++++++ packages/cli/src/tts/manager.ts | 85 +++++++++++++++++++++++++--- packages/cli/src/tts/synthesize.ts | 56 +++++++++++++++--- skills/hyperframes/references/tts.md | 19 +++++++ 6 files changed, 292 insertions(+), 23 deletions(-) create mode 100644 packages/cli/src/tts/manager.test.ts diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx index 82192cbfa..ee5df7c11 100644 --- a/docs/packages/cli.mdx +++ b/docs/packages/cli.mdx @@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ # Adjust speech speed npx hyperframes tts "Slow and clear" --speed 0.8 + # Generate Spanish speech (lang auto-detected from the `e` voice prefix) + npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav + + # Override the phonemizer (read English text with a French voice) + npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr + # Read text from a file npx hyperframes tts script.txt @@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ | `--output, -o` | Output file path (default: `speech.wav` in current directory) | | `--voice, -v` | Voice ID (run `--list` to see options) | | `--speed, -s` | Speech speed multiplier (default: 1.0) | + | `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. | | `--list` | List available voices and exit | | `--json` | Output result as JSON | + + Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent. + + Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing. diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts index d64f38721..9ede90c37 100644 --- a/packages/cli/src/commands/tts.ts +++ b/packages/cli/src/commands/tts.ts @@ -7,15 +7,31 @@ export const examples: Example[] = [ ["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'], ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'], ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'], + [ + "Generate Spanish speech", + 'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav', + ], + [ + "Override phonemizer language", + 'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav', + ], ["Read text from a file", "hyperframes tts script.txt"], ["List available voices", "hyperframes tts --list"], ]; import { resolve, extname } from "node:path"; import * as clack from "@clack/prompts"; import { c } from "../ui/colors.js"; -import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js"; +import { + DEFAULT_VOICE, + BUNDLED_VOICES, + SUPPORTED_LANGS, + inferLangFromVoiceId, + isSupportedLang, + type SupportedLang, +} from "../tts/manager.js"; const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", "); +const langList = SUPPORTED_LANGS.join(", "); export default defineCommand({ meta: { @@ -43,6 +59,11 @@ export default defineCommand({ description: "Speech speed multiplier (default: 1.0)", alias: "s", }, + lang: { + type: "string", + description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`, + alias: "l", + }, list: { type: "boolean", description: "List available voices and exit", @@ -94,15 +115,44 @@ export default defineCommand({ process.exit(1); } + // ── Resolve lang (explicit > inferred from voice prefix) ────────── + const inferredLang = inferLangFromVoiceId(voice); + let lang: SupportedLang = inferredLang; + if (args.lang != null) { + const requested = String(args.lang).toLowerCase(); + if (!isSupportedLang(requested)) { + console.error( + c.error( + `Unsupported --lang "${args.lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`, + ), + ); + process.exit(1); + } + lang = requested; + } + + // Info-level notice when explicit lang differs from voice-implied lang. + // Not an error — users sometimes want this intentionally (e.g. reading + // English text with a French voice for a stylized accent). Suppress + // when --json so machine-readable output stays clean. + if (!args.json && args.lang != null && lang !== inferredLang) { + console.log( + c.dim( + ` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`, + ), + ); + } + // ── Synthesize ──────────────────────────────────────────────────── const { synthesize } = await import("../tts/synthesize.js"); const spin = args.json ? null : clack.spinner(); - spin?.start(`Generating speech with ${c.accent(voice)}...`); + spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`); try { const result = await synthesize(text, output, { voice, speed, + lang, onProgress: spin ? (msg) => spin.message(msg) : undefined, }); @@ -112,6 +162,8 @@ export default defineCommand({ ok: true, voice, speed, + lang: result.lang, + langApplied: result.langApplied, durationSeconds: result.durationSeconds, outputPath: result.outputPath, }), @@ -122,6 +174,13 @@ export default defineCommand({ `Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`, ), ); + if (args.lang != null && !result.langApplied) { + console.log( + c.dim( + " Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.", + ), + ); + } } } catch (err) { const message = err instanceof Error ? err.message : String(err); @@ -147,16 +206,20 @@ function listVoices(json: boolean): void { console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`); console.log( - ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Gender")}`, + ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`, ); - console.log(` ${c.dim("─".repeat(60))}`); + console.log(` ${c.dim("─".repeat(72))}`); for (const v of BUNDLED_VOICES) { const id = v.id.padEnd(18); const label = v.label.padEnd(13); const lang = v.language.padEnd(10); - console.log(` ${c.accent(id)} ${label} ${lang} ${v.gender}`); + const code = v.defaultLang.padEnd(10); + console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${v.gender}`); } console.log( - `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`, + `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`, + ); + console.log( + ` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`, ); } diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts new file mode 100644 index 000000000..4af622863 --- /dev/null +++ b/packages/cli/src/tts/manager.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, it } from "vitest"; +import { + BUNDLED_VOICES, + SUPPORTED_LANGS, + inferLangFromVoiceId, + isSupportedLang, +} from "./manager.js"; + +describe("inferLangFromVoiceId", () => { + it.each([ + ["af_heart", "en-us"], + ["am_adam", "en-us"], + ["bf_emma", "en-gb"], + ["bm_george", "en-gb"], + ["ef_dora", "es"], + ["ff_siwis", "fr-fr"], + ["hf_alpha", "hi"], + ["if_sara", "it"], + ["jf_alpha", "ja"], + ["pf_dora", "pt-br"], + ["zf_xiaobei", "zh"], + ])("maps voice %s to lang %s", (voiceId, expected) => { + expect(inferLangFromVoiceId(voiceId)).toBe(expected); + }); + + it("falls back to en-us for unknown prefixes", () => { + expect(inferLangFromVoiceId("xf_test")).toBe("en-us"); + expect(inferLangFromVoiceId("")).toBe("en-us"); + }); + + it("is case-insensitive on the prefix letter", () => { + expect(inferLangFromVoiceId("EF_dora")).toBe("es"); + expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh"); + }); +}); + +describe("isSupportedLang", () => { + it("accepts every value in SUPPORTED_LANGS", () => { + for (const lang of SUPPORTED_LANGS) { + expect(isSupportedLang(lang)).toBe(true); + } + }); + + it("rejects invalid or misspelled lang codes", () => { + expect(isSupportedLang("english")).toBe(false); + expect(isSupportedLang("EN-US")).toBe(false); // case-sensitive by design + expect(isSupportedLang("de")).toBe(false); + expect(isSupportedLang("")).toBe(false); + }); +}); + +describe("BUNDLED_VOICES", () => { + it("attaches a valid defaultLang to every voice", () => { + for (const voice of BUNDLED_VOICES) { + expect(isSupportedLang(voice.defaultLang)).toBe(true); + expect(voice.defaultLang).toBe(inferLangFromVoiceId(voice.id)); + } + }); + + it("exposes at least one voice per non-English language", () => { + // Regression guard: --lang is user-facing, so the voice list must give + // users a working example in at least the most common non-English locales. + const langs = new Set(BUNDLED_VOICES.map((v) => v.defaultLang)); + expect(langs.has("es")).toBe(true); + expect(langs.has("fr-fr")).toBe(true); + expect(langs.has("ja")).toBe(true); + expect(langs.has("zh")).toBe(true); + }); +}); diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts index cc945bc21..469a2d874 100644 --- a/packages/cli/src/tts/manager.ts +++ b/packages/cli/src/tts/manager.ts @@ -17,6 +17,60 @@ const MODEL_URLS: Record = { const VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"; +// --------------------------------------------------------------------------- +// Languages — Kokoro's phonemizer supports these locale codes. The second +// letter of a voice ID is gender; the first letter is language. This list +// mirrors what misaki (English) and espeak-ng (everything else) accept. +// --------------------------------------------------------------------------- + +export const SUPPORTED_LANGS = [ + "en-us", + "en-gb", + "es", + "fr-fr", + "hi", + "it", + "pt-br", + "ja", + "zh", +] as const; + +export type SupportedLang = (typeof SUPPORTED_LANGS)[number]; + +const DEFAULT_LANG: SupportedLang = "en-us"; + +// First letter of a Kokoro voice ID → phonemizer locale. +// See https://github.com/hexgrad/kokoro for the full voice catalog. +const VOICE_PREFIX_LANG: Record = { + a: "en-us", // American English + b: "en-gb", // British English + e: "es", // Spanish + f: "fr-fr", // French + h: "hi", // Hindi + i: "it", // Italian + j: "ja", // Japanese + p: "pt-br", // Brazilian Portuguese + z: "zh", // Mandarin +}; + +/** + * Infer the phonemizer language from a Kokoro voice ID prefix. + * + * Kokoro voice IDs are `_` where `` is a single + * letter: a=American, b=British, e=Spanish, f=French, h=Hindi, i=Italian, + * j=Japanese, p=Brazilian Portuguese, z=Mandarin. Unknown prefixes fall + * back to `en-us` — the safe default for Kokoro's English-trained text + * frontend. + */ +export function inferLangFromVoiceId(voiceId: string): SupportedLang { + const first = voiceId.charAt(0).toLowerCase(); + return VOICE_PREFIX_LANG[first] ?? DEFAULT_LANG; +} + +export function isSupportedLang(value: string): value is SupportedLang { + return (SUPPORTED_LANGS as readonly string[]).includes(value); +} + // --------------------------------------------------------------------------- // Voices — Kokoro ships 54 voices across 8 languages. We expose a curated // default set and allow users to specify any valid Kokoro voice ID. @@ -27,17 +81,32 @@ export interface VoiceInfo { label: string; language: string; gender: "female" | "male"; + /** Phonemizer locale for this voice. Derived from the ID prefix. */ + defaultLang: SupportedLang; +} + +function makeVoice( + id: string, + label: string, + language: string, + gender: "female" | "male", +): VoiceInfo { + return { id, label, language, gender, defaultLang: inferLangFromVoiceId(id) }; } export const BUNDLED_VOICES: VoiceInfo[] = [ - { id: "af_heart", label: "Heart", language: "en-US", gender: "female" }, - { id: "af_nova", label: "Nova", language: "en-US", gender: "female" }, - { id: "af_sky", label: "Sky", language: "en-US", gender: "female" }, - { id: "am_adam", label: "Adam", language: "en-US", gender: "male" }, - { id: "am_michael", label: "Michael", language: "en-US", gender: "male" }, - { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" }, - { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" }, - { id: "bm_george", label: "George", language: "en-GB", gender: "male" }, + makeVoice("af_heart", "Heart", "en-US", "female"), + makeVoice("af_nova", "Nova", "en-US", "female"), + makeVoice("af_sky", "Sky", "en-US", "female"), + makeVoice("am_adam", "Adam", "en-US", "male"), + makeVoice("am_michael", "Michael", "en-US", "male"), + makeVoice("bf_emma", "Emma", "en-GB", "female"), + makeVoice("bf_isabella", "Isabella", "en-GB", "female"), + makeVoice("bm_george", "George", "en-GB", "male"), + makeVoice("ef_dora", "Dora", "es", "female"), + makeVoice("ff_siwis", "Siwis", "fr-FR", "female"), + makeVoice("jf_alpha", "Alpha", "ja", "female"), + makeVoice("zf_xiaobei", "Xiaobei", "zh", "female"), ]; export const DEFAULT_VOICE = "af_heart"; diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts index e2a5984fa..2629989c1 100644 --- a/packages/cli/src/tts/synthesize.ts +++ b/packages/cli/src/tts/synthesize.ts @@ -2,7 +2,13 @@ import { execFileSync } from "node:child_process"; import { existsSync, writeFileSync, mkdirSync } from "node:fs"; import { join, dirname } from "node:path"; import { homedir } from "node:os"; -import { ensureModel, ensureVoices, DEFAULT_VOICE } from "./manager.js"; +import { + ensureModel, + ensureVoices, + DEFAULT_VOICE, + inferLangFromVoiceId, + type SupportedLang, +} from "./manager.js"; // --------------------------------------------------------------------------- // Python runtime detection @@ -54,8 +60,11 @@ function hasPythonPackage(python: string, pkg: string): boolean { // Inline Python script for Kokoro synthesis // --------------------------------------------------------------------------- +// Kokoro-onnx added the `lang=` kwarg to `Kokoro.create()` in a later release. +// We pass it conditionally so older installs that only accept `voice=`/`speed=` +// continue to work (falling back to Kokoro's default phonemization). const SYNTH_SCRIPT = ` -import sys, json +import sys, json, inspect model_path = sys.argv[1] voices_path = sys.argv[2] @@ -63,12 +72,19 @@ text = sys.argv[3] voice = sys.argv[4] speed = float(sys.argv[5]) output_path = sys.argv[6] +lang = sys.argv[7] if len(sys.argv) > 7 else "" import kokoro_onnx import soundfile as sf model = kokoro_onnx.Kokoro(model_path, voices_path) -samples, sample_rate = model.create(text, voice=voice, speed=speed) + +kwargs = {"voice": voice, "speed": speed} +supports_lang = "lang" in inspect.signature(model.create).parameters +if lang and supports_lang: + kwargs["lang"] = lang + +samples, sample_rate = model.create(text, **kwargs) sf.write(output_path, samples, sample_rate) duration = len(samples) / sample_rate @@ -76,12 +92,16 @@ print(json.dumps({ "outputPath": output_path, "sampleRate": sample_rate, "durationSeconds": round(duration, 3), + "lang": lang if (lang and supports_lang) else None, + "langApplied": bool(lang and supports_lang), })) `; -// Cache the script to avoid rewriting it on every invocation +// Cache the script to avoid rewriting it on every invocation. +// The filename carries a version suffix so older installs automatically +// upgrade when the script body changes (e.g., adding the `lang` kwarg). const SCRIPT_DIR = join(homedir(), ".cache", "hyperframes", "tts"); -const SCRIPT_PATH = join(SCRIPT_DIR, "synth.py"); +const SCRIPT_PATH = join(SCRIPT_DIR, "synth-v2.py"); function ensureSynthScript(): string { if (!existsSync(SCRIPT_PATH)) { @@ -99,6 +119,12 @@ export interface SynthesizeOptions { model?: string; voice?: string; speed?: number; + /** + * Phonemizer locale. When omitted, inferred from the voice ID prefix + * (e.g., `ef_dora` → `es`). Pass explicitly to override — for example, + * reading English text with a French voice as a stylization. + */ + lang?: SupportedLang; onProgress?: (message: string) => void; } @@ -106,6 +132,10 @@ export interface SynthesizeResult { outputPath: string; sampleRate: number; durationSeconds: number; + /** Language actually applied during synthesis, or null if kokoro-onnx silently ignored it. */ + lang: SupportedLang | null; + /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */ + langApplied: boolean; } /** @@ -118,6 +148,7 @@ export async function synthesize( ): Promise { const voice = options?.voice ?? DEFAULT_VOICE; const speed = options?.speed ?? 1.0; + const lang: SupportedLang = options?.lang ?? inferLangFromVoiceId(voice); // 1. Ensure Python 3 is available with kokoro-onnx options?.onProgress?.("Checking Python runtime..."); @@ -151,11 +182,11 @@ export async function synthesize( mkdirSync(dirname(outputPath), { recursive: true }); // 5. Run synthesis - options?.onProgress?.(`Generating speech with voice ${voice}...`); + options?.onProgress?.(`Generating speech with voice ${voice} (${lang})...`); try { const stdout = execFileSync( python, - [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath], + [scriptPath, modelPath, voicesPath, text, voice, String(speed), outputPath, lang], { encoding: "utf-8", timeout: 300_000, @@ -170,13 +201,20 @@ export async function synthesize( // Parse the last line of stdout as JSON (in case Python printed warnings before it) const lines = stdout.trim().split("\n"); const jsonLine = lines[lines.length - 1] ?? ""; - const result: { outputPath: string; sampleRate: number; durationSeconds: number } = - JSON.parse(jsonLine); + const result: { + outputPath: string; + sampleRate: number; + durationSeconds: number; + lang: SupportedLang | null; + langApplied: boolean; + } = JSON.parse(jsonLine); return { outputPath: result.outputPath, sampleRate: result.sampleRate, durationSeconds: result.durationSeconds, + lang: result.lang, + langApplied: result.langApplied, }; } catch (err: unknown) { // If the error is our own JSON parse failure but the file was created, diff --git a/skills/hyperframes/references/tts.md b/skills/hyperframes/references/tts.md index ee94993a7..c403564d8 100644 --- a/skills/hyperframes/references/tts.md +++ b/skills/hyperframes/references/tts.md @@ -16,6 +16,25 @@ Match voice to content. Default is `af_heart`. Run `npx hyperframes tts --list` for all 54 voices (8 languages). +## Multilingual Phonemization + +Kokoro voice IDs encode language in the first letter: `a`=American English, `b`=British English, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin. The CLI auto-detects the phonemizer locale from that prefix — you don't need to pass `--lang` when the voice matches the text. + +```bash +npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav +npx hyperframes tts "今日はいい天気ですね" --voice jf_alpha --output ja.wav +``` + +Use `--lang` only to override auto-detection (e.g. stylized accents): + +```bash +npx hyperframes tts "Hello there" --voice af_heart --lang fr-fr --output accented.wav +``` + +Valid `--lang` codes: `en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`. + +Non-English phonemization requires `espeak-ng` installed system-wide (`brew install espeak-ng` on macOS, `apt-get install espeak-ng` on Debian/Ubuntu). + ## Speed Tuning - **0.7-0.8** — Tutorial, complex content From 74cd4738c67133f216b6cb483351f2c4c730bac6 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 20 Apr 2026 16:55:21 +0000 Subject: [PATCH 2/2] refactor(cli): simplify tts --lang implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-review cleanup on #351. Net -21 lines. - Drop `defaultLang` field + `makeVoice()` helper from VoiceInfo — compute via `inferLangFromVoiceId(v.id)` at read time in listVoices. The only reader was the --list table; caching the derived value on every voice added a self-consistency invariant we had to test. - Drop redundant `lang` field from SynthesizeResult — caller already knows the requested lang since it passed it in; only `langApplied` carries information the caller can't derive. - Use `errorBox` for --lang validation to match the house style in render.ts (other validation errors already use errorBox). - Reuse existing `langList` module constant in the validation error instead of re-joining SUPPORTED_LANGS. - Inline `DEFAULT_LANG` — used once in inferLangFromVoiceId. - Trim WHAT-restating comments and the duplicate prefix-enumeration JSDoc on inferLangFromVoiceId (VOICE_PREFIX_LANG already carries per-row comments). - Clean up orphaned `synth*.py` files in ~/.cache/hyperframes/tts when writing the current versioned script, so repeated upgrades don't leak files. - Drop the `EN-US` case-sensitive-rejection test assertion — the CLI lowercases input before validation, so accepting mixed case is a feature, not a bug. Tests: 16/16 in `manager.test.ts`, 127/127 full CLI suite pass. Lint + format + typecheck clean. --- packages/cli/src/commands/tts.ts | 32 +++++++-------- packages/cli/src/tts/manager.test.ts | 14 ++----- packages/cli/src/tts/manager.ts | 60 ++++++++++------------------ packages/cli/src/tts/synthesize.ts | 25 ++++++++---- 4 files changed, 55 insertions(+), 76 deletions(-) diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts index 9ede90c37..bf0bc03c1 100644 --- a/packages/cli/src/commands/tts.ts +++ b/packages/cli/src/commands/tts.ts @@ -21,6 +21,7 @@ export const examples: Example[] = [ import { resolve, extname } from "node:path"; import * as clack from "@clack/prompts"; import { c } from "../ui/colors.js"; +import { errorBox } from "../ui/format.js"; import { DEFAULT_VOICE, BUNDLED_VOICES, @@ -115,26 +116,19 @@ export default defineCommand({ process.exit(1); } - // ── Resolve lang (explicit > inferred from voice prefix) ────────── const inferredLang = inferLangFromVoiceId(voice); let lang: SupportedLang = inferredLang; if (args.lang != null) { const requested = String(args.lang).toLowerCase(); if (!isSupportedLang(requested)) { - console.error( - c.error( - `Unsupported --lang "${args.lang}". Must be one of: ${SUPPORTED_LANGS.join(", ")}.`, - ), - ); + errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`); process.exit(1); } lang = requested; } - // Info-level notice when explicit lang differs from voice-implied lang. - // Not an error — users sometimes want this intentionally (e.g. reading - // English text with a French voice for a stylized accent). Suppress - // when --json so machine-readable output stays clean. + // Mismatched voice/lang is a valid stylization (English text, French + // phonemization for accent), so this is a hint, not an error. if (!args.json && args.lang != null && lang !== inferredLang) { console.log( c.dim( @@ -162,7 +156,7 @@ export default defineCommand({ ok: true, voice, speed, - lang: result.lang, + lang, langApplied: result.langApplied, durationSeconds: result.durationSeconds, outputPath: result.outputPath, @@ -199,8 +193,10 @@ export default defineCommand({ // --------------------------------------------------------------------------- function listVoices(json: boolean): void { + const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) })); + if (json) { - console.log(JSON.stringify(BUNDLED_VOICES)); + console.log(JSON.stringify(rows)); return; } @@ -209,12 +205,12 @@ function listVoices(json: boolean): void { ` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`, ); console.log(` ${c.dim("─".repeat(72))}`); - for (const v of BUNDLED_VOICES) { - const id = v.id.padEnd(18); - const label = v.label.padEnd(13); - const lang = v.language.padEnd(10); - const code = v.defaultLang.padEnd(10); - console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${v.gender}`); + for (const row of rows) { + const id = row.id.padEnd(18); + const label = row.label.padEnd(13); + const lang = row.language.padEnd(10); + const code = row.defaultLang.padEnd(10); + console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`); } console.log( `\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`, diff --git a/packages/cli/src/tts/manager.test.ts b/packages/cli/src/tts/manager.test.ts index 4af622863..2ea4d9238 100644 --- a/packages/cli/src/tts/manager.test.ts +++ b/packages/cli/src/tts/manager.test.ts @@ -43,24 +43,16 @@ describe("isSupportedLang", () => { it("rejects invalid or misspelled lang codes", () => { expect(isSupportedLang("english")).toBe(false); - expect(isSupportedLang("EN-US")).toBe(false); // case-sensitive by design expect(isSupportedLang("de")).toBe(false); expect(isSupportedLang("")).toBe(false); }); }); describe("BUNDLED_VOICES", () => { - it("attaches a valid defaultLang to every voice", () => { - for (const voice of BUNDLED_VOICES) { - expect(isSupportedLang(voice.defaultLang)).toBe(true); - expect(voice.defaultLang).toBe(inferLangFromVoiceId(voice.id)); - } - }); - + // --lang is user-facing, so the voice list must give users a working + // example in at least the most common non-English locales. it("exposes at least one voice per non-English language", () => { - // Regression guard: --lang is user-facing, so the voice list must give - // users a working example in at least the most common non-English locales. - const langs = new Set(BUNDLED_VOICES.map((v) => v.defaultLang)); + const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id))); expect(langs.has("es")).toBe(true); expect(langs.has("fr-fr")).toBe(true); expect(langs.has("ja")).toBe(true); diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts index 469a2d874..fa7e3760f 100644 --- a/packages/cli/src/tts/manager.ts +++ b/packages/cli/src/tts/manager.ts @@ -17,12 +17,9 @@ const MODEL_URLS: Record = { const VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"; -// --------------------------------------------------------------------------- -// Languages — Kokoro's phonemizer supports these locale codes. The second -// letter of a voice ID is gender; the first letter is language. This list -// mirrors what misaki (English) and espeak-ng (everything else) accept. -// --------------------------------------------------------------------------- - +// Locale codes accepted by Kokoro's phonemizer (misaki for English, +// espeak-ng for everything else). Kept as a readonly tuple so the union +// type below stays driven by this single source. export const SUPPORTED_LANGS = [ "en-us", "en-gb", @@ -37,10 +34,8 @@ export const SUPPORTED_LANGS = [ export type SupportedLang = (typeof SUPPORTED_LANGS)[number]; -const DEFAULT_LANG: SupportedLang = "en-us"; - -// First letter of a Kokoro voice ID → phonemizer locale. -// See https://github.com/hexgrad/kokoro for the full voice catalog. +// Kokoro voice IDs are `_` — the first letter is +// language, the second is gender. See https://github.com/hexgrad/kokoro. const VOICE_PREFIX_LANG: Record = { a: "en-us", // American English b: "en-gb", // British English @@ -55,16 +50,12 @@ const VOICE_PREFIX_LANG: Record = { /** * Infer the phonemizer language from a Kokoro voice ID prefix. - * - * Kokoro voice IDs are `_` where `` is a single - * letter: a=American, b=British, e=Spanish, f=French, h=Hindi, i=Italian, - * j=Japanese, p=Brazilian Portuguese, z=Mandarin. Unknown prefixes fall - * back to `en-us` — the safe default for Kokoro's English-trained text - * frontend. + * Unknown prefixes fall back to `en-us` — Kokoro's text frontend is + * English-trained, so that's the safe default. */ export function inferLangFromVoiceId(voiceId: string): SupportedLang { const first = voiceId.charAt(0).toLowerCase(); - return VOICE_PREFIX_LANG[first] ?? DEFAULT_LANG; + return VOICE_PREFIX_LANG[first] ?? "en-us"; } export function isSupportedLang(value: string): value is SupportedLang { @@ -81,32 +72,21 @@ export interface VoiceInfo { label: string; language: string; gender: "female" | "male"; - /** Phonemizer locale for this voice. Derived from the ID prefix. */ - defaultLang: SupportedLang; -} - -function makeVoice( - id: string, - label: string, - language: string, - gender: "female" | "male", -): VoiceInfo { - return { id, label, language, gender, defaultLang: inferLangFromVoiceId(id) }; } export const BUNDLED_VOICES: VoiceInfo[] = [ - makeVoice("af_heart", "Heart", "en-US", "female"), - makeVoice("af_nova", "Nova", "en-US", "female"), - makeVoice("af_sky", "Sky", "en-US", "female"), - makeVoice("am_adam", "Adam", "en-US", "male"), - makeVoice("am_michael", "Michael", "en-US", "male"), - makeVoice("bf_emma", "Emma", "en-GB", "female"), - makeVoice("bf_isabella", "Isabella", "en-GB", "female"), - makeVoice("bm_george", "George", "en-GB", "male"), - makeVoice("ef_dora", "Dora", "es", "female"), - makeVoice("ff_siwis", "Siwis", "fr-FR", "female"), - makeVoice("jf_alpha", "Alpha", "ja", "female"), - makeVoice("zf_xiaobei", "Xiaobei", "zh", "female"), + { id: "af_heart", label: "Heart", language: "en-US", gender: "female" }, + { id: "af_nova", label: "Nova", language: "en-US", gender: "female" }, + { id: "af_sky", label: "Sky", language: "en-US", gender: "female" }, + { id: "am_adam", label: "Adam", language: "en-US", gender: "male" }, + { id: "am_michael", label: "Michael", language: "en-US", gender: "male" }, + { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" }, + { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" }, + { id: "bm_george", label: "George", language: "en-GB", gender: "male" }, + { id: "ef_dora", label: "Dora", language: "es", gender: "female" }, + { id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" }, + { id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" }, + { id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" }, ]; export const DEFAULT_VOICE = "af_heart"; diff --git a/packages/cli/src/tts/synthesize.ts b/packages/cli/src/tts/synthesize.ts index 2629989c1..829417914 100644 --- a/packages/cli/src/tts/synthesize.ts +++ b/packages/cli/src/tts/synthesize.ts @@ -1,6 +1,6 @@ import { execFileSync } from "node:child_process"; -import { existsSync, writeFileSync, mkdirSync } from "node:fs"; -import { join, dirname } from "node:path"; +import { existsSync, writeFileSync, mkdirSync, readdirSync, unlinkSync } from "node:fs"; +import { join, dirname, basename } from "node:path"; import { homedir } from "node:os"; import { ensureModel, @@ -92,7 +92,6 @@ print(json.dumps({ "outputPath": output_path, "sampleRate": sample_rate, "durationSeconds": round(duration, 3), - "lang": lang if (lang and supports_lang) else None, "langApplied": bool(lang and supports_lang), })) `; @@ -107,6 +106,22 @@ function ensureSynthScript(): string { if (!existsSync(SCRIPT_PATH)) { mkdirSync(SCRIPT_DIR, { recursive: true }); writeFileSync(SCRIPT_PATH, SYNTH_SCRIPT); + // Best-effort: delete older versioned scripts left behind by previous + // CLI releases so users don't accumulate stale files in ~/.cache. + const currentName = basename(SCRIPT_PATH); + try { + for (const entry of readdirSync(SCRIPT_DIR)) { + if (entry !== currentName && /^synth(-v\d+)?\.py$/.test(entry)) { + try { + unlinkSync(join(SCRIPT_DIR, entry)); + } catch { + // Ignore — orphan cleanup is best-effort. + } + } + } + } catch { + // Ignore — directory read is best-effort. + } } return SCRIPT_PATH; } @@ -132,8 +147,6 @@ export interface SynthesizeResult { outputPath: string; sampleRate: number; durationSeconds: number; - /** Language actually applied during synthesis, or null if kokoro-onnx silently ignored it. */ - lang: SupportedLang | null; /** False when the installed kokoro-onnx version does not support the `lang` kwarg. */ langApplied: boolean; } @@ -205,7 +218,6 @@ export async function synthesize( outputPath: string; sampleRate: number; durationSeconds: number; - lang: SupportedLang | null; langApplied: boolean; } = JSON.parse(jsonLine); @@ -213,7 +225,6 @@ export async function synthesize( outputPath: result.outputPath, sampleRate: result.sampleRate, durationSeconds: result.durationSeconds, - lang: result.lang, langApplied: result.langApplied, }; } catch (err: unknown) {