Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/packages/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,12 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
# Adjust speech speed
npx hyperframes tts "Slow and clear" --speed 0.8

# Generate Spanish speech (lang auto-detected from the `e` voice prefix)
npx hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav

# Override the phonemizer (read English text with a French voice)
npx hyperframes tts "Bonjour le monde" --voice af_heart --lang fr-fr

# Read text from a file
npx hyperframes tts script.txt

Expand All @@ -317,9 +323,14 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
| `--output, -o` | Output file path (default: `speech.wav` in current directory) |
| `--voice, -v` | Voice ID (run `--list` to see options) |
| `--speed, -s` | Speech speed multiplier (default: 1.0) |
| `--lang, -l` | Phonemizer locale (`en-us`, `en-gb`, `es`, `fr-fr`, `hi`, `it`, `pt-br`, `ja`, `zh`). When omitted, inferred from the voice ID prefix. |
| `--list` | List available voices and exit |
| `--json` | Output result as JSON |

<Tip>
Voice IDs encode the phonemizer language in their first letter (`a`=American, `b`=British, `e`=Spanish, `f`=French, `h`=Hindi, `i`=Italian, `j`=Japanese, `p`=Brazilian Portuguese, `z`=Mandarin). `--lang` is only needed when you want to override that — for example, giving English text a French phonemizer for a stylized accent.
</Tip>

<Tip>
Combine `tts` with `transcribe` to generate narration and word-level timestamps for captions in a single workflow: generate the audio with `tts`, then transcribe the output with `transcribe` to get word-level timing.
</Tip>
Expand Down
81 changes: 70 additions & 11 deletions packages/cli/src/commands/tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,32 @@ export const examples: Example[] = [
["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'],
["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
[
"Generate Spanish speech",
'hyperframes tts "La reunión empieza a las nueve" --voice ef_dora --output es.wav',
],
[
"Override phonemizer language",
'hyperframes tts "Ciao a tutti" --voice af_heart --lang it --output accented.wav',
],
["Read text from a file", "hyperframes tts script.txt"],
["List available voices", "hyperframes tts --list"],
];
import { resolve, extname } from "node:path";
import * as clack from "@clack/prompts";
import { c } from "../ui/colors.js";
import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js";
import { errorBox } from "../ui/format.js";
import {
DEFAULT_VOICE,
BUNDLED_VOICES,
SUPPORTED_LANGS,
inferLangFromVoiceId,
isSupportedLang,
type SupportedLang,
} from "../tts/manager.js";

const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
const langList = SUPPORTED_LANGS.join(", ");

export default defineCommand({
meta: {
Expand Down Expand Up @@ -43,6 +60,11 @@ export default defineCommand({
description: "Speech speed multiplier (default: 1.0)",
alias: "s",
},
lang: {
type: "string",
description: `Phonemizer language (auto-detected from voice prefix when omitted). Options: ${langList}`,
alias: "l",
},
list: {
type: "boolean",
description: "List available voices and exit",
Expand Down Expand Up @@ -94,15 +116,37 @@ export default defineCommand({
process.exit(1);
}

const inferredLang = inferLangFromVoiceId(voice);
let lang: SupportedLang = inferredLang;
if (args.lang != null) {
const requested = String(args.lang).toLowerCase();
if (!isSupportedLang(requested)) {
errorBox("Invalid --lang", `Got "${args.lang}". Must be one of: ${langList}.`);
process.exit(1);
}
lang = requested;
}

// Mismatched voice/lang is a valid stylization (English text, French
// phonemization for accent), so this is a hint, not an error.
if (!args.json && args.lang != null && lang !== inferredLang) {
console.log(
c.dim(
` Note: voice "${voice}" is ${inferredLang}, rendering with --lang ${lang} instead.`,
),
);
}

// ── Synthesize ────────────────────────────────────────────────────
const { synthesize } = await import("../tts/synthesize.js");
const spin = args.json ? null : clack.spinner();
spin?.start(`Generating speech with ${c.accent(voice)}...`);
spin?.start(`Generating speech with ${c.accent(voice)} (${lang})...`);

try {
const result = await synthesize(text, output, {
voice,
speed,
lang,
onProgress: spin ? (msg) => spin.message(msg) : undefined,
});

Expand All @@ -112,6 +156,8 @@ export default defineCommand({
ok: true,
voice,
speed,
lang,
langApplied: result.langApplied,
durationSeconds: result.durationSeconds,
outputPath: result.outputPath,
}),
Expand All @@ -122,6 +168,13 @@ export default defineCommand({
`Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`,
),
);
if (args.lang != null && !result.langApplied) {
console.log(
c.dim(
" Note: installed kokoro-onnx version does not support the --lang kwarg; phonemization used Kokoro's default.",
),
);
}
}
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
Expand All @@ -140,23 +193,29 @@ export default defineCommand({
// ---------------------------------------------------------------------------

function listVoices(json: boolean): void {
const rows = BUNDLED_VOICES.map((v) => ({ ...v, defaultLang: inferLangFromVoiceId(v.id) }));

if (json) {
console.log(JSON.stringify(BUNDLED_VOICES));
console.log(JSON.stringify(rows));
return;
}

console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
console.log(
` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Gender")}`,
` ${c.dim("ID")} ${c.dim("Name")} ${c.dim("Language")} ${c.dim("Lang code")} ${c.dim("Gender")}`,
);
console.log(` ${c.dim("─".repeat(60))}`);
for (const v of BUNDLED_VOICES) {
const id = v.id.padEnd(18);
const label = v.label.padEnd(13);
const lang = v.language.padEnd(10);
console.log(` ${c.accent(id)} ${label} ${lang} ${v.gender}`);
console.log(` ${c.dim("─".repeat(72))}`);
for (const row of rows) {
const id = row.id.padEnd(18);
const label = row.label.padEnd(13);
const lang = row.language.padEnd(10);
const code = row.defaultLang.padEnd(10);
console.log(` ${c.accent(id)} ${label} ${lang} ${code} ${row.gender}`);
}
console.log(
`\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`,
`\n ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}`,
);
console.log(
` ${c.dim("Override phonemizer with --lang <" + SUPPORTED_LANGS.join("|") + ">")}\n`,
);
}
61 changes: 61 additions & 0 deletions packages/cli/src/tts/manager.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { describe, expect, it } from "vitest";
import {
BUNDLED_VOICES,
SUPPORTED_LANGS,
inferLangFromVoiceId,
isSupportedLang,
} from "./manager.js";

describe("inferLangFromVoiceId", () => {
it.each([
["af_heart", "en-us"],
["am_adam", "en-us"],
["bf_emma", "en-gb"],
["bm_george", "en-gb"],
["ef_dora", "es"],
["ff_siwis", "fr-fr"],
["hf_alpha", "hi"],
["if_sara", "it"],
["jf_alpha", "ja"],
["pf_dora", "pt-br"],
["zf_xiaobei", "zh"],
])("maps voice %s to lang %s", (voiceId, expected) => {
expect(inferLangFromVoiceId(voiceId)).toBe(expected);
});

it("falls back to en-us for unknown prefixes", () => {
expect(inferLangFromVoiceId("xf_test")).toBe("en-us");
expect(inferLangFromVoiceId("")).toBe("en-us");
});

it("is case-insensitive on the prefix letter", () => {
expect(inferLangFromVoiceId("EF_dora")).toBe("es");
expect(inferLangFromVoiceId("ZF_xiaobei")).toBe("zh");
});
});

describe("isSupportedLang", () => {
it("accepts every value in SUPPORTED_LANGS", () => {
for (const lang of SUPPORTED_LANGS) {
expect(isSupportedLang(lang)).toBe(true);
}
});

it("rejects invalid or misspelled lang codes", () => {
expect(isSupportedLang("english")).toBe(false);
expect(isSupportedLang("de")).toBe(false);
expect(isSupportedLang("")).toBe(false);
});
});

describe("BUNDLED_VOICES", () => {
// --lang is user-facing, so the voice list must give users a working
// example in at least the most common non-English locales.
it("exposes at least one voice per non-English language", () => {
const langs = new Set(BUNDLED_VOICES.map((v) => inferLangFromVoiceId(v.id)));
expect(langs.has("es")).toBe(true);
expect(langs.has("fr-fr")).toBe(true);
expect(langs.has("ja")).toBe(true);
expect(langs.has("zh")).toBe(true);
});
});
49 changes: 49 additions & 0 deletions packages/cli/src/tts/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,51 @@ const MODEL_URLS: Record<string, string> = {
const VOICES_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";

// Locale codes accepted by Kokoro's phonemizer (misaki for English,
// espeak-ng for everything else). Kept as a readonly tuple so the union
// type below stays driven by this single source.
export const SUPPORTED_LANGS = [
"en-us",
"en-gb",
"es",
"fr-fr",
"hi",
"it",
"pt-br",
"ja",
"zh",
] as const;

export type SupportedLang = (typeof SUPPORTED_LANGS)[number];

// Kokoro voice IDs are `<lang><gender>_<name>` — the first letter is
// language, the second is gender. See https://github.com/hexgrad/kokoro.
const VOICE_PREFIX_LANG: Record<string, SupportedLang> = {
a: "en-us", // American English
b: "en-gb", // British English
e: "es", // Spanish
f: "fr-fr", // French
h: "hi", // Hindi
i: "it", // Italian
j: "ja", // Japanese
p: "pt-br", // Brazilian Portuguese
z: "zh", // Mandarin
};

/**
* Infer the phonemizer language from a Kokoro voice ID prefix.
* Unknown prefixes fall back to `en-us` — Kokoro's text frontend is
* English-trained, so that's the safe default.
*/
export function inferLangFromVoiceId(voiceId: string): SupportedLang {
const first = voiceId.charAt(0).toLowerCase();
return VOICE_PREFIX_LANG[first] ?? "en-us";
}

export function isSupportedLang(value: string): value is SupportedLang {
return (SUPPORTED_LANGS as readonly string[]).includes(value);
}

// ---------------------------------------------------------------------------
// Voices — Kokoro ships 54 voices across 8 languages. We expose a curated
// default set and allow users to specify any valid Kokoro voice ID.
Expand All @@ -38,6 +83,10 @@ export const BUNDLED_VOICES: VoiceInfo[] = [
{ id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
{ id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
{ id: "bm_george", label: "George", language: "en-GB", gender: "male" },
{ id: "ef_dora", label: "Dora", language: "es", gender: "female" },
{ id: "ff_siwis", label: "Siwis", language: "fr-FR", gender: "female" },
{ id: "jf_alpha", label: "Alpha", language: "ja", gender: "female" },
{ id: "zf_xiaobei", label: "Xiaobei", language: "zh", gender: "female" },
];

export const DEFAULT_VOICE = "af_heart";
Expand Down
Loading
Loading