diff --git a/.changeset/realtime-chat.md b/.changeset/realtime-chat.md new file mode 100644 index 000000000..43d570b19 --- /dev/null +++ b/.changeset/realtime-chat.md @@ -0,0 +1,18 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-client': minor +'@tanstack/ai-openai': minor +'@tanstack/ai-elevenlabs': minor +'@tanstack/ai-react': minor +--- + +feat: add realtime voice chat with OpenAI and ElevenLabs adapters + +Adds realtime voice/text chat capabilities: + +- **@tanstack/ai**: `realtimeToken()` function and shared realtime types (`RealtimeToken`, `RealtimeMessage`, `RealtimeSessionConfig`, `RealtimeStatus`, `RealtimeMode`, `AudioVisualization`, events, and error types) +- **@tanstack/ai-client**: Framework-agnostic `RealtimeClient` class with connection lifecycle, audio I/O, message state management, tool execution, and `RealtimeAdapter`/`RealtimeConnection` interfaces +- **@tanstack/ai-openai**: `openaiRealtime()` client adapter (WebRTC) and `openaiRealtimeToken()` server token adapter with support for semantic VAD, multiple voices, and all realtime models +- **@tanstack/ai-elevenlabs**: `elevenlabsRealtime()` client adapter (WebSocket) and `elevenlabsRealtimeToken()` server token adapter for ElevenLabs conversational AI agents +- **@tanstack/ai-react**: `useRealtimeChat()` hook with reactive state for status, mode, messages, pending transcripts, audio visualization levels, VAD control, text/image input, and interruptions +- **Docs**: Realtime Voice Chat guide and full API reference for all realtime classes, interfaces, functions, and type aliases diff --git a/docs/config.json b/docs/config.json index 23330cb22..490bbf3e7 100644 --- a/docs/config.json +++ b/docs/config.json @@ -78,6 +78,10 @@ "label": "Runtime Adapter Switching", "to": "guides/runtime-adapter-switching" }, + { + "label": "Realtime Voice Chat", + "to": "guides/realtime-chat" + }, { "label": "Text-to-Speech", "to": "guides/text-to-speech" @@ -228,6 +232,10 @@ "label": "ToolCallManager", "to": "reference/classes/ToolCallManager" }, + { + "label": "RealtimeClient", + "to": "reference/classes/RealtimeClient" + }, { "label": "WordBoundaryStrategy", "to": "reference/classes/WordBoundaryStrategy" @@ -315,6 +323,10 @@ "label": "uiMessageToModelMessages", "to": "reference/functions/uiMessageToModelMessages" }, + { + "label": "realtimeToken", + "to": "reference/functions/realtimeToken" + }, { "label": "untilFinishReason", "to": "reference/functions/untilFinishReason" @@ -346,6 +358,10 @@ "label": "AudioPart", "to": "reference/interfaces/AudioPart" }, + { + "label": "AudioVisualization", + "to": "reference/interfaces/AudioVisualization" + }, { "label": "BaseStreamChunk", "to": "reference/interfaces/BaseStreamChunk" @@ -426,6 +442,38 @@ "label": "ProcessorState", "to": "reference/interfaces/ProcessorState" }, + { + "label": "RealtimeAdapter", + "to": "reference/interfaces/RealtimeAdapter" + }, + { + "label": "RealtimeClientOptions", + "to": "reference/interfaces/RealtimeClientOptions" + }, + { + "label": "RealtimeConnection", + "to": "reference/interfaces/RealtimeConnection" + }, + { + "label": "RealtimeMessage", + "to": "reference/interfaces/RealtimeMessage" + }, + { + "label": "RealtimeSessionConfig", + "to": "reference/interfaces/RealtimeSessionConfig" + }, + { + "label": "RealtimeToken", + "to": "reference/interfaces/RealtimeToken" + }, + { + "label": "RealtimeTokenAdapter", + "to": "reference/interfaces/RealtimeTokenAdapter" + }, + { + "label": "RealtimeTokenOptions", + "to": "reference/interfaces/RealtimeTokenOptions" + }, { "label": "ResponseFormat", "to": "reference/interfaces/ResponseFormat" @@ -581,6 +629,22 @@ "label": "MessagePart", "to": "reference/type-aliases/MessagePart" }, + { + "label": "RealtimeEvent", + "to": "reference/type-aliases/RealtimeEvent" + }, + { + "label": "RealtimeMessagePart", + "to": "reference/type-aliases/RealtimeMessagePart" + }, + { + "label": "RealtimeMode", + "to": "reference/type-aliases/RealtimeMode" + }, + { + "label": "RealtimeStatus", + "to": "reference/type-aliases/RealtimeStatus" + }, { "label": "ModalitiesArrayToUnion", "to": "reference/type-aliases/ModalitiesArrayToUnion" diff --git a/docs/guides/realtime-chat.md b/docs/guides/realtime-chat.md new file mode 100644 index 000000000..afe2943c6 --- /dev/null +++ b/docs/guides/realtime-chat.md @@ -0,0 +1,446 @@ +--- +title: Realtime Voice Chat +id: realtime-chat +order: 14 +--- + +TanStack AI provides a complete realtime voice chat system for building voice-to-voice AI interactions. The realtime API supports multiple providers (OpenAI, ElevenLabs), automatic tool execution, audio visualization, and multimodal input including images. + +## Overview + +Realtime voice chat differs from text-based chat in several key ways: + +- **Bidirectional audio** - Users speak into a microphone, and the AI responds with synthesized voice +- **Voice Activity Detection (VAD)** - Automatically detects when the user starts and stops speaking +- **Interruptions** - Users can interrupt the AI mid-response +- **Low latency** - Uses WebRTC or WebSocket connections for near-instant communication +- **Multimodal** - Supports text input, image input, and tool calling alongside voice + +The realtime system follows the same adapter architecture as the rest of TanStack AI: + +1. **Server** generates ephemeral tokens using `realtimeToken()` with a provider-specific token adapter +2. **Client** connects using `RealtimeClient` (or `useRealtimeChat` in React) with a provider-specific connection adapter +3. **Provider adapters** handle the protocol differences between OpenAI WebRTC, ElevenLabs WebSocket, etc. + +## Quick Start + +### 1. Set Up the Server Token Endpoint + +The server generates short-lived tokens so your API keys never reach the client: + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' +import { createServerFn } from '@tanstack/react-start' + +const getRealtimeToken = createServerFn({ method: 'POST' }) + .handler(async () => { + return realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + }), + }) + }) +``` + +> **Note:** The `realtimeToken()` function works with any server framework. The example above uses TanStack Start, but you can use Express, Hono, Fastify, or any other framework that can handle HTTP requests. + +### 2. Connect from the Client (React) + +```typescript +import { useRealtimeChat } from '@tanstack/ai-react' +import { openaiRealtime } from '@tanstack/ai-openai' + +function VoiceChat() { + const { + status, + mode, + messages, + connect, + disconnect, + pendingUserTranscript, + pendingAssistantTranscript, + inputLevel, + outputLevel, + } = useRealtimeChat({ + getToken: () => fetch('/api/realtime-token', { method: 'POST' }).then(r => r.json()), + adapter: openaiRealtime(), + instructions: 'You are a helpful voice assistant.', + voice: 'alloy', + }) + + return ( +
+

Status: {status}

+

Mode: {mode}

+ + {pendingUserTranscript &&

You: {pendingUserTranscript}...

} + {pendingAssistantTranscript &&

AI: {pendingAssistantTranscript}...

} + {messages.map((msg) => ( +
+ {msg.role}: + {msg.parts.map((part, i) => ( + + {part.type === 'text' ? part.content : null} + {part.type === 'audio' ? part.transcript : null} + + ))} +
+ ))} +
+ ) +} +``` + +## Providers + +### OpenAI Realtime + +OpenAI's realtime API uses WebRTC for low-latency voice communication. + +**Server (token generation):** + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' + +const token = await realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + }), +}) +``` + +**Client (connection):** + +```typescript +import { openaiRealtime } from '@tanstack/ai-openai' + +const adapter = openaiRealtime() +``` + +**Environment variables:** `OPENAI_API_KEY` + +**Available models:** + +| Model | Description | +|-------|-------------| +| `gpt-4o-realtime-preview` | Full realtime model | +| `gpt-4o-mini-realtime-preview` | Smaller, faster realtime model | +| `gpt-realtime` | Latest realtime model | +| `gpt-realtime-mini` | Latest mini realtime model | + +**Available voices:** `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, `cedar` + +### ElevenLabs Realtime + +ElevenLabs uses WebSocket connections and requires an agent configured in their dashboard. + +**Server (token generation):** + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + +const token = await realtimeToken({ + adapter: elevenlabsRealtimeToken({ + agentId: 'your-agent-id', + }), +}) +``` + +**Client (connection):** + +```typescript +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + +const adapter = elevenlabsRealtime() +``` + +**Environment variables:** `ELEVENLABS_API_KEY`, `ELEVENLABS_AGENT_ID` (optional) + +## Voice Activity Detection (VAD) + +VAD controls how the system detects when the user is speaking. Three modes are available: + +| Mode | Description | +|------|-------------| +| `server` | Provider handles speech detection server-side (default) | +| `semantic` | Uses semantic understanding to detect turn boundaries (OpenAI only) | +| `manual` | Application controls when to listen via `startListening()`/`stopListening()` | + +```typescript +const chat = useRealtimeChat({ + // ... + vadMode: 'semantic', + semanticEagerness: 'medium', // 'low' | 'medium' | 'high' +}) +``` + +With `manual` VAD mode, use push-to-talk style interactions: + +```typescript +const { startListening, stopListening } = useRealtimeChat({ + vadMode: 'manual', + autoCapture: false, + // ... +}) + +// In your UI + +``` + +## Tools + +Realtime sessions support client-side tools. Define tools using the standard `toolDefinition()` API and pass their client implementations: + +```typescript +import { toolDefinition } from '@tanstack/ai' +import { z } from 'zod' + +const getWeatherDef = toolDefinition({ + name: 'getWeather', + description: 'Get weather for a location', + inputSchema: z.object({ + location: z.string().meta({ description: 'City name' }), + }), + outputSchema: z.object({ + temperature: z.number(), + conditions: z.string(), + }), +}) + +const getWeather = getWeatherDef.client(async ({ location }) => { + const res = await fetch(`/api/weather?location=${location}`) + return res.json() +}) + +// Pass tools to the hook +const chat = useRealtimeChat({ + // ... + tools: [getWeather], +}) +``` + +The realtime client automatically executes tool calls and sends results back to the provider. Tool calls appear as `tool-call` and `tool-result` parts in messages. + +## Text and Image Input + +In addition to voice, you can send text messages and images: + +```typescript +const { sendText, sendImage } = useRealtimeChat({ /* ... */ }) + +// Send a text message +sendText('What is the weather like today?') + +// Send an image (base64 data or URL) +sendImage(base64ImageData, 'image/png') +``` + +## Audio Visualization + +The hook provides real-time audio level data for building visualizations: + +```typescript +const { + inputLevel, // 0-1 normalized microphone volume + outputLevel, // 0-1 normalized speaker volume + getInputFrequencyData, // Uint8Array for frequency spectrum + getOutputFrequencyData, + getInputTimeDomainData, // Uint8Array for waveform + getOutputTimeDomainData, +} = useRealtimeChat({ /* ... */ }) +``` + +The `inputLevel` and `outputLevel` values update on every animation frame while connected, making them suitable for driving CSS animations or canvas visualizations: + +```typescript +function AudioIndicator({ level }: { level: number }) { + return ( +
+ ) +} +``` + +For more detailed visualizations, use the frequency and time-domain data getters inside a `requestAnimationFrame` loop. + +## Session Configuration + +Configure the realtime session through the hook options: + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `getToken` | `() => Promise` | required | Function to fetch a token from the server | +| `adapter` | `RealtimeAdapter` | required | Provider adapter (`openaiRealtime()`, `elevenlabsRealtime()`) | +| `instructions` | `string` | — | System instructions for the assistant | +| `voice` | `string` | — | Voice to use for audio output | +| `tools` | `AnyClientTool[]` | — | Client-side tools with execution logic | +| `vadMode` | `'server' \| 'semantic' \| 'manual'` | `'server'` | Voice activity detection mode | +| `semanticEagerness` | `'low' \| 'medium' \| 'high'` | — | Eagerness for semantic VAD | +| `autoPlayback` | `boolean` | `true` | Auto-play assistant audio | +| `autoCapture` | `boolean` | `true` | Request microphone on connect | +| `outputModalities` | `Array<'audio' \| 'text'>` | — | Response modalities | +| `temperature` | `number` | — | Generation temperature | +| `maxOutputTokens` | `number \| 'inf'` | — | Max tokens in a response | + +## Connection Lifecycle + +The realtime client manages a connection lifecycle with these statuses: + +| Status | Description | +|--------|-------------| +| `idle` | Not connected | +| `connecting` | Establishing connection | +| `connected` | Active session | +| `reconnecting` | Reconnecting after interruption | +| `error` | Connection error occurred | + +And these modes while connected: + +| Mode | Description | +|------|-------------| +| `idle` | Connected but not actively interacting | +| `listening` | Capturing user audio input | +| `thinking` | Processing user input | +| `speaking` | AI is generating a response | + +```typescript +const { status, mode, error, connect, disconnect } = useRealtimeChat({ /* ... */ }) + +// Handle connection +useEffect(() => { + if (status === 'error' && error) { + console.error('Connection error:', error.message) + } +}, [status, error]) +``` + +## Interruptions + +Users can interrupt the AI while it's speaking: + +```typescript +const { interrupt, mode } = useRealtimeChat({ /* ... */ }) + +// Programmatically interrupt +if (mode === 'speaking') { + interrupt() +} +``` + +With server or semantic VAD, interruptions happen automatically when the user starts speaking. Interrupted messages are marked with `interrupted: true` in the messages array. + +## Using RealtimeClient Directly + +For non-React applications or more control, use `RealtimeClient` directly: + +```typescript +import { RealtimeClient } from '@tanstack/ai-client' +import { openaiRealtime } from '@tanstack/ai-openai' + +const client = new RealtimeClient({ + getToken: () => fetch('/api/realtime-token', { method: 'POST' }).then(r => r.json()), + adapter: openaiRealtime(), + instructions: 'You are a helpful assistant.', + voice: 'alloy', + onMessage: (message) => { + console.log(`${message.role}:`, message.parts) + }, + onStatusChange: (status) => { + console.log('Status:', status) + }, + onModeChange: (mode) => { + console.log('Mode:', mode) + }, +}) + +// Connect +await client.connect() + +// Send text +client.sendText('Hello!') + +// Subscribe to state changes +const unsub = client.onStateChange((state) => { + console.log('Messages:', state.messages.length) +}) + +// Disconnect when done +await client.disconnect() + +// Clean up +client.destroy() +``` + +## Message Structure + +Realtime messages use a `parts`-based structure similar to `UIMessage`: + +```typescript +interface RealtimeMessage { + id: string + role: 'user' | 'assistant' + timestamp: number + parts: Array + interrupted?: boolean +} +``` + +Each part can be one of: + +| Part Type | Fields | Description | +|-----------|--------|-------------| +| `text` | `content` | Text content from `sendText()` | +| `audio` | `transcript`, `durationMs` | Transcribed voice content | +| `tool-call` | `id`, `name`, `arguments`, `input`, `output` | Tool invocation | +| `tool-result` | `toolCallId`, `content` | Tool execution result | +| `image` | `data`, `mimeType` | Image sent via `sendImage()` | + +## Error Handling + +Handle errors through the `onError` callback or the `error` state: + +```typescript +const { error } = useRealtimeChat({ + // ... + onError: (err) => { + if (err.message.includes('Permission denied')) { + alert('Microphone access is required for voice chat.') + } else { + console.error('Realtime error:', err) + } + }, +}) +``` + +## Best Practices + +1. **Token security** - Always generate tokens server-side. Never expose API keys to the client. +2. **Microphone permissions** - Handle the case where the user denies microphone access gracefully. +3. **Cleanup** - Always disconnect when unmounting components. The `useRealtimeChat` hook handles this automatically. +4. **Instructions** - Keep voice assistant instructions concise. Remind the model it's in a voice interface so responses stay conversational. +5. **Tool design** - Keep tool descriptions clear and tool outputs small, since results are processed in real time. +6. **Error recovery** - Implement retry logic for transient connection failures. + +## Next Steps + +- [Tools](./tools) - Learn about the isomorphic tool system +- [Text-to-Speech](./text-to-speech) - Non-realtime speech generation +- [Multimodal Content](./multimodal-content) - Working with images, audio, and video diff --git a/docs/reference/classes/RealtimeClient.md b/docs/reference/classes/RealtimeClient.md new file mode 100644 index 000000000..6e78100de --- /dev/null +++ b/docs/reference/classes/RealtimeClient.md @@ -0,0 +1,276 @@ +--- +id: RealtimeClient +title: RealtimeClient +--- + +# Class: RealtimeClient + +Defined in: [realtime-client.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai-client/src/realtime-client.ts) + +Client for managing realtime voice conversations. + +Handles connection lifecycle, audio I/O, message state, and tool execution for realtime voice-to-voice AI interactions. This is the framework-agnostic core that powers `useRealtimeChat` in React. + +## Example + +```typescript +import { RealtimeClient } from '@tanstack/ai-client' +import { openaiRealtime } from '@tanstack/ai-openai' + +const client = new RealtimeClient({ + getToken: () => fetch('/api/realtime-token').then(r => r.json()), + adapter: openaiRealtime(), + tools: [myTool.client(handler)], + onMessage: (msg) => console.log('Message:', msg), +}) + +await client.connect() +``` + +## Constructors + +### Constructor + +```ts +new RealtimeClient(options): RealtimeClient; +``` + +#### Parameters + +##### options + +[`RealtimeClientOptions`](../interfaces/RealtimeClientOptions.md) + +Configuration options for the client. + +#### Returns + +`RealtimeClient` + +## Properties + +### status + +```ts +readonly status: RealtimeStatus; +``` + +Current connection status (`'idle'`, `'connecting'`, `'connected'`, `'reconnecting'`, `'error'`). + +*** + +### mode + +```ts +readonly mode: RealtimeMode; +``` + +Current session mode (`'idle'`, `'listening'`, `'thinking'`, `'speaking'`). + +*** + +### messages + +```ts +readonly messages: Array; +``` + +Array of conversation messages. Updated as transcripts are finalized and messages complete. + +*** + +### error + +```ts +readonly error: Error | null; +``` + +Current error, if any. + +*** + +### pendingUserTranscript + +```ts +readonly pendingUserTranscript: string | null; +``` + +Partial transcript of what the user is currently saying (before finalization). + +*** + +### pendingAssistantTranscript + +```ts +readonly pendingAssistantTranscript: string | null; +``` + +Partial transcript of the assistant's current response (while speaking). + +*** + +### audio + +```ts +readonly audio: AudioVisualization | null; +``` + +Audio visualization data for the current connection. Returns `null` when not connected. + +## Methods + +### connect() + +```ts +connect(): Promise; +``` + +Connect to the realtime session. Fetches a token via `getToken()` and establishes the connection through the adapter. + +#### Returns + +`Promise` + +#### Throws + +If token fetch or connection fails. + +*** + +### disconnect() + +```ts +disconnect(): Promise; +``` + +Disconnect from the realtime session. Cleans up audio resources, event subscriptions, and token refresh timers. + +#### Returns + +`Promise` + +*** + +### startListening() + +```ts +startListening(): void; +``` + +Start listening for voice input. Only needed when `vadMode` is `'manual'`. + +#### Returns + +`void` + +*** + +### stopListening() + +```ts +stopListening(): void; +``` + +Stop listening for voice input. Only needed when `vadMode` is `'manual'`. + +#### Returns + +`void` + +*** + +### interrupt() + +```ts +interrupt(): void; +``` + +Interrupt the current assistant response. + +#### Returns + +`void` + +*** + +### sendText() + +```ts +sendText(text): void; +``` + +Send a text message instead of voice. + +#### Parameters + +##### text + +`string` + +#### Returns + +`void` + +*** + +### sendImage() + +```ts +sendImage(imageData, mimeType): void; +``` + +Send an image to the conversation. + +#### Parameters + +##### imageData + +`string` + +Base64-encoded image data or a URL. + +##### mimeType + +`string` + +MIME type of the image (e.g., `'image/png'`, `'image/jpeg'`). + +#### Returns + +`void` + +*** + +### onStateChange() + +```ts +onStateChange(callback): () => void; +``` + +Subscribe to state changes. The callback is invoked whenever the internal state updates (status, mode, messages, transcripts, errors). + +#### Parameters + +##### callback + +[`RealtimeStateChangeCallback`](../type-aliases/RealtimeStateChangeCallback.md) + +#### Returns + +`() => void` + +Unsubscribe function. + +*** + +### destroy() + +```ts +destroy(): void; +``` + +Clean up all resources. Disconnects, clears subscriptions, and releases audio resources. Call this when disposing of the client. + +#### Returns + +`void` diff --git a/docs/reference/functions/realtimeToken.md b/docs/reference/functions/realtimeToken.md new file mode 100644 index 000000000..22515aef9 --- /dev/null +++ b/docs/reference/functions/realtimeToken.md @@ -0,0 +1,75 @@ +--- +id: realtimeToken +title: realtimeToken +--- + +# Function: realtimeToken() + +```ts +function realtimeToken(options): Promise; +``` + +Defined in: [realtime/index.ts:33](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/index.ts#L33) + +Generate a realtime token using the provided adapter. + +This function is used on the **server** to generate ephemeral tokens that clients can use to establish realtime connections. The token contains authentication credentials and session configuration, and is typically short-lived (e.g., 10 minutes for OpenAI, 30 minutes for ElevenLabs). + +## Parameters + +### options + +[`RealtimeTokenOptions`](../interfaces/RealtimeTokenOptions.md) + +Token generation options including the provider-specific adapter. + +## Returns + +`Promise<`[`RealtimeToken`](../interfaces/RealtimeToken.md)`>` + +A token containing the provider credentials, expiration, and session config. + +## Examples + +### OpenAI + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' + +const token = await realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + }), +}) +``` + +### ElevenLabs + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + +const token = await realtimeToken({ + adapter: elevenlabsRealtimeToken({ + agentId: 'your-agent-id', + }), +}) +``` + +### TanStack Start Server Function + +```typescript +import { createServerFn } from '@tanstack/react-start' +import { realtimeToken } from '@tanstack/ai' +import { openaiRealtimeToken } from '@tanstack/ai-openai' + +export const getRealtimeToken = createServerFn({ method: 'POST' }) + .handler(async () => { + return realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + }), + }) + }) +``` diff --git a/docs/reference/interfaces/AudioVisualization.md b/docs/reference/interfaces/AudioVisualization.md new file mode 100644 index 000000000..48f9a8a2f --- /dev/null +++ b/docs/reference/interfaces/AudioVisualization.md @@ -0,0 +1,126 @@ +--- +id: AudioVisualization +title: AudioVisualization +--- + +# Interface: AudioVisualization + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Interface for accessing audio visualization data from a realtime connection. Provides volume levels, frequency data, and time-domain data for both input (microphone) and output (speaker) audio. + +## Properties + +### inputLevel + +```ts +readonly inputLevel: number; +``` + +Input volume level (0-1 normalized). + +*** + +### outputLevel + +```ts +readonly outputLevel: number; +``` + +Output volume level (0-1 normalized). + +*** + +### inputSampleRate + +```ts +readonly inputSampleRate: number; +``` + +Input audio sample rate in Hz. + +*** + +### outputSampleRate + +```ts +readonly outputSampleRate: number; +``` + +Output audio sample rate in Hz. + +## Methods + +### getInputFrequencyData() + +```ts +getInputFrequencyData(): Uint8Array; +``` + +Get frequency data for input audio visualization. + +#### Returns + +`Uint8Array` + +*** + +### getOutputFrequencyData() + +```ts +getOutputFrequencyData(): Uint8Array; +``` + +Get frequency data for output audio visualization. + +#### Returns + +`Uint8Array` + +*** + +### getInputTimeDomainData() + +```ts +getInputTimeDomainData(): Uint8Array; +``` + +Get time domain data for input waveform visualization. + +#### Returns + +`Uint8Array` + +*** + +### getOutputTimeDomainData() + +```ts +getOutputTimeDomainData(): Uint8Array; +``` + +Get time domain data for output waveform visualization. + +#### Returns + +`Uint8Array` + +*** + +### onInputAudio? + +```ts +optional onInputAudio: (callback: (samples: Float32Array, sampleRate: number) => void) => () => void; +``` + +Subscribe to raw input audio samples. Returns an unsubscribe function. + +*** + +### onOutputAudio? + +```ts +optional onOutputAudio: (callback: (samples: Float32Array, sampleRate: number) => void) => () => void; +``` + +Subscribe to raw output audio samples. Returns an unsubscribe function. diff --git a/docs/reference/interfaces/RealtimeAdapter.md b/docs/reference/interfaces/RealtimeAdapter.md new file mode 100644 index 000000000..caea97869 --- /dev/null +++ b/docs/reference/interfaces/RealtimeAdapter.md @@ -0,0 +1,48 @@ +--- +id: RealtimeAdapter +title: RealtimeAdapter +--- + +# Interface: RealtimeAdapter + +Defined in: [realtime-types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai-client/src/realtime-types.ts) + +Adapter interface for connecting to realtime providers. Each provider (OpenAI, ElevenLabs, etc.) implements this interface. + +## Properties + +### provider + +```ts +provider: string; +``` + +Provider identifier (e.g., `'openai'`, `'elevenlabs'`). + +## Methods + +### connect() + +```ts +connect(token, clientTools?): Promise; +``` + +Create a connection using the provided token. + +#### Parameters + +##### token + +[`RealtimeToken`](./RealtimeToken.md) + +The ephemeral token from the server. + +##### clientTools? + +`ReadonlyArray` + +Optional client-side tools to register with the provider. + +#### Returns + +`Promise<`[`RealtimeConnection`](./RealtimeConnection.md)`>` diff --git a/docs/reference/interfaces/RealtimeClientOptions.md b/docs/reference/interfaces/RealtimeClientOptions.md new file mode 100644 index 000000000..7a3a62034 --- /dev/null +++ b/docs/reference/interfaces/RealtimeClientOptions.md @@ -0,0 +1,204 @@ +--- +id: RealtimeClientOptions +title: RealtimeClientOptions +--- + +# Interface: RealtimeClientOptions + +Defined in: [realtime-types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai-client/src/realtime-types.ts) + +Options for the `RealtimeClient` and `useRealtimeChat` hook. + +## Properties + +### getToken + +```ts +getToken: () => Promise; +``` + +Function to fetch a realtime token from the server. Called on connect and when the token needs refresh. + +*** + +### adapter + +```ts +adapter: RealtimeAdapter; +``` + +The realtime adapter to use (e.g., `openaiRealtime()`, `elevenlabsRealtime()`). + +*** + +### tools? + +```ts +optional tools: ReadonlyArray; +``` + +Client-side tools with execution logic. + +*** + +### autoPlayback? + +```ts +optional autoPlayback: boolean; +``` + +Auto-play assistant audio responses. Default: `true`. + +*** + +### autoCapture? + +```ts +optional autoCapture: boolean; +``` + +Request microphone access on connect. Default: `true`. + +*** + +### instructions? + +```ts +optional instructions: string; +``` + +System instructions for the assistant. + +*** + +### voice? + +```ts +optional voice: string; +``` + +Voice to use for audio output (provider-specific, e.g., `'alloy'` for OpenAI). + +*** + +### vadMode? + +```ts +optional vadMode: 'server' | 'semantic' | 'manual'; +``` + +Voice activity detection mode. Default: `'server'`. + +- `'server'` — Provider handles speech detection server-side +- `'semantic'` — Semantic turn detection (OpenAI only) +- `'manual'` — Application controls via `startListening()`/`stopListening()` + +*** + +### outputModalities? + +```ts +optional outputModalities: Array<'audio' | 'text'>; +``` + +Output modalities for responses. + +*** + +### temperature? + +```ts +optional temperature: number; +``` + +Temperature for generation (provider-specific range). + +*** + +### maxOutputTokens? + +```ts +optional maxOutputTokens: number | 'inf'; +``` + +Maximum number of tokens in a response. + +*** + +### semanticEagerness? + +```ts +optional semanticEagerness: 'low' | 'medium' | 'high'; +``` + +Eagerness level for semantic VAD. + +*** + +### onStatusChange? + +```ts +optional onStatusChange: (status: RealtimeStatus) => void; +``` + +Called when connection status changes. + +*** + +### onModeChange? + +```ts +optional onModeChange: (mode: RealtimeMode) => void; +``` + +Called when session mode changes. + +*** + +### onMessage? + +```ts +optional onMessage: (message: RealtimeMessage) => void; +``` + +Called when a new message is added to the conversation. + +*** + +### onError? + +```ts +optional onError: (error: Error) => void; +``` + +Called when an error occurs. + +*** + +### onConnect? + +```ts +optional onConnect: () => void; +``` + +Called when connection is established. + +*** + +### onDisconnect? + +```ts +optional onDisconnect: () => void; +``` + +Called when disconnected. + +*** + +### onInterrupted? + +```ts +optional onInterrupted: () => void; +``` + +Called when the assistant's response is interrupted. diff --git a/docs/reference/interfaces/RealtimeConnection.md b/docs/reference/interfaces/RealtimeConnection.md new file mode 100644 index 000000000..00eff5faf --- /dev/null +++ b/docs/reference/interfaces/RealtimeConnection.md @@ -0,0 +1,210 @@ +--- +id: RealtimeConnection +title: RealtimeConnection +--- + +# Interface: RealtimeConnection + +Defined in: [realtime-types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai-client/src/realtime-types.ts) + +Connection interface representing an active realtime session. Handles audio I/O, events, and session management. Returned by `RealtimeAdapter.connect()`. + +## Methods + +### disconnect() + +```ts +disconnect(): Promise; +``` + +Disconnect from the realtime session. + +#### Returns + +`Promise` + +*** + +### startAudioCapture() + +```ts +startAudioCapture(): Promise; +``` + +Start capturing audio from the microphone. + +#### Returns + +`Promise` + +*** + +### stopAudioCapture() + +```ts +stopAudioCapture(): void; +``` + +Stop capturing audio. + +#### Returns + +`void` + +*** + +### sendText() + +```ts +sendText(text): void; +``` + +Send a text message (fallback for when voice isn't available). + +#### Parameters + +##### text + +`string` + +#### Returns + +`void` + +*** + +### sendImage() + +```ts +sendImage(imageData, mimeType): void; +``` + +Send an image to the conversation. + +#### Parameters + +##### imageData + +`string` + +Base64-encoded image data or a URL. + +##### mimeType + +`string` + +MIME type of the image. + +#### Returns + +`void` + +*** + +### sendToolResult() + +```ts +sendToolResult(callId, result): void; +``` + +Send a tool execution result back to the provider. + +#### Parameters + +##### callId + +`string` + +The tool call identifier. + +##### result + +`string` + +JSON-serialized result. + +#### Returns + +`void` + +*** + +### updateSession() + +```ts +updateSession(config): void; +``` + +Update session configuration. + +#### Parameters + +##### config + +`Partial` + +#### Returns + +`void` + +*** + +### interrupt() + +```ts +interrupt(): void; +``` + +Interrupt the current response. + +#### Returns + +`void` + +*** + +### on() + +```ts +on(event, handler): () => void; +``` + +Subscribe to connection events. + +#### Type Parameters + +##### TEvent + +`TEvent` *extends* `RealtimeEvent` + +#### Parameters + +##### event + +`TEvent` + +The event name (`'status_change'`, `'mode_change'`, `'transcript'`, `'audio_chunk'`, `'tool_call'`, `'message_complete'`, `'interrupted'`, `'error'`). + +##### handler + +`RealtimeEventHandler` + +#### Returns + +`() => void` + +Unsubscribe function. + +*** + +### getAudioVisualization() + +```ts +getAudioVisualization(): AudioVisualization; +``` + +Get audio visualization data for rendering level meters or waveforms. + +#### Returns + +[`AudioVisualization`](./AudioVisualization.md) diff --git a/docs/reference/interfaces/RealtimeMessage.md b/docs/reference/interfaces/RealtimeMessage.md new file mode 100644 index 000000000..ea83dfef8 --- /dev/null +++ b/docs/reference/interfaces/RealtimeMessage.md @@ -0,0 +1,80 @@ +--- +id: RealtimeMessage +title: RealtimeMessage +--- + +# Interface: RealtimeMessage + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +A message in a realtime conversation. Contains one or more content parts representing text, audio, tool calls, or images. + +## Properties + +### id + +```ts +id: string; +``` + +Unique message identifier. + +*** + +### role + +```ts +role: 'user' | 'assistant'; +``` + +Message role. + +*** + +### timestamp + +```ts +timestamp: number; +``` + +Timestamp when the message was created (milliseconds since epoch). + +*** + +### parts + +```ts +parts: Array; +``` + +Content parts of the message. Can include `RealtimeTextPart`, `RealtimeAudioPart`, `RealtimeToolCallPart`, `RealtimeToolResultPart`, or `RealtimeImagePart`. + +*** + +### interrupted? + +```ts +optional interrupted: boolean; +``` + +Whether this message was interrupted by the user. + +*** + +### audioId? + +```ts +optional audioId: string; +``` + +Reference to audio buffer if stored. + +*** + +### durationMs? + +```ts +optional durationMs: number; +``` + +Duration of the audio in milliseconds. diff --git a/docs/reference/interfaces/RealtimeSessionConfig.md b/docs/reference/interfaces/RealtimeSessionConfig.md new file mode 100644 index 000000000..8986a73fc --- /dev/null +++ b/docs/reference/interfaces/RealtimeSessionConfig.md @@ -0,0 +1,120 @@ +--- +id: RealtimeSessionConfig +title: RealtimeSessionConfig +--- + +# Interface: RealtimeSessionConfig + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Configuration for a realtime session. Passed to the provider to configure model behavior, voice, tools, and VAD settings. + +## Properties + +### model? + +```ts +optional model: string; +``` + +Model to use for the session. + +*** + +### voice? + +```ts +optional voice: string; +``` + +Voice to use for audio output. + +*** + +### instructions? + +```ts +optional instructions: string; +``` + +System instructions for the assistant. + +*** + +### tools? + +```ts +optional tools: Array; +``` + +Tools available in the session. + +*** + +### vadMode? + +```ts +optional vadMode: 'server' | 'semantic' | 'manual'; +``` + +Voice activity detection mode. + +*** + +### vadConfig? + +```ts +optional vadConfig: VADConfig; +``` + +Detailed VAD configuration (threshold, padding, silence duration). + +*** + +### outputModalities? + +```ts +optional outputModalities: Array<'audio' | 'text'>; +``` + +Output modalities for responses (e.g., `['audio', 'text']`). + +*** + +### temperature? + +```ts +optional temperature: number; +``` + +Temperature for generation (provider-specific range, e.g., 0.6-1.2 for OpenAI). + +*** + +### maxOutputTokens? + +```ts +optional maxOutputTokens: number | 'inf'; +``` + +Maximum number of tokens in a response. + +*** + +### semanticEagerness? + +```ts +optional semanticEagerness: 'low' | 'medium' | 'high'; +``` + +Eagerness level for semantic VAD. + +*** + +### providerOptions? + +```ts +optional providerOptions: Record; +``` + +Provider-specific options. diff --git a/docs/reference/interfaces/RealtimeToken.md b/docs/reference/interfaces/RealtimeToken.md new file mode 100644 index 000000000..43479fb1f --- /dev/null +++ b/docs/reference/interfaces/RealtimeToken.md @@ -0,0 +1,50 @@ +--- +id: RealtimeToken +title: RealtimeToken +--- + +# Interface: RealtimeToken + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Token returned by the server for client authentication. Contains the ephemeral credentials, expiration time, and session configuration for a realtime connection. + +## Properties + +### provider + +```ts +provider: string; +``` + +Provider identifier (e.g., `'openai'`, `'elevenlabs'`). + +*** + +### token + +```ts +token: string; +``` + +The ephemeral token value. For OpenAI, this is a client secret. For ElevenLabs, this is a signed URL. + +*** + +### expiresAt + +```ts +expiresAt: number; +``` + +Token expiration timestamp in milliseconds since epoch. + +*** + +### config + +```ts +config: RealtimeSessionConfig; +``` + +Session configuration embedded in the token (model, voice, instructions, etc.). diff --git a/docs/reference/interfaces/RealtimeTokenAdapter.md b/docs/reference/interfaces/RealtimeTokenAdapter.md new file mode 100644 index 000000000..223978103 --- /dev/null +++ b/docs/reference/interfaces/RealtimeTokenAdapter.md @@ -0,0 +1,34 @@ +--- +id: RealtimeTokenAdapter +title: RealtimeTokenAdapter +--- + +# Interface: RealtimeTokenAdapter + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Adapter interface for generating provider-specific tokens. Implemented by `openaiRealtimeToken()` and `elevenlabsRealtimeToken()`. + +## Properties + +### provider + +```ts +provider: string; +``` + +Provider identifier (e.g., `'openai'`, `'elevenlabs'`). + +## Methods + +### generateToken() + +```ts +generateToken(): Promise; +``` + +Generate an ephemeral token for client use. + +#### Returns + +`Promise<`[`RealtimeToken`](./RealtimeToken.md)`>` diff --git a/docs/reference/interfaces/RealtimeTokenOptions.md b/docs/reference/interfaces/RealtimeTokenOptions.md new file mode 100644 index 000000000..d34f43faf --- /dev/null +++ b/docs/reference/interfaces/RealtimeTokenOptions.md @@ -0,0 +1,23 @@ +--- +id: RealtimeTokenOptions +title: RealtimeTokenOptions +--- + +# Interface: RealtimeTokenOptions + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Options for the `realtimeToken()` function. + +## Properties + +### adapter + +```ts +adapter: RealtimeTokenAdapter; +``` + +The token adapter to use. Each provider has its own token adapter: + +- `openaiRealtimeToken()` from `@tanstack/ai-openai` +- `elevenlabsRealtimeToken()` from `@tanstack/ai-elevenlabs` diff --git a/docs/reference/type-aliases/RealtimeEvent.md b/docs/reference/type-aliases/RealtimeEvent.md new file mode 100644 index 000000000..0d7011b81 --- /dev/null +++ b/docs/reference/type-aliases/RealtimeEvent.md @@ -0,0 +1,33 @@ +--- +id: RealtimeEvent +title: RealtimeEvent +--- + +# Type Alias: RealtimeEvent + +```ts +type RealtimeEvent = + | 'status_change' + | 'mode_change' + | 'transcript' + | 'audio_chunk' + | 'tool_call' + | 'message_complete' + | 'interrupted' + | 'error'; +``` + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Events emitted by the realtime connection. Used with `RealtimeConnection.on()` to subscribe to specific events. + +| Event | Payload | Description | +|-------|---------|-------------| +| `'status_change'` | `{ status: RealtimeStatus }` | Connection status changed | +| `'mode_change'` | `{ mode: RealtimeMode }` | Session mode changed | +| `'transcript'` | `{ role, transcript, isFinal }` | Speech transcript (partial or final) | +| `'audio_chunk'` | `{ data: ArrayBuffer, sampleRate }` | Raw audio data received | +| `'tool_call'` | `{ toolCallId, toolName, input }` | Tool call requested by the model | +| `'message_complete'` | `{ message: RealtimeMessage }` | Complete message received | +| `'interrupted'` | `{ messageId?: string }` | Response was interrupted | +| `'error'` | `{ error: Error }` | Error occurred | diff --git a/docs/reference/type-aliases/RealtimeMessagePart.md b/docs/reference/type-aliases/RealtimeMessagePart.md new file mode 100644 index 000000000..75dae4b64 --- /dev/null +++ b/docs/reference/type-aliases/RealtimeMessagePart.md @@ -0,0 +1,27 @@ +--- +id: RealtimeMessagePart +title: RealtimeMessagePart +--- + +# Type Alias: RealtimeMessagePart + +```ts +type RealtimeMessagePart = + | RealtimeTextPart + | RealtimeAudioPart + | RealtimeToolCallPart + | RealtimeToolResultPart + | RealtimeImagePart; +``` + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Union of all realtime message part types. + +| Part | `type` Field | Key Properties | +|------|-------------|----------------| +| `RealtimeTextPart` | `'text'` | `content: string` | +| `RealtimeAudioPart` | `'audio'` | `transcript: string`, `durationMs?: number` | +| `RealtimeToolCallPart` | `'tool-call'` | `id`, `name`, `arguments`, `input?`, `output?` | +| `RealtimeToolResultPart` | `'tool-result'` | `toolCallId`, `content` | +| `RealtimeImagePart` | `'image'` | `data: string`, `mimeType: string` | diff --git a/docs/reference/type-aliases/RealtimeMode.md b/docs/reference/type-aliases/RealtimeMode.md new file mode 100644 index 000000000..bb0f8beb6 --- /dev/null +++ b/docs/reference/type-aliases/RealtimeMode.md @@ -0,0 +1,21 @@ +--- +id: RealtimeMode +title: RealtimeMode +--- + +# Type Alias: RealtimeMode + +```ts +type RealtimeMode = 'idle' | 'listening' | 'thinking' | 'speaking'; +``` + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Current mode of the realtime session. + +| Value | Description | +|-------|-------------| +| `'idle'` | Connected but not actively interacting | +| `'listening'` | Capturing user audio input | +| `'thinking'` | Processing user input | +| `'speaking'` | AI is generating a response | diff --git a/docs/reference/type-aliases/RealtimeStateChangeCallback.md b/docs/reference/type-aliases/RealtimeStateChangeCallback.md new file mode 100644 index 000000000..9508cf09b --- /dev/null +++ b/docs/reference/type-aliases/RealtimeStateChangeCallback.md @@ -0,0 +1,14 @@ +--- +id: RealtimeStateChangeCallback +title: RealtimeStateChangeCallback +--- + +# Type Alias: RealtimeStateChangeCallback + +```ts +type RealtimeStateChangeCallback = (state: RealtimeClientState) => void; +``` + +Defined in: [ai-client/src/realtime-types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai-client/src/realtime-types.ts) + +Callback function invoked when the realtime client state changes. Receives the full current `RealtimeClientState`. diff --git a/docs/reference/type-aliases/RealtimeStatus.md b/docs/reference/type-aliases/RealtimeStatus.md new file mode 100644 index 000000000..1872253d5 --- /dev/null +++ b/docs/reference/type-aliases/RealtimeStatus.md @@ -0,0 +1,22 @@ +--- +id: RealtimeStatus +title: RealtimeStatus +--- + +# Type Alias: RealtimeStatus + +```ts +type RealtimeStatus = 'idle' | 'connecting' | 'connected' | 'reconnecting' | 'error'; +``` + +Defined in: [realtime/types.ts](https://github.com/TanStack/ai/blob/main/packages/typescript/ai/src/realtime/types.ts) + +Connection status of the realtime client. + +| Value | Description | +|-------|-------------| +| `'idle'` | Not connected | +| `'connecting'` | Establishing connection | +| `'connected'` | Active session | +| `'reconnecting'` | Reconnecting after interruption | +| `'error'` | Connection error occurred | diff --git a/examples/ts-react-chat/.env.example b/examples/ts-react-chat/.env.example index 613cb664b..2bdb43f49 100644 --- a/examples/ts-react-chat/.env.example +++ b/examples/ts-react-chat/.env.example @@ -1,3 +1,11 @@ # OpenAI API Key # Get yours at: https://platform.openai.com/api-keys -OPENAI_API_KEY=sk-... \ No newline at end of file +OPENAI_API_KEY=sk-... + +# ElevenLabs API Key (for realtime voice) +# Get yours at: https://elevenlabs.io/app/settings/api-keys +ELEVENLABS_API_KEY=xi-... + +# ElevenLabs Agent ID (for realtime voice) +# Create an agent at: https://elevenlabs.io/app/conversational-ai +ELEVENLABS_AGENT_ID=... diff --git a/examples/ts-react-chat/package.json b/examples/ts-react-chat/package.json index db2974a56..3e1b9ca3b 100644 --- a/examples/ts-react-chat/package.json +++ b/examples/ts-react-chat/package.json @@ -13,6 +13,7 @@ "@tanstack/ai": "workspace:*", "@tanstack/ai-anthropic": "workspace:*", "@tanstack/ai-client": "workspace:*", + "@tanstack/ai-elevenlabs": "workspace:*", "@tanstack/ai-gemini": "workspace:*", "@tanstack/ai-grok": "workspace:*", "@tanstack/ai-groq": "workspace:*", diff --git a/examples/ts-react-chat/src/components/AudioSparkline.tsx b/examples/ts-react-chat/src/components/AudioSparkline.tsx new file mode 100644 index 000000000..edc51b4a6 --- /dev/null +++ b/examples/ts-react-chat/src/components/AudioSparkline.tsx @@ -0,0 +1,81 @@ +import { useEffect, useRef } from 'react' + +export function AudioSparkline({ + getData, + color, + label, +}: { + getData: () => Uint8Array + color: string + label: string +}) { + const canvasRef = useRef(null) + const animationRef = useRef(null) + + useEffect(() => { + const canvas = canvasRef.current + if (!canvas) return + + const ctx = canvas.getContext('2d') + if (!ctx) return + + function draw() { + const data = getData() + const width = canvas!.width + const height = canvas!.height + + ctx!.fillStyle = '#1f2937' + ctx!.fillRect(0, 0, width, height) + + ctx!.strokeStyle = color + ctx!.lineWidth = 1 + ctx!.beginPath() + + const step = Math.max(1, Math.floor(data.length / width)) + + for (let i = 0; i < width; i++) { + const dataIndex = Math.min(i * step, data.length - 1) + const value = data[dataIndex] ?? 128 + const y = height - (value / 255) * height + + if (i === 0) { + ctx!.moveTo(i, y) + } else { + ctx!.lineTo(i, y) + } + } + + ctx!.stroke() + + ctx!.strokeStyle = '#4b5563' + ctx!.setLineDash([2, 2]) + ctx!.beginPath() + ctx!.moveTo(0, height / 2) + ctx!.lineTo(width, height / 2) + ctx!.stroke() + ctx!.setLineDash([]) + + animationRef.current = requestAnimationFrame(draw) + } + + draw() + + return () => { + if (animationRef.current) { + cancelAnimationFrame(animationRef.current) + } + } + }, [getData, color]) + + return ( +
+ {label} + +
+ ) +} diff --git a/examples/ts-react-chat/src/components/Header.tsx b/examples/ts-react-chat/src/components/Header.tsx index 652036c9e..0b28cbc48 100644 --- a/examples/ts-react-chat/src/components/Header.tsx +++ b/examples/ts-react-chat/src/components/Header.tsx @@ -156,6 +156,19 @@ export default function Header() { Guitar Demo + + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-cyan-600 hover:bg-cyan-700 transition-colors mb-2', + }} + > + + Voice Chat (Realtime) + diff --git a/examples/ts-react-chat/src/lib/realtime-tools.ts b/examples/ts-react-chat/src/lib/realtime-tools.ts new file mode 100644 index 000000000..e19c52226 --- /dev/null +++ b/examples/ts-react-chat/src/lib/realtime-tools.ts @@ -0,0 +1,167 @@ +import { toolDefinition } from '@tanstack/ai' +import { z } from 'zod' + +// Tool to get current time - useful for voice assistants +export const getCurrentTimeToolDef = toolDefinition({ + name: 'getCurrentTime', + description: + 'Get the current date and time. Use this when the user asks what time it is or the current date.', + inputSchema: z.object({ + timezone: z + .string() + .optional() + .describe('Optional timezone like "America/New_York" or "Europe/London"'), + }), + outputSchema: z.object({ + time: z.string(), + date: z.string(), + timezone: z.string(), + }), +}) + +// Tool to get weather - common voice assistant use case +export const getWeatherToolDef = toolDefinition({ + name: 'getWeather', + description: + 'Get the current weather for a location. Use this when the user asks about the weather.', + inputSchema: z.object({ + location: z + .string() + .describe( + 'The city and state/country, e.g. "San Francisco, CA" or "London, UK"', + ), + }), + outputSchema: z.object({ + location: z.string(), + temperature: z.number(), + unit: z.string(), + condition: z.string(), + humidity: z.number(), + }), +}) + +// Tool to set a reminder - demonstrates user interaction +export const setReminderToolDef = toolDefinition({ + name: 'setReminder', + description: + 'Set a reminder for the user. Use this when the user asks to be reminded about something.', + inputSchema: z.object({ + message: z.string().describe('What to remind the user about'), + inMinutes: z.number().describe('How many minutes from now to remind'), + }), + outputSchema: z.object({ + success: z.boolean(), + message: z.string(), + remindAt: z.string(), + }), +}) + +// Tool to search knowledge base - useful for assistants with specific knowledge +export const searchKnowledgeToolDef = toolDefinition({ + name: 'searchKnowledge', + description: + 'Search a knowledge base for information. Use this to find specific facts or documentation.', + inputSchema: z.object({ + query: z.string().describe('The search query'), + }), + outputSchema: z.object({ + results: z.array( + z.object({ + title: z.string(), + snippet: z.string(), + }), + ), + }), +}) + +// Client-side implementation of getCurrentTime +export const getCurrentTimeClient = getCurrentTimeToolDef.client( + ({ timezone }) => { + const now = new Date() + const tz = timezone || Intl.DateTimeFormat().resolvedOptions().timeZone + + return { + time: now.toLocaleTimeString('en-US', { timeZone: tz }), + date: now.toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + timeZone: tz, + }), + timezone: tz, + } + }, +) + +// Client-side implementation of getWeather (mock data for demo) +export const getWeatherClient = getWeatherToolDef.client(({ location }) => { + // Mock weather data for demo purposes + const conditions = ['Sunny', 'Partly Cloudy', 'Cloudy', 'Rainy', 'Snowy'] + const randomCondition = + conditions[Math.floor(Math.random() * conditions.length)]! + const randomTemp = Math.floor(Math.random() * 30) + 50 // 50-80°F + const randomHumidity = Math.floor(Math.random() * 50) + 30 // 30-80% + + return { + location, + temperature: randomTemp, + unit: 'F', + condition: randomCondition, + humidity: randomHumidity, + } +}) + +// Client-side implementation of setReminder +export const setReminderClient = setReminderToolDef.client( + ({ message, inMinutes }) => { + const remindAt = new Date(Date.now() + inMinutes * 60 * 1000) + + // In a real app, you'd schedule a notification here + console.log( + `[Reminder] Will remind about "${message}" at ${remindAt.toLocaleTimeString()}`, + ) + + // For demo purposes, show an alert after the specified time + setTimeout( + () => { + alert(`Reminder: ${message}`) + }, + inMinutes * 60 * 1000, + ) + + return { + success: true, + message: `Reminder set: "${message}"`, + remindAt: remindAt.toLocaleTimeString(), + } + }, +) + +// Client-side implementation of searchKnowledge (mock data for demo) +export const searchKnowledgeClient = searchKnowledgeToolDef.client( + ({ query }) => { + // Mock search results for demo + const mockResults = [ + { + title: `Result for: ${query}`, + snippet: `This is a mock search result for the query "${query}". In a real application, this would return actual search results from a knowledge base.`, + }, + { + title: 'Additional Information', + snippet: + 'More relevant information would appear here based on your search query.', + }, + ] + + return { results: mockResults } + }, +) + +// Export all client tools as an array for easy use +export const realtimeClientTools = [ + getCurrentTimeClient, + getWeatherClient, + setReminderClient, + searchKnowledgeClient, +] as const diff --git a/examples/ts-react-chat/src/lib/use-realtime.ts b/examples/ts-react-chat/src/lib/use-realtime.ts new file mode 100644 index 000000000..848c702ca --- /dev/null +++ b/examples/ts-react-chat/src/lib/use-realtime.ts @@ -0,0 +1,91 @@ +import { createServerFn } from '@tanstack/react-start' +import { realtimeToken } from '@tanstack/ai' +import { useRealtimeChat } from '@tanstack/ai-react' +import { openaiRealtime, openaiRealtimeToken } from '@tanstack/ai-openai' +import { + elevenlabsRealtime, + elevenlabsRealtimeToken, +} from '@tanstack/ai-elevenlabs' +import { realtimeClientTools } from '@/lib/realtime-tools' + +type Provider = 'openai' | 'elevenlabs' + +const getRealtimeTokenFn = createServerFn({ method: 'POST' }) + .inputValidator((data: { provider: Provider; agentId?: string }) => { + if (!data.provider) throw new Error('Provider is required') + return data + }) + .handler(async ({ data }) => { + if (data.provider === 'openai') { + return realtimeToken({ + adapter: openaiRealtimeToken({ + model: 'gpt-4o-realtime-preview', + }), + }) + } + + if (data.provider === 'elevenlabs') { + const agentId = data.agentId || process.env.ELEVENLABS_AGENT_ID + if (!agentId) { + throw new Error( + 'ElevenLabs agent ID is required. Set ELEVENLABS_AGENT_ID or pass agentId in request body.', + ) + } + return realtimeToken({ + adapter: elevenlabsRealtimeToken({ agentId }), + }) + } + + throw new Error(`Unknown provider: ${data.provider}`) + }) + +export function useRealtime({ + provider, + agentId, + outputModalities, + temperature, + maxOutputTokens, + semanticEagerness, +}: { + provider: Provider + agentId: string + outputModalities?: Array<'audio' | 'text'> + temperature?: number + maxOutputTokens?: number | 'inf' + semanticEagerness?: 'low' | 'medium' | 'high' +}) { + const adapter = + provider === 'openai' ? openaiRealtime() : elevenlabsRealtime() + + return useRealtimeChat({ + getToken: () => + getRealtimeTokenFn({ + data: { + provider, + ...(provider === 'elevenlabs' && agentId ? { agentId } : {}), + }, + }), + adapter, + instructions: `You are a helpful, friendly voice assistant with access to several tools. + +You can: +- Tell the user the current time and date (getCurrentTime) +- Get weather information for any location (getWeather) +- Set reminders for the user (setReminder) +- Search a knowledge base for information (searchKnowledge) + +Keep your responses concise and conversational since this is a voice interface. +When using tools, briefly explain what you're doing and then share the results naturally. +If the user sends an image, describe what you see and answer any questions about it. +Be friendly and engaging!`, + voice: 'alloy', + tools: realtimeClientTools, + outputModalities, + temperature, + maxOutputTokens, + semanticEagerness, + onError: (err) => { + console.error('Realtime error:', err) + }, + }) +} diff --git a/examples/ts-react-chat/src/routeTree.gen.ts b/examples/ts-react-chat/src/routeTree.gen.ts index ce3e2520b..ab8ef90f3 100644 --- a/examples/ts-react-chat/src/routeTree.gen.ts +++ b/examples/ts-react-chat/src/routeTree.gen.ts @@ -9,6 +9,7 @@ // Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified. import { Route as rootRouteImport } from './routes/__root' +import { Route as RealtimeRouteImport } from './routes/realtime' import { Route as IndexRouteImport } from './routes/index' import { Route as GenerationsVideoRouteImport } from './routes/generations.video' import { Route as GenerationsTranscriptionRouteImport } from './routes/generations.transcription' @@ -24,6 +25,11 @@ import { Route as ApiGenerateVideoRouteImport } from './routes/api.generate.vide import { Route as ApiGenerateSpeechRouteImport } from './routes/api.generate.speech' import { Route as ApiGenerateImageRouteImport } from './routes/api.generate.image' +const RealtimeRoute = RealtimeRouteImport.update({ + id: '/realtime', + path: '/realtime', + getParentRoute: () => rootRouteImport, +} as any) const IndexRoute = IndexRouteImport.update({ id: '/', path: '/', @@ -98,6 +104,7 @@ const ApiGenerateImageRoute = ApiGenerateImageRouteImport.update({ export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute '/api/summarize': typeof ApiSummarizeRoute '/api/tanchat': typeof ApiTanchatRoute '/api/transcribe': typeof ApiTranscribeRoute @@ -114,6 +121,7 @@ export interface FileRoutesByFullPath { } export interface FileRoutesByTo { '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute '/api/summarize': typeof ApiSummarizeRoute '/api/tanchat': typeof ApiTanchatRoute '/api/transcribe': typeof ApiTranscribeRoute @@ -131,6 +139,7 @@ export interface FileRoutesByTo { export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/realtime': typeof RealtimeRoute '/api/summarize': typeof ApiSummarizeRoute '/api/tanchat': typeof ApiTanchatRoute '/api/transcribe': typeof ApiTranscribeRoute @@ -149,6 +158,7 @@ export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' + | '/realtime' | '/api/summarize' | '/api/tanchat' | '/api/transcribe' @@ -165,6 +175,7 @@ export interface FileRouteTypes { fileRoutesByTo: FileRoutesByTo to: | '/' + | '/realtime' | '/api/summarize' | '/api/tanchat' | '/api/transcribe' @@ -181,6 +192,7 @@ export interface FileRouteTypes { id: | '__root__' | '/' + | '/realtime' | '/api/summarize' | '/api/tanchat' | '/api/transcribe' @@ -198,6 +210,7 @@ export interface FileRouteTypes { } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + RealtimeRoute: typeof RealtimeRoute ApiSummarizeRoute: typeof ApiSummarizeRoute ApiTanchatRoute: typeof ApiTanchatRoute ApiTranscribeRoute: typeof ApiTranscribeRoute @@ -215,6 +228,13 @@ export interface RootRouteChildren { declare module '@tanstack/react-router' { interface FileRoutesByPath { + '/realtime': { + id: '/realtime' + path: '/realtime' + fullPath: '/realtime' + preLoaderRoute: typeof RealtimeRouteImport + parentRoute: typeof rootRouteImport + } '/': { id: '/' path: '/' @@ -318,6 +338,7 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + RealtimeRoute: RealtimeRoute, ApiSummarizeRoute: ApiSummarizeRoute, ApiTanchatRoute: ApiTanchatRoute, ApiTranscribeRoute: ApiTranscribeRoute, diff --git a/examples/ts-react-chat/src/routes/realtime.tsx b/examples/ts-react-chat/src/routes/realtime.tsx new file mode 100644 index 000000000..3225249e2 --- /dev/null +++ b/examples/ts-react-chat/src/routes/realtime.tsx @@ -0,0 +1,538 @@ +import { useEffect, useRef, useState } from 'react' +import { createFileRoute } from '@tanstack/react-router' +import { + Image, + Mic, + MicOff, + Phone, + PhoneOff, + Send, + Volume2, + Wrench, +} from 'lucide-react' +import { AudioSparkline } from '@/components/AudioSparkline' +import { useRealtime } from '@/lib/use-realtime' + +type Provider = 'openai' | 'elevenlabs' +type OutputMode = 'audio+text' | 'text-only' | 'audio-only' + +const PROVIDER_OPTIONS: Array<{ value: Provider; label: string }> = [ + { value: 'openai', label: 'OpenAI Realtime' }, + { value: 'elevenlabs', label: 'ElevenLabs' }, +] + +const OUTPUT_MODE_OPTIONS: Array<{ value: OutputMode; label: string }> = [ + { value: 'audio+text', label: 'Audio + Text' }, + { value: 'text-only', label: 'Text Only' }, + { value: 'audio-only', label: 'Audio Only' }, +] + +function outputModeToModalities( + mode: OutputMode, +): Array<'audio' | 'text'> | undefined { + switch (mode) { + case 'text-only': + return ['text'] + case 'audio-only': + return ['audio'] + case 'audio+text': + return ['audio', 'text'] + default: + return undefined + } +} + +function RealtimePage() { + const [provider, setProvider] = useState('openai') + const [agentId, setAgentId] = useState('') + const [textInput, setTextInput] = useState('') + const [outputMode, setOutputMode] = useState('audio+text') + const [temperature, setTemperature] = useState(0.8) + const [semanticEagerness, setSemanticEagerness] = useState< + 'low' | 'medium' | 'high' + >('medium') + const messagesEndRef = useRef(null) + const imageInputRef = useRef(null) + + const { + status, + mode, + messages, + pendingUserTranscript, + pendingAssistantTranscript, + error, + connect, + disconnect, + interrupt, + sendText, + sendImage, + inputLevel, + outputLevel, + getInputTimeDomainData, + getOutputTimeDomainData, + } = useRealtime({ + provider, + agentId, + outputModalities: outputModeToModalities(outputMode), + temperature, + semanticEagerness, + }) + + // Handle image file selection + const handleImageUpload = (e: React.ChangeEvent) => { + const file = e.target.files?.[0] + if (!file) return + + const reader = new FileReader() + reader.onload = () => { + const result = reader.result as string + // Extract base64 data (remove data:image/xxx;base64, prefix) + const base64 = result.split(',')[1] + if (base64) { + sendImage(base64, file.type) + } + } + reader.readAsDataURL(file) + + // Reset input so the same file can be selected again + e.target.value = '' + } + + // Auto-scroll to bottom when messages change + useEffect(() => { + messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }) + }, [messages, pendingUserTranscript, pendingAssistantTranscript]) + + // Get status color + const getStatusColor = () => { + switch (status) { + case 'connected': + return 'bg-green-500' + case 'connecting': + case 'reconnecting': + return 'bg-yellow-500' + case 'error': + return 'bg-red-500' + default: + return 'bg-gray-500' + } + } + + // Get mode icon + const getModeIndicator = () => { + switch (mode) { + case 'listening': + return ( +
+ + Listening... +
+ ) + case 'thinking': + return ( +
+
+ Thinking... +
+ ) + case 'speaking': + return ( +
+ + Speaking... +
+ ) + default: + return ( +
+ + Idle +
+ ) + } + } + + return ( +
+
+ {/* Header */} +
+
+
+ {/* Provider selector */} +
+ + +
+ + {/* ElevenLabs Agent ID (conditional) */} + {provider === 'elevenlabs' && ( +
+ + setAgentId(e.target.value)} + placeholder="Your ElevenLabs Agent ID" + disabled={status !== 'idle'} + className="rounded-lg border border-orange-500/20 bg-gray-900 px-3 py-2 text-sm text-white focus:outline-none focus:ring-2 focus:ring-orange-500/50 disabled:opacity-50 w-64" + /> +
+ )} + + {/* Output mode selector (OpenAI only) */} + {provider === 'openai' && ( +
+ + +
+ )} + + {/* Temperature slider */} + {provider === 'openai' && ( +
+ + setTemperature(parseFloat(e.target.value))} + disabled={status !== 'idle'} + className="w-24 accent-orange-500 disabled:opacity-50" + /> +
+ )} + + {/* Semantic eagerness */} + {provider === 'openai' && ( +
+ + +
+ )} +
+ + {/* Status */} +
+
+
+ + {status} + +
+ {getModeIndicator()} +
+
+
+ + {/* Tools indicator */} + {provider === 'openai' && ( +
+
+ + Tools enabled: + getCurrentTime + + getWeather + + setReminder + + searchKnowledge +
+
+ )} + + {/* Messages area */} +
+ {messages.length === 0 && status === 'idle' && ( +
+ +

Voice Chat with Tools & Vision

+

+ Click "Start Conversation" to begin talking with the AI +

+

+ Try asking: "What time is it?" or "What's the weather?" — or + send an image! +

+
+ )} + + {messages.map((message) => ( +
+
+ {message.role === 'assistant' ? ( +
+ AI +
+ ) : ( +
+ U +
+ )} +
+ {message.parts.map((part, idx) => { + if (part.type === 'audio') { + return ( +

+ {part.transcript} +

+ ) + } + if (part.type === 'text') { + return ( +

+ {part.content} +

+ ) + } + if (part.type === 'image') { + const src = part.data.startsWith('http') + ? part.data + : `data:${part.mimeType};base64,${part.data}` + return ( + User uploaded + ) + } + return null + })} + {message.interrupted && ( + + (interrupted) + + )} +
+
+
+ ))} + + {/* Pending transcripts */} + {pendingUserTranscript && ( +
+
+
+ U +
+

{pendingUserTranscript}...

+
+
+ )} + + {pendingAssistantTranscript && ( +
+
+
+ AI +
+

+ {pendingAssistantTranscript}... +

+
+
+ )} + +
+
+ + {/* Error display */} + {error && ( +
+ Error: {error.message} +
+ )} + + {/* Text input */} + {status === 'connected' && ( +
+
{ + e.preventDefault() + const text = textInput.trim() + if (!text) return + sendText(text) + setTextInput('') + }} + className="flex items-center gap-2" + > + setTextInput(e.target.value)} + placeholder="Type a message..." + className="flex-1 rounded-lg border border-orange-500/20 bg-gray-800 px-4 py-2 text-sm text-white placeholder-gray-500 focus:outline-none focus:ring-2 focus:ring-orange-500/50" + /> + {/* Image upload button (OpenAI only) */} + {provider === 'openai' && ( + <> + + + + )} + +
+
+ )} + + {/* Audio visualization & controls */} +
+ {/* Volume meters and waveforms */} + {status === 'connected' && ( +
+ {/* Input (Microphone) */} +
+ +
+
+
+ + {Math.round(inputLevel * 100)}% + + +
+ {/* Output (Speaker) */} +
+ +
+
+
+ + {Math.round(outputLevel * 100)}% + + +
+
+ )} + + {/* Controls */} +
+ {status === 'idle' ? ( + + ) : ( + <> + {mode === 'speaking' && ( + + )} + + + )} +
+
+
+
+ ) +} + +export const Route = createFileRoute('/realtime')({ + component: RealtimePage, +}) diff --git a/packages/typescript/ai-client/src/index.ts b/packages/typescript/ai-client/src/index.ts index 6762a00bd..c30a9c3ad 100644 --- a/packages/typescript/ai-client/src/index.ts +++ b/packages/typescript/ai-client/src/index.ts @@ -1,4 +1,5 @@ export { ChatClient } from './chat-client' +export { RealtimeClient } from './realtime-client' export { GenerationClient } from './generation-client' export { VideoGenerationClient } from './video-generation-client' export type { @@ -42,6 +43,13 @@ export type { ExtractToolOutput, } from './tool-types' export type { AnyClientTool } from '@tanstack/ai' +export type { + RealtimeAdapter, + RealtimeConnection, + RealtimeClientOptions, + RealtimeClientState, + RealtimeStateChangeCallback, +} from './realtime-types' export { fetchServerSentEvents, fetchHttpStream, diff --git a/packages/typescript/ai-client/src/realtime-client.ts b/packages/typescript/ai-client/src/realtime-client.ts new file mode 100644 index 000000000..2683294c0 --- /dev/null +++ b/packages/typescript/ai-client/src/realtime-client.ts @@ -0,0 +1,526 @@ +import { convertSchemaToJsonSchema } from '@tanstack/ai' +import type { + AnyClientTool, + AudioVisualization, + RealtimeMessage, + RealtimeMode, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { + RealtimeClientOptions, + RealtimeClientState, + RealtimeConnection, + RealtimeStateChangeCallback, +} from './realtime-types' + +// Token refresh buffer - refresh 1 minute before expiry +const TOKEN_REFRESH_BUFFER_MS = 60_000 + +/** + * Client for managing realtime voice conversations. + * + * Handles connection lifecycle, audio I/O, message state, + * and tool execution for realtime voice-to-voice AI interactions. + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * tools: [myTool.client(handler)], + * onMessage: (msg) => console.log('Message:', msg), + * }) + * + * await client.connect() + * ``` + */ +export class RealtimeClient { + private options: RealtimeClientOptions + private connection: RealtimeConnection | null = null + private token: RealtimeToken | null = null + private tokenRefreshTimeout: ReturnType | null = null + private clientTools: Map + private stateChangeCallbacks: Set = new Set() + private unsubscribers: Array<() => void> = [] + + private state: RealtimeClientState = { + status: 'idle', + mode: 'idle', + messages: [], + pendingUserTranscript: null, + pendingAssistantTranscript: null, + error: null, + } + + constructor(options: RealtimeClientOptions) { + this.options = { + autoPlayback: true, + autoCapture: true, + vadMode: 'server', + ...options, + } + + // Build client tools map + this.clientTools = new Map() + if (options.tools) { + for (const tool of options.tools) { + this.clientTools.set(tool.name, tool) + } + } + } + + // ============================================================================ + // Connection Lifecycle + // ============================================================================ + + /** + * Connect to the realtime session. + * Fetches a token and establishes the connection. + */ + async connect(): Promise { + if (this.state.status === 'connected') { + return + } + + this.updateState({ status: 'connecting', error: null }) + + try { + // Fetch token from server + this.token = await this.options.getToken() + + // Schedule token refresh + this.scheduleTokenRefresh() + + // Connect via adapter (pass tools for providers like ElevenLabs that need them at connect time) + const toolsList = + this.clientTools.size > 0 + ? Array.from(this.clientTools.values()) + : undefined + this.connection = await this.options.adapter.connect( + this.token, + toolsList, + ) + + // Subscribe to connection events + this.subscribeToConnectionEvents() + + // Auto-configure session with client-provided settings + this.applySessionConfig() + + // Start audio capture if configured + if (this.options.autoCapture) { + await this.connection.startAudioCapture() + } + + this.updateState({ status: 'connected', mode: 'listening' }) + this.options.onConnect?.() + } catch (error) { + const err = error instanceof Error ? error : new Error(String(error)) + this.updateState({ status: 'error', error: err }) + this.options.onError?.(err) + throw err + } + } + + /** + * Disconnect from the realtime session. + */ + async disconnect(): Promise { + if (this.tokenRefreshTimeout) { + clearTimeout(this.tokenRefreshTimeout) + this.tokenRefreshTimeout = null + } + + // Unsubscribe from all events + for (const unsub of this.unsubscribers) { + unsub() + } + this.unsubscribers = [] + + if (this.connection) { + await this.connection.disconnect() + this.connection = null + } + + this.token = null + this.updateState({ + status: 'idle', + mode: 'idle', + pendingUserTranscript: null, + pendingAssistantTranscript: null, + }) + this.options.onDisconnect?.() + } + + // ============================================================================ + // Voice Control + // ============================================================================ + + /** + * Start listening for voice input. + * Only needed when vadMode is 'manual'. + */ + startListening(): void { + if (!this.connection || this.state.status !== 'connected') { + return + } + this.connection.startAudioCapture() + this.updateState({ mode: 'listening' }) + } + + /** + * Stop listening for voice input. + * Only needed when vadMode is 'manual'. + */ + stopListening(): void { + if (!this.connection) { + return + } + this.connection.stopAudioCapture() + this.updateState({ mode: 'idle' }) + } + + /** + * Interrupt the current assistant response. + */ + interrupt(): void { + if (!this.connection) { + return + } + this.connection.interrupt() + } + + // ============================================================================ + // Text Input + // ============================================================================ + + /** + * Send a text message instead of voice. + */ + sendText(text: string): void { + if (!this.connection || this.state.status !== 'connected') { + return + } + + // Add user message + const userMessage: RealtimeMessage = { + id: this.generateId(), + role: 'user', + timestamp: Date.now(), + parts: [{ type: 'text', content: text }], + } + this.addMessage(userMessage) + + // Send to provider + this.connection.sendText(text) + } + + /** + * Send an image to the conversation. + * @param imageData - Base64-encoded image data or a URL + * @param mimeType - MIME type of the image (e.g., 'image/png', 'image/jpeg') + */ + sendImage(imageData: string, mimeType: string): void { + if (!this.connection || this.state.status !== 'connected') { + return + } + + // Add user message with image part + const userMessage: RealtimeMessage = { + id: this.generateId(), + role: 'user', + timestamp: Date.now(), + parts: [{ type: 'image', data: imageData, mimeType }], + } + this.addMessage(userMessage) + + // Send to provider + this.connection.sendImage(imageData, mimeType) + } + + // ============================================================================ + // State Access + // ============================================================================ + + /** Get current connection status */ + get status(): RealtimeStatus { + return this.state.status + } + + /** Get current mode */ + get mode(): RealtimeMode { + return this.state.mode + } + + /** Get conversation messages */ + get messages(): Array { + return this.state.messages + } + + /** Get current error, if any */ + get error(): Error | null { + return this.state.error + } + + /** Get pending user transcript (while user is speaking) */ + get pendingUserTranscript(): string | null { + return this.state.pendingUserTranscript + } + + /** Get pending assistant transcript (while assistant is speaking) */ + get pendingAssistantTranscript(): string | null { + return this.state.pendingAssistantTranscript + } + + /** Get audio visualization data */ + get audio(): AudioVisualization | null { + return this.connection?.getAudioVisualization() ?? null + } + + // ============================================================================ + // State Subscription + // ============================================================================ + + /** + * Subscribe to state changes. + * @returns Unsubscribe function + */ + onStateChange(callback: RealtimeStateChangeCallback): () => void { + this.stateChangeCallbacks.add(callback) + return () => { + this.stateChangeCallbacks.delete(callback) + } + } + + // ============================================================================ + // Cleanup + // ============================================================================ + + /** + * Clean up resources. + * Call this when disposing of the client. + */ + destroy(): void { + this.disconnect() + this.stateChangeCallbacks.clear() + } + + // ============================================================================ + // Private Methods + // ============================================================================ + + private updateState(updates: Partial): void { + this.state = { ...this.state, ...updates } + + // Notify callbacks + for (const callback of this.stateChangeCallbacks) { + callback(this.state) + } + + // Notify specific callbacks + if ('status' in updates && updates.status !== undefined) { + this.options.onStatusChange?.(updates.status) + } + if ('mode' in updates && updates.mode !== undefined) { + this.options.onModeChange?.(updates.mode) + } + } + + private addMessage(message: RealtimeMessage): void { + this.updateState({ + messages: [...this.state.messages, message], + }) + this.options.onMessage?.(message) + } + + private scheduleTokenRefresh(): void { + if (!this.token) return + + const timeUntilExpiry = this.token.expiresAt - Date.now() + const refreshIn = Math.max(0, timeUntilExpiry - TOKEN_REFRESH_BUFFER_MS) + + this.tokenRefreshTimeout = setTimeout(() => { + this.refreshToken() + }, refreshIn) + } + + private async refreshToken(): Promise { + try { + this.token = await this.options.getToken() + this.scheduleTokenRefresh() + // Note: Some providers may require reconnection with new token + // This is handled by the adapter implementation + } catch (error) { + const err = error instanceof Error ? error : new Error(String(error)) + this.updateState({ error: err }) + this.options.onError?.(err) + } + } + + private subscribeToConnectionEvents(): void { + if (!this.connection) return + + // Status changes + this.unsubscribers.push( + this.connection.on('status_change', ({ status }) => { + this.updateState({ status }) + }), + ) + + // Mode changes + this.unsubscribers.push( + this.connection.on('mode_change', ({ mode }) => { + this.updateState({ mode }) + }), + ) + + // Transcripts (streaming) + // User transcripts are added as messages when final (no separate message_complete for user input) + // Assistant transcripts are streamed, final message comes via message_complete + this.unsubscribers.push( + this.connection.on('transcript', ({ role, transcript, isFinal }) => { + if (role === 'user') { + this.updateState({ + pendingUserTranscript: isFinal ? null : transcript, + }) + // Add user message when transcript is finalized + if (isFinal && transcript) { + this.addMessage({ + id: this.generateId(), + role: 'user', + timestamp: Date.now(), + parts: [{ type: 'audio', transcript, durationMs: 0 }], + }) + } + } else { + // Assistant transcripts - just update pending, message_complete handles final + this.updateState({ + pendingAssistantTranscript: isFinal ? null : transcript, + }) + } + }), + ) + + // Tool calls + this.unsubscribers.push( + this.connection.on( + 'tool_call', + async ({ toolCallId, toolName, input }) => { + const tool = this.clientTools.get(toolName) + if (tool?.execute) { + try { + const output = await tool.execute(input) + this.connection?.sendToolResult( + toolCallId, + typeof output === 'string' ? output : JSON.stringify(output), + ) + } catch (error) { + const errMsg = + error instanceof Error ? error.message : String(error) + this.connection?.sendToolResult( + toolCallId, + JSON.stringify({ error: errMsg }), + ) + } + } + }, + ), + ) + + // Message complete + this.unsubscribers.push( + this.connection.on('message_complete', ({ message }) => { + // Replace pending message with final version if needed + const existingIndex = this.state.messages.findIndex( + (m) => m.id === message.id, + ) + if (existingIndex >= 0) { + const newMessages = [...this.state.messages] + newMessages[existingIndex] = message + this.updateState({ messages: newMessages }) + } else { + this.addMessage(message) + } + }), + ) + + // Interruption + this.unsubscribers.push( + this.connection.on('interrupted', ({ messageId }) => { + if (messageId) { + const newMessages = this.state.messages.map((m) => + m.id === messageId ? { ...m, interrupted: true } : m, + ) + this.updateState({ messages: newMessages }) + } + this.updateState({ + mode: 'listening', + pendingAssistantTranscript: null, + }) + this.options.onInterrupted?.() + }), + ) + + // Errors + this.unsubscribers.push( + this.connection.on('error', ({ error }) => { + this.updateState({ error }) + this.options.onError?.(error) + }), + ) + } + + private applySessionConfig(): void { + if (!this.connection) return + + const { + instructions, + voice, + vadMode, + tools, + outputModalities, + temperature, + maxOutputTokens, + semanticEagerness, + } = this.options + const hasConfig = + instructions || + voice || + vadMode || + (tools && tools.length > 0) || + outputModalities || + temperature !== undefined || + maxOutputTokens !== undefined || + semanticEagerness + if (!hasConfig) return + + const toolsConfig = tools + ? Array.from(this.clientTools.values()).map((t) => ({ + name: t.name, + description: t.description, + inputSchema: t.inputSchema + ? convertSchemaToJsonSchema(t.inputSchema) + : undefined, + })) + : undefined + + this.connection.updateSession({ + instructions, + voice, + vadMode, + tools: toolsConfig, + outputModalities, + temperature, + maxOutputTokens, + semanticEagerness, + }) + } + + private generateId(): string { + return `msg-${Date.now()}-${Math.random().toString(36).substring(7)}` + } +} diff --git a/packages/typescript/ai-client/src/realtime-types.ts b/packages/typescript/ai-client/src/realtime-types.ts new file mode 100644 index 000000000..bffd6df34 --- /dev/null +++ b/packages/typescript/ai-client/src/realtime-types.ts @@ -0,0 +1,180 @@ +import type { + AnyClientTool, + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' + +// ============================================================================ +// Adapter Interface +// ============================================================================ + +/** + * Adapter interface for connecting to realtime providers. + * Each provider (OpenAI, ElevenLabs, etc.) implements this interface. + */ +export interface RealtimeAdapter { + /** Provider identifier */ + provider: string + + /** + * Create a connection using the provided token + * @param token - The ephemeral token from the server + * @param clientTools - Optional client-side tools to register with the provider + * @returns A connection instance + */ + connect: ( + token: RealtimeToken, + clientTools?: ReadonlyArray, + ) => Promise +} + +/** + * Connection interface representing an active realtime session. + * Handles audio I/O, events, and session management. + */ +export interface RealtimeConnection { + // Lifecycle + /** Disconnect from the realtime session */ + disconnect: () => Promise + + // Audio I/O + /** Start capturing audio from the microphone */ + startAudioCapture: () => Promise + /** Stop capturing audio */ + stopAudioCapture: () => void + + // Text input + /** Send a text message (fallback for when voice isn't available) */ + sendText: (text: string) => void + + // Image input + /** Send an image to the conversation */ + sendImage: (imageData: string, mimeType: string) => void + + // Tool results + /** Send a tool execution result back to the provider */ + sendToolResult: (callId: string, result: string) => void + + // Session management + /** Update session configuration */ + updateSession: (config: Partial) => void + /** Interrupt the current response */ + interrupt: () => void + + // Events + /** Subscribe to connection events */ + on: ( + event: TEvent, + handler: RealtimeEventHandler, + ) => () => void + + // Audio visualization + /** Get audio visualization data */ + getAudioVisualization: () => AudioVisualization +} + +// ============================================================================ +// Client Options +// ============================================================================ + +/** + * Options for the RealtimeClient + */ +export interface RealtimeClientOptions { + /** + * Function to fetch a realtime token from the server. + * Called on connect and when token needs refresh. + */ + getToken: () => Promise + + /** + * The realtime adapter to use (e.g., openaiRealtime()) + */ + adapter: RealtimeAdapter + + /** + * Client-side tools with execution logic + */ + tools?: ReadonlyArray + + /** + * Auto-play assistant audio (default: true) + */ + autoPlayback?: boolean + + /** + * Request microphone access on connect (default: true) + */ + autoCapture?: boolean + + /** + * System instructions for the assistant + */ + instructions?: string + + /** + * Voice to use for audio output + */ + voice?: string + + /** + * Voice activity detection mode (default: 'server') + */ + vadMode?: 'server' | 'semantic' | 'manual' + + /** + * Output modalities for responses (e.g., ['audio', 'text']) + */ + outputModalities?: Array<'audio' | 'text'> + + /** + * Temperature for generation (provider-specific range) + */ + temperature?: number + + /** + * Maximum number of tokens in a response + */ + maxOutputTokens?: number | 'inf' + + /** + * Eagerness level for semantic VAD ('low', 'medium', 'high') + */ + semanticEagerness?: 'low' | 'medium' | 'high' + + // Callbacks + onStatusChange?: (status: RealtimeStatus) => void + onModeChange?: (mode: RealtimeMode) => void + onMessage?: (message: RealtimeMessage) => void + onError?: (error: Error) => void + onConnect?: () => void + onDisconnect?: () => void + onInterrupted?: () => void +} + +// ============================================================================ +// Client State +// ============================================================================ + +/** + * Internal state of the RealtimeClient + */ +export interface RealtimeClientState { + status: RealtimeStatus + mode: RealtimeMode + messages: Array + pendingUserTranscript: string | null + pendingAssistantTranscript: string | null + error: Error | null +} + +/** + * Callback type for state changes + */ +export type RealtimeStateChangeCallback = (state: RealtimeClientState) => void diff --git a/packages/typescript/ai-elevenlabs/README.md b/packages/typescript/ai-elevenlabs/README.md new file mode 100644 index 000000000..71b0d979b --- /dev/null +++ b/packages/typescript/ai-elevenlabs/README.md @@ -0,0 +1,76 @@ +# @tanstack/ai-elevenlabs + +ElevenLabs adapter for TanStack AI realtime voice conversations. + +## Installation + +```bash +npm install @tanstack/ai-elevenlabs @tanstack/ai @tanstack/ai-client +``` + +## Usage + +### Server-Side Token Generation + +```typescript +import { realtimeToken } from '@tanstack/ai' +import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + +// Generate a signed URL for client use +const token = await realtimeToken({ + adapter: elevenlabsRealtimeToken({ + agentId: 'your-agent-id', + }), +}) +``` + +### Client-Side Usage + +```typescript +import { RealtimeClient } from '@tanstack/ai-client' +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + +const client = new RealtimeClient({ + getToken: () => fetch('/api/realtime-token').then((r) => r.json()), + adapter: elevenlabsRealtime(), +}) + +await client.connect() +``` + +### With React + +```typescript +import { useRealtimeChat } from '@tanstack/ai-react' +import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + +function VoiceChat() { + const { status, mode, messages, connect, disconnect } = useRealtimeChat({ + getToken: () => fetch('/api/realtime-token').then(r => r.json()), + adapter: elevenlabsRealtime(), + }) + + return ( +
+

Status: {status}

+

Mode: {mode}

+ +
+ ) +} +``` + +## Environment Variables + +Set `ELEVENLABS_API_KEY` in your environment for server-side token generation. + +## Requirements + +- ElevenLabs account with Conversational AI agent configured +- Agent ID from ElevenLabs dashboard + +## License + +MIT diff --git a/packages/typescript/ai-elevenlabs/package.json b/packages/typescript/ai-elevenlabs/package.json new file mode 100644 index 000000000..4ff1754e8 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/package.json @@ -0,0 +1,55 @@ +{ + "name": "@tanstack/ai-elevenlabs", + "version": "0.0.1", + "description": "ElevenLabs adapter for TanStack AI realtime voice", + "author": "", + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/TanStack/ai.git", + "directory": "packages/typescript/ai-elevenlabs" + }, + "keywords": [ + "ai", + "elevenlabs", + "voice", + "realtime", + "tanstack", + "adapter" + ], + "type": "module", + "module": "./dist/esm/index.js", + "types": "./dist/esm/index.d.ts", + "exports": { + ".": { + "types": "./dist/esm/index.d.ts", + "import": "./dist/esm/index.js" + } + }, + "files": [ + "dist", + "src" + ], + "scripts": { + "build": "vite build", + "clean": "premove ./build ./dist", + "lint:fix": "eslint ./src --fix", + "test:build": "publint --strict", + "test:eslint": "eslint ./src", + "test:lib": "vitest --passWithNoTests", + "test:lib:dev": "pnpm test:lib --watch", + "test:types": "tsc" + }, + "dependencies": { + "@11labs/client": "^0.2.0" + }, + "peerDependencies": { + "@tanstack/ai": "workspace:^", + "@tanstack/ai-client": "workspace:^" + }, + "devDependencies": { + "@tanstack/ai": "workspace:*", + "@tanstack/ai-client": "workspace:*", + "@vitest/coverage-v8": "4.0.14" + } +} diff --git a/packages/typescript/ai-elevenlabs/src/index.ts b/packages/typescript/ai-elevenlabs/src/index.ts new file mode 100644 index 000000000..8f3789e84 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/index.ts @@ -0,0 +1,13 @@ +// ============================================================================ +// ElevenLabs Realtime (Voice) Adapters +// ============================================================================ + +export { elevenlabsRealtimeToken, elevenlabsRealtime } from './realtime/index' + +export type { + ElevenLabsRealtimeTokenOptions, + ElevenLabsRealtimeOptions, + ElevenLabsConversationMode, + ElevenLabsVADConfig, + ElevenLabsClientTool, +} from './realtime/index' diff --git a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts new file mode 100644 index 000000000..33bc5344e --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts @@ -0,0 +1,300 @@ +import { Conversation } from '@11labs/client' +import type { + AnyClientTool, + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { RealtimeAdapter, RealtimeConnection } from '@tanstack/ai-client' +import type { ElevenLabsRealtimeOptions } from './types' + +/** + * Creates an ElevenLabs realtime adapter for client-side use. + * + * Wraps the @11labs/client SDK for voice conversations. + * + * @param options - Optional configuration + * @returns A RealtimeAdapter for use with RealtimeClient + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { elevenlabsRealtime } from '@tanstack/ai-elevenlabs' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: elevenlabsRealtime(), + * }) + * ``` + */ +export function elevenlabsRealtime( + options: ElevenLabsRealtimeOptions = {}, +): RealtimeAdapter { + return { + provider: 'elevenlabs', + + async connect( + token: RealtimeToken, + clientToolDefs?: ReadonlyArray, + ): Promise { + return createElevenLabsConnection(token, options, clientToolDefs) + }, + } +} + +/** + * Creates a connection to ElevenLabs conversational AI + */ +async function createElevenLabsConnection( + token: RealtimeToken, + _options: ElevenLabsRealtimeOptions, + clientToolDefs?: ReadonlyArray, +): Promise { + const eventHandlers = new Map>>() + let conversation: Awaited< + ReturnType + > | null = null + let messageIdCounter = 0 + + // Empty arrays for when visualization isn't available + const emptyFrequencyData = new Uint8Array(128) + const emptyTimeDomainData = new Uint8Array(128).fill(128) + + // Helper to emit events + function emit( + event: TEvent, + payload: Parameters>[0], + ) { + const handlers = eventHandlers.get(event) + if (handlers) { + for (const handler of handlers) { + handler(payload) + } + } + } + + function generateMessageId(): string { + return `el-msg-${Date.now()}-${++messageIdCounter}` + } + + // Convert TanStack tool definitions to ElevenLabs clientTools format + const elevenLabsClientTools: Record< + string, + { + handler: (params: unknown) => Promise + description: string + parameters: Record + } + > = {} + + if (clientToolDefs) { + for (const tool of clientToolDefs) { + elevenLabsClientTools[tool.name] = { + handler: async (params: unknown) => { + if (tool.execute) { + const result = await tool.execute(params) + return typeof result === 'string' ? result : JSON.stringify(result) + } + return JSON.stringify({ + error: `No execute function for tool ${tool.name}`, + }) + }, + description: tool.description, + parameters: tool.inputSchema + ? (tool.inputSchema as Record) + : { type: 'object', properties: {} }, + } + } + } + + // Build session options + const sessionOptions: Record = { + signedUrl: token.token, + + onConnect: () => { + emit('status_change', { status: 'connected' as RealtimeStatus }) + emit('mode_change', { mode: 'listening' }) + }, + + onDisconnect: () => { + emit('status_change', { status: 'idle' as RealtimeStatus }) + emit('mode_change', { mode: 'idle' }) + }, + + onModeChange: ({ mode }: { mode: string }) => { + const mappedMode: RealtimeMode = + mode === 'speaking' ? 'speaking' : 'listening' + emit('mode_change', { mode: mappedMode }) + }, + + onMessage: ({ message, source }: { message: string; source: string }) => { + const role = source === 'user' ? 'user' : 'assistant' + + // Emit transcript update + emit('transcript', { + role, + transcript: message, + isFinal: true, + }) + + // Create and emit message + const realtimeMessage: RealtimeMessage = { + id: generateMessageId(), + role, + timestamp: Date.now(), + parts: [{ type: 'audio', transcript: message }], + } + emit('message_complete', { message: realtimeMessage }) + }, + + onError: (error: string | Error) => { + emit('error', { + error: new Error( + typeof error === 'string' ? error : error.message || 'Unknown error', + ), + }) + }, + } + + // Only add clientTools if we have any + if (Object.keys(elevenLabsClientTools).length > 0) { + sessionOptions.clientTools = elevenLabsClientTools + } + + // Start the conversation session + conversation = await Conversation.startSession( + sessionOptions as Parameters[0], + ) + + // Connection implementation + const connection: RealtimeConnection = { + async disconnect() { + if (conversation) { + await conversation.endSession() + conversation = null + } + emit('status_change', { status: 'idle' as RealtimeStatus }) + }, + + async startAudioCapture() { + // ElevenLabs SDK handles audio capture automatically + // This is called when the session starts + emit('mode_change', { mode: 'listening' }) + }, + + stopAudioCapture() { + // ElevenLabs SDK handles this + emit('mode_change', { mode: 'idle' }) + }, + + sendText(text: string) { + if (!conversation) return + conversation.sendUserMessage(text) + }, + + sendImage(_imageData: string, _mimeType: string) { + // ElevenLabs does not support direct image input in the conversation API + console.warn( + 'ElevenLabs realtime does not support sending images directly.', + ) + }, + + sendToolResult(_callId: string, _result: string) { + // ElevenLabs client tools are handled via the clientTools handlers + // registered at session start — results are returned automatically + }, + + updateSession(_config: Partial) { + // ElevenLabs session config is set at creation time + console.warn( + 'ElevenLabs does not support runtime session updates. Configure at connection time.', + ) + }, + + interrupt() { + // ElevenLabs handles interruption automatically via barge-in + // No explicit API to call + emit('mode_change', { mode: 'listening' }) + emit('interrupted', {}) + }, + + on( + event: TEvent, + handler: RealtimeEventHandler, + ): () => void { + if (!eventHandlers.has(event)) { + eventHandlers.set(event, new Set()) + } + eventHandlers.get(event)!.add(handler) + + return () => { + eventHandlers.get(event)?.delete(handler) + } + }, + + getAudioVisualization(): AudioVisualization { + return { + get inputLevel() { + if (!conversation) return 0 + try { + return conversation.getInputVolume() + } catch { + return 0 + } + }, + + get outputLevel() { + if (!conversation) return 0 + try { + return conversation.getOutputVolume() + } catch { + return 0 + } + }, + + getInputFrequencyData() { + if (!conversation) return emptyFrequencyData + try { + return conversation.getInputByteFrequencyData() + } catch { + return emptyFrequencyData + } + }, + + getOutputFrequencyData() { + if (!conversation) return emptyFrequencyData + try { + return conversation.getOutputByteFrequencyData() + } catch { + return emptyFrequencyData + } + }, + + getInputTimeDomainData() { + // ElevenLabs SDK doesn't expose time domain data + return emptyTimeDomainData + }, + + getOutputTimeDomainData() { + // ElevenLabs SDK doesn't expose time domain data + return emptyTimeDomainData + }, + + get inputSampleRate() { + return 16000 + }, + + get outputSampleRate() { + return 16000 + }, + } + }, + } + + return connection +} diff --git a/packages/typescript/ai-elevenlabs/src/realtime/index.ts b/packages/typescript/ai-elevenlabs/src/realtime/index.ts new file mode 100644 index 000000000..db176897e --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/index.ts @@ -0,0 +1,14 @@ +// Token adapter for server-side use +export { elevenlabsRealtimeToken } from './token' + +// Client adapter for browser use +export { elevenlabsRealtime } from './adapter' + +// Types +export type { + ElevenLabsRealtimeTokenOptions, + ElevenLabsRealtimeOptions, + ElevenLabsConversationMode, + ElevenLabsVADConfig, + ElevenLabsClientTool, +} from './types' diff --git a/packages/typescript/ai-elevenlabs/src/realtime/token.ts b/packages/typescript/ai-elevenlabs/src/realtime/token.ts new file mode 100644 index 000000000..030d0c9a9 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/token.ts @@ -0,0 +1,103 @@ +import type { RealtimeToken, RealtimeTokenAdapter } from '@tanstack/ai' +import type { ElevenLabsRealtimeTokenOptions } from './types' + +const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1' + +/** + * Get ElevenLabs API key from environment + */ +function getElevenLabsApiKey(): string { + // Check process.env (Node.js) + if (typeof process !== 'undefined' && process.env.ELEVENLABS_API_KEY) { + return process.env.ELEVENLABS_API_KEY + } + + // Check window.env (Browser with injected env) + if ( + typeof window !== 'undefined' && + (window as unknown as { env?: { ELEVENLABS_API_KEY?: string } }).env + ?.ELEVENLABS_API_KEY + ) { + return (window as unknown as { env: { ELEVENLABS_API_KEY: string } }).env + .ELEVENLABS_API_KEY + } + + throw new Error( + 'ELEVENLABS_API_KEY not found in environment variables. ' + + 'Please set ELEVENLABS_API_KEY in your environment.', + ) +} + +/** + * Creates an ElevenLabs realtime token adapter. + * + * This adapter generates signed URLs for client-side connections. + * The signed URL is valid for 30 minutes. + * + * @param options - Configuration options including agentId + * @returns A RealtimeTokenAdapter for use with realtimeToken() + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' + * + * const token = await realtimeToken({ + * adapter: elevenlabsRealtimeToken({ + * agentId: 'your-agent-id', + * }), + * }) + * ``` + */ +export function elevenlabsRealtimeToken( + options: ElevenLabsRealtimeTokenOptions, +): RealtimeTokenAdapter { + const apiKey = getElevenLabsApiKey() + + return { + provider: 'elevenlabs', + + async generateToken(): Promise { + const { agentId, overrides } = options + + // Get signed URL from ElevenLabs + const response = await fetch( + `${ELEVENLABS_API_URL}/convai/conversation/get_signed_url?agent_id=${agentId}`, + { + method: 'GET', + headers: { + 'xi-api-key': apiKey, + }, + }, + ) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `ElevenLabs signed URL request failed: ${response.status} ${errorText}`, + ) + } + + const data = await response.json() + const signedUrl = data.signed_url as string + + // Signed URLs are valid for 30 minutes + const expiresAt = Date.now() + 30 * 60 * 1000 + + return { + provider: 'elevenlabs', + token: signedUrl, + expiresAt, + config: { + voice: overrides?.voiceId, + instructions: overrides?.systemPrompt, + providerOptions: { + agentId, + firstMessage: overrides?.firstMessage, + language: overrides?.language, + }, + }, + } + }, + } +} diff --git a/packages/typescript/ai-elevenlabs/src/realtime/types.ts b/packages/typescript/ai-elevenlabs/src/realtime/types.ts new file mode 100644 index 000000000..c3f5227f7 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/realtime/types.ts @@ -0,0 +1,55 @@ +/** + * Options for the ElevenLabs realtime token adapter + */ +export interface ElevenLabsRealtimeTokenOptions { + /** Agent ID configured in ElevenLabs dashboard */ + agentId: string + /** Optional override values for the agent */ + overrides?: { + /** Custom voice ID to use */ + voiceId?: string + /** Custom system prompt */ + systemPrompt?: string + /** First message the agent should speak */ + firstMessage?: string + /** Language code (e.g., 'en') */ + language?: string + } +} + +/** + * Options for the ElevenLabs realtime client adapter + */ +export interface ElevenLabsRealtimeOptions { + /** Connection mode (default: auto-detect) */ + connectionMode?: 'websocket' | 'webrtc' + /** Enable debug logging */ + debug?: boolean +} + +/** + * ElevenLabs conversation mode + */ +export type ElevenLabsConversationMode = 'speaking' | 'listening' + +/** + * ElevenLabs voice activity detection configuration + */ +export interface ElevenLabsVADConfig { + /** VAD threshold (0.1-0.9) */ + vadThreshold?: number + /** Silence threshold in seconds (0.3-3.0) */ + vadSilenceThresholdSecs?: number + /** Minimum speech duration in ms */ + minSpeechDurationMs?: number + /** Minimum silence duration in ms */ + minSilenceDurationMs?: number +} + +/** + * Client tool definition for ElevenLabs + */ +export interface ElevenLabsClientTool { + /** Tool handler function */ + handler: (params: TParams) => Promise | TResult +} diff --git a/packages/typescript/ai-elevenlabs/tsconfig.json b/packages/typescript/ai-elevenlabs/tsconfig.json new file mode 100644 index 000000000..e5e872741 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../../tsconfig.json", + "compilerOptions": { + "outDir": "dist" + }, + "include": ["vite.config.ts", "./src"], + "exclude": ["node_modules", "dist", "**/*.config.ts"] +} diff --git a/packages/typescript/ai-elevenlabs/vite.config.ts b/packages/typescript/ai-elevenlabs/vite.config.ts new file mode 100644 index 000000000..11f5b20b7 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/vite.config.ts @@ -0,0 +1,37 @@ +import { defineConfig, mergeConfig } from 'vitest/config' +import { tanstackViteConfig } from '@tanstack/vite-config' +import packageJson from './package.json' + +const config = defineConfig({ + test: { + name: packageJson.name, + dir: './', + watch: false, + + globals: true, + environment: 'node', + include: ['tests/**/*.test.ts'], + coverage: { + provider: 'v8', + reporter: ['text', 'json', 'html', 'lcov'], + exclude: [ + 'node_modules/', + 'dist/', + 'tests/', + '**/*.test.ts', + '**/*.config.ts', + '**/types.ts', + ], + include: ['src/**/*.ts'], + }, + }, +}) + +export default mergeConfig( + config, + tanstackViteConfig({ + entry: ['./src/index.ts'], + srcDir: './src', + cjs: false, + }), +) diff --git a/packages/typescript/ai-openai/package.json b/packages/typescript/ai-openai/package.json index 917eb13aa..a137eed03 100644 --- a/packages/typescript/ai-openai/package.json +++ b/packages/typescript/ai-openai/package.json @@ -44,10 +44,12 @@ }, "peerDependencies": { "@tanstack/ai": "workspace:^", + "@tanstack/ai-client": "workspace:^", "zod": "^4.0.0" }, "devDependencies": { "@tanstack/ai": "workspace:*", + "@tanstack/ai-client": "workspace:*", "@vitest/coverage-v8": "4.0.14", "vite": "^7.2.7", "zod": "^4.2.0" diff --git a/packages/typescript/ai-openai/src/index.ts b/packages/typescript/ai-openai/src/index.ts index ffba8da87..afadc4529 100644 --- a/packages/typescript/ai-openai/src/index.ts +++ b/packages/typescript/ai-openai/src/index.ts @@ -100,3 +100,19 @@ export type { OpenAIMessageMetadataByModality, } from './message-types' export type { OpenAIClientConfig } from './utils/client' + +// ============================================================================ +// Realtime (Voice) Adapters +// ============================================================================ + +export { openaiRealtimeToken, openaiRealtime } from './realtime/index' + +export type { + OpenAIRealtimeVoice, + OpenAIRealtimeModel, + OpenAIRealtimeTokenOptions, + OpenAIRealtimeOptions, + OpenAITurnDetection, + OpenAISemanticVADConfig, + OpenAIServerVADConfig, +} from './realtime/index' diff --git a/packages/typescript/ai-openai/src/realtime/adapter.ts b/packages/typescript/ai-openai/src/realtime/adapter.ts new file mode 100644 index 000000000..35187a5d2 --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/adapter.ts @@ -0,0 +1,683 @@ +import type { + AnyClientTool, + AudioVisualization, + RealtimeEvent, + RealtimeEventHandler, + RealtimeMessage, + RealtimeMode, + RealtimeSessionConfig, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { RealtimeAdapter, RealtimeConnection } from '@tanstack/ai-client' +import type { OpenAIRealtimeOptions } from './types' + +const OPENAI_REALTIME_URL = 'https://api.openai.com/v1/realtime' + +/** + * Creates an OpenAI realtime adapter for client-side use. + * + * Uses WebRTC for browser connections (default) or WebSocket for Node.js. + * + * @param options - Optional configuration + * @returns A RealtimeAdapter for use with RealtimeClient + * + * @example + * ```typescript + * import { RealtimeClient } from '@tanstack/ai-client' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * const client = new RealtimeClient({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * }) + * ``` + */ +export function openaiRealtime( + options: OpenAIRealtimeOptions = {}, +): RealtimeAdapter { + const connectionMode = options.connectionMode ?? 'webrtc' + + return { + provider: 'openai', + + async connect( + token: RealtimeToken, + _clientTools?: ReadonlyArray, + ): Promise { + if (connectionMode === 'webrtc') { + return createWebRTCConnection(token) + } + throw new Error('WebSocket connection mode not yet implemented') + }, + } +} + +/** + * Creates a WebRTC connection to OpenAI's realtime API + */ +async function createWebRTCConnection( + token: RealtimeToken, +): Promise { + const model = token.config.model ?? 'gpt-4o-realtime-preview' + const eventHandlers = new Map>>() + + // WebRTC peer connection + const pc = new RTCPeerConnection() + + // Audio context for visualization + let audioContext: AudioContext | null = null + let inputAnalyser: AnalyserNode | null = null + let outputAnalyser: AnalyserNode | null = null + let inputSource: MediaStreamAudioSourceNode | null = null + let outputSource: MediaStreamAudioSourceNode | null = null + let localStream: MediaStream | null = null + + // Audio element for playback (more reliable than AudioContext.destination) + let audioElement: HTMLAudioElement | null = null + + // Data channel for events + let dataChannel: RTCDataChannel | null = null + + // Current state + let currentMode: RealtimeMode = 'idle' + let currentMessageId: string | null = null + + // Empty arrays for when visualization isn't available + // frequencyBinCount = fftSize / 2 = 1024 + const emptyFrequencyData = new Uint8Array(1024) + const emptyTimeDomainData = new Uint8Array(2048).fill(128) // 128 is silence + + // Helper to emit events (defined early so it can be used during setup) + function emit( + event: TEvent, + payload: Parameters>[0], + ) { + const handlers = eventHandlers.get(event) + if (handlers) { + for (const handler of handlers) { + handler(payload) + } + } + } + + // Set up data channel for bidirectional communication + dataChannel = pc.createDataChannel('oai-events') + + // Promise that resolves when the data channel is open and ready + const dataChannelReady = new Promise((resolve) => { + dataChannel!.onopen = () => { + flushPendingEvents() + emit('status_change', { status: 'connected' as RealtimeStatus }) + resolve() + } + }) + + dataChannel.onmessage = (event) => { + try { + const message = JSON.parse(event.data) + handleServerEvent(message) + } catch (e) { + console.error('Failed to parse realtime event:', e) + } + } + + dataChannel.onerror = (error) => { + emit('error', { error: new Error(`Data channel error: ${error}`) }) + } + + // Handle incoming audio track + pc.ontrack = (event) => { + if (event.track.kind === 'audio' && event.streams[0]) { + setupOutputAudioAnalysis(event.streams[0]) + } + } + + // IMPORTANT: Request microphone access and add audio track BEFORE creating offer + // OpenAI's Realtime API requires an audio track in the SDP offer + try { + localStream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate: 24000, + }, + }) + + // Add audio track to peer connection + for (const track of localStream.getAudioTracks()) { + pc.addTrack(track, localStream) + } + } catch (error) { + throw new Error( + `Microphone access required for realtime voice: ${error instanceof Error ? error.message : error}`, + ) + } + + // Create and set local description (now includes audio track) + const offer = await pc.createOffer() + await pc.setLocalDescription(offer) + + // Send SDP to OpenAI and get answer + const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}?model=${model}`, { + method: 'POST', + headers: { + Authorization: `Bearer ${token.token}`, + 'Content-Type': 'application/sdp', + }, + body: offer.sdp, + }) + + if (!sdpResponse.ok) { + const errorText = await sdpResponse.text() + throw new Error( + `Failed to establish WebRTC connection: ${sdpResponse.status} - ${errorText}`, + ) + } + + const answerSdp = await sdpResponse.text() + await pc.setRemoteDescription({ type: 'answer', sdp: answerSdp }) + + // Set up input audio analysis now that we have the stream + setupInputAudioAnalysis(localStream) + + // Handle server events + function handleServerEvent(event: Record) { + const type = event.type as string + + switch (type) { + case 'session.created': + case 'session.updated': + // Session ready + break + + case 'input_audio_buffer.speech_started': + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + break + + case 'input_audio_buffer.speech_stopped': + currentMode = 'thinking' + emit('mode_change', { mode: 'thinking' }) + break + + case 'input_audio_buffer.committed': + // Audio buffer committed for processing + break + + case 'conversation.item.input_audio_transcription.completed': { + const transcript = event.transcript as string + emit('transcript', { role: 'user', transcript, isFinal: true }) + break + } + + case 'response.created': + currentMode = 'thinking' + emit('mode_change', { mode: 'thinking' }) + break + + case 'response.output_item.added': { + const item = event.item as Record + if (item.type === 'message') { + currentMessageId = item.id as string + } + break + } + + case 'response.audio_transcript.delta': { + const delta = event.delta as string + emit('transcript', { + role: 'assistant', + transcript: delta, + isFinal: false, + }) + break + } + + case 'response.audio_transcript.done': { + const transcript = event.transcript as string + emit('transcript', { role: 'assistant', transcript, isFinal: true }) + break + } + + case 'response.output_text.delta': { + const delta = event.delta as string + emit('transcript', { + role: 'assistant', + transcript: delta, + isFinal: false, + }) + break + } + + case 'response.output_text.done': { + const text = event.text as string + emit('transcript', { + role: 'assistant', + transcript: text, + isFinal: true, + }) + break + } + + case 'response.audio.delta': + if (currentMode !== 'speaking') { + currentMode = 'speaking' + emit('mode_change', { mode: 'speaking' }) + } + break + + case 'response.audio.done': + break + + case 'response.function_call_arguments.done': { + const callId = event.call_id as string + const name = event.name as string + const args = event.arguments as string + try { + const input = JSON.parse(args) + emit('tool_call', { toolCallId: callId, toolName: name, input }) + } catch { + emit('tool_call', { toolCallId: callId, toolName: name, input: args }) + } + break + } + + case 'response.done': { + const response = event.response as Record + const output = response.output as + | Array> + | undefined + + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + + // Emit message complete if we have a current message + if (currentMessageId) { + const message: RealtimeMessage = { + id: currentMessageId, + role: 'assistant', + timestamp: Date.now(), + parts: [], + } + + // Extract content from output items + for (const item of output || []) { + if (item.type === 'message' && item.content) { + const content = item.content as Array> + for (const part of content) { + if (part.type === 'audio' && part.transcript) { + message.parts.push({ + type: 'audio', + transcript: part.transcript as string, + }) + } else if (part.type === 'text' && part.text) { + message.parts.push({ + type: 'text', + content: part.text as string, + }) + } + } + } + } + + emit('message_complete', { message }) + currentMessageId = null + } + break + } + + case 'conversation.item.truncated': + emit('interrupted', { messageId: currentMessageId ?? undefined }) + break + + case 'error': { + const error = event.error as Record + emit('error', { + error: new Error((error.message as string) || 'Unknown error'), + }) + break + } + } + } + + // Set up audio analysis for output + function setupOutputAudioAnalysis(stream: MediaStream) { + // Create audio element for playback - this is the standard way to play WebRTC audio + audioElement = new Audio() + audioElement.srcObject = stream + audioElement.autoplay = true + // Some browsers require this for autoplay + audioElement.play().catch((e) => { + console.warn('Audio autoplay failed:', e) + }) + + // Set up AudioContext for visualization only (not playback) + if (!audioContext) { + audioContext = new AudioContext() + } + + // Resume AudioContext if suspended (browsers require user interaction) + if (audioContext.state === 'suspended') { + audioContext.resume().catch(() => { + // Ignore - visualization just won't work + }) + } + + outputAnalyser = audioContext.createAnalyser() + outputAnalyser.fftSize = 2048 // Larger size for more accurate level detection + outputAnalyser.smoothingTimeConstant = 0.3 + + outputSource = audioContext.createMediaStreamSource(stream) + outputSource.connect(outputAnalyser) + // Don't connect to destination - the Audio element handles playback + } + + // Set up audio analysis for input + function setupInputAudioAnalysis(stream: MediaStream) { + if (!audioContext) { + audioContext = new AudioContext() + } + + // Resume AudioContext if suspended (browsers require user interaction) + if (audioContext.state === 'suspended') { + audioContext.resume().catch(() => { + // Ignore - visualization just won't work + }) + } + + inputAnalyser = audioContext.createAnalyser() + inputAnalyser.fftSize = 2048 // Larger size for more accurate level detection + inputAnalyser.smoothingTimeConstant = 0.3 + + inputSource = audioContext.createMediaStreamSource(stream) + inputSource.connect(inputAnalyser) + } + + // Queue for events sent before the data channel is open + const pendingEvents: Array> = [] + + // Send event to server (queues if data channel not yet open) + function sendEvent(event: Record) { + if (dataChannel?.readyState === 'open') { + dataChannel.send(JSON.stringify(event)) + } else { + pendingEvents.push(event) + } + } + + // Flush any queued events (called when data channel opens) + function flushPendingEvents() { + for (const event of pendingEvents) { + dataChannel!.send(JSON.stringify(event)) + } + pendingEvents.length = 0 + } + + // Connection implementation + const connection: RealtimeConnection = { + async disconnect() { + if (localStream) { + for (const track of localStream.getTracks()) { + track.stop() + } + localStream = null + } + + if (audioElement) { + audioElement.pause() + audioElement.srcObject = null + audioElement = null + } + + if (dataChannel) { + dataChannel.close() + dataChannel = null + } + + pc.close() + + if (audioContext) { + await audioContext.close() + audioContext = null + } + + emit('status_change', { status: 'idle' as RealtimeStatus }) + }, + + async startAudioCapture() { + // Audio capture is established during connection setup + // This method enables the tracks and signals listening mode + if (localStream) { + for (const track of localStream.getAudioTracks()) { + track.enabled = true + } + } + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + }, + + stopAudioCapture() { + // Disable tracks rather than stopping them to allow re-enabling + if (localStream) { + for (const track of localStream.getAudioTracks()) { + track.enabled = false + } + } + currentMode = 'idle' + emit('mode_change', { mode: 'idle' }) + }, + + sendText(text: string) { + sendEvent({ + type: 'conversation.item.create', + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text }], + }, + }) + sendEvent({ + type: 'response.create', + }) + }, + + sendImage(imageData: string, mimeType: string) { + // Determine if imageData is a URL or base64 data + const isUrl = + imageData.startsWith('http://') || imageData.startsWith('https://') + const imageContent = isUrl + ? { type: 'input_image', image_url: imageData } + : { + type: 'input_image', + image_url: `data:${mimeType};base64,${imageData}`, + } + + sendEvent({ + type: 'conversation.item.create', + item: { + type: 'message', + role: 'user', + content: [imageContent], + }, + }) + sendEvent({ + type: 'response.create', + }) + }, + + sendToolResult(callId: string, result: string) { + sendEvent({ + type: 'conversation.item.create', + item: { + type: 'function_call_output', + call_id: callId, + output: result, + }, + }) + sendEvent({ type: 'response.create' }) + }, + + updateSession(config: Partial) { + const sessionUpdate: Record = {} + + if (config.instructions) { + sessionUpdate.instructions = config.instructions + } + + if (config.voice) { + sessionUpdate.voice = config.voice + } + + if (config.vadMode) { + if (config.vadMode === 'semantic') { + sessionUpdate.turn_detection = { + type: 'semantic_vad', + eagerness: config.semanticEagerness ?? 'medium', + } + } else if (config.vadMode === 'server') { + sessionUpdate.turn_detection = { + type: 'server_vad', + threshold: config.vadConfig?.threshold ?? 0.5, + prefix_padding_ms: config.vadConfig?.prefixPaddingMs ?? 300, + silence_duration_ms: config.vadConfig?.silenceDurationMs ?? 500, + } + } else { + sessionUpdate.turn_detection = null + } + } + + if (config.tools !== undefined) { + sessionUpdate.tools = config.tools.map((t) => ({ + type: 'function', + name: t.name, + description: t.description, + parameters: t.inputSchema ?? { type: 'object', properties: {} }, + })) + sessionUpdate.tool_choice = 'auto' + } + + if (config.outputModalities) { + sessionUpdate.modalities = config.outputModalities + } + + if (config.temperature !== undefined) { + sessionUpdate.temperature = config.temperature + } + + if (config.maxOutputTokens !== undefined) { + sessionUpdate.max_response_output_tokens = config.maxOutputTokens + } + + // Always enable input audio transcription so user speech is transcribed + sessionUpdate.input_audio_transcription = { model: 'whisper-1' } + + if (Object.keys(sessionUpdate).length > 0) { + sendEvent({ + type: 'session.update', + session: sessionUpdate, + }) + } + }, + + interrupt() { + sendEvent({ type: 'response.cancel' }) + currentMode = 'listening' + emit('mode_change', { mode: 'listening' }) + emit('interrupted', { messageId: currentMessageId ?? undefined }) + }, + + on( + event: TEvent, + handler: RealtimeEventHandler, + ): () => void { + if (!eventHandlers.has(event)) { + eventHandlers.set(event, new Set()) + } + eventHandlers.get(event)!.add(handler) + + return () => { + eventHandlers.get(event)?.delete(handler) + } + }, + + getAudioVisualization(): AudioVisualization { + // Helper to calculate audio level from time domain data + // Uses peak amplitude which is more responsive for voice audio meters + function calculateLevel(analyser: AnalyserNode): number { + const data = new Uint8Array(analyser.fftSize) + analyser.getByteTimeDomainData(data) + + // Find peak deviation from center (128 is silence) + // This is more responsive than RMS for voice level meters + let maxDeviation = 0 + for (const sample of data) { + const deviation = Math.abs(sample - 128) + if (deviation > maxDeviation) { + maxDeviation = deviation + } + } + + // Normalize to 0-1 range (max deviation is 128) + // Scale by 1.5x so that ~66% amplitude reads as full scale + // This provides good visual feedback without pegging too early + const normalized = maxDeviation / 128 + return Math.min(1, normalized * 1.5) + } + + return { + get inputLevel() { + if (!inputAnalyser) return 0 + return calculateLevel(inputAnalyser) + }, + + get outputLevel() { + if (!outputAnalyser) return 0 + return calculateLevel(outputAnalyser) + }, + + getInputFrequencyData() { + if (!inputAnalyser) return emptyFrequencyData + const data = new Uint8Array(inputAnalyser.frequencyBinCount) + inputAnalyser.getByteFrequencyData(data) + return data + }, + + getOutputFrequencyData() { + if (!outputAnalyser) return emptyFrequencyData + const data = new Uint8Array(outputAnalyser.frequencyBinCount) + outputAnalyser.getByteFrequencyData(data) + return data + }, + + getInputTimeDomainData() { + if (!inputAnalyser) return emptyTimeDomainData + const data = new Uint8Array(inputAnalyser.fftSize) + inputAnalyser.getByteTimeDomainData(data) + return data + }, + + getOutputTimeDomainData() { + if (!outputAnalyser) return emptyTimeDomainData + const data = new Uint8Array(outputAnalyser.fftSize) + outputAnalyser.getByteTimeDomainData(data) + return data + }, + + get inputSampleRate() { + return 24000 + }, + + get outputSampleRate() { + return 24000 + }, + } + }, + } + + // Wait for the data channel to be open before returning the connection. + // This ensures session.update (tools, instructions, etc.) can be sent immediately. + await dataChannelReady + + return connection +} diff --git a/packages/typescript/ai-openai/src/realtime/index.ts b/packages/typescript/ai-openai/src/realtime/index.ts new file mode 100644 index 000000000..d5ea156e6 --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/index.ts @@ -0,0 +1,16 @@ +// Token adapter for server-side use +export { openaiRealtimeToken } from './token' + +// Client adapter for browser use +export { openaiRealtime } from './adapter' + +// Types +export type { + OpenAIRealtimeVoice, + OpenAIRealtimeModel, + OpenAIRealtimeTokenOptions, + OpenAIRealtimeOptions, + OpenAITurnDetection, + OpenAISemanticVADConfig, + OpenAIServerVADConfig, +} from './types' diff --git a/packages/typescript/ai-openai/src/realtime/token.ts b/packages/typescript/ai-openai/src/realtime/token.ts new file mode 100644 index 000000000..6bff9c9c2 --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/token.ts @@ -0,0 +1,82 @@ +import { getOpenAIApiKeyFromEnv } from '../utils/client' +import type { RealtimeToken, RealtimeTokenAdapter } from '@tanstack/ai' +import type { + OpenAIRealtimeModel, + OpenAIRealtimeSessionResponse, + OpenAIRealtimeTokenOptions, +} from './types' + +const OPENAI_REALTIME_SESSIONS_URL = + 'https://api.openai.com/v1/realtime/sessions' + +/** + * Creates an OpenAI realtime token adapter. + * + * This adapter generates ephemeral tokens for client-side WebRTC connections. + * The token is valid for 10 minutes. + * + * @param options - Configuration options for the realtime session + * @returns A RealtimeTokenAdapter for use with realtimeToken() + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { openaiRealtimeToken } from '@tanstack/ai-openai' + * + * const token = await realtimeToken({ + * adapter: openaiRealtimeToken({ + * model: 'gpt-4o-realtime-preview', + * voice: 'alloy', + * instructions: 'You are a helpful assistant.', + * turnDetection: { + * type: 'semantic_vad', + * eagerness: 'medium', + * }, + * }), + * }) + * ``` + */ +export function openaiRealtimeToken( + options: OpenAIRealtimeTokenOptions = {}, +): RealtimeTokenAdapter { + const apiKey = getOpenAIApiKeyFromEnv() + + return { + provider: 'openai', + + async generateToken(): Promise { + const model: OpenAIRealtimeModel = + options.model ?? 'gpt-4o-realtime-preview' + + // Call OpenAI API to create session and get ephemeral token. + // Only the model is sent server-side; all other session config + // (instructions, voice, tools, VAD) is applied client-side via session.update. + const response = await fetch(OPENAI_REALTIME_SESSIONS_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ model }), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error( + `OpenAI realtime session creation failed: ${response.status} ${errorText}`, + ) + } + + const sessionData: OpenAIRealtimeSessionResponse = await response.json() + + return { + provider: 'openai', + token: sessionData.client_secret.value, + expiresAt: sessionData.client_secret.expires_at * 1000, + config: { + model: sessionData.model, + }, + } + }, + } +} diff --git a/packages/typescript/ai-openai/src/realtime/types.ts b/packages/typescript/ai-openai/src/realtime/types.ts new file mode 100644 index 000000000..f4d36d9cc --- /dev/null +++ b/packages/typescript/ai-openai/src/realtime/types.ts @@ -0,0 +1,104 @@ +import type { VADConfig } from '@tanstack/ai' + +/** + * OpenAI realtime voice options + */ +export type OpenAIRealtimeVoice = + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'sage' + | 'shimmer' + | 'verse' + | 'marin' + | 'cedar' + +/** + * OpenAI realtime model options + */ +export type OpenAIRealtimeModel = + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17' + | 'gpt-realtime' + | 'gpt-realtime-mini' + +/** + * OpenAI semantic VAD configuration + */ +export interface OpenAISemanticVADConfig { + type: 'semantic_vad' + /** Eagerness level for turn detection */ + eagerness?: 'low' | 'medium' | 'high' +} + +/** + * OpenAI server VAD configuration + */ +export interface OpenAIServerVADConfig extends VADConfig { + type: 'server_vad' +} + +/** + * OpenAI turn detection configuration + */ +export type OpenAITurnDetection = + | OpenAISemanticVADConfig + | OpenAIServerVADConfig + | null + +/** + * Options for the OpenAI realtime token adapter + */ +export interface OpenAIRealtimeTokenOptions { + /** Model to use (default: 'gpt-4o-realtime-preview') */ + model?: OpenAIRealtimeModel +} + +/** + * Options for the OpenAI realtime client adapter + */ +export interface OpenAIRealtimeOptions { + /** Connection mode (default: 'webrtc' in browser) */ + connectionMode?: 'webrtc' | 'websocket' +} + +/** + * OpenAI realtime session response from the API + */ +export interface OpenAIRealtimeSessionResponse { + id: string + object: 'realtime.session' + model: string + modalities: Array + instructions: string + voice: string + input_audio_format: string + output_audio_format: string + input_audio_transcription: { + model: string + } | null + turn_detection: { + type: string + threshold?: number + prefix_padding_ms?: number + silence_duration_ms?: number + eagerness?: string + } | null + tools: Array<{ + type: string + name: string + description: string + parameters: Record + }> + tool_choice: string + temperature: number + max_response_output_tokens: number | string + client_secret: { + value: string + expires_at: number + } +} diff --git a/packages/typescript/ai-react/src/index.ts b/packages/typescript/ai-react/src/index.ts index 24b460cdf..b261e803b 100644 --- a/packages/typescript/ai-react/src/index.ts +++ b/packages/typescript/ai-react/src/index.ts @@ -1,10 +1,15 @@ export { useChat } from './use-chat' +export { useRealtimeChat } from './use-realtime-chat' export type { UseChatOptions, UseChatReturn, UIMessage, ChatRequestBody, } from './types' +export type { + UseRealtimeChatOptions, + UseRealtimeChatReturn, +} from './realtime-types' // Generation hooks export { useGeneration } from './use-generation' diff --git a/packages/typescript/ai-react/src/realtime-types.ts b/packages/typescript/ai-react/src/realtime-types.ts new file mode 100644 index 000000000..bad512d29 --- /dev/null +++ b/packages/typescript/ai-react/src/realtime-types.ts @@ -0,0 +1,143 @@ +import type { + AnyClientTool, + RealtimeMessage, + RealtimeMode, + RealtimeStatus, + RealtimeToken, +} from '@tanstack/ai' +import type { RealtimeAdapter } from '@tanstack/ai-client' + +/** + * Options for the useRealtimeChat hook. + */ +export interface UseRealtimeChatOptions { + /** + * Function to fetch a realtime token from the server. + * Called on connect and when token needs refresh. + */ + getToken: () => Promise + + /** + * The realtime adapter to use (e.g., openaiRealtime()) + */ + adapter: RealtimeAdapter + + /** + * Client-side tools with execution logic + */ + tools?: ReadonlyArray + + /** + * Auto-play assistant audio (default: true) + */ + autoPlayback?: boolean + + /** + * Request microphone access on connect (default: true) + */ + autoCapture?: boolean + + /** + * System instructions for the assistant + */ + instructions?: string + + /** + * Voice to use for audio output + */ + voice?: string + + /** + * Voice activity detection mode (default: 'server') + */ + vadMode?: 'server' | 'semantic' | 'manual' + + /** + * Output modalities for responses (e.g., ['audio', 'text']) + */ + outputModalities?: Array<'audio' | 'text'> + + /** + * Temperature for generation (provider-specific range) + */ + temperature?: number + + /** + * Maximum number of tokens in a response + */ + maxOutputTokens?: number | 'inf' + + /** + * Eagerness level for semantic VAD ('low', 'medium', 'high') + */ + semanticEagerness?: 'low' | 'medium' | 'high' + + // Callbacks + onConnect?: () => void + onDisconnect?: () => void + onError?: (error: Error) => void + onMessage?: (message: RealtimeMessage) => void + onModeChange?: (mode: RealtimeMode) => void + onInterrupted?: () => void +} + +/** + * Return type for the useRealtimeChat hook. + */ +export interface UseRealtimeChatReturn { + // Connection state + /** Current connection status */ + status: RealtimeStatus + /** Current error, if any */ + error: Error | null + /** Connect to the realtime session */ + connect: () => Promise + /** Disconnect from the realtime session */ + disconnect: () => Promise + + // Conversation state + /** Current mode (idle, listening, thinking, speaking) */ + mode: RealtimeMode + /** Conversation messages */ + messages: Array + /** User transcript while speaking (before finalized) */ + pendingUserTranscript: string | null + /** Assistant transcript while speaking (before finalized) */ + pendingAssistantTranscript: string | null + + // Voice control + /** Start listening for voice input (manual VAD mode) */ + startListening: () => void + /** Stop listening for voice input (manual VAD mode) */ + stopListening: () => void + /** Interrupt the current assistant response */ + interrupt: () => void + + // Text input + /** Send a text message instead of voice */ + sendText: (text: string) => void + + // Image input + /** Send an image to the conversation */ + sendImage: (imageData: string, mimeType: string) => void + + // Audio visualization (0-1 normalized) + /** Current input (microphone) volume level */ + inputLevel: number + /** Current output (speaker) volume level */ + outputLevel: number + /** Get frequency data for input audio visualization */ + getInputFrequencyData: () => Uint8Array + /** Get frequency data for output audio visualization */ + getOutputFrequencyData: () => Uint8Array + /** Get time domain data for input waveform */ + getInputTimeDomainData: () => Uint8Array + /** Get time domain data for output waveform */ + getOutputTimeDomainData: () => Uint8Array + + // VAD control + /** Current VAD mode */ + vadMode: 'server' | 'semantic' | 'manual' + /** Change VAD mode at runtime */ + setVADMode: (mode: 'server' | 'semantic' | 'manual') => void +} diff --git a/packages/typescript/ai-react/src/use-realtime-chat.ts b/packages/typescript/ai-react/src/use-realtime-chat.ts new file mode 100644 index 000000000..63272821f --- /dev/null +++ b/packages/typescript/ai-react/src/use-realtime-chat.ts @@ -0,0 +1,277 @@ +import { useCallback, useEffect, useRef, useState } from 'react' +import { RealtimeClient } from '@tanstack/ai-client' +import type { + RealtimeMessage, + RealtimeMode, + RealtimeStatus, +} from '@tanstack/ai' +import type { + UseRealtimeChatOptions, + UseRealtimeChatReturn, +} from './realtime-types' + +// Empty frequency data for when client is not connected +const emptyFrequencyData = new Uint8Array(128) +const emptyTimeDomainData = new Uint8Array(128).fill(128) + +/** + * React hook for realtime voice conversations. + * + * Provides a simple interface for voice-to-voice AI interactions + * with support for multiple providers (OpenAI, ElevenLabs, etc.). + * + * @param options - Configuration options including adapter and callbacks + * @returns Hook return value with state and control methods + * + * @example + * ```typescript + * import { useRealtimeChat } from '@tanstack/ai-react' + * import { openaiRealtime } from '@tanstack/ai-openai' + * + * function VoiceChat() { + * const { + * status, + * mode, + * messages, + * connect, + * disconnect, + * inputLevel, + * outputLevel, + * } = useRealtimeChat({ + * getToken: () => fetch('/api/realtime-token').then(r => r.json()), + * adapter: openaiRealtime(), + * }) + * + * return ( + *
+ *

Status: {status}

+ *

Mode: {mode}

+ * + *
+ * ) + * } + * ``` + */ +export function useRealtimeChat( + options: UseRealtimeChatOptions, +): UseRealtimeChatReturn { + // State + const [status, setStatus] = useState('idle') + const [mode, setMode] = useState('idle') + const [messages, setMessages] = useState>([]) + const [pendingUserTranscript, setPendingUserTranscript] = useState< + string | null + >(null) + const [pendingAssistantTranscript, setPendingAssistantTranscript] = useState< + string | null + >(null) + const [error, setError] = useState(null) + const [inputLevel, setInputLevel] = useState(0) + const [outputLevel, setOutputLevel] = useState(0) + const [vadMode, setVADModeState] = useState<'server' | 'semantic' | 'manual'>( + options.vadMode ?? 'server', + ) + + // Refs + const clientRef = useRef(null) + const optionsRef = useRef(options) + optionsRef.current = options + const animationFrameRef = useRef(null) + + // Create client instance - use ref to ensure we reuse the same instance + // This handles React StrictMode double-rendering + if (!clientRef.current) { + clientRef.current = new RealtimeClient({ + getToken: optionsRef.current.getToken, + adapter: optionsRef.current.adapter, + tools: optionsRef.current.tools, + instructions: optionsRef.current.instructions, + voice: optionsRef.current.voice, + autoPlayback: optionsRef.current.autoPlayback, + autoCapture: optionsRef.current.autoCapture, + vadMode: optionsRef.current.vadMode, + outputModalities: optionsRef.current.outputModalities, + temperature: optionsRef.current.temperature, + maxOutputTokens: optionsRef.current.maxOutputTokens, + semanticEagerness: optionsRef.current.semanticEagerness, + onStatusChange: (newStatus) => { + setStatus(newStatus) + }, + onModeChange: (newMode) => { + setMode(newMode) + optionsRef.current.onModeChange?.(newMode) + }, + onMessage: (message) => { + setMessages((prev) => [...prev, message]) + optionsRef.current.onMessage?.(message) + }, + onError: (err) => { + setError(err) + optionsRef.current.onError?.(err) + }, + onConnect: () => { + setError(null) + optionsRef.current.onConnect?.() + }, + onDisconnect: () => { + optionsRef.current.onDisconnect?.() + }, + onInterrupted: () => { + setPendingAssistantTranscript(null) + optionsRef.current.onInterrupted?.() + }, + }) + + // Subscribe to state changes for transcripts + clientRef.current.onStateChange((state) => { + setPendingUserTranscript(state.pendingUserTranscript) + setPendingAssistantTranscript(state.pendingAssistantTranscript) + }) + } + + const client = clientRef.current + + // Audio level animation loop + useEffect(() => { + function updateLevels() { + if (clientRef.current?.audio) { + setInputLevel(clientRef.current.audio.inputLevel) + setOutputLevel(clientRef.current.audio.outputLevel) + } + animationFrameRef.current = requestAnimationFrame(updateLevels) + } + + if (status === 'connected') { + updateLevels() + } + + return () => { + if (animationFrameRef.current) { + cancelAnimationFrame(animationFrameRef.current) + animationFrameRef.current = null + } + } + }, [status]) + + // Cleanup on unmount + useEffect(() => { + return () => { + clientRef.current?.destroy() + } + }, []) + + // Connection methods + const connect = useCallback(async () => { + setError(null) + setMessages([]) + setPendingUserTranscript(null) + setPendingAssistantTranscript(null) + await client.connect() + }, [client]) + + const disconnect = useCallback(async () => { + await client.disconnect() + }, [client]) + + // Voice control methods + const startListening = useCallback(() => { + client.startListening() + }, [client]) + + const stopListening = useCallback(() => { + client.stopListening() + }, [client]) + + const interrupt = useCallback(() => { + client.interrupt() + }, [client]) + + // Text input + const sendText = useCallback( + (text: string) => { + client.sendText(text) + }, + [client], + ) + + // Image input + const sendImage = useCallback( + (imageData: string, mimeType: string) => { + client.sendImage(imageData, mimeType) + }, + [client], + ) + + // Audio visualization + const getInputFrequencyData = useCallback(() => { + return ( + clientRef.current?.audio?.getInputFrequencyData() ?? emptyFrequencyData + ) + }, []) + + const getOutputFrequencyData = useCallback(() => { + return ( + clientRef.current?.audio?.getOutputFrequencyData() ?? emptyFrequencyData + ) + }, []) + + const getInputTimeDomainData = useCallback(() => { + return ( + clientRef.current?.audio?.getInputTimeDomainData() ?? emptyTimeDomainData + ) + }, []) + + const getOutputTimeDomainData = useCallback(() => { + return ( + clientRef.current?.audio?.getOutputTimeDomainData() ?? emptyTimeDomainData + ) + }, []) + + // VAD mode control + const setVADMode = useCallback( + (newMode: 'server' | 'semantic' | 'manual') => { + setVADModeState(newMode) + // TODO: Update session config if connected + }, + [], + ) + + return { + // Connection state + status, + error, + connect, + disconnect, + + // Conversation state + mode, + messages, + pendingUserTranscript, + pendingAssistantTranscript, + + // Voice control + startListening, + stopListening, + interrupt, + + // Text input + sendText, + + // Image input + sendImage, + + // Audio visualization + inputLevel, + outputLevel, + getInputFrequencyData, + getOutputFrequencyData, + getInputTimeDomainData, + getOutputTimeDomainData, + + // VAD control + vadMode, + setVADMode, + } +} diff --git a/packages/typescript/ai/src/index.ts b/packages/typescript/ai/src/index.ts index 7f0d4fece..8bf6d0a83 100644 --- a/packages/typescript/ai/src/index.ts +++ b/packages/typescript/ai/src/index.ts @@ -79,6 +79,31 @@ export { detectImageMimeType } from './utils' // Event client + event types export * from './event-client' +// Realtime +export { realtimeToken } from './realtime/index' +export type { + RealtimeToken, + RealtimeTokenAdapter, + RealtimeTokenOptions, + RealtimeSessionConfig, + VADConfig, + RealtimeMessage, + RealtimeMessagePart, + RealtimeTextPart, + RealtimeAudioPart, + RealtimeToolCallPart, + RealtimeToolResultPart, + RealtimeImagePart, + RealtimeStatus, + RealtimeMode, + AudioVisualization, + RealtimeEvent, + RealtimeEventPayloads, + RealtimeEventHandler, + RealtimeErrorCode, + RealtimeError, +} from './realtime/index' + // Message converters export { convertMessagesToModelMessages, diff --git a/packages/typescript/ai/src/realtime/index.ts b/packages/typescript/ai/src/realtime/index.ts new file mode 100644 index 000000000..74c450c1d --- /dev/null +++ b/packages/typescript/ai/src/realtime/index.ts @@ -0,0 +1,38 @@ +import type { RealtimeToken, RealtimeTokenOptions } from './types' + +// Re-export all types +export * from './types' + +/** + * Generate a realtime token using the provided adapter. + * + * This function is used on the server to generate ephemeral tokens + * that clients can use to establish realtime connections. + * + * @param options - Token generation options including the adapter + * @returns Promise resolving to a RealtimeToken + * + * @example + * ```typescript + * import { realtimeToken } from '@tanstack/ai' + * import { openaiRealtimeToken } from '@tanstack/ai-openai' + * + * // Server function (TanStack Start example) + * export const getRealtimeToken = createServerFn() + * .handler(async () => { + * return realtimeToken({ + * adapter: openaiRealtimeToken({ + * model: 'gpt-4o-realtime-preview', + * voice: 'alloy', + * instructions: 'You are a helpful assistant...', + * }), + * }) + * }) + * ``` + */ +export async function realtimeToken( + options: RealtimeTokenOptions, +): Promise { + const { adapter } = options + return adapter.generateToken() +} diff --git a/packages/typescript/ai/src/realtime/types.ts b/packages/typescript/ai/src/realtime/types.ts new file mode 100644 index 000000000..daaf6f57c --- /dev/null +++ b/packages/typescript/ai/src/realtime/types.ts @@ -0,0 +1,294 @@ +// ============================================================================ +// Token Types +// ============================================================================ + +/** + * Voice activity detection configuration + */ +export interface VADConfig { + /** Sensitivity threshold (0.0-1.0) */ + threshold?: number + /** Audio to include before speech detection (ms) */ + prefixPaddingMs?: number + /** Silence duration to end turn (ms) */ + silenceDurationMs?: number +} + +/** + * Serializable tool descriptor for realtime session configuration. + * Contains only the metadata needed by providers, not Zod schemas or execute functions. + */ +export interface RealtimeToolConfig { + name: string + description: string + inputSchema?: Record +} + +/** + * Configuration for a realtime session + */ +export interface RealtimeSessionConfig { + /** Model to use for the session */ + model?: string + /** Voice to use for audio output */ + voice?: string + /** System instructions for the assistant */ + instructions?: string + /** Tools available in the session */ + tools?: Array + /** VAD mode */ + vadMode?: 'server' | 'semantic' | 'manual' + /** VAD configuration */ + vadConfig?: VADConfig + /** Output modalities for responses (e.g., ['audio', 'text'], ['text']) */ + outputModalities?: Array<'audio' | 'text'> + /** Temperature for generation (provider-specific range, e.g., 0.6-1.2 for OpenAI) */ + temperature?: number + /** Maximum number of tokens in a response */ + maxOutputTokens?: number | 'inf' + /** Eagerness level for semantic VAD ('low', 'medium', 'high') */ + semanticEagerness?: 'low' | 'medium' | 'high' + /** Provider-specific options */ + providerOptions?: Record +} + +/** + * Token returned by the server for client authentication + */ +export interface RealtimeToken { + /** Provider identifier */ + provider: string + /** The ephemeral token value */ + token: string + /** Token expiration timestamp (ms since epoch) */ + expiresAt: number + /** Session configuration embedded in the token */ + config: RealtimeSessionConfig +} + +/** + * Adapter interface for generating provider-specific tokens + */ +export interface RealtimeTokenAdapter { + /** Provider identifier */ + provider: string + /** Generate an ephemeral token for client use */ + generateToken: () => Promise +} + +/** + * Options for the realtimeToken function + */ +export interface RealtimeTokenOptions { + /** The token adapter to use */ + adapter: RealtimeTokenAdapter +} + +// ============================================================================ +// Message Types +// ============================================================================ + +/** + * Text content part in a realtime message + */ +export interface RealtimeTextPart { + type: 'text' + content: string +} + +/** + * Audio content part in a realtime message + */ +export interface RealtimeAudioPart { + type: 'audio' + /** Transcription of the audio */ + transcript: string + /** Raw audio data (optional, if stored) */ + audioData?: ArrayBuffer + /** Duration of the audio in milliseconds */ + durationMs?: number +} + +/** + * Tool call part in a realtime message + */ +export interface RealtimeToolCallPart { + type: 'tool-call' + id: string + name: string + arguments: string + input?: unknown + output?: unknown +} + +/** + * Tool result part in a realtime message + */ +export interface RealtimeToolResultPart { + type: 'tool-result' + toolCallId: string + content: string +} + +/** + * Image content part in a realtime message + */ +export interface RealtimeImagePart { + type: 'image' + /** Base64-encoded image data or a URL */ + data: string + /** MIME type of the image (e.g., 'image/png', 'image/jpeg') */ + mimeType: string +} + +/** + * Union of all realtime message parts + */ +export type RealtimeMessagePart = + | RealtimeTextPart + | RealtimeAudioPart + | RealtimeToolCallPart + | RealtimeToolResultPart + | RealtimeImagePart + +/** + * A message in a realtime conversation + */ +export interface RealtimeMessage { + /** Unique message identifier */ + id: string + /** Message role */ + role: 'user' | 'assistant' + /** Timestamp when the message was created */ + timestamp: number + /** Content parts of the message */ + parts: Array + /** Whether this message was interrupted */ + interrupted?: boolean + /** Reference to audio buffer if stored */ + audioId?: string + /** Duration of the audio in milliseconds */ + durationMs?: number +} + +// ============================================================================ +// Status Types +// ============================================================================ + +/** + * Connection status of the realtime client + */ +export type RealtimeStatus = + | 'idle' + | 'connecting' + | 'connected' + | 'reconnecting' + | 'error' + +/** + * Current mode of the realtime session + */ +export type RealtimeMode = 'idle' | 'listening' | 'thinking' | 'speaking' + +// ============================================================================ +// Audio Visualization Types +// ============================================================================ + +/** + * Interface for accessing audio visualization data + */ +export interface AudioVisualization { + /** Input volume level (0-1 normalized) */ + readonly inputLevel: number + /** Output volume level (0-1 normalized) */ + readonly outputLevel: number + + /** Get frequency data for input audio visualization */ + getInputFrequencyData: () => Uint8Array + /** Get frequency data for output audio visualization */ + getOutputFrequencyData: () => Uint8Array + + /** Get time domain data for input waveform */ + getInputTimeDomainData: () => Uint8Array + /** Get time domain data for output waveform */ + getOutputTimeDomainData: () => Uint8Array + + /** Input sample rate */ + readonly inputSampleRate: number + /** Output sample rate */ + readonly outputSampleRate: number + + /** Subscribe to raw input audio samples */ + onInputAudio?: ( + callback: (samples: Float32Array, sampleRate: number) => void, + ) => () => void + /** Subscribe to raw output audio samples */ + onOutputAudio?: ( + callback: (samples: Float32Array, sampleRate: number) => void, + ) => () => void +} + +// ============================================================================ +// Event Types +// ============================================================================ + +/** + * Events emitted by the realtime connection + */ +export type RealtimeEvent = + | 'status_change' + | 'mode_change' + | 'transcript' + | 'audio_chunk' + | 'tool_call' + | 'message_complete' + | 'interrupted' + | 'error' + +/** + * Event payloads for realtime events + */ +export interface RealtimeEventPayloads { + status_change: { status: RealtimeStatus } + mode_change: { mode: RealtimeMode } + transcript: { + role: 'user' | 'assistant' + transcript: string + isFinal: boolean + } + audio_chunk: { data: ArrayBuffer; sampleRate: number } + tool_call: { toolCallId: string; toolName: string; input: unknown } + message_complete: { message: RealtimeMessage } + interrupted: { messageId?: string } + error: { error: Error } +} + +/** + * Handler type for realtime events + */ +export type RealtimeEventHandler = ( + payload: RealtimeEventPayloads[TEvent], +) => void + +// ============================================================================ +// Error Types +// ============================================================================ + +/** + * Error codes for realtime errors + */ +export type RealtimeErrorCode = + | 'TOKEN_EXPIRED' + | 'CONNECTION_FAILED' + | 'PERMISSION_DENIED' + | 'PROVIDER_ERROR' + | 'UNKNOWN' + +/** + * Extended error with realtime-specific information + */ +export interface RealtimeError extends Error { + code: RealtimeErrorCode + provider?: string + details?: unknown +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e6c1ee218..5b9787208 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -201,6 +201,9 @@ importers: '@tanstack/ai-client': specifier: workspace:* version: link:../../packages/typescript/ai-client + '@tanstack/ai-elevenlabs': + specifier: workspace:* + version: link:../../packages/typescript/ai-elevenlabs '@tanstack/ai-gemini': specifier: workspace:* version: link:../../packages/typescript/ai-gemini @@ -755,6 +758,22 @@ importers: specifier: ^2.11.10 version: 2.11.10(solid-js@1.9.10)(vite@7.2.7(@types/node@25.0.1)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-elevenlabs: + dependencies: + '@11labs/client': + specifier: ^0.2.0 + version: 0.2.0(@types/dom-mediacapture-record@1.0.22) + devDependencies: + '@tanstack/ai': + specifier: workspace:* + version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client + '@vitest/coverage-v8': + specifier: 4.0.14 + version: 4.0.14(vitest@4.0.18(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) + packages/typescript/ai-fal: dependencies: '@fal-ai/client': @@ -850,6 +869,9 @@ importers: '@tanstack/ai': specifier: workspace:* version: link:../ai + '@tanstack/ai-client': + specifier: workspace:* + version: link:../ai-client '@vitest/coverage-v8': specifier: 4.0.14 version: 4.0.14(vitest@4.0.18(@types/node@25.0.1)(happy-dom@20.0.11)(jiti@2.6.1)(jsdom@27.3.0(postcss@8.5.6))(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) @@ -1440,6 +1462,10 @@ importers: packages: + '@11labs/client@0.2.0': + resolution: {integrity: sha512-GBplAV4WDbcoThsIzdSDPN3xbcitK0ZZ4iJfJZKfltqvgvS6Uw8GZxHwVgiPwnQoA3uosYyY3L9TuPwmel18xQ==} + deprecated: This package is no longer maintained. Please use @elevenlabs/client for the latest version + '@acemir/cssom@0.9.29': resolution: {integrity: sha512-G90x0VW+9nW4dFajtjCoT+NM0scAfH9Mb08IcjgFHYbfiL/lU04dTF9JuVOi3/OH+DJCQdcIseSXkdCB9Ky6JA==} @@ -1632,6 +1658,9 @@ packages: resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==} engines: {node: '>=18'} + '@bufbuild/protobuf@1.10.1': + resolution: {integrity: sha512-wJ8ReQbHxsAfXhrf9ixl0aYbZorRuOWpBNzm8pL8ftmSxQx/wnJD5Eg861NwJU/czy2VXFIebCeZnZrI9rktIQ==} + '@changesets/apply-release-plan@7.0.14': resolution: {integrity: sha512-ddBvf9PHdy2YY0OUiEl3TV78mH9sckndJR14QAt87KLEbIov81XO0q0QAmvooBxXlqRRP8I9B7XOzZwQG7JkWA==} @@ -2505,6 +2534,12 @@ packages: '@jridgewell/trace-mapping@0.3.31': resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + '@livekit/mutex@1.1.1': + resolution: {integrity: sha512-EsshAucklmpuUAfkABPxJNhzj9v2sG7JuzFDL4ML1oJQSV14sqrpTYnsaOudMAw9yOaW53NU3QQTlUQoRs4czw==} + + '@livekit/protocol@1.44.0': + resolution: {integrity: sha512-/vfhDUGcUKO8Q43r6i+5FrDhl5oZjm/X3U4x2Iciqvgn5C8qbj+57YPcWSJ1kyIZm5Cm6AV2nAPjMm3ETD/iyg==} + '@manypkg/find-root@1.1.0': resolution: {integrity: sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA==} @@ -4452,6 +4487,9 @@ packages: '@types/deep-eql@4.0.2': resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==} + '@types/dom-mediacapture-record@1.0.22': + resolution: {integrity: sha512-mUMZLK3NvwRLcAAT9qmcK+9p7tpU2FHdDsntR3YI4+GY88XrgG4XiE7u1Q2LAN2/FZOz/tdMDC3GQCR4T8nFuw==} + '@types/estree-jsx@1.0.5': resolution: {integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==} @@ -6688,6 +6726,9 @@ packages: jju@1.4.0: resolution: {integrity: sha512-8wb9Yw966OSxApiCt0K3yNJL8pnNeIv+OEq2YMidz4FKP6nonSRoOXc80iXY4JaN2FC11B9qsNmDsm+ZOfMROA==} + jose@6.2.0: + resolution: {integrity: sha512-xsfE1TcSCbUdo6U07tR0mvhg0flGxU8tPLbF03mirl2ukGQENhUg4ubGYQnhVH0b5stLlPM+WOqDkEl1R1y5sQ==} + joycon@3.1.1: resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==} engines: {node: '>=10'} @@ -6902,6 +6943,11 @@ packages: resolution: {integrity: sha512-I8oW2+QL5KJo8zXNWX046M134WchxsXC7SawLPvRQpogCbkyQIaFxPE89A2HiwR7vAK2Dm2ERBAmyjTYGYEpBg==} hasBin: true + livekit-client@2.17.2: + resolution: {integrity: sha512-+67y2EtAWZabARlY7kANl/VT1Uu1EJYR5a8qwpT2ub/uBCltsEgEDOxCIMwE9HFR5w+z41HR6GL9hyEvW/y6CQ==} + peerDependencies: + '@types/dom-mediacapture-record': ^1 + load-tsconfig@0.2.5: resolution: {integrity: sha512-IXO6OCs9yg8tMKzfPZ1YmheJbZCiEsnBdcB03l0OcfK9prKnJb96siuHCr5Fl37/yo9DnKU+TLpxzTUspw9shg==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -6944,6 +6990,10 @@ packages: resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==} engines: {node: '>=10'} + loglevel@1.9.2: + resolution: {integrity: sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==} + engines: {node: '>= 0.6.0'} + long@5.3.2: resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} @@ -8053,6 +8103,13 @@ packages: scule@1.3.0: resolution: {integrity: sha512-6FtHJEvt+pVMIB9IBY+IcCJ6Z5f1iQnytgyfKMhDKgmzYG+TeH/wx1y3l27rshSbLiSanrR9ffZDrEsmjlQF2g==} + sdp-transform@2.15.0: + resolution: {integrity: sha512-KrOH82c/W+GYQ0LHqtr3caRpM3ITglq3ljGUIb8LTki7ByacJZ9z+piSGiwZDsRyhQbYBOBJgr2k6X4BZXi3Kw==} + hasBin: true + + sdp@3.2.1: + resolution: {integrity: sha512-lwsAIzOPlH8/7IIjjz3K0zYBk7aBVVcvjMwt3M4fLxpjMYyy7i3I97SLHebgn4YBjirkzfp3RvRDWSKsh/+WFw==} + semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -8546,6 +8603,9 @@ packages: peerDependencies: typescript: '>=4.8.4' + ts-debounce@4.0.0: + resolution: {integrity: sha512-+1iDGY6NmOGidq7i7xZGA4cm8DAa6fqdYcvO5Z6yBevH++Bdo9Qt/mN0TzHUgcCcKv1gmh9+W5dHqz8pMWbCbg==} + ts-declaration-location@1.0.7: resolution: {integrity: sha512-EDyGAwH1gO0Ausm9gV6T2nUvBgXT5kGoCMJPllOaooZ+4VvJiKBdZE7wK18N1deEowhcUptS+5GXZK8U/fvpwA==} peerDependencies: @@ -8642,6 +8702,9 @@ packages: resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} engines: {node: '>= 0.6'} + typed-emitter@2.1.0: + resolution: {integrity: sha512-g/KzbYKbH5C2vPkaXGu8DJlHrGKHLsM25Zg9WuC9pMGfuvT+X25tZQWo5fK1BjBm8+UrVE9LDCvaY0CQk+fXDA==} + typedoc-plugin-frontmatter@1.3.0: resolution: {integrity: sha512-xYQFMAecMlsRUjmf9oM/Sq2FVz4zlgcbIeVFNLdO118CHTN06gIKJNSlyExh9+Xl8sK0YhIvoQwViUURxritWA==} peerDependencies: @@ -9284,6 +9347,10 @@ packages: webpack-virtual-modules@0.6.2: resolution: {integrity: sha512-66/V2i5hQanC51vBQKPH4aI8NMAcBW59FVBs+rC7eGHupMyfn34q7rZIE+ETlJ+XTevqfUhVVBgSUNSW2flEUQ==} + webrtc-adapter@9.0.4: + resolution: {integrity: sha512-5ZZY1+lGq8LEKuDlg9M2RPJHlH3R7OVwyHqMcUsLKCgd9Wvf+QrFTCItkXXYPmrJn8H6gRLXbSgxLLdexiqHxw==} + engines: {node: '>=6.0.0', npm: '>=3.10.0'} + whatwg-encoding@3.1.1: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} @@ -9438,6 +9505,12 @@ packages: snapshots: + '@11labs/client@0.2.0(@types/dom-mediacapture-record@1.0.22)': + dependencies: + livekit-client: 2.17.2(@types/dom-mediacapture-record@1.0.22) + transitivePeerDependencies: + - '@types/dom-mediacapture-record' + '@acemir/cssom@0.9.29': {} '@alcyone-labs/zod-to-json-schema@4.0.10(zod@4.2.1)': @@ -9687,6 +9760,8 @@ snapshots: '@bcoe/v8-coverage@1.0.2': {} + '@bufbuild/protobuf@1.10.1': {} + '@changesets/apply-release-plan@7.0.14': dependencies: '@changesets/config': 3.1.2 @@ -10361,6 +10436,12 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.5 + '@livekit/mutex@1.1.1': {} + + '@livekit/protocol@1.44.0': + dependencies: + '@bufbuild/protobuf': 1.10.1 + '@manypkg/find-root@1.1.0': dependencies: '@babel/runtime': 7.28.4 @@ -12903,6 +12984,8 @@ snapshots: '@types/deep-eql@4.0.2': {} + '@types/dom-mediacapture-record@1.0.22': {} + '@types/estree-jsx@1.0.5': dependencies: '@types/estree': 1.0.8 @@ -15599,6 +15682,8 @@ snapshots: jju@1.4.0: {} + jose@6.2.0: {} + joycon@3.1.1: {} js-beautify@1.15.4: @@ -15824,6 +15909,20 @@ snapshots: untun: 0.1.3 uqr: 0.1.2 + livekit-client@2.17.2(@types/dom-mediacapture-record@1.0.22): + dependencies: + '@livekit/mutex': 1.1.1 + '@livekit/protocol': 1.44.0 + '@types/dom-mediacapture-record': 1.0.22 + events: 3.3.0 + jose: 6.2.0 + loglevel: 1.9.2 + sdp-transform: 2.15.0 + ts-debounce: 4.0.0 + tslib: 2.8.1 + typed-emitter: 2.1.0 + webrtc-adapter: 9.0.4 + load-tsconfig@0.2.5: {} local-pkg@0.5.1: @@ -15862,6 +15961,8 @@ snapshots: chalk: 4.1.2 is-unicode-supported: 0.1.0 + loglevel@1.9.2: {} + long@5.3.2: {} longest-streak@3.1.0: {} @@ -17515,6 +17616,10 @@ snapshots: scule@1.3.0: {} + sdp-transform@2.15.0: {} + + sdp@3.2.1: {} + semver@6.3.1: {} semver@7.5.4: @@ -18036,6 +18141,8 @@ snapshots: dependencies: typescript: 5.9.3 + ts-debounce@4.0.0: {} + ts-declaration-location@1.0.7(typescript@5.9.3): dependencies: picomatch: 4.0.3 @@ -18143,6 +18250,10 @@ snapshots: media-typer: 1.1.0 mime-types: 3.0.2 + typed-emitter@2.1.0: + optionalDependencies: + rxjs: 7.8.2 + typedoc-plugin-frontmatter@1.3.0(typedoc-plugin-markdown@4.9.0(typedoc@0.28.14(typescript@5.9.3))): dependencies: typedoc-plugin-markdown: 4.9.0(typedoc@0.28.14(typescript@5.9.3)) @@ -18908,6 +19019,10 @@ snapshots: webpack-virtual-modules@0.6.2: {} + webrtc-adapter@9.0.4: + dependencies: + sdp: 3.2.1 + whatwg-encoding@3.1.1: dependencies: iconv-lite: 0.6.3