Example: Half-cascade with Cartesia

For the complete documentation index, see llms.txt.

Overview

This half-cascade example uses OpenAI Realtime for speech‑to‑text and reasoning, then sends OpenAI text responses to Cartesia Realtime TTS.

⬇️ Jump to the Full VoxEngine scenario.

Demo video

OpenAI + Cartesia demo:

Video link: OpenAI + Cartesia demo

Prerequisites

Store your OpenAI API key in Voximplant Secrets under OPENAI_API_KEY.
(Optional) Update the CARTESIA_VOICE_ID constant in the example to your preferred voice.
(Optional) Store your Cartesia API key in Voximplant Secrets under CARTESIA_API_KEY if you want to use your own Cartesia account.

How it works

OpenAI runs in text mode (output_modalities: ["text"]).
Caller audio is sent to OpenAI: call.sendMediaTo(voiceAIClient).
Cartesia generates speech from OpenAI text and streams it to the call.

Notes

The example uses the sonic-2 model. Adjust the voice or output settings to match your telephony requirements.
Do not set audio format parameters in half-cascade connector requests. VoxEngine’s WebSocket gateway handles media format negotiation automatically.
If no Cartesia API key is provided, Voximplant’s default account and billing are used.
Custom / cloned voices are only available when using your own API key.
Cartesia TTS requires a real text input on initialization of the player - i.e. no passing "" or " "
Subsequent turns use generationRequest(...) with the same voice, model_id, and language.

More info

OpenAI module API: https://voximplant.com/docs/references/voxengine/openai
OpenAI Realtime guide: https://voximplant.com/docs/guides/ai/openai-realtime
Cartesia module API: https://voximplant.com/docs/references/voxengine/cartesia
Realtime TTS guide: https://voximplant.com/docs/guides/speech/realtime-tts

Full VoxEngine scenario

voxeengine-openai-half-cascade-cartesia.js

1 /**
2  * Voximplant + OpenAI Realtime API + Cartesia TTS demo
3  * Scenario: OpenAI handles STT/LLM, Cartesia handles TTS (half-cascade).
4  */
5 
6 require(Modules.OpenAI);
7 require(Modules.Cartesia);
8 const SYSTEM_PROMPT = `
9 You are Voxi, a helpful phone assistant.
10 Keep responses short and telephony-friendly.
11 Reply in English.
12 `;
13 
14 const CARTESIA_MODEL_ID = "sonic-2";
15 const CARTESIA_VOICE_ID = "a0e99841-438c-4a64-b679-ae501e7d6091";
16 
17 const SESSION_CONFIG = {
18     session: {
19         type: "realtime",
20         instructions: SYSTEM_PROMPT,
21         output_modalities: ["text"],
22         turn_detection: {type: "server_vad", interrupt_response: true},
23     },
24 };
25 
26 VoxEngine.addEventListener(AppEvents.CallAlerting, async ({call}) => {
27     let voiceAIClient;
28     let ttsPlayer;
29 
30     call.addEventListener(CallEvents.Disconnected, () => VoxEngine.terminate());
31     call.addEventListener(CallEvents.Failed, () => VoxEngine.terminate());
32 
33     try {
34         call.answer();
35         // call.record({hd_audio: true, stereo: true}); // Optional: record the call
36 
37         const openAiKey = VoxEngine.getSecretValue('OPENAI_API_KEY');
38 
39         voiceAIClient = await OpenAI.createRealtimeAPIClient({
40             apiKey: openAiKey,
41             model: "gpt-realtime-1.5",
42             onWebSocketClose: (event) => {
43                 Logger.write("===OpenAI.WebSocket.Close===");
44                 if (event) Logger.write(JSON.stringify(event));
45                 VoxEngine.terminate();
46             },
47         });
48 
49         voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionCreated, () => {
50             voiceAIClient.sessionUpdate(SESSION_CONFIG);
51         });
52 
53         voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionUpdated, () => {
54             call.sendMediaTo(voiceAIClient);
55             voiceAIClient.responseCreate({instructions: "Hello! How can I help today?"});
56         });
57 
58         voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.ResponseOutputTextDone, (event) => {
59             const payload = event?.data?.payload || event?.data || {};
60             const text = payload.text || payload.delta;
61             if (!text) return;
62             Logger.write(`===AGENT_TEXT=== ${text}`);
63 
64             // Cartesia TTS requires input on initialization, so we lazily create the player here as needed
65             if (!ttsPlayer) {
66                 const contextId = `openai-cartesia-${Date.now()}`;
67                 const cartesiaOptions = {
68                     // apikey: VoxEngine.getSecretValue('CARTESIA_API_KEY'), // optional
69                     generationRequestParameters: {
70                         model_id: CARTESIA_MODEL_ID,
71                         transcript: text,
72                         language: "en",
73                         voice: {mode: "id", id: CARTESIA_VOICE_ID},
74                         context_id: contextId,
75                         continue: false,
76                     },
77                 };
78 
79                 ttsPlayer = Cartesia.createRealtimeTTSPlayer(text, cartesiaOptions);
80                 ttsPlayer.sendMediaTo(call);
81                 return;
82             }
83 
84             const contextId = `openai-cartesia-${Date.now()}`;
85             ttsPlayer.generationRequest({
86                 model_id: CARTESIA_MODEL_ID,
87                 transcript: text,
88                 language: "en",
89                 voice: {mode: "id", id: CARTESIA_VOICE_ID},
90                 context_id: contextId,
91                 continue: false,
92             });
93         });
94 
95         // Barge-in: clear both OpenAI and Cartesia buffers
96         voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.InputAudioBufferSpeechStarted, () => {
97             Logger.write("===BARGE-IN: OpenAI.InputAudioBufferSpeechStarted===");
98             voiceAIClient.clearMediaBuffer();
99             ttsPlayer?.clearBuffer();
100         });
101 
102         // ---------------------- Log all other events for debugging -----------------------
103         [
104             OpenAI.RealtimeAPIEvents.ResponseCreated,
105             OpenAI.RealtimeAPIEvents.ResponseDone,
106             OpenAI.RealtimeAPIEvents.ResponseOutputTextDelta,
107             OpenAI.RealtimeAPIEvents.ConnectorInformation,
108             OpenAI.RealtimeAPIEvents.HTTPResponse,
109             OpenAI.RealtimeAPIEvents.WebSocketError,
110             OpenAI.RealtimeAPIEvents.Unknown,
111             OpenAI.Events.WebSocketMediaStarted,
112             OpenAI.Events.WebSocketMediaEnded,
113         ].forEach((eventName) => {
114             voiceAIClient.addEventListener(eventName, (event) => {
115                 Logger.write(`===${event.name}===`);
116                 if (event?.data) Logger.write(JSON.stringify(event.data));
117             });
118         });
119     } catch (error) {
120         Logger.write("===UNHANDLED_ERROR===");
121         Logger.write(error);
122         voiceAIClient?.close();
123         VoxEngine.terminate();
124     }
125 });