Example: Half-cascade with Cartesia

View as Markdown

For the complete documentation index, see llms.txt.

Overview

This half-cascade example uses OpenAI Realtime for speech‑to‑text and reasoning, then sends OpenAI text responses to Cartesia Realtime TTS.

⬇️ Jump to the Full VoxEngine scenario.

Demo video

OpenAI + Cartesia demo:

Video link: OpenAI + Cartesia demo

Prerequisites

  • Store your OpenAI API key in Voximplant Secrets under OPENAI_API_KEY.
  • (Optional) Update the CARTESIA_VOICE_ID constant in the example to your preferred voice.
  • (Optional) Store your Cartesia API key in Voximplant Secrets under CARTESIA_API_KEY if you want to use your own Cartesia account.

How it works

  • OpenAI runs in text mode (output_modalities: ["text"]).
  • Caller audio is sent to OpenAI: call.sendMediaTo(voiceAIClient).
  • Cartesia generates speech from OpenAI text and streams it to the call.

Notes

  • The example uses the sonic-2 model. Adjust the voice or output settings to match your telephony requirements.
  • Do not set audio format parameters in half-cascade connector requests. VoxEngine’s WebSocket gateway handles media format negotiation automatically.
  • If no Cartesia API key is provided, Voximplant’s default account and billing are used.
  • Custom / cloned voices are only available when using your own API key.
  • Cartesia TTS requires a real text input on initialization of the player - i.e. no passing "" or " "
  • Subsequent turns use generationRequest(...) with the same voice, model_id, and language.

More info

Full VoxEngine scenario

voxeengine-openai-half-cascade-cartesia.js
1/**
2 * Voximplant + OpenAI Realtime API + Cartesia TTS demo
3 * Scenario: OpenAI handles STT/LLM, Cartesia handles TTS (half-cascade).
4 */
5
6require(Modules.OpenAI);
7require(Modules.Cartesia);
8const SYSTEM_PROMPT = `
9You are Voxi, a helpful phone assistant.
10Keep responses short and telephony-friendly.
11Reply in English.
12`;
13
14const CARTESIA_MODEL_ID = "sonic-2";
15const CARTESIA_VOICE_ID = "a0e99841-438c-4a64-b679-ae501e7d6091";
16
17const SESSION_CONFIG = {
18 session: {
19 type: "realtime",
20 instructions: SYSTEM_PROMPT,
21 output_modalities: ["text"],
22 turn_detection: {type: "server_vad", interrupt_response: true},
23 },
24};
25
26VoxEngine.addEventListener(AppEvents.CallAlerting, async ({call}) => {
27 let voiceAIClient;
28 let ttsPlayer;
29
30 call.addEventListener(CallEvents.Disconnected, () => VoxEngine.terminate());
31 call.addEventListener(CallEvents.Failed, () => VoxEngine.terminate());
32
33 try {
34 call.answer();
35 // call.record({hd_audio: true, stereo: true}); // Optional: record the call
36
37 const openAiKey = VoxEngine.getSecretValue('OPENAI_API_KEY');
38
39 voiceAIClient = await OpenAI.createRealtimeAPIClient({
40 apiKey: openAiKey,
41 model: "gpt-realtime-1.5",
42 onWebSocketClose: (event) => {
43 Logger.write("===OpenAI.WebSocket.Close===");
44 if (event) Logger.write(JSON.stringify(event));
45 VoxEngine.terminate();
46 },
47 });
48
49 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionCreated, () => {
50 voiceAIClient.sessionUpdate(SESSION_CONFIG);
51 });
52
53 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionUpdated, () => {
54 call.sendMediaTo(voiceAIClient);
55 voiceAIClient.responseCreate({instructions: "Hello! How can I help today?"});
56 });
57
58 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.ResponseOutputTextDone, (event) => {
59 const payload = event?.data?.payload || event?.data || {};
60 const text = payload.text || payload.delta;
61 if (!text) return;
62 Logger.write(`===AGENT_TEXT=== ${text}`);
63
64 // Cartesia TTS requires input on initialization, so we lazily create the player here as needed
65 if (!ttsPlayer) {
66 const contextId = `openai-cartesia-${Date.now()}`;
67 const cartesiaOptions = {
68 // apikey: VoxEngine.getSecretValue('CARTESIA_API_KEY'), // optional
69 generationRequestParameters: {
70 model_id: CARTESIA_MODEL_ID,
71 transcript: text,
72 language: "en",
73 voice: {mode: "id", id: CARTESIA_VOICE_ID},
74 context_id: contextId,
75 continue: false,
76 },
77 };
78
79 ttsPlayer = Cartesia.createRealtimeTTSPlayer(text, cartesiaOptions);
80 ttsPlayer.sendMediaTo(call);
81 return;
82 }
83
84 const contextId = `openai-cartesia-${Date.now()}`;
85 ttsPlayer.generationRequest({
86 model_id: CARTESIA_MODEL_ID,
87 transcript: text,
88 language: "en",
89 voice: {mode: "id", id: CARTESIA_VOICE_ID},
90 context_id: contextId,
91 continue: false,
92 });
93 });
94
95 // Barge-in: clear both OpenAI and Cartesia buffers
96 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.InputAudioBufferSpeechStarted, () => {
97 Logger.write("===BARGE-IN: OpenAI.InputAudioBufferSpeechStarted===");
98 voiceAIClient.clearMediaBuffer();
99 ttsPlayer?.clearBuffer();
100 });
101
102 // ---------------------- Log all other events for debugging -----------------------
103 [
104 OpenAI.RealtimeAPIEvents.ResponseCreated,
105 OpenAI.RealtimeAPIEvents.ResponseDone,
106 OpenAI.RealtimeAPIEvents.ResponseOutputTextDelta,
107 OpenAI.RealtimeAPIEvents.ConnectorInformation,
108 OpenAI.RealtimeAPIEvents.HTTPResponse,
109 OpenAI.RealtimeAPIEvents.WebSocketError,
110 OpenAI.RealtimeAPIEvents.Unknown,
111 OpenAI.Events.WebSocketMediaStarted,
112 OpenAI.Events.WebSocketMediaEnded,
113 ].forEach((eventName) => {
114 voiceAIClient.addEventListener(eventName, (event) => {
115 Logger.write(`===${event.name}===`);
116 if (event?.data) Logger.write(JSON.stringify(event.data));
117 });
118 });
119 } catch (error) {
120 Logger.write("===UNHANDLED_ERROR===");
121 Logger.write(error);
122 voiceAIClient?.close();
123 VoxEngine.terminate();
124 }
125});