Example: Half-cascade with Inworld

View as MarkdownOpen in Claude

Overview

This half-cascade example uses OpenAI Realtime for speechโ€‘toโ€‘text and reasoning, then sends OpenAI text responses to Inworld Realtime TTS.

โฌ‡๏ธ Jump to the Full VoxEngine scenario.

Prerequisites

  • Store your OpenAI API key in Voximplant ApplicationStorage under OPENAI_API_KEY.
  • Set a voiceId in the Inworld request (createContextParameters.create.voiceId) to choose the TTS voice used in this scenario.
  • (Optional) Store your Inworld API key in ApplicationStorage as INWORLD_API_KEY if you want to use your own Inworld account.

How it works

  • OpenAI runs in text mode (output_modalities: ["text"]).
  • Caller audio is sent to OpenAI: call.sendMediaTo(voiceAIClient).
  • Inworld generates speech from OpenAI text and streams it to the call.

Notes

  • The example sets voiceId: "Ashley" and modelId: "inworld-tts-1.5-mini" in createContextParameters.create. Change these to any supported Inworld voice/model.
  • Do not set audio format parameters in half-cascade connector requests. VoxEngineโ€™s WebSocket gateway handles media format negotiation automatically.
  • If no Inworld API key is provided, Voximplantโ€™s default account and billing are used.
  • Custom / cloned voices are only available when using your own API key.
  • Generate speech using send({ send_text: { text } })
  • Flush the context after every turn with send({ flush_context: {} })
  • Clear buffered speech in barge-in handler with clearBuffer() so interruptions stay natural.

More info

Full VoxEngine scenario

voxeengine-openai-half-cascade-inworld.js
1/**
2 * Voximplant + OpenAI Realtime API + Inworld TTS demo
3 * Scenario: OpenAI handles STT/LLM, Inworld handles TTS (half-cascade).
4 */
5
6require(Modules.OpenAI);
7require(Modules.Inworld);
8require(Modules.ApplicationStorage);
9
10const SYSTEM_PROMPT = `
11You are Voxi, a helpful phone assistant.
12Keep responses short and telephony-friendly.
13Always reply in English.
14`;
15
16const INWORLD_VOICE_ID = "Ashley"; // set your preference here
17const INWORLD_MODEL_ID = "inworld-tts-1.5-mini"; // set your preference here, or leave blank to use the default model for the voice
18
19const SESSION_CONFIG = {
20 session: {
21 type: "realtime",
22 instructions: SYSTEM_PROMPT,
23 output_modalities: ["text"],
24 turn_detection: {type: "server_vad", interrupt_response: true},
25 },
26};
27
28VoxEngine.addEventListener(AppEvents.CallAlerting, async ({call}) => {
29 let voiceAIClient;
30 let ttsPlayer;
31
32 call.addEventListener(CallEvents.Disconnected, () => VoxEngine.terminate());
33 call.addEventListener(CallEvents.Failed, () => VoxEngine.terminate());
34
35 try {
36 call.answer();
37 // call.record({hd_audio: true, stereo: true}); // Optional: record the call
38
39 const openAiKey = (await ApplicationStorage.get("OPENAI_API_KEY")).value;
40
41 voiceAIClient = await OpenAI.createRealtimeAPIClient({
42 apiKey: openAiKey,
43 model: "gpt-realtime",
44 onWebSocketClose: (event) => {
45 Logger.write("===OpenAI.WebSocket.Close===");
46 if (event) Logger.write(JSON.stringify(event));
47 VoxEngine.terminate();
48 },
49 });
50
51 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionCreated, () => {
52 voiceAIClient.sessionUpdate(SESSION_CONFIG);
53 });
54
55 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.SessionUpdated, async () => {
56 call.sendMediaTo(voiceAIClient); // bridge media between the call and OpenAI
57
58 // create the TTS player and pass the config parameters
59 ttsPlayer = Inworld.createRealtimeTTSPlayer({
60 // apiKey: (await ApplicationStorage.get("INWORLD_API_KEY")).value; // optional,
61 createContextParameters: {
62 create: {
63 voiceId: INWORLD_VOICE_ID,
64 modelId: INWORLD_MODEL_ID,
65 speakingRate: 1.1,
66 temperature: 1.3,
67 }
68 }
69 });
70 ttsPlayer.sendMediaTo(call); // bridge media between the TTS player and the call
71
72 voiceAIClient.responseCreate({instructions: "Hello! How can I help today?"});
73 });
74
75 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.ResponseOutputTextDone, (event) => {
76 const payload = event?.data?.payload || event?.data || {};
77 const text = payload.text || payload.delta;
78 if (!text || !ttsPlayer) return;
79 Logger.write(`===AGENT_TEXT=== ${text}`);
80 ttsPlayer.send({send_text: {text}});
81 ttsPlayer.send({flush_context: {}});
82 });
83
84 // Barge-in: clear both OpenAI and Inworld buffers
85 voiceAIClient.addEventListener(OpenAI.RealtimeAPIEvents.InputAudioBufferSpeechStarted, () => {
86 Logger.write("===BARGE-IN: OpenAI.InputAudioBufferSpeechStarted===");
87 voiceAIClient.clearMediaBuffer();
88 ttsPlayer?.clearBuffer();
89 });
90
91 // ---------------------- Log all other events for debugging -----------------------
92 [
93 OpenAI.RealtimeAPIEvents.ResponseCreated,
94 OpenAI.RealtimeAPIEvents.ResponseDone,
95 OpenAI.RealtimeAPIEvents.ResponseOutputTextDelta,
96 OpenAI.RealtimeAPIEvents.ConnectorInformation,
97 OpenAI.RealtimeAPIEvents.HTTPResponse,
98 OpenAI.RealtimeAPIEvents.WebSocketError,
99 OpenAI.RealtimeAPIEvents.Unknown,
100 OpenAI.Events.WebSocketMediaStarted,
101 OpenAI.Events.WebSocketMediaEnded,
102 ].forEach((eventName) => {
103 voiceAIClient.addEventListener(eventName, (event) => {
104 Logger.write(`===${event.name}===`);
105 if (event?.data) Logger.write(JSON.stringify(event.data));
106 });
107 });
108 } catch (error) {
109 Logger.write("===UNHANDLED_ERROR===");
110 Logger.write(error);
111 voiceAIClient?.close();
112 VoxEngine.terminate();
113 }
114});