| 1 | /** |
| 2 | * Full-cascade Voice AI demo: Deepgram STT + Groq Llama Responses API + Inworld TTS |
| 3 | * Scenario: answer an incoming call using VoxTurnTaking for turn management. |
| 4 | * |
| 5 | * Include `vox-turn-taking` BEFORE this scenario in the routing rule sequence. |
| 6 | * |
| 7 | * Groq's Responses API is OpenAI-compatible, but it does not currently support |
| 8 | * `previous_response_id`. To keep this example simple, each turn is submitted |
| 9 | * independently instead of rebuilding prior conversation history locally. |
| 10 | */ |
| 11 | |
| 12 | require(Modules.ASR); |
| 13 | require(Modules.OpenAI); |
| 14 | require(Modules.Inworld); |
| 15 | const SYSTEM_PROMPT = ` |
| 16 | You are Voxi, a helpful phone assistant for Voximplant. Keep responses short, polite, and telephony-friendly (usually 1-2 sentences). |
| 17 | Reply in English. |
| 18 | `; |
| 19 | |
| 20 | VoxEngine.addEventListener(AppEvents.CallAlerting, async ({call}) => { |
| 21 | let stt; |
| 22 | let responsesClient; |
| 23 | let ttsPlayer; |
| 24 | let turnTaking; |
| 25 | const terminate = () => { |
| 26 | stt?.stop(); |
| 27 | responsesClient?.close(); |
| 28 | turnTaking?.close(); |
| 29 | VoxEngine.terminate(); |
| 30 | }; |
| 31 | |
| 32 | call.addEventListener(CallEvents.Disconnected, terminate); |
| 33 | call.addEventListener(CallEvents.Failed, terminate); |
| 34 | |
| 35 | try { |
| 36 | call.answer(); |
| 37 | call.record({hd_audio: true, stereo: true}); // optional recording |
| 38 | |
| 39 | stt = VoxEngine.createASR({ |
| 40 | profile: ASRProfileList.Deepgram.en_US, |
| 41 | interimResults: true, |
| 42 | request: { |
| 43 | language: "en-US", |
| 44 | model: "nova-2-phonecall", |
| 45 | keywords: ["Voximplant:4", "OpenAI:2"], |
| 46 | }, |
| 47 | }); |
| 48 | |
| 49 | responsesClient = await OpenAI.createResponsesAPIClient({ |
| 50 | apiKey: VoxEngine.getSecretValue('GROQ_API_KEY'), |
| 51 | baseUrl: "https://api.groq.com/openai/v1", |
| 52 | storeContext: false, |
| 53 | onWebSocketClose: (event) => { |
| 54 | Logger.write("===Groq.WebSocket.Close==="); |
| 55 | if (event) Logger.write(JSON.stringify(event)); |
| 56 | terminate(); |
| 57 | }, |
| 58 | }); |
| 59 | |
| 60 | ttsPlayer = Inworld.createRealtimeTTSPlayer({ |
| 61 | createContextParameters: { |
| 62 | create: { |
| 63 | voiceId: "Ashley", |
| 64 | modelId: "inworld-tts-1.5-mini", |
| 65 | speakingRate: 1.1, |
| 66 | temperature: 1.3, |
| 67 | } |
| 68 | } |
| 69 | }); |
| 70 | |
| 71 | // Load the VoxTurnTaking module as part of the routing rule |
| 72 | turnTaking = await VoxTurnTaking.create({ |
| 73 | call, |
| 74 | stt, |
| 75 | vadOptions: { |
| 76 | threshold: 0.5, // sensitivity for detecting speech vs silence |
| 77 | minSilenceDurationMs: 350, // silence required before VAD marks speech end |
| 78 | speechPadMs: 10, // small padding around detected speech |
| 79 | }, |
| 80 | turnDetectorOptions: { |
| 81 | threshold: 0.5, // end-of-turn probability needed from Pipecat |
| 82 | }, |
| 83 | policy: { |
| 84 | transcriptSettleMs: 500, // grace period for a final STT chunk after end-of-turn |
| 85 | userSpeechTimeoutMs: 1000, // default fallback submit timeout after speech ends |
| 86 | shortUtteranceExtensionMs: 1800, // longer hold for fragments that may continue |
| 87 | fastShortUtteranceTimeoutMs: 700, // faster submit for short complete utterances like "hey" |
| 88 | shortUtteranceMaxChars: 12, // max chars still treated as a short fragment |
| 89 | shortUtteranceMaxWords: 2, // max words still treated as a short fragment |
| 90 | lowConfidenceShortUtteranceThreshold: 0.75, // keep short low-confidence finals replaceable |
| 91 | }, |
| 92 | enableLogging: true, |
| 93 | onUserTurn: (input) => { // send the transcript text on end-of-turn |
| 94 | responsesClient.createResponses({ |
| 95 | model: "llama-3.3-70b-versatile", |
| 96 | instructions: SYSTEM_PROMPT, |
| 97 | input, |
| 98 | }); |
| 99 | }, |
| 100 | onInterrupt: () => { |
| 101 | ttsPlayer?.clearBuffer(); // stop any in-progress TTS audio |
| 102 | }, |
| 103 | }); |
| 104 | |
| 105 | responsesClient.addEventListener(OpenAI.ResponsesAPIEvents.ResponseTextDelta, (event) => { |
| 106 | const text = event?.data?.payload?.delta; |
| 107 | if (!text || !turnTaking.canPlayAgentAudio()) return; |
| 108 | ttsPlayer.send({send_text: {text}}); |
| 109 | }); |
| 110 | |
| 111 | responsesClient.addEventListener(OpenAI.ResponsesAPIEvents.ResponseTextDone, (event) => { |
| 112 | const text = event?.data?.payload?.text; |
| 113 | Logger.write(`===AGENT=== ${text}`); |
| 114 | ttsPlayer.send({flush_context: {}}); // Tell TTS to process all buffered text immediately |
| 115 | }); |
| 116 | |
| 117 | // Event logging to illustrate available OpenAI Responses API client events |
| 118 | [ |
| 119 | OpenAI.ResponsesAPIEvents.ResponseCreated, |
| 120 | OpenAI.ResponsesAPIEvents.ResponseFailed, |
| 121 | OpenAI.ResponsesAPIEvents.ResponsesAPIError, |
| 122 | OpenAI.ResponsesAPIEvents.ResponseInProgress, |
| 123 | OpenAI.ResponsesAPIEvents.ResponseCompleted, |
| 124 | OpenAI.ResponsesAPIEvents.ResponseOutputItemAdded, |
| 125 | OpenAI.ResponsesAPIEvents.ResponseContentPartAdded, |
| 126 | OpenAI.ResponsesAPIEvents.ConnectorInformation, |
| 127 | OpenAI.ResponsesAPIEvents.Unknown, |
| 128 | OpenAI.Events.WebSocketMediaStarted, |
| 129 | OpenAI.Events.WebSocketMediaEnded, |
| 130 | ].forEach((eventName) => { |
| 131 | responsesClient.addEventListener(eventName, (event) => { |
| 132 | Logger.write(`===${event?.name || eventName}===`); |
| 133 | if (event?.data) Logger.write(JSON.stringify(event.data)); |
| 134 | }); |
| 135 | }); |
| 136 | |
| 137 | // Attach the caller media |
| 138 | call.sendMediaTo(stt); |
| 139 | ttsPlayer.sendMediaTo(call); |
| 140 | |
| 141 | // Tell the LLM to talk first and greet the user |
| 142 | responsesClient.createResponses({ |
| 143 | model: "llama-3.3-70b-versatile", |
| 144 | instructions: SYSTEM_PROMPT, |
| 145 | input: "Greet the caller briefly.", |
| 146 | }); |
| 147 | |
| 148 | |
| 149 | } catch (error) { |
| 150 | Logger.write("===UNHANDLED_ERROR==="); |
| 151 | Logger.write(error); |
| 152 | terminate(); |
| 153 | } |
| 154 | }); |