| 1 | /** |
| 2 | * Voximplant turn-taking runtime for sequenced scenarios. |
| 3 | * |
| 4 | * Include this scenario BEFORE any scenario that wants to use VoxTurnTaking |
| 5 | * in the same routing rule sequence. |
| 6 | * |
| 7 | * This runtime hides the current Silero + Pipecat + timer-based turn policy |
| 8 | * behind a small API so scenarios stay simple today and can transition more |
| 9 | * easily if Voximplant later exposes a more Pipecat-native Smart Turn model. |
| 10 | */ |
| 11 | |
| 12 | require(Modules.ASR); |
| 13 | require(Modules.Silero); |
| 14 | require(Modules.Pipecat); |
| 15 | |
| 16 | // eslint-disable-next-line no-unused-vars |
| 17 | const VoxTurnTaking = { |
| 18 | DEFAULTS: { |
| 19 | vadOptions: { |
| 20 | threshold: 0.5, |
| 21 | minSilenceDurationMs: 300, |
| 22 | speechPadMs: 10, |
| 23 | }, |
| 24 | turnDetectorOptions: { |
| 25 | threshold: 0.5, |
| 26 | }, |
| 27 | policy: { |
| 28 | transcriptSettleMs: 500, |
| 29 | userSpeechTimeoutMs: 1000, |
| 30 | shortUtteranceExtensionMs: 1800, |
| 31 | fastShortUtteranceTimeoutMs: 700, |
| 32 | shortUtteranceMaxChars: 12, |
| 33 | shortUtteranceMaxWords: 2, |
| 34 | lowConfidenceShortUtteranceThreshold: 0.75, |
| 35 | continuationTokens: ["and", "but", "so", "well", "then", "uh", "um"], |
| 36 | }, |
| 37 | }, |
| 38 | |
| 39 | /** |
| 40 | * Creates a turn-taking controller around a call, STT engine, Silero VAD, |
| 41 | * and Pipecat turn detector. |
| 42 | * |
| 43 | * A user turn stays open until this runtime calls `onUserTurn()`. Silero, |
| 44 | * Pipecat, and the timeout policy only provide evidence that the current |
| 45 | * turn may be ready to submit. |
| 46 | * |
| 47 | * @param {object} options |
| 48 | * @param {Call} options.call |
| 49 | * Active VoxEngine call whose inbound media should be analyzed. |
| 50 | * @param {ASR} options.stt |
| 51 | * Speech-to-text engine already configured by the consuming scenario. |
| 52 | * @param {(input: string, reason: string) => void} options.onUserTurn |
| 53 | * Callback invoked when the accumulated user turn should be submitted to |
| 54 | * the LLM. |
| 55 | * @param {() => void} [options.onInterrupt] |
| 56 | * Callback invoked on barge-in so the consuming scenario can stop agent |
| 57 | * playback and flush TTS state. |
| 58 | * @param {boolean} [options.enableLogging=false] |
| 59 | * When true, emits debug logs for turn-taking decisions. Disabled by |
| 60 | * default so scenarios can keep logs quiet unless they are debugging. |
| 61 | * @param {(line: string) => void} [options.logger] |
| 62 | * Optional logger used when `enableLogging` is true. |
| 63 | * @param {object} [options.vadOptions] |
| 64 | * Silero VAD options merged over `VoxTurnTaking.DEFAULTS.vadOptions`. |
| 65 | * @param {number} [options.vadOptions.threshold] |
| 66 | * Voice activity threshold passed to `Silero.createVAD()`. |
| 67 | * @param {number} [options.vadOptions.minSilenceDurationMs] |
| 68 | * Silence required before Silero emits `speechEndAt`. |
| 69 | * @param {number} [options.vadOptions.speechPadMs] |
| 70 | * Padding used around detected speech segments. |
| 71 | * @param {object} [options.turnDetectorOptions] |
| 72 | * Pipecat options merged over |
| 73 | * `VoxTurnTaking.DEFAULTS.turnDetectorOptions`. |
| 74 | * @param {number} [options.turnDetectorOptions.threshold] |
| 75 | * End-of-turn probability threshold passed to |
| 76 | * `Pipecat.createTurnDetector()`. |
| 77 | * @param {object} [options.policy] |
| 78 | * Local policy layered on top of Silero and Pipecat to bridge gaps in |
| 79 | * the current API. |
| 80 | * @param {number} [options.policy.transcriptSettleMs] |
| 81 | * Extra ASR grace period after Pipecat signals end-of-turn but a final |
| 82 | * transcript chunk has not arrived yet. |
| 83 | * @param {number} [options.policy.userSpeechTimeoutMs] |
| 84 | * Default fallback timeout started after `speechEndAt`. |
| 85 | * @param {number} [options.policy.shortUtteranceExtensionMs] |
| 86 | * Longer hold time used for short fragments that may be followed by a |
| 87 | * continuation. |
| 88 | * @param {number} [options.policy.fastShortUtteranceTimeoutMs] |
| 89 | * Shorter fallback used for brief, high-confidence utterances that are |
| 90 | * likely complete, such as a standalone greeting. |
| 91 | * @param {number} [options.policy.shortUtteranceMaxChars] |
| 92 | * Maximum character count considered a short fragment. |
| 93 | * @param {number} [options.policy.shortUtteranceMaxWords] |
| 94 | * Maximum word count considered a short fragment. |
| 95 | * @param {number} [options.policy.lowConfidenceShortUtteranceThreshold] |
| 96 | * Confidence threshold below which a short final transcript stays |
| 97 | * replaceable instead of being committed immediately. |
| 98 | * @param {string[]} [options.policy.continuationTokens] |
| 99 | * Short leading words that usually indicate the caller is continuing a |
| 100 | * thought rather than finishing a turn. |
| 101 | * @returns {Promise<object>} |
| 102 | * @returns {object} return.vad |
| 103 | * Silero VAD instance created by the runtime. |
| 104 | * @returns {object} return.turnDetector |
| 105 | * Pipecat turn detector instance created by the runtime. |
| 106 | * @returns {() => boolean} return.canPlayAgentAudio |
| 107 | * Indicates whether agent audio should still be forwarded to TTS. |
| 108 | * @returns {() => void} return.close |
| 109 | * Cleans up timers and closes the VAD and turn detector. |
| 110 | */ |
| 111 | async create(options) { |
| 112 | const { |
| 113 | call, |
| 114 | stt, |
| 115 | onUserTurn, |
| 116 | onInterrupt, |
| 117 | enableLogging = false, |
| 118 | logger = (line) => Logger.write(line), |
| 119 | } = options; |
| 120 | const vadOptions = Object.assign({}, this.DEFAULTS.vadOptions, options.vadOptions); |
| 121 | const turnDetectorOptions = Object.assign( |
| 122 | {}, |
| 123 | this.DEFAULTS.turnDetectorOptions, |
| 124 | options.turnDetectorOptions |
| 125 | ); |
| 126 | const policy = Object.assign({}, this.DEFAULTS.policy, options.policy); |
| 127 | |
| 128 | const vad = await Silero.createVAD(vadOptions); |
| 129 | const turnDetector = await Pipecat.createTurnDetector(turnDetectorOptions); |
| 130 | |
| 131 | call.sendMediaTo(vad); |
| 132 | call.sendMediaTo(turnDetector); |
| 133 | |
| 134 | const log = (line) => { |
| 135 | if (enableLogging) logger(line); |
| 136 | }; |
| 137 | const emitModuleEvent = (eventName, event) => { |
| 138 | logger(`===${eventName}===`); |
| 139 | if (event) logger(JSON.stringify(event)); |
| 140 | }; |
| 141 | |
| 142 | let fallbackTimer; |
| 143 | let settleTimer; |
| 144 | let finalTranscript = ""; |
| 145 | let interimTranscript = ""; |
| 146 | let transcriptSeparator = ""; |
| 147 | let smartTurnComplete = false; |
| 148 | let acceptingTranscript = false; |
| 149 | let signalVersion = 0; |
| 150 | let allowAgentAudio = true; |
| 151 | let lastFinalConfidence = 1; |
| 152 | let replaceableShortFinal = false; |
| 153 | let shortExtensionApplied = false; |
| 154 | |
| 155 | const clearTimers = () => { |
| 156 | if (fallbackTimer) clearTimeout(fallbackTimer); |
| 157 | if (settleTimer) clearTimeout(settleTimer); |
| 158 | fallbackTimer = null; |
| 159 | settleTimer = null; |
| 160 | }; |
| 161 | |
| 162 | const normalizeConfidence = (value) => { |
| 163 | if (typeof value !== "number" || Number.isNaN(value)) return null; |
| 164 | return value > 1 ? value / 100 : value; |
| 165 | }; |
| 166 | |
| 167 | const isShortUtterance = (text) => { |
| 168 | if (!text) return false; |
| 169 | const words = text.trim().split(/\s+/).filter(Boolean); |
| 170 | return ( |
| 171 | text.length <= policy.shortUtteranceMaxChars && |
| 172 | words.length <= policy.shortUtteranceMaxWords |
| 173 | ); |
| 174 | }; |
| 175 | |
| 176 | const startsWithContinuationToken = (text) => { |
| 177 | if (!text) return false; |
| 178 | const firstWord = text.trim().split(/\s+/)[0]?.toLowerCase(); |
| 179 | return policy.continuationTokens.includes(firstWord); |
| 180 | }; |
| 181 | |
| 182 | const buildInput = () => { |
| 183 | let input = finalTranscript; |
| 184 | if (interimTranscript) { |
| 185 | if (input) input += transcriptSeparator; |
| 186 | input += interimTranscript; |
| 187 | } |
| 188 | return input.trim(); |
| 189 | }; |
| 190 | |
| 191 | const submitCurrentTurn = (reason) => { |
| 192 | const input = buildInput(); |
| 193 | if (!input) return false; |
| 194 | |
| 195 | // Hold short replaceable fragments open for one extra window so |
| 196 | // resumed speech can overwrite them. After that extension, submit |
| 197 | // the turn instead of looping forever. |
| 198 | if (replaceableShortFinal && !shortExtensionApplied) { |
| 199 | shortExtensionApplied = true; |
| 200 | startHardTimeout(signalVersion, policy.shortUtteranceExtensionMs); |
| 201 | return false; |
| 202 | } |
| 203 | |
| 204 | log(`===${reason}===`); |
| 205 | log(`===USER=== ${input}`); |
| 206 | allowAgentAudio = true; |
| 207 | onUserTurn(input, reason); |
| 208 | finalTranscript = ""; |
| 209 | interimTranscript = ""; |
| 210 | transcriptSeparator = ""; |
| 211 | smartTurnComplete = false; |
| 212 | acceptingTranscript = false; |
| 213 | lastFinalConfidence = 1; |
| 214 | replaceableShortFinal = false; |
| 215 | shortExtensionApplied = false; |
| 216 | signalVersion += 1; |
| 217 | clearTimers(); |
| 218 | return true; |
| 219 | }; |
| 220 | |
| 221 | const startHardTimeout = (version, delay = policy.userSpeechTimeoutMs) => { |
| 222 | clearTimers(); |
| 223 | fallbackTimer = setTimeout(() => { |
| 224 | if (version !== signalVersion) return; |
| 225 | |
| 226 | const input = buildInput(); |
| 227 | if (!input) return; |
| 228 | |
| 229 | submitCurrentTurn("FALLBACK_END_OF_TURN"); |
| 230 | }, delay); |
| 231 | }; |
| 232 | |
| 233 | // Connector information and error events are part of the module's core |
| 234 | // contract, so log them here instead of making every consuming scenario |
| 235 | // re-register the same listeners. |
| 236 | [ |
| 237 | Silero.VADEvents.ConnectorInformation, |
| 238 | Silero.VADEvents.Error, |
| 239 | ].forEach((eventName) => { |
| 240 | vad.addEventListener(eventName, (event) => emitModuleEvent(eventName, event)); |
| 241 | }); |
| 242 | |
| 243 | [ |
| 244 | Pipecat.TurnEvents.ConnectorInformation, |
| 245 | Pipecat.TurnEvents.Error, |
| 246 | ].forEach((eventName) => { |
| 247 | turnDetector.addEventListener(eventName, (event) => |
| 248 | emitModuleEvent(eventName, event) |
| 249 | ); |
| 250 | }); |
| 251 | |
| 252 | stt.addEventListener(ASREvents.InterimResult, (event) => { |
| 253 | if (!acceptingTranscript) return; |
| 254 | const text = event?.text?.trim(); |
| 255 | if (!text) return; |
| 256 | |
| 257 | if (!transcriptSeparator && finalTranscript) transcriptSeparator = " "; |
| 258 | interimTranscript = text; |
| 259 | }); |
| 260 | |
| 261 | stt.addEventListener(ASREvents.Result, (event) => { |
| 262 | if (!acceptingTranscript) return; |
| 263 | const text = event?.text?.trim(); |
| 264 | if (!text) return; |
| 265 | const confidence = normalizeConfidence(event?.confidence); |
| 266 | const hadCommittedPrefix = !!finalTranscript; |
| 267 | |
| 268 | // A short low-confidence fragment like "they" or "so" is often an |
| 269 | // early clipped piece of a longer utterance. Keep it replaceable so |
| 270 | // the next final STT chunk can overwrite it. Also keep short |
| 271 | // trailing chunks replaceable when they arrive after an existing |
| 272 | // transcript prefix, which helps prevent submits like |
| 273 | // "do they support open" before the final "AI" lands. |
| 274 | if (replaceableShortFinal) { |
| 275 | finalTranscript = text; |
| 276 | } else { |
| 277 | if (finalTranscript) finalTranscript += transcriptSeparator || " "; |
| 278 | finalTranscript += text; |
| 279 | } |
| 280 | |
| 281 | interimTranscript = ""; |
| 282 | transcriptSeparator = " "; |
| 283 | lastFinalConfidence = confidence === null ? 1 : confidence; |
| 284 | replaceableShortFinal = |
| 285 | isShortUtterance(text) && |
| 286 | ( |
| 287 | hadCommittedPrefix || |
| 288 | lastFinalConfidence < policy.lowConfidenceShortUtteranceThreshold || |
| 289 | startsWithContinuationToken(text) |
| 290 | ); |
| 291 | shortExtensionApplied = false; |
| 292 | |
| 293 | log(`===STT Final: ${event.text}`); |
| 294 | if (isShortUtterance(text) && !replaceableShortFinal && !smartTurnComplete) { |
| 295 | startHardTimeout( |
| 296 | signalVersion, |
| 297 | Math.min( |
| 298 | policy.userSpeechTimeoutMs, |
| 299 | policy.fastShortUtteranceTimeoutMs |
| 300 | ) |
| 301 | ); |
| 302 | } |
| 303 | if (smartTurnComplete) submitCurrentTurn("TURN_DETECT: FINAL_TRANSCRIPT"); |
| 304 | }); |
| 305 | |
| 306 | vad.addEventListener(Silero.VADEvents.Result, (event) => { |
| 307 | if (event.speechStartAt) { |
| 308 | signalVersion += 1; |
| 309 | clearTimers(); |
| 310 | smartTurnComplete = false; |
| 311 | acceptingTranscript = true; |
| 312 | allowAgentAudio = false; |
| 313 | if (finalTranscript || interimTranscript) transcriptSeparator = " ... "; |
| 314 | log("===BARGE-IN==="); |
| 315 | if (onInterrupt) onInterrupt(); |
| 316 | } |
| 317 | |
| 318 | if (event.speechEndAt) { |
| 319 | startHardTimeout(signalVersion); |
| 320 | turnDetector.predict(); |
| 321 | } |
| 322 | }); |
| 323 | |
| 324 | turnDetector.addEventListener(Pipecat.TurnEvents.Result, (event) => { |
| 325 | log( |
| 326 | `===Pipecat.TurnEvents.Result=== ${JSON.stringify(event.probability)}` |
| 327 | ); |
| 328 | if (!event.endOfTurn) return; |
| 329 | |
| 330 | smartTurnComplete = true; |
| 331 | if (finalTranscript) { |
| 332 | submitCurrentTurn("TURN_DETECT: END_OF_TURN"); |
| 333 | return; |
| 334 | } |
| 335 | |
| 336 | if (settleTimer) clearTimeout(settleTimer); |
| 337 | const version = signalVersion; |
| 338 | settleTimer = setTimeout(() => { |
| 339 | if (version !== signalVersion) return; |
| 340 | submitCurrentTurn("TURN_DETECT: ASR_GRACE"); |
| 341 | }, policy.transcriptSettleMs); |
| 342 | }); |
| 343 | |
| 344 | return { |
| 345 | vad, |
| 346 | turnDetector, |
| 347 | canPlayAgentAudio() { |
| 348 | return allowAgentAudio; |
| 349 | }, |
| 350 | close() { |
| 351 | clearTimers(); |
| 352 | vad?.close(); |
| 353 | turnDetector?.close(); |
| 354 | }, |
| 355 | }; |
| 356 | }, |
| 357 | }; |