Example: Speech-to-speech translation

View as MarkdownOpen in Claude

This example answers an inbound English call, dials a Spanish-speaking callee, and uses Gemini Live API to translate the caller’s speech into Spanish audio in real time.

⬇️ Jump to the Full VoxEngine scenario.

Prerequisites

Demo video

Session setup

The Gemini Live API session is configured via connectConfig, passed into Gemini.createLiveAPIClient(...).

In the full scenario, see GEMINI_CONNECT_CONFIG:

  • responseModalities: ["AUDIO"] asks Gemini to speak back in real time.
  • thinkingConfig: { thinkingBudget: 0 } disables long thinking to reduce latency.
  • realtimeInputConfig.automaticActivityDetection tunes barge-in behavior.
  • speechConfig selects a prebuilt voice for the translated audio.
  • systemInstruction enforces the English → Spanish translation behavior.

To log text transcripts, uncomment inputAudioTranscription and outputAudioTranscription.

Translation pipeline (one-way)

This example uses a one-way pipeline:

English caller -> Gemini Live API -> Spanish callee

The code wires the audio like this:

Connect audio
1call.sendMediaTo(geminiLiveAPIClient);
2geminiLiveAPIClient.sendMediaTo(calleeCall);

Barge-in

Gemini includes an interrupted flag in ServerContent when the caller speaks over TTS. The example clears the media buffer so Gemini stops speaking immediately:

Barge-in handling
1if (payload.interrupted !== undefined) {
2 geminiLiveAPIClient.clearMediaBuffer();
3}

Events

The scenario listens for Gemini.LiveAPIEvents.ServerContent. If transcripts are enabled, the example logs both languages:

Transcripts
1if (payload.inputTranscription?.text) Logger.write(`===EN=== ${payload.inputTranscription.text}`);
2if (payload.outputTranscription?.text) Logger.write(`===ES=== ${payload.outputTranscription.text}`);

For illustration, it also logs these events:

  • Gemini.LiveAPIEvents: SetupComplete, ServerContent, ConnectorInformation, Unknown
  • Gemini.Events: WebSocketMediaStarted, WebSocketMediaEnded

Notes

  • This example uses the Gemini Developer API (Gemini.Backend.GEMINI_API).
  • Translation is one-way (English → Spanish). For bidirectional translation, run two Gemini sessions with opposite instructions.
  • The example includes short prompts (call.say / calleeCall.say) to make recordings easier to follow. Remove them for production.

See the VoxEngine API Reference for more details.

Full VoxEngine scenario

voxeengine-gemini-s2s-translate.js
1/**
2 * Voximplant + Gemini Live API connector demo
3 * Scenario: real-time speech-to-speech translation (English -> Spanish).
4 */
5
6require(Modules.Gemini);
7require(Modules.ApplicationStorage);
8
9const SYSTEM_INSTRUCTIONS = `
10You are a REAL-TIME INTERPRETER.
11
12Task:
13- Translate everything you hear from English to Spanish.
14
15Rules:
16- Output ONLY the Spanish translation (no English, no explanations, no extra commentary).
17- Preserve meaning, tone, names, numbers, and proper nouns.
18- Keep latency low: translate phrase-by-phrase as soon as you have enough context.
19- Do NOT greet or introduce yourself. Speak ONLY when the caller speaks.
20`;
21
22const GEMINI_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
23
24VoxEngine.addEventListener(AppEvents.CallAlerting, async ({call}) => {
25 let voiceAIClient;
26 let calleeCall;
27 let terminated = false;
28
29 const terminate = () => {
30 if (terminated) return;
31 terminated = true;
32 try {
33 voiceAIClient?.close();
34 } catch (error) {
35 Logger.write(error);
36 }
37 try {
38 calleeCall?.hangup();
39 } catch (error) {
40 Logger.write(error);
41 }
42 try {
43 call.hangup();
44 } catch (error) {
45 Logger.write(error);
46 }
47 VoxEngine.terminate();
48 };
49
50 call.answer();
51 call.record({hd_audio: true, stereo: true});
52 call.addEventListener(CallEvents.Disconnected, terminate);
53 call.addEventListener(CallEvents.Failed, terminate);
54
55 const geminiApiKey = (await ApplicationStorage.get("GEMINI_API_KEY")).value;
56
57 // const calleeNumber = (await ApplicationStorage.get("CALLEE_NUMBER")).value;
58 // const pstnCallerId = (await ApplicationStorage.get("PSTN_CALLER_ID")).value;
59 // calleeCall = VoxEngine.callPSTN(calleeNumber, pstnCallerId);
60
61 calleeCall = VoxEngine.callUser("callee");
62 calleeCall.addEventListener(CallEvents.Disconnected, terminate);
63 calleeCall.addEventListener(CallEvents.Failed, terminate);
64
65 calleeCall.addEventListener(CallEvents.Connected, async () => {
66 calleeCall.record({hd_audio: true, stereo: true});
67
68 // Optional prompts to make the demo obvious on recordings.
69 call.say("Connected. Speak in English. The other party will hear Spanish.");
70 calleeCall.say("Connected. You will hear Spanish translation in real time.");
71
72 const GEMINI_CONNECT_CONFIG = {
73 responseModalities: ["AUDIO"],
74 thinkingConfig: {thinkingBudget: 0},
75 realtimeInputConfig: {
76 automaticActivityDetection: {
77 disabled: false,
78 prefixPaddingMs: 20,
79 silenceDurationMs: 200,
80 },
81 },
82 speechConfig: {
83 voiceConfig: {
84 prebuiltVoiceConfig: {voiceName: "Achird"},
85 },
86 },
87 // inputAudioTranscription: {},
88 // outputAudioTranscription: {},
89 systemInstruction: {
90 parts: [{text: SYSTEM_INSTRUCTIONS}],
91 },
92 };
93
94 try {
95 voiceAIClient = await Gemini.createLiveAPIClient({
96 apiKey: geminiApiKey,
97 model: GEMINI_MODEL,
98 backend: Gemini.Backend.GEMINI_API,
99 connectConfig: GEMINI_CONNECT_CONFIG,
100 onWebSocketClose: (event) => {
101 Logger.write("===Gemini.WebSocket.Close===");
102 if (event) Logger.write(JSON.stringify(event));
103 terminate();
104 },
105 });
106
107 // Caller (English) -> Gemini -> Callee (Spanish)
108 call.sendMediaTo(voiceAIClient);
109 voiceAIClient.sendMediaTo(calleeCall);
110
111 voiceAIClient.addEventListener(Gemini.LiveAPIEvents.SetupComplete, (event) => {
112 Logger.write("===Gemini.LiveAPIEvents.SetupComplete===");
113 if (event?.data) Logger.write(JSON.stringify(event.data));
114 });
115
116 voiceAIClient.addEventListener(Gemini.LiveAPIEvents.ServerContent, (event) => {
117 const payload = event?.data?.payload || {};
118 if (payload.inputTranscription?.text) {
119 Logger.write(`===EN=== ${payload.inputTranscription.text}`);
120 }
121 if (payload.outputTranscription?.text) {
122 Logger.write(`===ES=== ${payload.outputTranscription.text}`);
123 }
124 if (payload.interrupted !== undefined) {
125 Logger.write("===BARGE-IN=== Gemini.LiveAPIEvents.ServerContent");
126 voiceAIClient.clearMediaBuffer();
127 }
128 });
129
130 [
131 Gemini.LiveAPIEvents.SetupComplete,
132 Gemini.LiveAPIEvents.ServerContent,
133 Gemini.LiveAPIEvents.ConnectorInformation,
134 Gemini.LiveAPIEvents.Unknown,
135 Gemini.Events.WebSocketMediaStarted,
136 Gemini.Events.WebSocketMediaEnded,
137 ].forEach((eventName) => {
138 voiceAIClient.addEventListener(eventName, (event) => {
139 Logger.write(`===${event.name}===`);
140 if (event?.data) Logger.write(JSON.stringify(event.data));
141 });
142 });
143 } catch (error) {
144 Logger.write("===SOMETHING_WENT_WRONG===");
145 Logger.write(error);
146 terminate();
147 }
148 });
149});