Module scenario.voice.adapters.composable
Composable and provider-branded voice agents.
Two classes live here:
ComposableVoiceAgent
Assembles any STTProvider + litellm LLM + TTS voice string into a
VoiceAgentAdapter. The STT→LLM→TTS loop runs locally, giving full
observability into each seam.
ElevenLabsVoiceAgent
Branded preset: wires ElevenLabsSTTProvider and an ElevenLabs TTS voice
by default, accepts per-piece overrides.
Both expose last_user_transcript and last_llm_response for scenario
harness assertions.
Expand source code
"""
Composable and provider-branded voice agents.
Two classes live here:
``ComposableVoiceAgent``
Assembles any STTProvider + litellm LLM + TTS voice string into a
VoiceAgentAdapter. The STT→LLM→TTS loop runs locally, giving full
observability into each seam.
``ElevenLabsVoiceAgent``
Branded preset: wires ElevenLabsSTTProvider and an ElevenLabs TTS voice
by default, accepts per-piece overrides.
Both expose ``last_user_transcript`` and ``last_llm_response`` for scenario
harness assertions.
"""
from __future__ import annotations
from typing import ClassVar, List, Optional
from ...config.voice_models import COMPOSABLE_VOICE_LLM_MODEL
from ..adapter import VoiceAgentAdapter
from ..audio_chunk import AudioChunk
from ..capabilities import AdapterCapabilities
from ..stt import ElevenLabsSTTProvider, STTProvider
class ComposableVoiceAgent(VoiceAgentAdapter):
"""
Locally-executed STT → LLM → TTS voice agent.
``stt`` transcribes incoming user audio, the result is fed to ``llm``
(a litellm model string) along with conversation history, and the response
is synthesised via the ``tts`` voice string using the existing
``scenario.voice.synthesize`` router.
Each seam is independently swappable — change any one without touching the
other two. Intermediate results are surfaced on instance attributes so the
scenario harness can assert on them.
Attributes:
last_user_transcript: Transcript of the most-recent user audio turn.
last_llm_response: Text produced by the LLM for the most-recent turn.
"""
capabilities: ClassVar[AdapterCapabilities] = AdapterCapabilities(
streaming_transcripts=True,
native_vad=False,
dtmf=False,
input_formats=["pcm16/24000"],
output_formats=["pcm16/24000"],
)
DEFAULT_SYSTEM_PROMPT = (
"You are a helpful voice assistant. Respond naturally and conversationally "
"as this is an audio conversation — be concise, friendly, and clear."
)
def __init__(
self,
stt: STTProvider,
llm: str,
tts: str,
*,
system_prompt: Optional[str] = None,
) -> None:
"""
Args:
stt: STTProvider implementation for the user's audio.
llm: litellm-style model identifier, e.g. ``COMPOSABLE_VOICE_LLM_MODEL``.
tts: TTS voice string in ``"provider/voice"`` format,
e.g. ``"openai/nova"`` or ``"elevenlabs/rachel"``.
system_prompt: Optional system prompt seeded at turn zero so the
LLM has guidance before the first user message. Defaults to a
generic helpful-assistant prompt.
"""
super().__init__()
self.stt = stt
self.llm = llm
self.tts = tts
self.last_user_transcript: Optional[str] = None
self.last_llm_response: Optional[str] = None
# Seed history with a system prompt so the first recv_audio call (which
# can happen before any user audio when the agent speaks first) doesn't
# send an empty messages array to the LLM.
self._history: List[dict] = [
{"role": "system", "content": system_prompt or self.DEFAULT_SYSTEM_PROMPT}
]
# Turn-output guard. ``recv_audio`` synthesises ONE chunk per
# user turn. The default ``call()`` drains by re-calling
# ``recv_audio`` until tail-silence — on this adapter that would
# kick a second LLM call, cancelled later by timeout (wasted
# credits + latency). The guard makes subsequent ``recv_audio``
# calls in the same turn return an empty chunk, which the drain
# loop interprets as end-of-stream.
#
# Reset boundary: ``send_audio`` (new user audio → new turn).
# Set boundary: end of ``recv_audio`` (LLM+TTS completed).
self._turn_output_emitted: bool = False
def __repr__(self) -> str:
return f"ComposableVoiceAgent(llm={self.llm!r}, tts={self.tts!r})"
# ------------------------------------------------------------------ lifecycle
async def connect(self) -> None:
"""No-op — no external transport to open."""
async def disconnect(self) -> None:
"""No-op — nothing to tear down."""
# ------------------------------------------------------------------ I/O
async def send_audio(self, chunk: AudioChunk) -> None:
"""Transcribe the chunk via STT and store for the next recv_audio call."""
transcript = await self.stt.transcribe(chunk)
self.last_user_transcript = transcript
self._history.append({"role": "user", "content": transcript})
# New user turn → next recv_audio is allowed to synthesise.
self._turn_output_emitted = False
async def recv_audio(self, timeout: float) -> AudioChunk:
"""
Run the LLM on the current history, synthesise the response via TTS,
and return the resulting AudioChunk.
``timeout`` is honoured for the combined LLM+TTS call via
``asyncio.wait_for``. Subsequent calls in the same turn (the
default ``call()`` drains until tail-silence) return an empty
chunk so the drain loop exits without billing a second LLM
round-trip — see ``_turn_output_emitted`` for the guard contract.
"""
if self._turn_output_emitted:
return AudioChunk(data=b"")
import asyncio
async def _run() -> AudioChunk:
import litellm # type: ignore
from litellm.types.utils import Choices, ModelResponse
from typing import cast as _cast
from ..tts import synthesize
completion = await litellm.acompletion(
model=self.llm,
messages=self._history,
)
# Non-streaming acompletion returns ModelResponse with Choices;
# cast satisfies pyright without runtime isinstance overhead.
completion = _cast(ModelResponse, completion)
choice = _cast(Choices, completion.choices[0])
response_text: str = choice.message.content or ""
self.last_llm_response = response_text
self._history.append({"role": "assistant", "content": response_text})
return await synthesize(response_text, self.tts)
chunk = await asyncio.wait_for(_run(), timeout=timeout)
self._turn_output_emitted = True
return chunk
class ElevenLabsVoiceAgent(ComposableVoiceAgent):
"""
Composable voice agent with ElevenLabs-opinionated defaults.
Not to be confused with :class:`ElevenLabsAgentAdapter` (in
``scenario.voice.adapters.elevenlabs``) — that one talks to ElevenLabs'
**hosted** Conversational AI endpoint where EL runs the full
STT→LLM→TTS loop. This class is local: you compose ``ElevenLabsSTTProvider``
+ any LLM + ElevenLabs TTS yourself, keeping full control over prompts,
model choice, and tool calls.
Instantiate with just an ``api_key`` to get an ElevenLabs STT +
LLM (default ``COMPOSABLE_VOICE_LLM_MODEL``) + ``elevenlabs/rachel`` TTS stack. Each piece
can be overridden independently without changing the others.
Example::
# Defaults — all ElevenLabs STT, GPT-4o-mini, ElevenLabs TTS
agent = ElevenLabsVoiceAgent(api_key="sk-...")
# Override just the LLM
agent = ElevenLabsVoiceAgent(api_key="sk-...", llm="openai/gpt-4o")
# Bring your own STT
agent = ElevenLabsVoiceAgent(api_key="sk-...", stt=MyCustomSTT())
"""
def __init__(
self,
api_key: str,
*,
llm: str = COMPOSABLE_VOICE_LLM_MODEL,
voice: Optional[str] = None,
stt: Optional[STTProvider] = None,
system_prompt: Optional[str] = None,
) -> None:
"""
Args:
api_key: ElevenLabs API key. Redacted in ``__repr__``.
llm: litellm-style model identifier. Defaults to
``COMPOSABLE_VOICE_LLM_MODEL``.
voice: TTS voice string in ``"elevenlabs/<voice_id>"`` format.
Defaults to the ``ELEVENLABS_VOICE_ID`` environment variable
when set, otherwise falls back to "Sarah"
(``"elevenlabs/EXAVITQu4vr4xnSDxMaL"``) — premade and
accessible on the ElevenLabs free tier as of 2026-05.
Other premade voices (e.g. "Rachel"
``21m00Tcm4TlvDq8ikWAM``) returned 402 paid_plan_required
from the EL TTS API; gating differs per voice. Set
``ELEVENLABS_VOICE_ID`` to override.
stt: STTProvider override. Defaults to
``ElevenLabsSTTProvider(api_key=api_key)``.
system_prompt: Optional system prompt. Defaults to
``ComposableVoiceAgent.DEFAULT_SYSTEM_PROMPT``.
"""
import os
if voice is None:
env_voice_id = os.environ.get("ELEVENLABS_VOICE_ID")
voice = (
f"elevenlabs/{env_voice_id}"
if env_voice_id
else "elevenlabs/EXAVITQu4vr4xnSDxMaL" # "Sarah" — free-tier premade
)
resolved_stt = stt if stt is not None else ElevenLabsSTTProvider(api_key=api_key)
super().__init__(stt=resolved_stt, llm=llm, tts=voice, system_prompt=system_prompt)
self._api_key = api_key
self.voice = voice
def __repr__(self) -> str: # redact credentials
return (
f"ElevenLabsVoiceAgent("
f"api_key='***', llm={self.llm!r}, voice={self.voice!r})"
)
Classes
class ComposableVoiceAgent (stt: STTProvider, llm: str, tts: str, *, system_prompt: Optional[str] = None)-
Locally-executed STT → LLM → TTS voice agent.
stttranscribes incoming user audio, the result is fed tollm(a litellm model string) along with conversation history, and the response is synthesised via thettsvoice string using the existingsynthesize()router.Each seam is independently swappable — change any one without touching the other two. Intermediate results are surfaced on instance attributes so the scenario harness can assert on them.
Attributes
last_user_transcript- Transcript of the most-recent user audio turn.
last_llm_response- Text produced by the LLM for the most-recent turn.
Args
stt- STTProvider implementation for the user's audio.
llm- litellm-style model identifier, e.g.
COMPOSABLE_VOICE_LLM_MODEL. tts- TTS voice string in
"provider/voice"format, e.g."openai/nova"or"elevenlabs/rachel". system_prompt- Optional system prompt seeded at turn zero so the LLM has guidance before the first user message. Defaults to a generic helpful-assistant prompt.
Expand source code
class ComposableVoiceAgent(VoiceAgentAdapter): """ Locally-executed STT → LLM → TTS voice agent. ``stt`` transcribes incoming user audio, the result is fed to ``llm`` (a litellm model string) along with conversation history, and the response is synthesised via the ``tts`` voice string using the existing ``scenario.voice.synthesize`` router. Each seam is independently swappable — change any one without touching the other two. Intermediate results are surfaced on instance attributes so the scenario harness can assert on them. Attributes: last_user_transcript: Transcript of the most-recent user audio turn. last_llm_response: Text produced by the LLM for the most-recent turn. """ capabilities: ClassVar[AdapterCapabilities] = AdapterCapabilities( streaming_transcripts=True, native_vad=False, dtmf=False, input_formats=["pcm16/24000"], output_formats=["pcm16/24000"], ) DEFAULT_SYSTEM_PROMPT = ( "You are a helpful voice assistant. Respond naturally and conversationally " "as this is an audio conversation — be concise, friendly, and clear." ) def __init__( self, stt: STTProvider, llm: str, tts: str, *, system_prompt: Optional[str] = None, ) -> None: """ Args: stt: STTProvider implementation for the user's audio. llm: litellm-style model identifier, e.g. ``COMPOSABLE_VOICE_LLM_MODEL``. tts: TTS voice string in ``"provider/voice"`` format, e.g. ``"openai/nova"`` or ``"elevenlabs/rachel"``. system_prompt: Optional system prompt seeded at turn zero so the LLM has guidance before the first user message. Defaults to a generic helpful-assistant prompt. """ super().__init__() self.stt = stt self.llm = llm self.tts = tts self.last_user_transcript: Optional[str] = None self.last_llm_response: Optional[str] = None # Seed history with a system prompt so the first recv_audio call (which # can happen before any user audio when the agent speaks first) doesn't # send an empty messages array to the LLM. self._history: List[dict] = [ {"role": "system", "content": system_prompt or self.DEFAULT_SYSTEM_PROMPT} ] # Turn-output guard. ``recv_audio`` synthesises ONE chunk per # user turn. The default ``call()`` drains by re-calling # ``recv_audio`` until tail-silence — on this adapter that would # kick a second LLM call, cancelled later by timeout (wasted # credits + latency). The guard makes subsequent ``recv_audio`` # calls in the same turn return an empty chunk, which the drain # loop interprets as end-of-stream. # # Reset boundary: ``send_audio`` (new user audio → new turn). # Set boundary: end of ``recv_audio`` (LLM+TTS completed). self._turn_output_emitted: bool = False def __repr__(self) -> str: return f"ComposableVoiceAgent(llm={self.llm!r}, tts={self.tts!r})" # ------------------------------------------------------------------ lifecycle async def connect(self) -> None: """No-op — no external transport to open.""" async def disconnect(self) -> None: """No-op — nothing to tear down.""" # ------------------------------------------------------------------ I/O async def send_audio(self, chunk: AudioChunk) -> None: """Transcribe the chunk via STT and store for the next recv_audio call.""" transcript = await self.stt.transcribe(chunk) self.last_user_transcript = transcript self._history.append({"role": "user", "content": transcript}) # New user turn → next recv_audio is allowed to synthesise. self._turn_output_emitted = False async def recv_audio(self, timeout: float) -> AudioChunk: """ Run the LLM on the current history, synthesise the response via TTS, and return the resulting AudioChunk. ``timeout`` is honoured for the combined LLM+TTS call via ``asyncio.wait_for``. Subsequent calls in the same turn (the default ``call()`` drains until tail-silence) return an empty chunk so the drain loop exits without billing a second LLM round-trip — see ``_turn_output_emitted`` for the guard contract. """ if self._turn_output_emitted: return AudioChunk(data=b"") import asyncio async def _run() -> AudioChunk: import litellm # type: ignore from litellm.types.utils import Choices, ModelResponse from typing import cast as _cast from ..tts import synthesize completion = await litellm.acompletion( model=self.llm, messages=self._history, ) # Non-streaming acompletion returns ModelResponse with Choices; # cast satisfies pyright without runtime isinstance overhead. completion = _cast(ModelResponse, completion) choice = _cast(Choices, completion.choices[0]) response_text: str = choice.message.content or "" self.last_llm_response = response_text self._history.append({"role": "assistant", "content": response_text}) return await synthesize(response_text, self.tts) chunk = await asyncio.wait_for(_run(), timeout=timeout) self._turn_output_emitted = True return chunkAncestors
- VoiceAgentAdapter
- AgentAdapter
- abc.ABC
Subclasses
Class variables
var DEFAULT_SYSTEM_PROMPTvar capabilities : ClassVar[AdapterCapabilities]
Methods
async def connect(self) ‑> None-
No-op — no external transport to open.
Expand source code
async def connect(self) -> None: """No-op — no external transport to open.""" async def disconnect(self) ‑> None-
No-op — nothing to tear down.
Expand source code
async def disconnect(self) -> None: """No-op — nothing to tear down.""" async def recv_audio(self, timeout: float) ‑> AudioChunk-
Run the LLM on the current history, synthesise the response via TTS, and return the resulting AudioChunk.
timeoutis honoured for the combined LLM+TTS call viaasyncio.wait_for. Subsequent calls in the same turn (the defaultcall()drains until tail-silence) return an empty chunk so the drain loop exits without billing a second LLM round-trip — see_turn_output_emittedfor the guard contract.Expand source code
async def recv_audio(self, timeout: float) -> AudioChunk: """ Run the LLM on the current history, synthesise the response via TTS, and return the resulting AudioChunk. ``timeout`` is honoured for the combined LLM+TTS call via ``asyncio.wait_for``. Subsequent calls in the same turn (the default ``call()`` drains until tail-silence) return an empty chunk so the drain loop exits without billing a second LLM round-trip — see ``_turn_output_emitted`` for the guard contract. """ if self._turn_output_emitted: return AudioChunk(data=b"") import asyncio async def _run() -> AudioChunk: import litellm # type: ignore from litellm.types.utils import Choices, ModelResponse from typing import cast as _cast from ..tts import synthesize completion = await litellm.acompletion( model=self.llm, messages=self._history, ) # Non-streaming acompletion returns ModelResponse with Choices; # cast satisfies pyright without runtime isinstance overhead. completion = _cast(ModelResponse, completion) choice = _cast(Choices, completion.choices[0]) response_text: str = choice.message.content or "" self.last_llm_response = response_text self._history.append({"role": "assistant", "content": response_text}) return await synthesize(response_text, self.tts) chunk = await asyncio.wait_for(_run(), timeout=timeout) self._turn_output_emitted = True return chunk async def send_audio(self, chunk: AudioChunk) ‑> None-
Transcribe the chunk via STT and store for the next recv_audio call.
Expand source code
async def send_audio(self, chunk: AudioChunk) -> None: """Transcribe the chunk via STT and store for the next recv_audio call.""" transcript = await self.stt.transcribe(chunk) self.last_user_transcript = transcript self._history.append({"role": "user", "content": transcript}) # New user turn → next recv_audio is allowed to synthesise. self._turn_output_emitted = False
Inherited members
class ElevenLabsVoiceAgent (api_key: str, *, llm: str = 'openai/gpt-5.4-mini', voice: Optional[str] = None, stt: Optional[STTProvider] = None, system_prompt: Optional[str] = None)-
Composable voice agent with ElevenLabs-opinionated defaults.
Not to be confused with :class:
ElevenLabsAgentAdapter(inscenario.voice.adapters.elevenlabs) — that one talks to ElevenLabs' hosted Conversational AI endpoint where EL runs the full STT→LLM→TTS loop. This class is local: you composeElevenLabsSTTProvider+ any LLM + ElevenLabs TTS yourself, keeping full control over prompts, model choice, and tool calls.Instantiate with just an
api_keyto get an ElevenLabs STT + LLM (defaultCOMPOSABLE_VOICE_LLM_MODEL) +elevenlabs/rachelTTS stack. Each piece can be overridden independently without changing the others.Example::
# Defaults — all ElevenLabs STT, GPT-4o-mini, ElevenLabs TTS agent = ElevenLabsVoiceAgent(api_key="sk-...") # Override just the LLM agent = ElevenLabsVoiceAgent(api_key="sk-...", llm="openai/gpt-4o") # Bring your own STT agent = ElevenLabsVoiceAgent(api_key="sk-...", stt=MyCustomSTT())Args
api_key- ElevenLabs API key. Redacted in
__repr__. llm- litellm-style model identifier. Defaults to
COMPOSABLE_VOICE_LLM_MODEL. voice- TTS voice string in
"elevenlabs/<voice_id>"format. Defaults to theELEVENLABS_VOICE_IDenvironment variable when set, otherwise falls back to "Sarah" ("elevenlabs/EXAVITQu4vr4xnSDxMaL") — premade and accessible on the ElevenLabs free tier as of 2026-05. Other premade voices (e.g. "Rachel"21m00Tcm4TlvDq8ikWAM) returned 402 paid_plan_required from the EL TTS API; gating differs per voice. SetELEVENLABS_VOICE_IDto override. stt- STTProvider override. Defaults to
ElevenLabsSTTProvider(api_key=api_key). system_prompt- Optional system prompt. Defaults to
ComposableVoiceAgent.DEFAULT_SYSTEM_PROMPT.
Expand source code
class ElevenLabsVoiceAgent(ComposableVoiceAgent): """ Composable voice agent with ElevenLabs-opinionated defaults. Not to be confused with :class:`ElevenLabsAgentAdapter` (in ``scenario.voice.adapters.elevenlabs``) — that one talks to ElevenLabs' **hosted** Conversational AI endpoint where EL runs the full STT→LLM→TTS loop. This class is local: you compose ``ElevenLabsSTTProvider`` + any LLM + ElevenLabs TTS yourself, keeping full control over prompts, model choice, and tool calls. Instantiate with just an ``api_key`` to get an ElevenLabs STT + LLM (default ``COMPOSABLE_VOICE_LLM_MODEL``) + ``elevenlabs/rachel`` TTS stack. Each piece can be overridden independently without changing the others. Example:: # Defaults — all ElevenLabs STT, GPT-4o-mini, ElevenLabs TTS agent = ElevenLabsVoiceAgent(api_key="sk-...") # Override just the LLM agent = ElevenLabsVoiceAgent(api_key="sk-...", llm="openai/gpt-4o") # Bring your own STT agent = ElevenLabsVoiceAgent(api_key="sk-...", stt=MyCustomSTT()) """ def __init__( self, api_key: str, *, llm: str = COMPOSABLE_VOICE_LLM_MODEL, voice: Optional[str] = None, stt: Optional[STTProvider] = None, system_prompt: Optional[str] = None, ) -> None: """ Args: api_key: ElevenLabs API key. Redacted in ``__repr__``. llm: litellm-style model identifier. Defaults to ``COMPOSABLE_VOICE_LLM_MODEL``. voice: TTS voice string in ``"elevenlabs/<voice_id>"`` format. Defaults to the ``ELEVENLABS_VOICE_ID`` environment variable when set, otherwise falls back to "Sarah" (``"elevenlabs/EXAVITQu4vr4xnSDxMaL"``) — premade and accessible on the ElevenLabs free tier as of 2026-05. Other premade voices (e.g. "Rachel" ``21m00Tcm4TlvDq8ikWAM``) returned 402 paid_plan_required from the EL TTS API; gating differs per voice. Set ``ELEVENLABS_VOICE_ID`` to override. stt: STTProvider override. Defaults to ``ElevenLabsSTTProvider(api_key=api_key)``. system_prompt: Optional system prompt. Defaults to ``ComposableVoiceAgent.DEFAULT_SYSTEM_PROMPT``. """ import os if voice is None: env_voice_id = os.environ.get("ELEVENLABS_VOICE_ID") voice = ( f"elevenlabs/{env_voice_id}" if env_voice_id else "elevenlabs/EXAVITQu4vr4xnSDxMaL" # "Sarah" — free-tier premade ) resolved_stt = stt if stt is not None else ElevenLabsSTTProvider(api_key=api_key) super().__init__(stt=resolved_stt, llm=llm, tts=voice, system_prompt=system_prompt) self._api_key = api_key self.voice = voice def __repr__(self) -> str: # redact credentials return ( f"ElevenLabsVoiceAgent(" f"api_key='***', llm={self.llm!r}, voice={self.voice!r})" )Ancestors
Inherited members