Module scenario.voice.messages

Audio message helpers.

Encodes AudioChunks into OpenAI-compatible multimodal messages (input_audio content parts for user role, assistant messages with a transcript + audio attachment for assistant role) and extracts them back out.

This is the glue between AudioChunk (internal format) and ChatCompletionMessageParam (the SDK's existing message bus).

Expand source code
"""
Audio message helpers.

Encodes AudioChunks into OpenAI-compatible multimodal messages (input_audio
content parts for user role, assistant messages with a transcript + audio
attachment for assistant role) and extracts them back out.

This is the glue between AudioChunk (internal format) and
ChatCompletionMessageParam (the SDK's existing message bus).
"""

from __future__ import annotations

import base64
from typing import Any, Optional, cast

from openai.types.chat import ChatCompletionMessageParam

from .audio_chunk import AudioChunk, PCM16_SAMPLE_RATE


def _pcm16_to_wav_bytes(pcm: bytes) -> bytes:
    """Wrap raw PCM16 mono bytes at 24kHz in a minimal WAV container."""
    from io import BytesIO
    import wave

    buf = BytesIO()
    with wave.open(buf, "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(PCM16_SAMPLE_RATE)
        w.writeframes(pcm)
    return buf.getvalue()


def _wav_bytes_to_pcm16(wav: bytes) -> bytes:
    """Extract raw PCM16 frames from a WAV byte string (24kHz mono expected)."""
    from io import BytesIO
    import wave

    with wave.open(BytesIO(wav), "rb") as w:
        return w.readframes(w.getnframes())


def create_audio_message(
    chunk: AudioChunk,
    role: str = "user",
) -> ChatCompletionMessageParam:
    """
    Turn an AudioChunk into an OpenAI-compatible message.

    The content is a list with an input_audio part carrying base64-encoded WAV.
    If the chunk has a transcript, it is added as a text part alongside the
    audio — this is what lets the judge's text-only path still read the content.

    Audio travels cleanly in any role (user or assistant) per the locked design
    — there is no forceUserRole workaround.
    """
    wav = _pcm16_to_wav_bytes(chunk.data)
    b64 = base64.b64encode(wav).decode()
    parts: list[dict[str, Any]] = [
        {
            "type": "input_audio",
            "input_audio": {"data": b64, "format": "wav"},
        }
    ]
    if chunk.transcript:
        parts.insert(0, {"type": "text", "text": chunk.transcript})
    return cast(ChatCompletionMessageParam, {"role": role, "content": parts})


def extract_audio(message: ChatCompletionMessageParam) -> Optional[AudioChunk]:
    """
    Pull the first audio chunk out of an OpenAI-format message.

    Returns None if the message has no audio content part. Accepts both
    'input_audio' (OpenAI API convention) and 'audio' (alternate providers).
    """
    content = message.get("content") if isinstance(message, dict) else None
    if not isinstance(content, list):
        return None

    transcript: Optional[str] = None
    for part in content:
        if not isinstance(part, dict):
            continue
        if part.get("type") == "text":
            transcript = part.get("text") or transcript
        if part.get("type") in ("input_audio", "audio"):
            data_obj = part.get("input_audio") or part.get("audio") or {}
            b64 = data_obj.get("data") if isinstance(data_obj, dict) else None
            if not b64:
                continue
            raw = base64.b64decode(b64)
            # We expect WAV by convention. If it's raw PCM, bytes pass through.
            if raw[:4] == b"RIFF":
                pcm = _wav_bytes_to_pcm16(raw)
            else:
                pcm = raw
            return AudioChunk(data=pcm, transcript=transcript)
    return None


def message_has_audio(message: ChatCompletionMessageParam) -> bool:
    """True if the message contains any audio content part."""
    return extract_audio(message) is not None

Functions

def create_audio_message(chunk: AudioChunk, role: str = 'user') ‑> openai.types.chat.chat_completion_developer_message_param.ChatCompletionDeveloperMessageParam | openai.types.chat.chat_completion_system_message_param.ChatCompletionSystemMessageParam | openai.types.chat.chat_completion_user_message_param.ChatCompletionUserMessageParam | openai.types.chat.chat_completion_assistant_message_param.ChatCompletionAssistantMessageParam | openai.types.chat.chat_completion_tool_message_param.ChatCompletionToolMessageParam | openai.types.chat.chat_completion_function_message_param.ChatCompletionFunctionMessageParam

Turn an AudioChunk into an OpenAI-compatible message.

The content is a list with an input_audio part carrying base64-encoded WAV. If the chunk has a transcript, it is added as a text part alongside the audio — this is what lets the judge's text-only path still read the content.

Audio travels cleanly in any role (user or assistant) per the locked design — there is no forceUserRole workaround.

Expand source code
def create_audio_message(
    chunk: AudioChunk,
    role: str = "user",
) -> ChatCompletionMessageParam:
    """
    Turn an AudioChunk into an OpenAI-compatible message.

    The content is a list with an input_audio part carrying base64-encoded WAV.
    If the chunk has a transcript, it is added as a text part alongside the
    audio — this is what lets the judge's text-only path still read the content.

    Audio travels cleanly in any role (user or assistant) per the locked design
    — there is no forceUserRole workaround.
    """
    wav = _pcm16_to_wav_bytes(chunk.data)
    b64 = base64.b64encode(wav).decode()
    parts: list[dict[str, Any]] = [
        {
            "type": "input_audio",
            "input_audio": {"data": b64, "format": "wav"},
        }
    ]
    if chunk.transcript:
        parts.insert(0, {"type": "text", "text": chunk.transcript})
    return cast(ChatCompletionMessageParam, {"role": role, "content": parts})
def extract_audio(message: ChatCompletionMessageParam) ‑> AudioChunk | None

Pull the first audio chunk out of an OpenAI-format message.

Returns None if the message has no audio content part. Accepts both 'input_audio' (OpenAI API convention) and 'audio' (alternate providers).

Expand source code
def extract_audio(message: ChatCompletionMessageParam) -> Optional[AudioChunk]:
    """
    Pull the first audio chunk out of an OpenAI-format message.

    Returns None if the message has no audio content part. Accepts both
    'input_audio' (OpenAI API convention) and 'audio' (alternate providers).
    """
    content = message.get("content") if isinstance(message, dict) else None
    if not isinstance(content, list):
        return None

    transcript: Optional[str] = None
    for part in content:
        if not isinstance(part, dict):
            continue
        if part.get("type") == "text":
            transcript = part.get("text") or transcript
        if part.get("type") in ("input_audio", "audio"):
            data_obj = part.get("input_audio") or part.get("audio") or {}
            b64 = data_obj.get("data") if isinstance(data_obj, dict) else None
            if not b64:
                continue
            raw = base64.b64decode(b64)
            # We expect WAV by convention. If it's raw PCM, bytes pass through.
            if raw[:4] == b"RIFF":
                pcm = _wav_bytes_to_pcm16(raw)
            else:
                pcm = raw
            return AudioChunk(data=pcm, transcript=transcript)
    return None
def message_has_audio(message: ChatCompletionMessageParam) ‑> bool

True if the message contains any audio content part.

Expand source code
def message_has_audio(message: ChatCompletionMessageParam) -> bool:
    """True if the message contains any audio content part."""
    return extract_audio(message) is not None