Source code for pipecat.processors.frameworks.rtvi.models

#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""RTVI protocol v1 message models.

Contains all RTVI protocol v1 message definitions and data structures.
Import this module under the ``RTVI`` alias to use as a namespace::

    import pipecat.processors.frameworks.rtvi.models as RTVI

    msg = RTVI.BotReady(id="1", data=RTVI.BotReadyData(version=RTVI.PROTOCOL_VERSION))
"""

from collections.abc import Mapping
from typing import (
    Any,
    Literal,
)

from pydantic import BaseModel, ConfigDict

from pipecat.frames.frames import (
    AggregationType,
)

# -- Constants --
PROTOCOL_VERSION = "1.4.0"

MESSAGE_LABEL = "rtvi-ai"
MessageLiteral = Literal["rtvi-ai"]

# -- Base Message Structure --


[docs] class Message(BaseModel): """Base RTVI message structure. Represents the standard format for RTVI protocol messages. """ label: MessageLiteral = MESSAGE_LABEL type: str id: str data: dict[str, Any] | None = None
# -- Client -> Pipecat messages.
[docs] class RawClientMessageData(BaseModel): """Data structure expected from client messages sent to the RTVI server.""" t: str d: Any | None = None
[docs] class ClientMessage(BaseModel): """Cleansed data structure for client messages for handling.""" msg_id: str type: str data: Any | None = None
[docs] class RawServerResponseData(BaseModel): """Data structure for server responses to client messages.""" t: str d: Any | None = None
[docs] class ServerResponse(BaseModel): """The RTVI-formatted message response from the server to the client. This message is used to respond to custom messages sent by the client. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["server-response"] = "server-response" id: str data: RawServerResponseData
[docs] class AboutClientData(BaseModel): """Data about the RTVI client. Contains information about the client, including which RTVI library it is using, what platform it is on and any additional details, if available. """ library: str library_version: str | None = None platform: str | None = None platform_version: str | None = None platform_details: Any | None = None
[docs] class ClientReadyData(BaseModel): """Data format of client ready messages. Contains the RTVI protocol version and client information. """ version: str about: AboutClientData
# -- Pipecat -> Client errors
[docs] class ErrorResponseData(BaseModel): """Data for an RTVI error response. Contains the error message to send back to the client. """ error: str
[docs] class ErrorResponse(BaseModel): """RTVI error response message. RTVI formatted error response message for relaying failed client requests. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["error-response"] = "error-response" id: str data: ErrorResponseData
[docs] class ErrorData(BaseModel): """Data for an RTVI error event. Contains error information including whether it's fatal. """ error: str fatal: bool # Indicates the pipeline has stopped due to this error
[docs] class Error(BaseModel): """RTVI error event message. RTVI formatted error message for relaying errors in the pipeline. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["error"] = "error" data: ErrorData
# -- Pipecat -> Client responses and messages.
[docs] class BotReadyData(BaseModel): """Data for bot ready notification. Contains protocol version and initial configuration. """ version: str about: Mapping[str, Any] | None = None
[docs] class BotReady(BaseModel): """Message indicating bot is ready for interaction. Sent after bot initialization is complete. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-ready"] = "bot-ready" id: str data: BotReadyData
[docs] class LLMFunctionCallMessageData(BaseModel): """Data for LLM function call notification. Contains function call details including name, ID, and arguments. .. deprecated:: 0.0.102 Use ``LLMFunctionCallInProgressMessageData`` instead. """ function_name: str tool_call_id: str args: Mapping[str, Any]
[docs] class LLMFunctionCallMessage(BaseModel): """Message notifying of an LLM function call. Sent when the LLM makes a function call. .. deprecated:: 0.0.102 Use ``LLMFunctionCallInProgressMessage`` with the ``llm-function-call-in-progress`` event type instead. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["llm-function-call"] = "llm-function-call" data: LLMFunctionCallMessageData
[docs] class SendTextOptions(BaseModel): """Options for sending text input to the LLM. Contains options for how the pipeline should process the text input. """ run_immediately: bool = True audio_response: bool = True
[docs] class SendTextData(BaseModel): """Data format for sending text input to the LLM. Contains the text content to send and any options for how the pipeline should process it. """ content: str options: SendTextOptions | None = None
[docs] class LLMFunctionCallStartMessageData(BaseModel): """Data for LLM function call start notification. Contains the function name being called. Fields may be omitted based on the configured function_call_report_level for security. """ function_name: str | None = None
[docs] class LLMFunctionCallStartMessage(BaseModel): """Message notifying that an LLM function call has started. Sent when the LLM begins a function call. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["llm-function-call-started"] = "llm-function-call-started" data: LLMFunctionCallStartMessageData
[docs] class LLMFunctionCallResultData(BaseModel): """Data for LLM function call result. Contains function call details and result. """ function_name: str tool_call_id: str arguments: dict result: dict | str
[docs] class LLMFunctionCallInProgressMessageData(BaseModel): """Data for LLM function call in-progress notification. Contains function call details including name, ID, and arguments. Fields may be omitted based on the configured function_call_report_level for security. """ tool_call_id: str function_name: str | None = None arguments: Mapping[str, Any] | None = None
[docs] class LLMFunctionCallInProgressMessage(BaseModel): """Message notifying that an LLM function call is in progress. Sent when the LLM function call execution begins. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["llm-function-call-in-progress"] = "llm-function-call-in-progress" data: LLMFunctionCallInProgressMessageData
[docs] class LLMFunctionCallStoppedMessageData(BaseModel): """Data for LLM function call stopped notification. Contains details about the function call that stopped, including whether it was cancelled or completed with a result. Fields may be omitted based on the configured function_call_report_level for security. """ tool_call_id: str cancelled: bool function_name: str | None = None result: Any | None = None
[docs] class LLMFunctionCallStoppedMessage(BaseModel): """Message notifying that an LLM function call has stopped. Sent when a function call completes (with result) or is cancelled. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["llm-function-call-stopped"] = "llm-function-call-stopped" data: LLMFunctionCallStoppedMessageData
[docs] class BotLLMStartedMessage(BaseModel): """Message indicating bot LLM processing has started.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-llm-started"] = "bot-llm-started"
[docs] class BotLLMStoppedMessage(BaseModel): """Message indicating bot LLM processing has stopped.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-llm-stopped"] = "bot-llm-stopped"
[docs] class BotTTSStartedMessage(BaseModel): """Message indicating bot TTS processing has started.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-tts-started"] = "bot-tts-started"
[docs] class BotTTSStoppedMessage(BaseModel): """Message indicating bot TTS processing has stopped.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-tts-stopped"] = "bot-tts-stopped"
[docs] class TextMessageData(BaseModel): """Data for text-based RTVI messages. Contains text content. """ text: str
[docs] class BotOutputMessageData(TextMessageData): """Data for bot output RTVI messages. Extends TextMessageData to include metadata about the output. """ spoken: bool = False # Indicates if the text has been spoken by TTS aggregated_by: AggregationType | str
# Indicates what form the text is in (e.g., by word, sentence, etc.)
[docs] class BotOutputMessage(BaseModel): """Message containing bot output text. An event meant to holistically represent what the bot is outputting, along with metadata about the output and if it has been spoken. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-output"] = "bot-output" data: BotOutputMessageData
[docs] class BotTranscriptionMessage(BaseModel): """Message containing bot transcription text. Sent when the bot's speech is transcribed. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-transcription"] = "bot-transcription" data: TextMessageData
[docs] class BotLLMTextMessage(BaseModel): """Message containing bot LLM text output. Sent when the bot's LLM generates text. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-llm-text"] = "bot-llm-text" data: TextMessageData
[docs] class BotTTSTextMessage(BaseModel): """Message containing bot TTS text output. Sent when text is being processed by TTS. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-tts-text"] = "bot-tts-text" data: TextMessageData
[docs] class AudioMessageData(BaseModel): """Data for audio-based RTVI messages. Contains audio data and metadata. """ audio: str sample_rate: int num_channels: int
[docs] class BotTTSAudioMessage(BaseModel): """Message containing bot TTS audio output. Sent when the bot's TTS generates audio. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-tts-audio"] = "bot-tts-audio" data: AudioMessageData
[docs] class UserTranscriptionMessageData(BaseModel): """Data for user transcription messages. Contains transcription text and metadata. """ text: str user_id: str timestamp: str final: bool
[docs] class UserTranscriptionMessage(BaseModel): """Message containing user transcription. Sent when user speech is transcribed. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["user-transcription"] = "user-transcription" data: UserTranscriptionMessageData
[docs] class UserLLMTextMessage(BaseModel): """Message containing user text input for LLM. Sent when user text is processed by the LLM. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["user-llm-text"] = "user-llm-text" data: TextMessageData
[docs] class UserStartedSpeakingMessage(BaseModel): """Message indicating user has started speaking.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["user-started-speaking"] = "user-started-speaking"
[docs] class UserStoppedSpeakingMessage(BaseModel): """Message indicating user has stopped speaking.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["user-stopped-speaking"] = "user-stopped-speaking"
[docs] class UserMuteStartedMessage(BaseModel): """Message indicating user has been muted.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["user-mute-started"] = "user-mute-started"
[docs] class UserMuteStoppedMessage(BaseModel): """Message indicating user has been unmuted.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["user-mute-stopped"] = "user-mute-stopped"
[docs] class BotStartedSpeakingMessage(BaseModel): """Message indicating bot has started speaking.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-started-speaking"] = "bot-started-speaking"
[docs] class BotStoppedSpeakingMessage(BaseModel): """Message indicating bot has stopped speaking.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-stopped-speaking"] = "bot-stopped-speaking"
[docs] class MetricsMessage(BaseModel): """Message containing performance metrics. Sent to provide performance and usage metrics. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["metrics"] = "metrics" data: Mapping[str, Any]
[docs] class ServerMessage(BaseModel): """Generic server message. Used for custom server-to-client messages. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["server-message"] = "server-message" data: Any
[docs] class AudioLevelMessageData(BaseModel): """Data format for sending audio levels.""" value: float
[docs] class UserAudioLevelMessage(BaseModel): """Message indicating user audio level.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["user-audio-level"] = "user-audio-level" data: AudioLevelMessageData
[docs] class BotAudioLevelMessage(BaseModel): """Message indicating bot audio level.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["bot-audio-level"] = "bot-audio-level" data: AudioLevelMessageData
[docs] class SystemLogMessage(BaseModel): """Message including a system log.""" label: MessageLiteral = MESSAGE_LABEL type: Literal["system-log"] = "system-log" data: TextMessageData
# -- UI Worker Protocol ------------------------------------------------------ # # A structured RTVI message vocabulary that lets server-side workers # observe and drive a GUI app on the client side. The protocol covers # five first-class RTVI message types: # # ui-event client-to-server event message # ui-command server-to-client command message # ui-snapshot client-to-server accessibility snapshot # ui-cancel-job-group client-to-server cancellation request # ui-job-group server-to-client job-group lifecycle envelope # # This section is data only (constants and payload models, no # behavior). ``pipecat.workers.ui.UIWorker`` builds the higher-level # abstractions on top, and single-LLM Pipecat apps can target the same # wire format directly via custom tools that emit typed RTVI messages # with these types. The matching client-side implementation lives in # ``@pipecat-ai/client-js`` and ``@pipecat-ai/client-react``. # The wire-format ``type`` strings (``"ui-event"``, ``"ui-command"``, # ``"ui-snapshot"``, ``"ui-cancel-job-group"``, ``"ui-job-group"``) are pinned # as ``Literal[...]`` field defaults on the corresponding ``*Message`` # pydantic class below, matching the convention used for every other # RTVI message type in this module. # Each ``ui-job-group`` envelope carries a ``kind`` field that the client's # reducer dispatches on. The four kinds form the lifecycle of a # user-facing job group: # # group_started → job_update* → job_completed × N → group_completed # # where N is the number of workers in the group. The kind strings are # pinned as ``Literal[...]`` defaults on the matching ``UIJob*Data`` # class below. # -- UI envelope data classes --
[docs] class UIEventData(BaseModel): """Inner ``data`` for a ``ui-event`` message. Parameters: event: App-defined event. payload: App-defined payload, schemaless by design. """ event: str payload: Any | None = None
[docs] class UICommandData(BaseModel): """Inner ``data`` for a ``ui-command`` message. Parameters: command: App-defined command. payload: App-defined payload (already a plain dict by the time it lands on the wire). The standard command payload models below produce the right shape via ``model_dump()``. """ command: str payload: Any | None = None
[docs] class A11yNode(BaseModel): """One node in the UI accessibility snapshot tree. Mirrors the client-side ``A11yNode`` wire shape. Extra fields are allowed so clients can add platform-specific or future metadata without breaking older servers. Parameters: ref: Stable client-assigned element reference. role: ARIA-style role for the node. name: Optional accessible name. value: Optional current value for inputs/progress/etc. state: Optional short state tags (e.g. ``"focused"``, ``"disabled"``, ``"offscreen"``). level: Optional heading level. colcount: Optional column count for grid-like containers. rowcount: Optional row count for grid-like containers. children: Optional child nodes. """ model_config = ConfigDict(extra="allow") ref: str role: str name: str | None = None value: str | None = None state: list[str] | None = None level: int | None = None colcount: int | None = None rowcount: int | None = None children: list["A11yNode"] | None = None
[docs] class A11ySelection(BaseModel): """The user's current text selection in the UI snapshot. Extra fields are allowed for forward compatibility with client snapshot additions. Parameters: ref: Ref of the element that carries the selection. text: Selected text. start_offset: Optional selection start offset. end_offset: Optional selection end offset. """ model_config = ConfigDict(extra="allow") ref: str text: str start_offset: int | None = None end_offset: int | None = None
[docs] class A11ySnapshot(BaseModel): """Client accessibility snapshot sent in a ``ui-snapshot`` message. Mirrors the client-side ``A11ySnapshot`` wire shape. Extra fields are allowed so clients can add compatible metadata over time. Parameters: root: Root accessibility node. captured_at: Client-side epoch milliseconds when captured. selection: Optional current text selection. """ model_config = ConfigDict(extra="allow") root: A11yNode captured_at: int selection: A11ySelection | None = None
[docs] class UISnapshotData(BaseModel): """Inner ``data`` for a ``ui-snapshot`` message. The accessibility snapshot tree mirrors the client-side ``A11ySnapshot`` wire shape and is kept forward-compatible by allowing extra fields on the snapshot models. Parameters: tree: The serialized accessibility tree. """ tree: A11ySnapshot
[docs] class UICancelJobGroupData(BaseModel): """Inner ``data`` for a ``ui-cancel-job-group`` message. Parameters: job_id: The job group id the client wants cancelled. reason: Optional human-readable reason. """ job_id: str reason: str | None = None
[docs] class UIJobGroupStartedData(BaseModel): """``data`` for a ``ui-job-group`` envelope with kind ``group_started``. Parameters: kind: Always ``"group_started"``. job_id: Shared job-group identifier for the group. workers: Names of the workers the work was dispatched to. label: Optional human-readable label for the group. cancellable: Whether the client may request cancellation. at: Epoch milliseconds when the group started. """ kind: Literal["group_started"] = "group_started" job_id: str workers: list[str] | None = None label: str | None = None cancellable: bool = True at: int = 0
[docs] class UIJobUpdateData(BaseModel): """``data`` for a ``ui-job-group`` envelope with kind ``job_update``. Parameters: kind: Always ``"job_update"``. job_id: The shared job-group identifier. worker_name: The worker that produced the update. data: The worker's update payload, forwarded verbatim. at: Epoch milliseconds when the update was emitted. """ kind: Literal["job_update"] = "job_update" job_id: str worker_name: str data: Any | None = None at: int = 0
[docs] class UIJobCompletedData(BaseModel): """``data`` for a ``ui-job-group`` envelope with kind ``job_completed``. Parameters: kind: Always ``"job_completed"``. job_id: The shared job-group identifier. worker_name: The worker that produced the response. status: Completion status string. response: The worker's response payload. at: Epoch milliseconds when the response was received. """ kind: Literal["job_completed"] = "job_completed" job_id: str worker_name: str status: str response: Any | None = None at: int = 0
[docs] class UIJobGroupCompletedData(BaseModel): """``data`` for a ``ui-job-group`` envelope with kind ``group_completed``. Parameters: kind: Always ``"group_completed"``. job_id: The shared job-group identifier. at: Epoch milliseconds when the group completed. """ kind: Literal["group_completed"] = "group_completed" job_id: str at: int = 0
#: Discriminated union over the four job-group lifecycle data shapes, #: keyed by the ``kind`` field. UIJobGroupData = ( UIJobGroupStartedData | UIJobUpdateData | UIJobCompletedData | UIJobGroupCompletedData ) # -- UI envelope message classes --
[docs] class UIEventMessage(BaseModel): """RTVI ``ui-event`` message (client → server).""" label: MessageLiteral = MESSAGE_LABEL type: Literal["ui-event"] = "ui-event" id: str data: UIEventData
[docs] class UICommandMessage(BaseModel): """RTVI ``ui-command`` message (server → client).""" label: MessageLiteral = MESSAGE_LABEL type: Literal["ui-command"] = "ui-command" data: UICommandData
[docs] class UISnapshotMessage(BaseModel): """RTVI ``ui-snapshot`` message (client → server).""" label: MessageLiteral = MESSAGE_LABEL type: Literal["ui-snapshot"] = "ui-snapshot" id: str data: UISnapshotData
[docs] class UICancelJobGroupMessage(BaseModel): """RTVI ``ui-cancel-job-group`` message (client → server).""" label: MessageLiteral = MESSAGE_LABEL type: Literal["ui-cancel-job-group"] = "ui-cancel-job-group" id: str data: UICancelJobGroupData
[docs] class UIJobGroupMessage(BaseModel): """RTVI ``ui-job-group`` message (server → client). The ``data`` field is one of the four job-group lifecycle discriminated by the ``kind`` field. """ label: MessageLiteral = MESSAGE_LABEL type: Literal["ui-job-group"] = "ui-job-group" data: UIJobGroupData
# -- UI command payloads -- # # These models describe commands that have matching default React # handlers in ``@pipecat-ai/client-react``'s ``standardHandlers``. # Apps can use them as-is, override the client handler to customize # rendering, or ignore them entirely and define their own command # names. # # Server-side helpers that send commands accept these models directly. # ``BaseModel.model_dump()`` converts them to the plain-dict shape # that travels over the wire.
[docs] class Toast(BaseModel): """A transient notification surface shown on the client. Parameters: title: Required headline. subtitle: Optional second line beneath the title. description: Optional body text. image_url: Optional leading image. duration_ms: Optional dismiss timer. Client default applies when None. """ title: str subtitle: str | None = None description: str | None = None image_url: str | None = None duration_ms: int | None = None
[docs] class ScrollTo(BaseModel): """Scroll a target element into view. The client resolves the target by ``ref`` first (a snapshot ref like ``"e42"`` assigned by the a11y walker), then falls back to ``target_id`` (``document.getElementById``). Supply whichever you have; ``ref`` is the normal choice when acting on a node from ``<ui_state>``. Parameters: ref: Snapshot ref from ``<ui_state>``. target_id: Element id registered on the client. behavior: Optional scroll behavior hint. Typical values: ``"smooth"`` or ``"instant"``. Clients may ignore. """ ref: str | None = None target_id: str | None = None behavior: str | None = None
[docs] class Highlight(BaseModel): """Briefly emphasize a target element (flash, glow, pulse). Parameters: ref: Snapshot ref from ``<ui_state>``. target_id: Element id registered on the client. duration_ms: Optional highlight duration. Client default applies when None. """ ref: str | None = None target_id: str | None = None duration_ms: int | None = None
[docs] class Focus(BaseModel): """Move input focus to a target element. Parameters: ref: Snapshot ref from ``<ui_state>``. target_id: Element id registered on the client. """ ref: str | None = None target_id: str | None = None
[docs] class Click(BaseModel): """Click an element on the client. Closes the form-fill loop for non-text inputs (checkboxes, radios) and exposes the rest of the action vocabulary (submit buttons, links, app-specific clickable nodes). The standard handler silently no-ops on ``disabled`` targets so the worker can't bypass UI affordances the user is meant to control. For native ``<select>``, prefer ``SetInputValue`` (clicking options doesn't reliably change the selection); for custom comboboxes (ARIA listbox + popup), apps wire their own command matching the library's interaction model. Parameters: ref: Snapshot ref from ``<ui_state>``. target_id: Element id registered on the client. Used as a fallback when ``ref`` is not set or has gone stale. """ ref: str | None = None target_id: str | None = None
[docs] class SetInputValue(BaseModel): """Write a value into a text input or textarea on the client. Use this for form-filling: the worker has decided what should go into a field (clarifying answer, tax form entry, etc.) and asks the client to populate it. With ``replace=True`` (the default), the existing value is overwritten; with ``replace=False`` the value is appended. The standard handler silently no-ops on ``disabled``, ``readonly``, and ``<input type="hidden">`` targets so the worker can't write into fields the user can't. Parameters: value: The text to write. ref: Snapshot ref from ``<ui_state>``. Typically the ref of an ``<input>`` or ``<textarea>``. target_id: Element id registered on the client. Used as a fallback when ``ref`` is not set or has gone stale. replace: When True (the default), overwrite the current value. When False, append to it. """ value: str = "" ref: str | None = None target_id: str | None = None replace: bool = True
[docs] class SelectText(BaseModel): """Select text on the page so the user can see what the worker means. Mirror of the ``selection`` field surfaced in the snapshot. Use this to point the user's attention at a specific paragraph or range after the worker has decided what it's referring to. With ``start_offset`` and ``end_offset`` omitted, the entire target's text content is selected (``Range.selectNodeContents`` for document elements; ``el.select()`` for ``<input>`` / ``<textarea>``). Parameters: ref: Snapshot ref from ``<ui_state>``. Typically the ref of a paragraph or input element. target_id: Element id registered on the client. Used as a fallback when ``ref`` is not set or has gone stale. start_offset: Character offset within the target's text where the selection should start. For ``<input>`` and ``<textarea>`` this is the value offset; for document elements it is computed against the concatenation of descendant text nodes in document order. end_offset: End character offset, exclusive. Same coordinate system as ``start_offset``. """ ref: str | None = None target_id: str | None = None start_offset: int | None = None end_offset: int | None = None