Source code for pipecat.workers.ui.ui_tools
#
# Copyright (c) 2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
"""Opt-in tool mixin for ``UIWorker``.
Ships ``ReplyToolMixin``: a single bundled ``reply`` tool (a required spoken
answer plus the standard UI actions) covering the common app shapes, for
subclasses that don't need a custom tool schema. See the class for details.
"""
from loguru import logger
from pipecat.services.llm_service import FunctionCallParams
from pipecat.workers.llm.tool_decorator import tool
[docs]
class ReplyToolMixin:
"""Expose a ``reply`` tool covering the full standard action set.
Single bundled LLM tool with a required spoken ``answer`` plus
optional visual and state-changing actions. One tool call per
turn, no chaining; the required ``answer`` argument is enforced
by the API schema so the model cannot omit the terminator.
Compose alongside ``UIWorker``::
class MyUIWorker(ReplyToolMixin, UIWorker):
...
Covers pointing apps (``scroll_to`` + ``highlight``), reading
apps (``scroll_to`` + ``select_text``), form apps (``fills`` +
``click``), and any blend (e.g. a document review with
selection-based deixis AND voice-driven note-taking). The LLM
uses whichever fields fit the user's request per turn; unused
fields stay ``null`` and don't affect behavior.
Delivers ``answer`` as verbatim TTS
(``respond_to_job(answer, tts_speak=True)``) -- the worker speaks
the exact phrase. Apps that want a minimal schema (only the fields
actually used, or app-specific commands), or that want the
requester's voice LLM to phrase the reply instead, write their own
``@tool reply`` on the ``UIWorker`` subclass directly. Use the
helper methods on ``UIWorker`` plus ``send_command`` to dispatch the
underlying UI commands.
The host class must provide ``scroll_to``, ``highlight``,
``select_text``, ``click``, ``set_input_value``, and
``respond_to_job`` (``UIWorker`` does) and must be the target of
``@tool`` discovery on the LLM pipeline.
"""
[docs]
@tool
async def reply(
self,
params: FunctionCallParams,
answer: str,
scroll_to: str | None = None,
highlight: list[str] | None = None,
select_text: str | None = None,
fills: list[dict] | None = None,
click: list[str] | None = None,
):
"""Reply to the user. Optionally point at content and act on inputs.
Always called exactly once per turn. ``answer`` is required;
the action fields are optional and may be combined.
Visual / pointing actions (draw the user's attention):
- ``scroll_to`` brings an element into view (single ref).
- ``highlight`` flashes elements briefly (list of refs).
Best for short emphasis like a button or a fact.
- ``select_text`` puts the page's text selection on an
element (single ref). Best for "this paragraph" / "the
section about X" so the user sees exactly what was meant.
Persists until the user clicks elsewhere.
State-changing actions (modify form / app state):
- ``fills`` writes values into inputs (list of
``{"ref", "value"}`` objects, multi-fill in one turn).
- ``click`` clicks elements (list of refs in order). Use for
checkboxes, radios, submit buttons.
Order of dispatch within a turn: ``scroll_to``, then
``highlight``, then ``select_text``, then ``fills``, then
``click``, then speak the answer.
Args:
params: Framework-provided tool invocation context.
answer: The spoken reply in plain language. One short
sentence. No markdown, no symbols.
scroll_to: Optional snapshot ref. Scrolls the element
into view before speaking.
highlight: Optional list of snapshot refs. Visually
pulses each element.
select_text: Optional snapshot ref. Places the page's
text selection on that element.
fills: Optional list of ``{"ref": "eN", "value": "..."}``
objects. Writes each value into the input at ``ref``.
click: Optional list of snapshot refs to click in order.
"""
preview = (answer or "").strip()
if len(preview) > 80:
preview = preview[:80] + "…"
logger.debug(
f"{self}: reply(answer={preview!r}, scroll_to={scroll_to!r}, "
f"highlight={highlight!r}, select_text={select_text!r}, "
f"fills={fills!r}, click={click!r})"
)
# Defensive guards on the list arguments: an LLM that emits a
# malformed entry (None, a bare string, etc.) would crash the
# tool body before respond_to_job fires, leaving the
# single-flight lock held until the requester's timeout cancels
# us. Skip non-conforming entries instead.
if scroll_to:
await self.scroll_to(scroll_to) # type: ignore[attr-defined]
if highlight:
for ref in highlight:
if not isinstance(ref, str):
continue
await self.highlight(ref) # type: ignore[attr-defined]
if select_text:
await self.select_text(select_text) # type: ignore[attr-defined]
if fills:
for entry in fills:
if not isinstance(entry, dict):
continue
ref = entry.get("ref")
value = entry.get("value")
if not isinstance(ref, str) or value is None:
continue
await self.set_input_value(ref, str(value)) # type: ignore[attr-defined]
if click:
for ref in click:
if not isinstance(ref, str):
continue
await self.click(ref) # type: ignore[attr-defined]
await self.respond_to_job(answer, tts_speak=True) # type: ignore[attr-defined]
await params.result_callback(None)