Source code for pipecat.workers.ui.ui_tools

#
# Copyright (c) 2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Opt-in tool mixin for ``UIWorker``.

Ships ``ReplyToolMixin``: a single bundled ``reply`` tool (a required spoken
answer plus the standard UI actions) covering the common app shapes, for
subclasses that don't need a custom tool schema. See the class for details.
"""

from loguru import logger

from pipecat.services.llm_service import FunctionCallParams
from pipecat.workers.llm.tool_decorator import tool



[docs]
class ReplyToolMixin:
    """Expose a ``reply`` tool covering the full standard action set.

    Single bundled LLM tool with a required spoken ``answer`` plus
    optional visual and state-changing actions. One tool call per
    turn, no chaining; the required ``answer`` argument is enforced
    by the API schema so the model cannot omit the terminator.

    Compose alongside ``UIWorker``::

        class MyUIWorker(ReplyToolMixin, UIWorker):
            ...

    Covers pointing apps (``scroll_to`` + ``highlight``), reading
    apps (``scroll_to`` + ``select_text``), form apps (``fills`` +
    ``click``), and any blend (e.g. a document review with
    selection-based deixis AND voice-driven note-taking). The LLM
    uses whichever fields fit the user's request per turn; unused
    fields stay ``null`` and don't affect behavior.

    Delivers ``answer`` as verbatim TTS
    (``respond_to_job(answer, tts_speak=True)``) -- the worker speaks
    the exact phrase. Apps that want a minimal schema (only the fields
    actually used, or app-specific commands), or that want the
    requester's voice LLM to phrase the reply instead, write their own
    ``@tool reply`` on the ``UIWorker`` subclass directly. Use the
    helper methods on ``UIWorker`` plus ``send_command`` to dispatch the
    underlying UI commands.

    The host class must provide ``scroll_to``, ``highlight``,
    ``select_text``, ``click``, ``set_input_value``, and
    ``respond_to_job`` (``UIWorker`` does) and must be the target of
    ``@tool`` discovery on the LLM pipeline.
    """


[docs]
    @tool
    async def reply(
        self,
        params: FunctionCallParams,
        answer: str,
        scroll_to: str | None = None,
        highlight: list[str] | None = None,
        select_text: str | None = None,
        fills: list[dict] | None = None,
        click: list[str] | None = None,
    ):
        """Reply to the user. Optionally point at content and act on inputs.

        Always called exactly once per turn. ``answer`` is required;
        the action fields are optional and may be combined.

        Visual / pointing actions (draw the user's attention):

        - ``scroll_to`` brings an element into view (single ref).
        - ``highlight`` flashes elements briefly (list of refs).
          Best for short emphasis like a button or a fact.
        - ``select_text`` puts the page's text selection on an
          element (single ref). Best for "this paragraph" / "the
          section about X" so the user sees exactly what was meant.
          Persists until the user clicks elsewhere.

        State-changing actions (modify form / app state):

        - ``fills`` writes values into inputs (list of
          ``{"ref", "value"}`` objects, multi-fill in one turn).
        - ``click`` clicks elements (list of refs in order). Use for
          checkboxes, radios, submit buttons.

        Order of dispatch within a turn: ``scroll_to``, then
        ``highlight``, then ``select_text``, then ``fills``, then
        ``click``, then speak the answer.

        Args:
            params: Framework-provided tool invocation context.
            answer: The spoken reply in plain language. One short
                sentence. No markdown, no symbols.
            scroll_to: Optional snapshot ref. Scrolls the element
                into view before speaking.
            highlight: Optional list of snapshot refs. Visually
                pulses each element.
            select_text: Optional snapshot ref. Places the page's
                text selection on that element.
            fills: Optional list of ``{"ref": "eN", "value": "..."}``
                objects. Writes each value into the input at ``ref``.
            click: Optional list of snapshot refs to click in order.
        """
        preview = (answer or "").strip()
        if len(preview) > 80:
            preview = preview[:80] + "…"
        logger.debug(
            f"{self}: reply(answer={preview!r}, scroll_to={scroll_to!r}, "
            f"highlight={highlight!r}, select_text={select_text!r}, "
            f"fills={fills!r}, click={click!r})"
        )
        # Defensive guards on the list arguments: an LLM that emits a
        # malformed entry (None, a bare string, etc.) would crash the
        # tool body before respond_to_job fires, leaving the
        # single-flight lock held until the requester's timeout cancels
        # us. Skip non-conforming entries instead.
        if scroll_to:
            await self.scroll_to(scroll_to)  # type: ignore[attr-defined]
        if highlight:
            for ref in highlight:
                if not isinstance(ref, str):
                    continue
                await self.highlight(ref)  # type: ignore[attr-defined]
        if select_text:
            await self.select_text(select_text)  # type: ignore[attr-defined]
        if fills:
            for entry in fills:
                if not isinstance(entry, dict):
                    continue
                ref = entry.get("ref")
                value = entry.get("value")
                if not isinstance(ref, str) or value is None:
                    continue
                await self.set_input_value(ref, str(value))  # type: ignore[attr-defined]
        if click:
            for ref in click:
                if not isinstance(ref, str):
                    continue
                await self.click(ref)  # type: ignore[attr-defined]
        await self.respond_to_job(answer, tts_speak=True)  # type: ignore[attr-defined]
        await params.result_callback(None)