Source code for pipecat.services.groq.tts

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Groq text-to-speech service implementation."""

import io
import wave
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from typing import Literal, cast

from loguru import logger
from pydantic import BaseModel

from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
    TTSAudioRawFrame,
)
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
from pipecat.services.tts_service import TTSService
from pipecat.transcriptions.language import Language
from pipecat.utils.deprecation import deprecated
from pipecat.utils.tracing.service_decorators import traced_tts

try:
    from groq import AsyncGroq
except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
    logger.error('In order to use Groq, you need to `uv add "pipecat-ai[groq]"`.')
    raise ImportError(f"Missing module: {e}") from e

# Hint set for `output_format`. The values mirror the Literal that
# `groq.resources.audio.speech.AsyncSpeech.create` accepts on its
# `response_format` parameter (also visible as the `response_format` field of
# `groq.types.audio.SpeechCreateParams`). The groq SDK does not export this as
# a named alias, so we redeclare it here.
#
# This alias is used in unions like `GroqAudioFormat | str`, so pyright shows
# these values as completion hints without rejecting other strings. If groq
# adds a new format before this list is updated, callers can still pass it and
# we forward it through (with a cast at the API boundary). Keep in sync on a
# best-effort basis when bumping the groq dep.
GroqAudioFormat = Literal["flac", "mp3", "mulaw", "ogg", "wav"]



[docs]
@dataclass
class GroqTTSSettings(TTSSettings):
    """Settings for GroqTTSService.

    Parameters:
        speed: Speech speed multiplier. Defaults to 1.0.
    """

    speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)




[docs]
class GroqTTSService(TTSService):
    """Groq text-to-speech service implementation.

    Provides text-to-speech synthesis using Groq's TTS API. The service
    operates at a fixed 48kHz sample rate and supports various voices
    and output formats.
    """

    Settings = GroqTTSSettings
    _settings: Settings


[docs]
    @deprecated(
        "`GroqTTSService.InputParams` is deprecated since 0.0.105 and will be removed in 2.0.0. "
        "Use `GroqTTSService.Settings` instead."
    )
    class InputParams(BaseModel):
        """Input parameters for Groq TTS configuration.

        .. deprecated:: 0.0.105
            Use ``settings=GroqTTSService.Settings(...)`` instead.
            Will be removed in 2.0.0.

        Parameters:
            language: Language for speech synthesis. Defaults to English.
            speed: Speech speed multiplier. Defaults to 1.0.
        """

        language: Language | None = Language.EN
        speed: float | None = 1.0


    GROQ_SAMPLE_RATE = 48000  # Groq TTS only supports 48kHz sample rate


[docs]
    def __init__(
        self,
        *,
        api_key: str,
        output_format: GroqAudioFormat | str = "wav",
        params: InputParams | None = None,
        model_name: str | None = None,
        voice_id: str | None = None,
        sample_rate: int | None = GROQ_SAMPLE_RATE,
        settings: Settings | None = None,
        **kwargs,
    ):
        """Initialize Groq TTS service.

        Args:
            api_key: Groq API key for authentication.
            output_format: Audio output format. Defaults to "wav".
            params: Additional input parameters for voice customization.

                .. deprecated:: 0.0.105
                    Use ``settings=GroqTTSService.Settings(...)`` instead.
                    Will be removed in 2.0.0.

            model_name: TTS model to use.

                .. deprecated:: 0.0.105
                    Use ``settings=GroqTTSService.Settings(model=...)`` instead.
                    Will be removed in 2.0.0.

            voice_id: Voice identifier to use.

                .. deprecated:: 0.0.105
                    Use ``settings=GroqTTSService.Settings(voice=...)`` instead.
                    Will be removed in 2.0.0.

            sample_rate: Audio sample rate. Must be 48000 Hz for Groq TTS.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to parent TTSService class.
        """
        if sample_rate != self.GROQ_SAMPLE_RATE:
            logger.warning(f"Groq TTS only supports {self.GROQ_SAMPLE_RATE}Hz sample rate. ")

        # 1. Initialize default_settings with hardcoded defaults
        default_settings = self.Settings(
            model="canopylabs/orpheus-v1-english",
            voice="autumn",
            language="en",
            speed=1.0,
        )

        # 2. Apply direct init arg overrides (deprecated)
        if model_name is not None:
            self._warn_init_param_moved_to_settings("model_name", "model")
            default_settings.model = model_name
        if voice_id is not None:
            self._warn_init_param_moved_to_settings("voice_id", "voice")
            default_settings.voice = voice_id

        # 3. Apply params overrides — only if settings not provided
        if params is not None:
            self._warn_init_param_moved_to_settings("params")
            if not settings:
                default_settings.language = str(params.language) if params.language else "en"
                default_settings.speed = params.speed

        # 4. Apply settings delta (canonical API, always wins)
        if settings is not None:
            default_settings.apply_update(settings)

        super().__init__(
            pause_frame_processing=True,
            push_start_frame=True,
            push_stop_frames=True,
            sample_rate=sample_rate,
            settings=default_settings,
            **kwargs,
        )

        self._api_key = api_key
        self._output_format: str = output_format

        self._client = AsyncGroq(api_key=self._api_key)



[docs]
    def can_generate_metrics(self) -> bool:
        """Check if this service can generate processing metrics.

        Returns:
            True, as Groq TTS service supports metrics generation.
        """
        return True



[docs]
    @traced_tts
    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Groq's TTS API.

        Args:
            text: The text to synthesize into speech.
            context_id: The context ID for tracking audio frames.

        Yields:
            Frame: Audio frames containing the synthesized speech data.
        """
        measuring_ttfb = True
        try:
            model = assert_given(self._settings.model)
            voice = assert_given(self._settings.voice)
            speed = assert_given(self._settings.speed)
            if model is None:
                raise ValueError("Groq TTS model must be specified")
            if voice is None:
                raise ValueError("Groq TTS voice must be specified")
            if speed is None:
                raise ValueError("Groq TTS speed must be specified")
            response = await self._client.audio.speech.create(
                model=model,
                voice=voice,
                # Cast satisfies groq's stricter Literal typing while letting
                # callers pass any string (e.g. a newer groq format we haven't
                # yet added to GroqAudioFormat). If the value is unsupported,
                # groq's API will surface a runtime error with a clear message.
                response_format=cast(GroqAudioFormat, self._output_format),
                # Note: as of 2026-02-25, only a speed of 1.0 is supported, but
                # here we pass it for completeness and future-proofing
                speed=speed,
                input=text,
            )

            async for data in response.iter_bytes():
                if measuring_ttfb:
                    await self.stop_ttfb_metrics()
                    measuring_ttfb = False

                with wave.open(io.BytesIO(data)) as w:
                    channels = w.getnchannels()
                    frame_rate = w.getframerate()
                    num_frames = w.getnframes()
                    bytes = w.readframes(num_frames)
                    yield TTSAudioRawFrame(bytes, frame_rate, channels, context_id=context_id)
        except Exception as e:
            yield ErrorFrame(error=f"Unknown error occurred: {e}")