Source code for pipecat.utils.text.word_timestamp_utils

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Utilities for normalizing word-timestamp streams from TTS services."""

import re



[docs]
def merge_punct_tokens(
    word_times: list[tuple[str, float]],
) -> list[tuple[str, float]]:
    """Merge punctuation/space-only tokens into the preceding word.

    Some TTS services (e.g. Inworld) emit spaces and punctuation as separate
    word-timestamp tokens rather than attaching them to the adjacent word.
    This function collapses those tokens so downstream consumers always receive
    words with trailing punctuation already attached — identical to the format
    produced by ElevenLabs or Cartesia.

    A token is considered punct/space-only when its text contains no alphanumeric
    characters after stripping XML/HTML tags.  Such tokens are appended to the
    preceding word's text and their timestamp is discarded (the preceding word's
    timestamp is kept).  Leading punct/space tokens with no preceding word are
    silently discarded.  Every output token is stripped of leading and trailing
    whitespace (spaces, tabs, newlines).

    Args:
        word_times: Raw list of ``(word, timestamp)`` pairs from the TTS service.

    Returns:
        Merged list where every entry contains at least one alphanumeric character
        and has no leading or trailing whitespace.

    Example::

        merge_punct_tokens([("questions", 1.0), (", ", 1.2), ("explain", 1.4)])
        # → [("questions,", 1.0), ("explain", 1.4)]
    """
    merged: list[tuple[str, float]] = []
    for word, ts in word_times:
        stripped = re.sub(r"<[^>]+>", "", word)
        has_alnum = any(c.isalnum() for c in stripped)
        if not has_alnum:
            if merged:
                prev_word, prev_ts = merged[-1]
                merged[-1] = (prev_word + word, prev_ts)
            # else: leading punct/space with no preceding word → discard
        else:
            merged.append((word, ts))
    return [(word.strip(), ts) for word, ts in merged]