Source code for pipecat.utils.text.word_timestamp_utils

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Utilities for normalizing word-timestamp streams from TTS services."""

import re


[docs] def merge_punct_tokens( word_times: list[tuple[str, float]], ) -> list[tuple[str, float]]: """Merge punctuation/space-only tokens into the preceding word. Some TTS services (e.g. Inworld) emit spaces and punctuation as separate word-timestamp tokens rather than attaching them to the adjacent word. This function collapses those tokens so downstream consumers always receive words with trailing punctuation already attached — identical to the format produced by ElevenLabs or Cartesia. A token is considered punct/space-only when its text contains no alphanumeric characters after stripping XML/HTML tags. Such tokens are appended to the preceding word's text and their timestamp is discarded (the preceding word's timestamp is kept). Leading punct/space tokens with no preceding word are silently discarded. Every output token is stripped of leading and trailing whitespace (spaces, tabs, newlines). Args: word_times: Raw list of ``(word, timestamp)`` pairs from the TTS service. Returns: Merged list where every entry contains at least one alphanumeric character and has no leading or trailing whitespace. Example:: merge_punct_tokens([("questions", 1.0), (", ", 1.2), ("explain", 1.4)]) # → [("questions,", 1.0), ("explain", 1.4)] """ merged: list[tuple[str, float]] = [] for word, ts in word_times: stripped = re.sub(r"<[^>]+>", "", word) has_alnum = any(c.isalnum() for c in stripped) if not has_alnum: if merged: prev_word, prev_ts = merged[-1] merged[-1] = (prev_word + word, prev_ts) # else: leading punct/space with no preceding word → discard else: merged.append((word, ts)) return [(word.strip(), ts) for word, ts in merged]