From dad4d884704901c8a9d3cfe97fcf0c9a9a11ecd0 Mon Sep 17 00:00:00 2001 From: Timothy Luong Date: Wed, 4 Sep 2024 17:21:00 -0700 Subject: [PATCH 1/4] Upgrading the message cutoff for Cartesia Synthesizer to use timestamps --- .../synthesizer/cartesia_synthesizer.py | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index 959ca5ba5e..bb7cfa8755 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -90,6 +90,8 @@ def __init__( self.client = self.cartesia_tts(api_key=self.api_key) self.ws = None self.ctx = None + self.ctx_message = BaseMessage(text="") + self.ctx_timestamps = [] self.no_more_inputs_task = None self.no_more_inputs_lock = asyncio.Lock() @@ -99,10 +101,14 @@ async def initialize_ws(self): async def initialize_ctx(self, is_first_text_chunk: bool): if self.ctx is None or self.ctx.is_closed(): + self.ctx_message = BaseMessage(text="") + self.ctx_timestamps = [] if self.ws: self.ctx = self.ws.context() else: if is_first_text_chunk: + self.ctx_message = BaseMessage(text="") + self.ctx_timestamps = [] if self.no_more_inputs_task: self.no_more_inputs_task.cancel() await self.ctx.no_more_inputs() @@ -144,6 +150,7 @@ async def create_speech_uncached( voice_id=self.voice_id, continue_=not is_sole_text_chunk, output_format=self.output_format, + add_timestamps=True, _experimental_voice_controls=self._experimental_voice_controls, ) if not is_sole_text_chunk: @@ -159,12 +166,20 @@ async def chunk_generator(context): try: async for event in context.receive(): audio = event.get("audio") - buffer.extend(audio) - while len(buffer) >= chunk_size: - yield SynthesisResult.ChunkResult( - chunk=buffer[:chunk_size], is_last_chunk=False - ) - buffer = buffer[chunk_size:] + word_timestamps = event.get("word_timestamps") + if word_timestamps: + words = word_timestamps['words'] + start_times = word_timestamps['start'] + end_times = word_timestamps['end'] + for word, start, end in zip(words, start_times, end_times): + self.ctx_timestamps.append((word, start, end)) + if audio: + buffer.extend(audio) + while len(buffer) >= chunk_size: + yield SynthesisResult.ChunkResult( + chunk=buffer[:chunk_size], is_last_chunk=False + ) + buffer = buffer[chunk_size:] except Exception as e: logger.info( f"Caught error while receiving audio chunks from CartesiaSynthesizer: {e}" @@ -180,11 +195,27 @@ async def chunk_generator(context): buffer.extend(b"\x00\x00" * padding_size) # 0 is silence in s16le yield SynthesisResult.ChunkResult(chunk=buffer, is_last_chunk=True) + self.ctx_message.text += transcript + + def get_message_cutoff_ctx(message, seconds, words_per_minute=150): + if seconds: + closest_index = 0 + if len(self.ctx_timestamps) > 0: + for index, word_timestamp in enumerate(self.ctx_timestamps): + _word, start, end = word_timestamp + closest_index = index + if end >= seconds: + break + if closest_index: + # Check if they're less than 2 seconds apart, fall back to words per minute otherwise + if self.ctx_timestamps[closest_index][2] - seconds < 2: + return " ".join([word for word, *_ in self.ctx_timestamps[:closest_index + 1]]) + return self.get_message_cutoff_from_voice_speed(message, seconds, words_per_minute) + + return SynthesisResult( chunk_generator=chunk_generator(self.ctx), - get_message_up_to=lambda seconds: self.get_message_cutoff_from_voice_speed( - message, seconds - ), + get_message_up_to=lambda seconds: get_message_cutoff_ctx(self.ctx_message, seconds), ) @classmethod From 19b2070dd613bbc1aaa3ddfd5723c8796a31f22e Mon Sep 17 00:00:00 2001 From: Timothy Luong Date: Thu, 5 Sep 2024 11:11:39 -0700 Subject: [PATCH 2/4] Linting --- vocode/streaming/synthesizer/cartesia_synthesizer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index bb7cfa8755..d1661eb226 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -168,9 +168,9 @@ async def chunk_generator(context): audio = event.get("audio") word_timestamps = event.get("word_timestamps") if word_timestamps: - words = word_timestamps['words'] - start_times = word_timestamps['start'] - end_times = word_timestamps['end'] + words = word_timestamps["words"] + start_times = word_timestamps["start"] + end_times = word_timestamps["end"] for word, start, end in zip(words, start_times, end_times): self.ctx_timestamps.append((word, start, end)) if audio: @@ -209,10 +209,11 @@ def get_message_cutoff_ctx(message, seconds, words_per_minute=150): if closest_index: # Check if they're less than 2 seconds apart, fall back to words per minute otherwise if self.ctx_timestamps[closest_index][2] - seconds < 2: - return " ".join([word for word, *_ in self.ctx_timestamps[:closest_index + 1]]) + return " ".join( + [word for word, *_ in self.ctx_timestamps[: closest_index + 1]] + ) return self.get_message_cutoff_from_voice_speed(message, seconds, words_per_minute) - return SynthesisResult( chunk_generator=chunk_generator(self.ctx), get_message_up_to=lambda seconds: get_message_cutoff_ctx(self.ctx_message, seconds), From 86a9febb0c8427e1e3512d68fe7c683e075ff80d Mon Sep 17 00:00:00 2001 From: Timothy Luong Date: Thu, 5 Sep 2024 11:23:26 -0700 Subject: [PATCH 3/4] Adding typing to --- vocode/streaming/synthesizer/cartesia_synthesizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index d1661eb226..b5d42d7c1b 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -2,6 +2,7 @@ import hashlib from loguru import logger +from typing import List, Tuple from vocode import getenv from vocode.streaming.models.audio import AudioEncoding, SamplingRate @@ -91,7 +92,7 @@ def __init__( self.ws = None self.ctx = None self.ctx_message = BaseMessage(text="") - self.ctx_timestamps = [] + self.ctx_timestamps: List[Tuple[str, float, float]] = [] self.no_more_inputs_task = None self.no_more_inputs_lock = asyncio.Lock() From 57b52dc5c494f34e4d87e3215eb33de9734b6aef Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Fri, 6 Sep 2024 15:34:36 -0700 Subject: [PATCH 4/4] fix lint --- vocode/streaming/synthesizer/cartesia_synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index b5d42d7c1b..e9387253ef 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -1,8 +1,8 @@ import asyncio import hashlib +from typing import List, Tuple from loguru import logger -from typing import List, Tuple from vocode import getenv from vocode.streaming.models.audio import AudioEncoding, SamplingRate