introduce PipelineParams audio input/output sample rates

pipecat-ai · Feb 4, 2025 · ab45e48 · ab45e48
1 parent cc54255
commit ab45e48
Show file tree

Hide file tree

Showing 61 changed files with 570 additions and 402 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added new fields to `PipelineParams` to control audio input and output sample
+  rates for the whole pipeline. This allows controlling sample rates from a
+  single place instead of having to specify sample rates in each
+  service. Setting a sample rate to a service is still possible and will
+  override the value from `PipelineParams`.
+
 - Introduce audio resamplers (`BaseAudioResampler`). This is just a base class
   to implement audio resamplers. Currently, two implementations are provided
   `SOXRAudioResampler` and `ResampyResampler`. A new

diff --git a/examples/bot-ready-signalling/server/signalling_bot.py b/examples/bot-ready-signalling/server/signalling_bot.py
@@ -17,7 +17,7 @@
 from pipecat.frames.frames import AudioRawFrame, EndFrame, OutputAudioRawFrame, TTSSpeakFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineTask
+from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 
@@ -31,16 +31,15 @@
 class SilenceFrame(OutputAudioRawFrame):
     def __init__(
         self,
-        audio: bytes = None,
-        sample_rate: int = 16000,
-        num_channels: int = 1,
-        duration: float = 0.1,
+        *,
+        sample_rate: int,
+        duration: float,
     ):
         # Initialize the parent class with the silent frame's data
         super().__init__(
-            audio=self.create_silent_audio_frame(sample_rate, num_channels, duration).audio,
+            audio=self.create_silent_audio_frame(sample_rate, 1, duration).audio,
             sample_rate=sample_rate,
-            num_channels=num_channels,
+            num_channels=1,
         )
 
     @staticmethod
@@ -80,7 +79,10 @@ async def on_app_message(transport, message, sender):
                 return
             await task.queue_frames(
                 [
-                    SilenceFrame(duration=0.5),
+                    SilenceFrame(
+                        sample_rate=task.params.audio_out_sample_rate,
+                        duration=0.5,
+                    ),
                     TTSSpeakFrame(f"Hello there, how are you doing today ?"),
                     EndFrame(),
                 ]

diff --git a/examples/foundational/07g-interruptible-openai-tts.py b/examples/foundational/07g-interruptible-openai-tts.py
@@ -37,7 +37,6 @@ async def main():
             "Respond bot",
             DailyParams(
                 audio_out_enabled=True,
-                audio_out_sample_rate=24000,
                 transcription_enabled=True,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(),

diff --git a/examples/foundational/07k-interruptible-lmnt.py b/examples/foundational/07k-interruptible-lmnt.py
@@ -38,7 +38,6 @@ async def main():
             "Respond bot",
             DailyParams(
                 audio_out_enabled=True,
-                audio_out_sample_rate=24000,
                 transcription_enabled=True,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(),

diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py
@@ -40,7 +40,6 @@ async def main():
             "Respond bot",
             DailyParams(
                 audio_out_enabled=True,
-                audio_out_sample_rate=24000,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(),
                 vad_audio_passthrough=True,

diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
@@ -21,7 +21,7 @@
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineTask
+from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 
@@ -61,7 +61,6 @@ async def main():
             "Test",
             DailyParams(
                 audio_in_enabled=True,
-                audio_in_sample_rate=24000,
                 audio_out_enabled=True,
                 camera_out_enabled=True,
                 camera_out_is_live=True,
@@ -78,7 +77,9 @@ async def on_first_participant_joined(transport, participant):
 
         runner = PipelineRunner()
 
-        task = PipelineTask(pipeline)
+        task = PipelineTask(
+            pipeline, PipelineParams(audio_in_sample_rate=24000, audio_out_sample_rate=24000)
+        )
 
         await runner.run(task)
 

diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py
@@ -22,7 +22,7 @@
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineTask
+from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.transports.base_transport import TransportParams
 from pipecat.transports.local.tk import TkLocalTransport
@@ -62,7 +62,7 @@ async def main():
         tk_root.title("Local Mirror")
 
         daily_transport = DailyTransport(
-            room_url, token, "Test", DailyParams(audio_in_enabled=True, audio_in_sample_rate=24000)
+            room_url, token, "Test", DailyParams(audio_in_enabled=True)
         )
 
         tk_transport = TkLocalTransport(
@@ -82,7 +82,9 @@ async def on_first_participant_joined(transport, participant):
 
         pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])
 
-        task = PipelineTask(pipeline)
+        task = PipelineTask(
+            pipeline, PipelineParams(audio_in_sample_rate=24000, audio_out_sample_rate=24000)
+        )
 
         async def run_tk():
             while not task.has_finished():

diff --git a/examples/foundational/18-gstreamer-filesrc.py b/examples/foundational/18-gstreamer-filesrc.py
@@ -51,8 +51,6 @@ async def main():
             out_params=GStreamerPipelineSource.OutputParams(
                 video_width=1280,
                 video_height=720,
-                audio_sample_rate=24000,
-                audio_channels=1,
             ),
         )
 

diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py
@@ -80,9 +80,7 @@ async def main():
             "Respond bot",
             DailyParams(
                 audio_in_enabled=True,
-                audio_in_sample_rate=24000,
                 audio_out_enabled=True,
-                audio_out_sample_rate=24000,
                 transcription_enabled=False,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),

diff --git a/examples/foundational/20b-persistent-context-openai-realtime.py b/examples/foundational/20b-persistent-context-openai-realtime.py
@@ -177,9 +177,7 @@ async def main():
             "Respond bot",
             DailyParams(
                 audio_in_enabled=True,
-                audio_in_sample_rate=24000,
                 audio_out_enabled=True,
-                audio_out_sample_rate=24000,
                 transcription_enabled=False,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),

diff --git a/examples/foundational/21-tavus-layer.py b/examples/foundational/21-tavus-layer.py
@@ -88,6 +88,10 @@ async def main():
         task = PipelineTask(
             pipeline,
             PipelineParams(
+                # We just use 16000 because that's what Tavus is expecting and
+                # we avoid resampling.
+                audio_in_sample_rate=16000,
+                audio_out_sample_rate=16000,
                 allow_interruptions=True,
                 enable_metrics=True,
                 enable_usage_metrics=True,

diff --git a/examples/foundational/22d-natural-conversation-gemini-audio.py b/examples/foundational/22d-natural-conversation-gemini-audio.py
@@ -639,7 +639,6 @@ async def main():
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer(),
                 vad_audio_passthrough=True,
-                audio_in_sample_rate=16000,
             ),
         )
 

diff --git a/examples/foundational/26-gemini-multimodal-live.py b/examples/foundational/26-gemini-multimodal-live.py
@@ -37,8 +37,6 @@ async def main():
             token,
             "Respond bot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 vad_enabled=True,
                 vad_audio_passthrough=True,

diff --git a/examples/foundational/26a-gemini-multimodal-live-transcription.py b/examples/foundational/26a-gemini-multimodal-live-transcription.py
@@ -37,8 +37,6 @@ async def main():
             token,
             "Respond bot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 vad_enabled=True,
                 vad_audio_passthrough=True,

diff --git a/examples/foundational/26b-gemini-multimodal-live-function-calling.py b/examples/foundational/26b-gemini-multimodal-live-function-calling.py
@@ -84,8 +84,6 @@ async def main():
             token,
             "Respond bot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 vad_enabled=True,
                 vad_audio_passthrough=True,

diff --git a/examples/foundational/26c-gemini-multimodal-live-video.py b/examples/foundational/26c-gemini-multimodal-live-video.py
@@ -37,8 +37,6 @@ async def main():
             token,
             "Respond bot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 vad_enabled=True,
                 vad_audio_passthrough=True,
@@ -47,8 +45,6 @@ async def main():
                 # matter because we can only use the Multimodal Live API's phrase
                 # endpointing, for now.
                 vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
-                start_audio_paused=True,
-                start_video_paused=True,
             ),
         )
 

diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py
@@ -52,8 +52,6 @@ async def main():
             token,
             "Respond bot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 vad_enabled=True,
                 vad_audio_passthrough=True,

diff --git a/examples/foundational/29-livekit-audio-chat.py b/examples/foundational/29-livekit-audio-chat.py
@@ -38,8 +38,6 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
-DESIRED_SAMPLE_RATE = 16000
-
 
 def generate_token(room_name: str, participant_name: str, api_key: str, api_secret: str) -> str:
     token = api.AccessToken(api_key, api_secret)
@@ -114,11 +112,8 @@ async def main():
             token=token,
             room_name=room_name,
             params=LiveKitParams(
-                audio_in_channels=1,
                 audio_in_enabled=True,
                 audio_out_enabled=True,
-                audio_in_sample_rate=DESIRED_SAMPLE_RATE,
-                audio_out_sample_rate=DESIRED_SAMPLE_RATE,
                 vad_analyzer=SileroVADAnalyzer(),
                 vad_enabled=True,
                 vad_audio_passthrough=True,
@@ -128,7 +123,6 @@ async def main():
         stt = DeepgramSTTService(
             api_key=os.getenv("DEEPGRAM_API_KEY"),
             live_options=LiveOptions(
-                sample_rate=DESIRED_SAMPLE_RATE,
                 vad_events=True,
             ),
         )
@@ -138,7 +132,6 @@ async def main():
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-            sample_rate=DESIRED_SAMPLE_RATE,
         )
 
         messages = [

diff --git a/examples/simple-chatbot/server/bot-gemini.py b/examples/simple-chatbot/server/bot-gemini.py
@@ -121,8 +121,6 @@ async def main():
             token,
             "Chatbot",
             DailyParams(
-                audio_in_sample_rate=16000,
-                audio_out_sample_rate=24000,
                 audio_out_enabled=True,
                 camera_out_enabled=True,
                 camera_out_width=1024,

diff --git a/examples/studypal/studypal.py b/examples/studypal/studypal.py
@@ -112,7 +112,6 @@ async def main():
             token,
             "studypal",
             DailyParams(
-                audio_out_sample_rate=44100,
                 audio_out_enabled=True,
                 transcription_enabled=True,
                 vad_enabled=True,
@@ -124,7 +123,6 @@ async def main():
             api_key=os.getenv("CARTESIA_API_KEY"),
             voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"),
             # British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9
-            sample_rate=44100,
         )
 
         llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini")
@@ -155,7 +153,12 @@ async def main():
             ]
         )
 
-        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                audio_out_sample_rate=44100, allow_interruptions=True, enable_metrics=True
+            ),
+        )
 
         @transport.event_handler("on_first_participant_joined")
         async def on_first_participant_joined(transport, participant):

diff --git a/examples/twilio-chatbot/bot.py b/examples/twilio-chatbot/bot.py
@@ -11,7 +11,6 @@
 import wave
 
 import aiofiles
-from deepgram import LiveOptions
 from dotenv import load_dotenv
 from fastapi import WebSocket
 from loguru import logger
@@ -36,8 +35,6 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
-SAMPLE_RATE = 8000
-
 
 async def save_audio(server_name: str, audio: bytes, sample_rate: int, num_channels: int):
     if len(audio) > 0:
@@ -63,29 +60,21 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
         params=FastAPIWebsocketParams(
             audio_in_enabled=True,
             audio_out_enabled=True,
-            audio_out_sample_rate=SAMPLE_RATE,
             add_wav_header=False,
             vad_enabled=True,
-            vad_analyzer=SileroVADAnalyzer(sample_rate=SAMPLE_RATE),
+            vad_analyzer=SileroVADAnalyzer(),
             vad_audio_passthrough=True,
-            serializer=TwilioFrameSerializer(
-                stream_sid, TwilioFrameSerializer.InputParams(sample_rate=SAMPLE_RATE)
-            ),
+            serializer=TwilioFrameSerializer(stream_sid),
         ),
     )
 
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
 
-    stt = DeepgramSTTService(
-        api_key=os.getenv("DEEPGRAM_API_KEY"),
-        live_options=LiveOptions(sample_rate=SAMPLE_RATE),
-        audio_passthrough=True,
-    )
+    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True)
 
     tts = CartesiaTTSService(
         api_key=os.getenv("CARTESIA_API_KEY"),
         voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
-        sample_rate=SAMPLE_RATE,
         push_silence_after_stop=testing,
     )
 
@@ -101,7 +90,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
 
     # NOTE: Watch out! This will save all the conversation in memory. You can
     # pass `buffer_size` to get periodic callbacks.
-    audiobuffer = AudioBufferProcessor(sample_rate=SAMPLE_RATE)
+    audiobuffer = AudioBufferProcessor()
 
     pipeline = Pipeline(
         [
@@ -116,7 +105,12 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
         ]
     )
 
-    task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            audio_in_sample_rate=8000, audio_out_sample_rate=8000, allow_interruptions=True
+        ),
+    )
 
     @transport.event_handler("on_client_connected")
     async def on_client_connected(transport, client):