Skip to content

Commit

Permalink
introduce PipelineParams audio input/output sample rates
Browse files Browse the repository at this point in the history
  • Loading branch information
aconchillo committed Feb 4, 2025
1 parent cc54255 commit ab45e48
Show file tree
Hide file tree
Showing 61 changed files with 570 additions and 402 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added new fields to `PipelineParams` to control audio input and output sample
rates for the whole pipeline. This allows controlling sample rates from a
single place instead of having to specify sample rates in each
service. Setting a sample rate to a service is still possible and will
override the value from `PipelineParams`.

- Introduce audio resamplers (`BaseAudioResampler`). This is just a base class
to implement audio resamplers. Currently, two implementations are provided
`SOXRAudioResampler` and `ResampyResampler`. A new
Expand Down
18 changes: 10 additions & 8 deletions examples/bot-ready-signalling/server/signalling_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pipecat.frames.frames import AudioRawFrame, EndFrame, OutputAudioRawFrame, TTSSpeakFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport

Expand All @@ -31,16 +31,15 @@
class SilenceFrame(OutputAudioRawFrame):
def __init__(
self,
audio: bytes = None,
sample_rate: int = 16000,
num_channels: int = 1,
duration: float = 0.1,
*,
sample_rate: int,
duration: float,
):
# Initialize the parent class with the silent frame's data
super().__init__(
audio=self.create_silent_audio_frame(sample_rate, num_channels, duration).audio,
audio=self.create_silent_audio_frame(sample_rate, 1, duration).audio,
sample_rate=sample_rate,
num_channels=num_channels,
num_channels=1,
)

@staticmethod
Expand Down Expand Up @@ -80,7 +79,10 @@ async def on_app_message(transport, message, sender):
return
await task.queue_frames(
[
SilenceFrame(duration=0.5),
SilenceFrame(
sample_rate=task.params.audio_out_sample_rate,
duration=0.5,
),
TTSSpeakFrame(f"Hello there, how are you doing today ?"),
EndFrame(),
]
Expand Down
1 change: 0 additions & 1 deletion examples/foundational/07g-interruptible-openai-tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ async def main():
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
Expand Down
1 change: 0 additions & 1 deletion examples/foundational/07k-interruptible-lmnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ async def main():
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
Expand Down
1 change: 0 additions & 1 deletion examples/foundational/07n-interruptible-google.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ async def main():
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=24000,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
Expand Down
7 changes: 4 additions & 3 deletions examples/foundational/09-mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.services.daily import DailyParams, DailyTransport

Expand Down Expand Up @@ -61,7 +61,6 @@ async def main():
"Test",
DailyParams(
audio_in_enabled=True,
audio_in_sample_rate=24000,
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_is_live=True,
Expand All @@ -78,7 +77,9 @@ async def on_first_participant_joined(transport, participant):

runner = PipelineRunner()

task = PipelineTask(pipeline)
task = PipelineTask(
pipeline, PipelineParams(audio_in_sample_rate=24000, audio_out_sample_rate=24000)
)

await runner.run(task)

Expand Down
8 changes: 5 additions & 3 deletions examples/foundational/09a-local-mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.local.tk import TkLocalTransport
Expand Down Expand Up @@ -62,7 +62,7 @@ async def main():
tk_root.title("Local Mirror")

daily_transport = DailyTransport(
room_url, token, "Test", DailyParams(audio_in_enabled=True, audio_in_sample_rate=24000)
room_url, token, "Test", DailyParams(audio_in_enabled=True)
)

tk_transport = TkLocalTransport(
Expand All @@ -82,7 +82,9 @@ async def on_first_participant_joined(transport, participant):

pipeline = Pipeline([daily_transport.input(), MirrorProcessor(), tk_transport.output()])

task = PipelineTask(pipeline)
task = PipelineTask(
pipeline, PipelineParams(audio_in_sample_rate=24000, audio_out_sample_rate=24000)
)

async def run_tk():
while not task.has_finished():
Expand Down
2 changes: 0 additions & 2 deletions examples/foundational/18-gstreamer-filesrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ async def main():
out_params=GStreamerPipelineSource.OutputParams(
video_width=1280,
video_height=720,
audio_sample_rate=24000,
audio_channels=1,
),
)

Expand Down
2 changes: 0 additions & 2 deletions examples/foundational/19-openai-realtime-beta.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ async def main():
"Respond bot",
DailyParams(
audio_in_enabled=True,
audio_in_sample_rate=24000,
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,7 @@ async def main():
"Respond bot",
DailyParams(
audio_in_enabled=True,
audio_in_sample_rate=24000,
audio_out_enabled=True,
audio_out_sample_rate=24000,
transcription_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.8)),
Expand Down
4 changes: 4 additions & 0 deletions examples/foundational/21-tavus-layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ async def main():
task = PipelineTask(
pipeline,
PipelineParams(
# We just use 16000 because that's what Tavus is expecting and
# we avoid resampling.
audio_in_sample_rate=16000,
audio_out_sample_rate=16000,
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,6 @@ async def main():
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
audio_in_sample_rate=16000,
),
)

Expand Down
2 changes: 0 additions & 2 deletions examples/foundational/26-gemini-multimodal-live.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ async def main():
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ async def main():
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ async def main():
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
Expand Down
4 changes: 0 additions & 4 deletions examples/foundational/26c-gemini-multimodal-live-video.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ async def main():
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
Expand All @@ -47,8 +45,6 @@ async def main():
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
start_audio_paused=True,
start_video_paused=True,
),
)

Expand Down
2 changes: 0 additions & 2 deletions examples/foundational/26d-gemini-multimodal-live-text.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ async def main():
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
Expand Down
7 changes: 0 additions & 7 deletions examples/foundational/29-livekit-audio-chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")

DESIRED_SAMPLE_RATE = 16000


def generate_token(room_name: str, participant_name: str, api_key: str, api_secret: str) -> str:
token = api.AccessToken(api_key, api_secret)
Expand Down Expand Up @@ -114,11 +112,8 @@ async def main():
token=token,
room_name=room_name,
params=LiveKitParams(
audio_in_channels=1,
audio_in_enabled=True,
audio_out_enabled=True,
audio_in_sample_rate=DESIRED_SAMPLE_RATE,
audio_out_sample_rate=DESIRED_SAMPLE_RATE,
vad_analyzer=SileroVADAnalyzer(),
vad_enabled=True,
vad_audio_passthrough=True,
Expand All @@ -128,7 +123,6 @@ async def main():
stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"),
live_options=LiveOptions(
sample_rate=DESIRED_SAMPLE_RATE,
vad_events=True,
),
)
Expand All @@ -138,7 +132,6 @@ async def main():
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
sample_rate=DESIRED_SAMPLE_RATE,
)

messages = [
Expand Down
2 changes: 0 additions & 2 deletions examples/simple-chatbot/server/bot-gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@ async def main():
token,
"Chatbot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
camera_out_enabled=True,
camera_out_width=1024,
Expand Down
9 changes: 6 additions & 3 deletions examples/studypal/studypal.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ async def main():
token,
"studypal",
DailyParams(
audio_out_sample_rate=44100,
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
Expand All @@ -124,7 +123,6 @@ async def main():
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id=os.getenv("CARTESIA_VOICE_ID", "4d2fd738-3b3d-4368-957a-bb4805275bd9"),
# British Narration Lady: 4d2fd738-3b3d-4368-957a-bb4805275bd9
sample_rate=44100,
)

llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini")
Expand Down Expand Up @@ -155,7 +153,12 @@ async def main():
]
)

task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
task = PipelineTask(
pipeline,
PipelineParams(
audio_out_sample_rate=44100, allow_interruptions=True, enable_metrics=True
),
)

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
Expand Down
26 changes: 10 additions & 16 deletions examples/twilio-chatbot/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import wave

import aiofiles
from deepgram import LiveOptions
from dotenv import load_dotenv
from fastapi import WebSocket
from loguru import logger
Expand All @@ -36,8 +35,6 @@
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")

SAMPLE_RATE = 8000


async def save_audio(server_name: str, audio: bytes, sample_rate: int, num_channels: int):
if len(audio) > 0:
Expand All @@ -63,29 +60,21 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
params=FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_out_sample_rate=SAMPLE_RATE,
add_wav_header=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(sample_rate=SAMPLE_RATE),
vad_analyzer=SileroVADAnalyzer(),
vad_audio_passthrough=True,
serializer=TwilioFrameSerializer(
stream_sid, TwilioFrameSerializer.InputParams(sample_rate=SAMPLE_RATE)
),
serializer=TwilioFrameSerializer(stream_sid),
),
)

llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

stt = DeepgramSTTService(
api_key=os.getenv("DEEPGRAM_API_KEY"),
live_options=LiveOptions(sample_rate=SAMPLE_RATE),
audio_passthrough=True,
)
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True)

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
sample_rate=SAMPLE_RATE,
push_silence_after_stop=testing,
)

Expand All @@ -101,7 +90,7 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):

# NOTE: Watch out! This will save all the conversation in memory. You can
# pass `buffer_size` to get periodic callbacks.
audiobuffer = AudioBufferProcessor(sample_rate=SAMPLE_RATE)
audiobuffer = AudioBufferProcessor()

pipeline = Pipeline(
[
Expand All @@ -116,7 +105,12 @@ async def run_bot(websocket_client: WebSocket, stream_sid: str, testing: bool):
]
)

task = PipelineTask(pipeline, params=PipelineParams(allow_interruptions=True))
task = PipelineTask(
pipeline,
params=PipelineParams(
audio_in_sample_rate=8000, audio_out_sample_rate=8000, allow_interruptions=True
),
)

@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
Expand Down
Loading

0 comments on commit ab45e48

Please sign in to comment.