diff --git a/crates/export/src/lib.rs b/crates/export/src/lib.rs index 978281260..e0b92ebfd 100644 --- a/crates/export/src/lib.rs +++ b/crates/export/src/lib.rs @@ -1,6 +1,6 @@ use cap_editor::Segment; use cap_media::{ - data::{AudioInfo, RawVideoFormat, VideoInfo}, + data::{cast_f32_slice_to_bytes, AudioInfo, RawVideoFormat, VideoInfo}, encoders::{MP4Encoder, MP4Input}, feeds::{AudioData, AudioFrameBuffer}, MediaError, @@ -207,17 +207,11 @@ where .buffer .next_frame_data(samples, project.timeline.as_ref().map(|t| t)) { - let mut frame = audio_info.wrap_frame(&frame_data, 0); + let mut frame = audio_info + .wrap_frame(unsafe { cast_f32_slice_to_bytes(&frame_data) }, 0); let pts = (frame_count as f64 * f64::from(audio_info.sample_rate) / f64::from(fps)) as i64; frame.set_pts(Some(pts)); - // println!( - // "Export: Sending audio frame {} with PTS: {:?}, samples: {}, data size: {}", - // frame_count, - // frame.pts(), - // samples, - // frame_data.len() - // ); Some(frame) } else { None @@ -226,11 +220,6 @@ where None }; - // println!( - // "Export: Processing frame {} (size: {}x{}, padded_bytes_per_row: {})", - // frame_count, frame.width, frame.height, frame.padded_bytes_per_row - // ); - let mut video_frame = VideoInfo::from_raw( RawVideoFormat::Rgba, self.output_size.0, @@ -244,12 +233,6 @@ where ); video_frame.set_pts(Some(frame_count as i64)); - // println!( - // "Export: Sending frame {} to encoder (PTS: {:?})", - // frame_count, - // video_frame.pts() - // ); - frame_tx .send(MP4Input { audio: audio_frame, diff --git a/crates/media/src/data.rs b/crates/media/src/data.rs index 758b094e7..2e1b23f14 100644 --- a/crates/media/src/data.rs +++ b/crates/media/src/data.rs @@ -204,6 +204,10 @@ impl AudioInfo { } } +pub unsafe fn cast_f32_slice_to_bytes(slice: &[f32]) -> &[u8] { + std::slice::from_raw_parts(slice.as_ptr() as *const u8, slice.len() * f32::BYTE_SIZE) +} + #[derive(Debug, Copy, Clone)] pub struct VideoInfo { pub pixel_format: Pixel, diff --git a/crates/media/src/encoders/mp4.rs b/crates/media/src/encoders/mp4.rs index 8730b4b92..36f760f33 100644 --- a/crates/media/src/encoders/mp4.rs +++ b/crates/media/src/encoders/mp4.rs @@ -2,6 +2,7 @@ use crate::{ data::{ AudioInfo, FFAudio, FFPacket, FFRational, FFVideo, PlanarData, RawVideoFormat, VideoInfo, }, + feeds::AudioData, pipeline::{audio_buffer::AudioBuffer, task::PipelineSinkTask}, MediaError, }; @@ -110,7 +111,7 @@ impl MP4Encoder { let resampler = software::resampler( ( - Sample::F64(format::sample::Type::Packed), + AudioData::FORMAT, audio_config.channel_layout(), audio_config.sample_rate, ), @@ -211,6 +212,7 @@ impl MP4Encoder { }; let mut output = ffmpeg::util::frame::Audio::empty(); + audio.resampler.run(&buffered_frame, &mut output).unwrap(); // Preserve PTS from input frame @@ -218,24 +220,12 @@ impl MP4Encoder { output.set_pts(Some(pts)); } - // println!( - // "MP4Encoder: Sending audio frame with PTS: {:?}, samples: {}", - // output.pts(), - // output.samples() - // ); - // Send frame to encoder audio.encoder.send_frame(&output).unwrap(); // Process any encoded packets let mut encoded_packet = FFPacket::empty(); while audio.encoder.receive_packet(&mut encoded_packet).is_ok() { - // println!( - // "MP4Encoder: Writing audio packet with PTS: {:?}, size: {}", - // encoded_packet.pts(), - // encoded_packet.size() - // ); - encoded_packet.set_stream(1); encoded_packet.rescale_ts( audio.encoder.time_base(), @@ -257,21 +247,11 @@ impl MP4Encoder { .receive_packet(&mut encoded_packet) .is_ok() { - // println!( - // "MP4Encoder: Got encoded packet with PTS: {:?}, DTS: {:?}", - // encoded_packet.pts(), - // encoded_packet.dts() - // ); encoded_packet.set_stream(0); // Video is stream 0 encoded_packet.rescale_ts( self.video.encoder.time_base(), self.output_ctx.stream(0).unwrap().time_base(), ); - // println!( - // "MP4Encoder: Writing packet with rescaled PTS: {:?}, DTS: {:?}", - // encoded_packet.pts(), - // encoded_packet.dts() - // ); encoded_packet .write_interleaved(&mut self.output_ctx) .unwrap(); diff --git a/crates/media/src/feeds/audio.rs b/crates/media/src/feeds/audio.rs index b0a94c5ed..9a64c978d 100644 --- a/crates/media/src/feeds/audio.rs +++ b/crates/media/src/feeds/audio.rs @@ -1,7 +1,11 @@ use cap_project::TimelineConfiguration; use ffmpeg::{ codec::{context, decoder}, - format::sample::{Sample, Type}, + format::{ + self, + sample::{Sample, Type}, + }, + frame, software::resampling, }; use ringbuf::{ @@ -11,21 +15,21 @@ use ringbuf::{ use std::{path::PathBuf, sync::Arc}; use crate::{ - data::{AudioInfo, FFAudio, FromSampleBytes}, + data::{cast_f32_slice_to_bytes, AudioInfo, FFAudio, FromSampleBytes}, MediaError, }; -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq)] pub struct AudioData { - pub buffer: Arc>, + pub buffer: Arc>, pub info: AudioInfo, } impl AudioData { - pub const FORMAT: Sample = Sample::F64(Type::Packed); + pub const FORMAT: Sample = Sample::F32(Type::Packed); pub fn from_file(path: PathBuf) -> Result { - let input_ctx = ffmpeg::format::input(&path)?; + let mut input_ctx = ffmpeg::format::input(&path)?; let input_stream = input_ctx .streams() .best(ffmpeg::media::Type::Audio) @@ -36,31 +40,84 @@ impl AudioData { decoder.set_parameters(input_stream.parameters())?; decoder.set_packet_time_base(input_stream.time_base()); - let input_info = AudioInfo::from_decoder(&decoder)?; - let mut output_info = input_info; - output_info.sample_format = Self::FORMAT; + let mut info = AudioInfo::from_decoder(&decoder)?; + info.sample_format = Self::FORMAT; - let resampler = AudioResampler::new(input_info, output_info)?; + let stream_index = input_stream.index(); + Ok(Self { + buffer: Arc::new(decode_audio_to_f32( + &mut decoder, + &mut input_ctx, + stream_index, + )), + info, + }) + } +} - let reader = AudioFileReader { - stream_index: input_stream.index(), - info: input_info, - resampler, - decoder, - first: true, - }; +fn decode_audio_to_f32( + decoder: &mut decoder::Audio, + input_ctx: &mut format::context::Input, + stream_index: usize, +) -> Vec { + let mut resampler = F32Resampler::new(&decoder); + + let decoder_time_base = decoder.time_base(); + run_audio_decoder( + decoder, + input_ctx.packets().filter_map(|(s, mut p)| { + if s.index() == stream_index { + p.rescale_ts(s.time_base(), decoder_time_base); + Some(p) + } else { + None + } + }), + |frame| { + let ts = frame.timestamp(); + frame.set_pts(ts); + + resampler.ingest_frame(&frame); + }, + ); - reader.read(input_ctx) + resampler.finish().0 +} + +fn run_audio_decoder( + decoder: &mut decoder::Audio, + packets: impl Iterator, + mut on_frame: impl FnMut(&mut frame::Audio), +) { + let mut decoder_frame = frame::Audio::empty(); + let mut decode_packets = |decoder: &mut decoder::Audio| { + while decoder.receive_frame(&mut decoder_frame).is_ok() { + on_frame(&mut decoder_frame); + } + }; + + for packet in packets { + decoder.send_packet(&packet).unwrap(); + decode_packets(decoder); } + + decoder.send_eof().unwrap(); + decode_packets(decoder); } pub struct AudioFrameBuffer { data: Vec, - cursor: (usize, usize), + cursor: AudioFrameBufferCursor, elapsed_samples: usize, sample_size: usize, } +#[derive(Clone, Copy, Debug)] +pub struct AudioFrameBufferCursor { + segment_index: usize, + samples: usize, +} + impl AudioFrameBuffer { pub fn new(data: Vec) -> Self { let info = data[0].info; @@ -68,7 +125,10 @@ impl AudioFrameBuffer { Self { data, - cursor: (0, 0), + cursor: AudioFrameBufferCursor { + segment_index: 0, + samples: 0, + }, elapsed_samples: 0, sample_size, } @@ -86,11 +146,20 @@ impl AudioFrameBuffer { Some(timeline) => match timeline.get_recording_time(playhead) { Some((time, segment)) => { let index = segment.unwrap_or(0) as usize; - (index, self.playhead_to_samples(time) * self.sample_size) + AudioFrameBufferCursor { + segment_index: index, + samples: self.playhead_to_samples(time), + } } - None => (0, self.data[0].buffer.len()), + None => AudioFrameBufferCursor { + segment_index: 0, + samples: self.data[0].buffer.len(), + }, + }, + None => AudioFrameBufferCursor { + segment_index: 0, + samples: self.elapsed_samples, }, - None => (0, self.elapsed_samples * self.sample_size), }; } @@ -102,15 +171,18 @@ impl AudioFrameBuffer { // (corresponding to a trim or split point). Currently this change is at least 0.2 seconds // - not sure we offer that much precision in the editor even! let new_cursor = match timeline.get_recording_time(playhead) { - Some((time, segment)) => ( - segment.unwrap_or(0) as usize, - self.playhead_to_samples(time) * self.sample_size, - ), - None => (0, self.data[0].buffer.len()), + Some((time, segment)) => AudioFrameBufferCursor { + segment_index: segment.unwrap_or(0) as usize, + samples: self.playhead_to_samples(time), + }, + None => AudioFrameBufferCursor { + segment_index: 0, + samples: self.data[0].buffer.len(), + }, }; - let cursor_diff = new_cursor.1 as isize - self.cursor.1 as isize; - if new_cursor.0 != self.cursor.0 + let cursor_diff = new_cursor.samples as isize - self.cursor.samples as isize; + if new_cursor.segment_index != self.cursor.segment_index || cursor_diff.unsigned_abs() > (self.info().sample_rate as usize) / 5 { self.cursor = new_cursor; @@ -139,7 +211,8 @@ impl AudioFrameBuffer { .map(move |(samples, data)| { let mut raw_frame = FFAudio::new(format, samples, channels); raw_frame.set_rate(sample_rate); - raw_frame.data_mut(0)[0..data.len()].copy_from_slice(data); + raw_frame.data_mut(0)[0..data.len() * f32::BYTE_SIZE] + .copy_from_slice(unsafe { cast_f32_slice_to_bytes(data) }); raw_frame }) @@ -147,31 +220,23 @@ impl AudioFrameBuffer { pub fn next_frame_data<'a>( &'a mut self, - mut samples: usize, + samples: usize, maybe_timeline: Option<&TimelineConfiguration>, - ) -> Option<(usize, &'a [u8])> { + ) -> Option<(usize, &'a [f32])> { if let Some(timeline) = maybe_timeline { self.adjust_cursor(timeline); } - let buffer = &self.data[self.cursor.0].buffer; - if self.cursor.1 >= buffer.len() { + let buffer = &self.data[self.cursor.segment_index].buffer; + if self.cursor.segment_index >= buffer.len() { self.elapsed_samples += samples; return None; } - let mut bytes_size = self.sample_size * samples; - - let remaining_data = buffer.len() - self.cursor.1; - if remaining_data < bytes_size { - bytes_size = remaining_data; - samples = remaining_data / self.sample_size; - } - let start = self.cursor; self.elapsed_samples += samples; - self.cursor.1 += bytes_size; - Some((samples, &buffer[start.1..self.cursor.1])) + self.cursor.samples += samples; + Some((samples, &buffer[start.samples..self.cursor.samples])) } } @@ -251,68 +316,6 @@ impl AudioPlaybackBuffer { } } -struct AudioFileReader { - decoder: decoder::Audio, - resampler: AudioResampler, - stream_index: usize, - info: AudioInfo, - first: bool, -} - -impl AudioFileReader { - fn read( - mut self, - mut input_ctx: ffmpeg::format::context::Input, - ) -> Result { - let mut buffer = Vec::new(); - let output_info = self.resampler.output; - - for (stream, mut packet) in input_ctx.packets() { - if stream.index() == self.stream_index { - packet.rescale_ts(stream.time_base(), self.info.time_base); - self.decoder.send_packet(&packet).unwrap(); - self.decode_packets(&mut buffer); - } - } - - self.finish_resampling(&mut buffer); - - Ok(AudioData { - buffer: Arc::new(buffer), - info: output_info, - }) - } - - fn decode_packets(&mut self, data: &mut Vec) { - let mut decoded_frame = FFAudio::empty(); - - while self.decoder.receive_frame(&mut decoded_frame).is_ok() { - let timestamp = decoded_frame.timestamp(); - if self.first { - println!( - "First timestamp: {timestamp:?}, time base {}", - self.decoder.time_base() - ); - self.first = false; - } - decoded_frame.set_pts(timestamp); - let resampled = self.resampler.queue_and_process_frame(&decoded_frame); - // println!("Resampled: {:?}", resampled); - data.extend_from_slice(resampled); - decoded_frame = FFAudio::empty(); - } - } - - fn finish_resampling(&mut self, data: &mut Vec) { - self.decoder.send_eof().unwrap(); - self.decode_packets(data); - - while let Some(resampled) = self.resampler.flush_frame() { - data.extend_from_slice(resampled); - } - } -} - pub struct AudioResampler { context: resampling::Context, output_frame: FFAudio, @@ -369,3 +372,76 @@ impl AudioResampler { Some(self.current_frame_data()) } } + +fn write_f32_ne_bytes(bytes: &[u8], buf: &mut Vec) { + buf.extend( + bytes + .chunks(4) + .map(|c| f32::from_ne_bytes([c[0], c[1], c[2], c[3]])), + ); +} + +struct F32Resampler { + resampler: ffmpeg::software::resampling::Context, + buf: Vec, + resampled_frame: frame::Audio, + resampled_samples: usize, +} + +impl F32Resampler { + pub fn new(decoder: &ffmpeg::codec::decoder::Audio) -> Self { + let resampler = ffmpeg::software::resampler( + (decoder.format(), decoder.channel_layout(), decoder.rate()), + (AudioData::FORMAT, decoder.channel_layout(), decoder.rate()), + ) + .unwrap(); + + Self { + resampler, + buf: Vec::new(), + resampled_frame: frame::Audio::empty(), + resampled_samples: 0, + } + } + + pub fn ingest_frame(&mut self, frame: &frame::Audio) { + let resample_delay = self + .resampler + .run(&frame, &mut self.resampled_frame) + .unwrap(); + + self.resampled_samples += self.resampled_frame.samples(); + + write_f32_ne_bytes( + &self.resampled_frame.data(0)[0..self.resampled_frame.samples() * f32::BYTE_SIZE], + &mut self.buf, + ); + + if resample_delay.is_some() { + self.flush(); + } + } + + fn flush(&mut self) { + loop { + let delay = self.resampler.flush(&mut self.resampled_frame).unwrap(); + + self.resampled_samples += self.resampled_frame.samples(); + + write_f32_ne_bytes( + &self.resampled_frame.data(0)[0..self.resampled_frame.samples() * f32::BYTE_SIZE], + &mut self.buf, + ); + + if delay.is_none() { + break; + } + } + } + + pub fn finish(mut self) -> (Vec, usize) { + self.flush(); + + (self.buf, self.resampled_samples) + } +}