From f98a2461d65da225b9793002d7c420f76b78300c Mon Sep 17 00:00:00 2001 From: Nathan Jaremko Date: Thu, 30 Jan 2025 13:55:58 -0500 Subject: [PATCH] Release 0.4.0 --- CHANGELOG.md | 5 ++ Gemfile.lock | 2 +- README.md | 5 +- ext/osv/src/csv/builder.rs | 33 ++++-------- ext/osv/src/csv/parser.rs | 88 +++++++++++++++++++++----------- ext/osv/src/csv/record_reader.rs | 27 +++++++--- ext/osv/src/reader.rs | 13 +++-- ext/osv/src/utils.rs | 16 +++--- lib/osv.rbi | 7 +-- lib/osv/version.rb | 2 +- test/basic_test.rb | 63 ++++------------------- 11 files changed, 121 insertions(+), 140 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ae24f7..4aa1ccc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 0.4.0 + +- Added `lossy` option to `for_each` that allows replacing invalid UTF-8 characters with a replacement character +- Removed `flexible_default` option from `for_each` + ## 0.3.21 - Fix bug where `ignore_null_bytes` was not being respected in enumerators. diff --git a/Gemfile.lock b/Gemfile.lock index ddd1f64..c66ea37 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - osv (0.3.22) + osv (0.4.0) rb_sys (~> 0.9.39) GEM diff --git a/README.md b/README.md index e4c565d..9dd9c6c 100644 --- a/README.md +++ b/README.md @@ -84,11 +84,10 @@ OSV.for_each("data.csv", # Parsing behavior flexible: false, # Allow varying number of fields (default: false) - flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields. - # Implicitly enables flexible mode if set. trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil) buffer_size: 1024, # Number of rows to buffer in memory (default: 1024) ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false) + lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false) ) ``` @@ -103,9 +102,9 @@ OSV.for_each("data.csv", - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024) - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array) - `flexible`: Boolean specifying if the parser should be flexible (default: false) -- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`) - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields) - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false) +- `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false) When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc. diff --git a/ext/osv/src/csv/builder.rs b/ext/osv/src/csv/builder.rs index 4d24862..e65fdab 100644 --- a/ext/osv/src/csv/builder.rs +++ b/ext/osv/src/csv/builder.rs @@ -79,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> { quote_char: u8, null_string: Option, flexible: bool, - flexible_default: Option, trim: csv::Trim, ignore_null_bytes: bool, + lossy: bool, _phantom: PhantomData, _phantom_a: PhantomData<&'a ()>, } @@ -97,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { quote_char: b'"', null_string: None, flexible: false, - flexible_default: None, trim: csv::Trim::None, ignore_null_bytes: false, + lossy: false, _phantom: PhantomData, _phantom_a: PhantomData, } @@ -140,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { self } - /// Sets the default value for missing fields when in flexible mode. - #[must_use] - pub fn flexible_default(mut self, flexible_default: Option) -> Self { - self.flexible_default = flexible_default; - self - } - /// Sets the trimming mode for fields. #[must_use] pub fn trim(mut self, trim: csv::Trim) -> Self { @@ -160,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { self } + #[must_use] + pub fn lossy(mut self, lossy: bool) -> Self { + self.lossy = lossy; + self + } + /// Handles reading from a file descriptor. fn handle_file_descriptor(&self) -> Result, ReaderError> { let raw_value = self.to_read.as_raw(); @@ -202,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { build_ruby_reader(&self.ruby, self.to_read)? }; - let flexible = self.flexible || self.flexible_default.is_some(); + let flexible = self.flexible; let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable); let mut reader = csv::ReaderBuilder::new() @@ -220,18 +219,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { } let static_headers = StringCache::intern_many(&headers)?; - // We intern both of these to get static string references we can reuse throughout the parser. - let flexible_default = self - .flexible_default - .map(|s| { - RString::new(&s) - .to_interned_str() - .as_str() - .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e))) - }) - .transpose()? - .map(Cow::Borrowed); - let null_string = self .null_string .map(|s| { @@ -247,8 +234,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> { reader, static_headers, null_string, - flexible_default, self.ignore_null_bytes, + self.lossy, )) } } diff --git a/ext/osv/src/csv/parser.rs b/ext/osv/src/csv/parser.rs index bcb085d..cc7c782 100644 --- a/ext/osv/src/csv/parser.rs +++ b/ext/osv/src/csv/parser.rs @@ -5,14 +5,18 @@ use std::hash::BuildHasher; use super::header_cache::StringCacheKey; use super::CowStr; +pub enum CsvRecordType { + String(csv::StringRecord), + Byte(csv::ByteRecord), +} + pub trait RecordParser<'a> { type Output; fn parse( headers: &[StringCacheKey], - record: &csv::StringRecord, + record: &CsvRecordType, null_string: Option>, - flexible_default: Option>, ignore_null_bytes: bool, ) -> Self::Output; } @@ -25,20 +29,18 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a> #[inline] fn parse( headers: &[StringCacheKey], - record: &csv::StringRecord, + record: &CsvRecordType, null_string: Option>, - flexible_default: Option>, ignore_null_bytes: bool, ) -> Self::Output { let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default()); let shared_empty = Cow::Borrowed(""); - let shared_default = flexible_default.map(CowStr); + headers.iter().enumerate().for_each(|(i, header)| { - let value = record.get(i).map_or_else( - || shared_default.clone(), - |field| { - if null_string.as_deref() == Some(field) { + let value = match record { + CsvRecordType::String(s) => s.get(i).and_then(|field| { + if null_string.as_deref() == Some(field.as_ref()) { None } else if field.is_empty() { Some(CowStr(shared_empty.clone())) @@ -47,8 +49,22 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a> } else { Some(CowStr(Cow::Owned(field.to_string()))) } - }, - ); + }), + + CsvRecordType::Byte(b) => b.get(i).and_then(|field| { + let field = String::from_utf8_lossy(field); + if null_string.as_deref() == Some(field.as_ref()) { + None + } else if field.is_empty() { + Some(CowStr(shared_empty.clone())) + } else if ignore_null_bytes { + Some(CowStr(Cow::Owned(field.replace("\0", "")))) + } else { + Some(CowStr(Cow::Owned(field.to_string()))) + } + }), + }; + map.insert(*header, value); }); map @@ -61,35 +77,47 @@ impl<'a> RecordParser<'a> for Vec>> { #[inline] fn parse( headers: &[StringCacheKey], - record: &csv::StringRecord, + record: &CsvRecordType, null_string: Option>, - flexible_default: Option>, ignore_null_bytes: bool, ) -> Self::Output { let target_len = headers.len(); let mut vec = Vec::with_capacity(target_len); let shared_empty = Cow::Borrowed(""); - let shared_default = flexible_default.map(CowStr); - - for field in record.iter() { - let value = if Some(field) == null_string.as_deref() { - None - } else if field.is_empty() { - Some(CowStr(shared_empty.clone())) - } else if ignore_null_bytes { - Some(CowStr(Cow::Owned(field.replace("\0", "")))) - } else { - Some(CowStr(Cow::Owned(field.to_string()))) - }; - vec.push(value); - } - if vec.len() < target_len { - if let Some(default) = shared_default { - vec.resize_with(target_len, || Some(default.clone())); + match record { + CsvRecordType::String(record) => { + for field in record.iter() { + let value = if Some(field.as_ref()) == null_string.as_deref() { + None + } else if field.is_empty() { + Some(CowStr(shared_empty.clone())) + } else if ignore_null_bytes { + Some(CowStr(Cow::Owned(field.replace("\0", "")))) + } else { + Some(CowStr(Cow::Owned(field.to_string()))) + }; + vec.push(value); + } + } + CsvRecordType::Byte(record) => { + for field in record.iter() { + let field = String::from_utf8_lossy(field); + let value = if Some(field.as_ref()) == null_string.as_deref() { + None + } else if field.is_empty() { + Some(CowStr(shared_empty.clone())) + } else if ignore_null_bytes { + Some(CowStr(Cow::Owned(field.replace("\0", "")))) + } else { + Some(CowStr(Cow::Owned(field.to_string()))) + }; + vec.push(value); + } } } + vec } } diff --git a/ext/osv/src/csv/record_reader.rs b/ext/osv/src/csv/record_reader.rs index f78a45c..7a8731c 100644 --- a/ext/osv/src/csv/record_reader.rs +++ b/ext/osv/src/csv/record_reader.rs @@ -1,6 +1,6 @@ use super::builder::ReaderError; use super::header_cache::StringCacheKey; -use super::parser::RecordParser; +use super::parser::{CsvRecordType, RecordParser}; use super::{header_cache::StringCache, ruby_reader::SeekableRead}; use magnus::{Error, Ruby}; use std::borrow::Cow; @@ -16,8 +16,7 @@ pub struct RecordReader<'a, T: RecordParser<'a>> { reader: csv::Reader>>, headers: Vec, null_string: Option>, - flexible_default: Option>, - string_record: csv::StringRecord, + string_record: CsvRecordType, parser: std::marker::PhantomData, ignore_null_bytes: bool, } @@ -57,16 +56,25 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> { reader: csv::Reader>>, headers: Vec, null_string: Option>, - flexible_default: Option>, ignore_null_bytes: bool, + lossy: bool, ) -> Self { let headers_len = headers.len(); Self { reader, headers, null_string, - flexible_default, - string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len), + string_record: if lossy { + CsvRecordType::Byte(csv::ByteRecord::with_capacity( + READ_BUFFER_SIZE, + headers_len, + )) + } else { + CsvRecordType::String(csv::StringRecord::with_capacity( + READ_BUFFER_SIZE, + headers_len, + )) + }, parser: std::marker::PhantomData, ignore_null_bytes, } @@ -74,12 +82,15 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> { /// Attempts to read the next record, returning any errors encountered. fn try_next(&mut self) -> Result, ReaderError> { - if self.reader.read_record(&mut self.string_record)? { + let record = match self.string_record { + CsvRecordType::String(ref mut record) => self.reader.read_record(record), + CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record), + }?; + if record { Ok(Some(T::parse( &self.headers, &self.string_record, self.null_string.clone(), - self.flexible_default.clone(), self.ignore_null_bytes, ))) } else { diff --git a/ext/osv/src/reader.rs b/ext/osv/src/reader.rs index 8e1ad4a..96ef6e9 100644 --- a/ext/osv/src/reader.rs +++ b/ext/osv/src/reader.rs @@ -34,9 +34,9 @@ struct EnumeratorArgs { null_string: Option, result_type: String, flexible: bool, - flexible_default: Option, trim: Option, ignore_null_bytes: bool, + lossy: bool, } /// Parses a CSV file with the given configuration. @@ -56,9 +56,9 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { null_string, result_type, flexible, - flexible_default, trim, ignore_null_bytes, + lossy, } = parse_read_csv_args(&ruby, args)?; if !ruby.block_given() { @@ -71,7 +71,6 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { null_string, result_type, flexible, - flexible_default, trim: match trim { Trim::All => Some("all".to_string()), Trim::Headers => Some("headers".to_string()), @@ -79,6 +78,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { _ => None, }, ignore_null_bytes, + lossy, }) .map(|yield_enum| yield_enum.into_value_with(&ruby)); } @@ -97,12 +97,12 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { >::new(ruby, to_read) .has_headers(has_headers) .flexible(flexible) - .flexible_default(flexible_default) .trim(trim) .delimiter(delimiter) .quote_char(quote_char) .null_string(null_string) .ignore_null_bytes(ignore_null_bytes) + .lossy(lossy) .build()?; let ruby = unsafe { Ruby::get_unchecked() }; @@ -115,12 +115,12 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result { let builder = RecordReaderBuilder::>>>::new(ruby, to_read) .has_headers(has_headers) .flexible(flexible) - .flexible_default(flexible_default) .trim(trim) .delimiter(delimiter) .quote_char(quote_char) .null_string(null_string) .ignore_null_bytes(ignore_null_bytes) + .lossy(lossy) .build()?; let ruby = unsafe { Ruby::get_unchecked() }; @@ -150,10 +150,9 @@ fn create_enumerator(args: EnumeratorArgs) -> Result kwargs.aset(Symbol::new("nil_string"), args.null_string)?; kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?; kwargs.aset(Symbol::new("flexible"), args.flexible)?; - kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?; kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?; kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?; - + kwargs.aset(Symbol::new("lossy"), args.lossy)?; Ok(args .rb_self .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)))) diff --git a/ext/osv/src/utils.rs b/ext/osv/src/utils.rs index eaaa991..7045699 100644 --- a/ext/osv/src/utils.rs +++ b/ext/osv/src/utils.rs @@ -34,9 +34,9 @@ pub struct ReadCsvArgs { pub null_string: Option, pub result_type: String, pub flexible: bool, - pub flexible_default: Option, pub trim: csv::Trim, pub ignore_null_bytes: bool, + pub lossy: bool, } /// Parse common arguments for CSV parsing @@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result>, Option>, Option>, - Option>>, Option>, Option>, + Option>, ), (), >( @@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result Result Result csv::Trim::None, }; - let ignore_null_bytes = kwargs.optional.8.flatten().unwrap_or_default(); + let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default(); + + let lossy = kwargs.optional.8.flatten().unwrap_or_default(); Ok(ReadCsvArgs { to_read, @@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result "1", "name" => "��" }], actual + ensure + File.delete("test/invalid_utf8.csv") rescue nil + end + def test_parse_csv_with_headers_null expected = [ { "id" => "1", "age" => "25", "name" => "John" }, @@ -248,45 +257,6 @@ def test_parse_csv_with_missing_field_flexible_without_headers end end - def test_parse_csv_with_missing_field_flexible_default - Tempfile.create(%w[test .csv]) do |tempfile| - content = File.read("test/test.csv") - content += "4,oops\n" - tempfile.write(content) - tempfile.close - - expected = [ - { "id" => "1", "age" => "25", "name" => "John" }, - { "name" => "Jane", "id" => "2", "age" => "30" }, - { "name" => "Jim", "age" => "35", "id" => "3" }, - { "id" => "4", "name" => "oops", "age" => "" } - ] - actual = [] - OSV.for_each(tempfile.path, flexible_default: "") { |row| actual << row } - assert_equal expected, actual - end - end - - def test_parse_csv_with_missing_field_flexible_default_without_headers - Tempfile.create(%w[test .csv]) do |tempfile| - content = File.read("test/test.csv") - content += "4,oops\n" - tempfile.write(content) - tempfile.close - - expected = [ - { "c0" => "id", "c1" => "name", "c2" => "age" }, - { "c1" => "John", "c0" => "1", "c2" => "25" }, - { "c1" => "Jane", "c2" => "30", "c0" => "2" }, - { "c0" => "3", "c1" => "Jim", "c2" => "35" }, - { "c0" => "4", "c2" => "", "c1" => "oops" } - ] - actual = [] - OSV.for_each(tempfile.path, has_headers: false, flexible_default: "") { |row| actual << row } - assert_equal expected, actual - end - end - def test_parse_csv_with_missing_field_flexible_array Tempfile.create(%w[test .csv]) do |tempfile| content = File.read("test/test.csv") @@ -301,20 +271,6 @@ def test_parse_csv_with_missing_field_flexible_array end end - def test_parse_csv_with_missing_field_flexible_default_array - Tempfile.create(%w[test .csv]) do |tempfile| - content = File.read("test/test.csv") - content += "4,oops\n" - tempfile.write(content) - tempfile.close - - expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35], ["4", "oops", ""]] - actual = [] - OSV.for_each(tempfile.path, flexible_default: "", result_type: "array") { |row| actual << row } - assert_equal expected, actual - end - end - def test_parse_csv_compat_with_io_and_headers expected = [%w[1 John 25], %w[2 Jane 30], %w[3 Jim 35]] actual = [] @@ -629,7 +585,6 @@ def test_parse_csv_with_explicit_nil_kwargs nil_string: nil, result_type: nil, flexible: nil, - flexible_default: nil, trim: nil ) { |row| actual << row } end