Skip to content

Commit

Permalink
Release 0.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
njaremko committed Jan 30, 2025
1 parent 4804aa0 commit f98a246
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 140 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## 0.4.0

- Added `lossy` option to `for_each` that allows replacing invalid UTF-8 characters with a replacement character
- Removed `flexible_default` option from `for_each`

## 0.3.21

- Fix bug where `ignore_null_bytes` was not being respected in enumerators.
Expand Down
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
osv (0.3.22)
osv (0.4.0)
rb_sys (~> 0.9.39)

GEM
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,10 @@ OSV.for_each("data.csv",

# Parsing behavior
flexible: false, # Allow varying number of fields (default: false)
flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
# Implicitly enables flexible mode if set.
trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
)
```

Expand All @@ -103,9 +102,9 @@ OSV.for_each("data.csv",
- `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
- `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
- `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)

When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.

Expand Down
33 changes: 10 additions & 23 deletions ext/osv/src/csv/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
quote_char: u8,
null_string: Option<String>,
flexible: bool,
flexible_default: Option<String>,
trim: csv::Trim,
ignore_null_bytes: bool,
lossy: bool,
_phantom: PhantomData<T>,
_phantom_a: PhantomData<&'a ()>,
}
Expand All @@ -97,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
quote_char: b'"',
null_string: None,
flexible: false,
flexible_default: None,
trim: csv::Trim::None,
ignore_null_bytes: false,
lossy: false,
_phantom: PhantomData,
_phantom_a: PhantomData,
}
Expand Down Expand Up @@ -140,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
self
}

/// Sets the default value for missing fields when in flexible mode.
#[must_use]
pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
self.flexible_default = flexible_default;
self
}

/// Sets the trimming mode for fields.
#[must_use]
pub fn trim(mut self, trim: csv::Trim) -> Self {
Expand All @@ -160,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
self
}

#[must_use]
pub fn lossy(mut self, lossy: bool) -> Self {
self.lossy = lossy;
self
}

/// Handles reading from a file descriptor.
fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
let raw_value = self.to_read.as_raw();
Expand Down Expand Up @@ -202,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
build_ruby_reader(&self.ruby, self.to_read)?
};

let flexible = self.flexible || self.flexible_default.is_some();
let flexible = self.flexible;
let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);

let mut reader = csv::ReaderBuilder::new()
Expand All @@ -220,18 +219,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
}
let static_headers = StringCache::intern_many(&headers)?;

// We intern both of these to get static string references we can reuse throughout the parser.
let flexible_default = self
.flexible_default
.map(|s| {
RString::new(&s)
.to_interned_str()
.as_str()
.map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
})
.transpose()?
.map(Cow::Borrowed);

let null_string = self
.null_string
.map(|s| {
Expand All @@ -247,8 +234,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
reader,
static_headers,
null_string,
flexible_default,
self.ignore_null_bytes,
self.lossy,
))
}
}
88 changes: 58 additions & 30 deletions ext/osv/src/csv/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@ use std::hash::BuildHasher;
use super::header_cache::StringCacheKey;
use super::CowStr;

pub enum CsvRecordType {
String(csv::StringRecord),
Byte(csv::ByteRecord),
}

pub trait RecordParser<'a> {
type Output;

fn parse(
headers: &[StringCacheKey],
record: &csv::StringRecord,
record: &CsvRecordType,
null_string: Option<Cow<'a, str>>,
flexible_default: Option<Cow<'a, str>>,
ignore_null_bytes: bool,
) -> Self::Output;
}
Expand All @@ -25,20 +29,18 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
#[inline]
fn parse(
headers: &[StringCacheKey],
record: &csv::StringRecord,
record: &CsvRecordType,
null_string: Option<Cow<'a, str>>,
flexible_default: Option<Cow<'a, str>>,
ignore_null_bytes: bool,
) -> Self::Output {
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());

let shared_empty = Cow::Borrowed("");
let shared_default = flexible_default.map(CowStr);

headers.iter().enumerate().for_each(|(i, header)| {
let value = record.get(i).map_or_else(
|| shared_default.clone(),
|field| {
if null_string.as_deref() == Some(field) {
let value = match record {
CsvRecordType::String(s) => s.get(i).and_then(|field| {
if null_string.as_deref() == Some(field.as_ref()) {
None
} else if field.is_empty() {
Some(CowStr(shared_empty.clone()))
Expand All @@ -47,8 +49,22 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
} else {
Some(CowStr(Cow::Owned(field.to_string())))
}
},
);
}),

CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
let field = String::from_utf8_lossy(field);
if null_string.as_deref() == Some(field.as_ref()) {
None
} else if field.is_empty() {
Some(CowStr(shared_empty.clone()))
} else if ignore_null_bytes {
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
} else {
Some(CowStr(Cow::Owned(field.to_string())))
}
}),
};

map.insert(*header, value);
});
map
Expand All @@ -61,35 +77,47 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
#[inline]
fn parse(
headers: &[StringCacheKey],
record: &csv::StringRecord,
record: &CsvRecordType,
null_string: Option<Cow<'a, str>>,
flexible_default: Option<Cow<'a, str>>,
ignore_null_bytes: bool,
) -> Self::Output {
let target_len = headers.len();
let mut vec = Vec::with_capacity(target_len);

let shared_empty = Cow::Borrowed("");
let shared_default = flexible_default.map(CowStr);

for field in record.iter() {
let value = if Some(field) == null_string.as_deref() {
None
} else if field.is_empty() {
Some(CowStr(shared_empty.clone()))
} else if ignore_null_bytes {
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
} else {
Some(CowStr(Cow::Owned(field.to_string())))
};
vec.push(value);
}

if vec.len() < target_len {
if let Some(default) = shared_default {
vec.resize_with(target_len, || Some(default.clone()));
match record {
CsvRecordType::String(record) => {
for field in record.iter() {
let value = if Some(field.as_ref()) == null_string.as_deref() {
None
} else if field.is_empty() {
Some(CowStr(shared_empty.clone()))
} else if ignore_null_bytes {
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
} else {
Some(CowStr(Cow::Owned(field.to_string())))
};
vec.push(value);
}
}
CsvRecordType::Byte(record) => {
for field in record.iter() {
let field = String::from_utf8_lossy(field);
let value = if Some(field.as_ref()) == null_string.as_deref() {
None
} else if field.is_empty() {
Some(CowStr(shared_empty.clone()))
} else if ignore_null_bytes {
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
} else {
Some(CowStr(Cow::Owned(field.to_string())))
};
vec.push(value);
}
}
}

vec
}
}
27 changes: 19 additions & 8 deletions ext/osv/src/csv/record_reader.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::builder::ReaderError;
use super::header_cache::StringCacheKey;
use super::parser::RecordParser;
use super::parser::{CsvRecordType, RecordParser};
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
use magnus::{Error, Ruby};
use std::borrow::Cow;
Expand All @@ -16,8 +16,7 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
headers: Vec<StringCacheKey>,
null_string: Option<Cow<'a, str>>,
flexible_default: Option<Cow<'a, str>>,
string_record: csv::StringRecord,
string_record: CsvRecordType,
parser: std::marker::PhantomData<T>,
ignore_null_bytes: bool,
}
Expand Down Expand Up @@ -57,29 +56,41 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
headers: Vec<StringCacheKey>,
null_string: Option<Cow<'a, str>>,
flexible_default: Option<Cow<'a, str>>,
ignore_null_bytes: bool,
lossy: bool,
) -> Self {
let headers_len = headers.len();
Self {
reader,
headers,
null_string,
flexible_default,
string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
string_record: if lossy {
CsvRecordType::Byte(csv::ByteRecord::with_capacity(
READ_BUFFER_SIZE,
headers_len,
))
} else {
CsvRecordType::String(csv::StringRecord::with_capacity(
READ_BUFFER_SIZE,
headers_len,
))
},
parser: std::marker::PhantomData,
ignore_null_bytes,
}
}

/// Attempts to read the next record, returning any errors encountered.
fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
if self.reader.read_record(&mut self.string_record)? {
let record = match self.string_record {
CsvRecordType::String(ref mut record) => self.reader.read_record(record),
CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
}?;
if record {
Ok(Some(T::parse(
&self.headers,
&self.string_record,
self.null_string.clone(),
self.flexible_default.clone(),
self.ignore_null_bytes,
)))
} else {
Expand Down
Loading

0 comments on commit f98a246

Please sign in to comment.