diff --git a/crates/polars-io/src/csv/mod.rs b/crates/polars-io/src/csv/mod.rs index 2d4de6ac21c5..33da2fc51ca5 100644 --- a/crates/polars-io/src/csv/mod.rs +++ b/crates/polars-io/src/csv/mod.rs @@ -18,7 +18,7 @@ //! //! CsvWriter::new(&mut file) //! .has_header(true) -//! .with_delimiter(b',') +//! .with_separator(b',') //! .finish(df) //! } //! ``` diff --git a/crates/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs index b89d5cbcb297..1b7880f1352e 100644 --- a/crates/polars-io/src/csv/parser.rs +++ b/crates/polars-io/src/csv/parser.rs @@ -30,20 +30,20 @@ pub(crate) fn next_line_position_naive(input: &[u8], eol_char: u8) -> Option, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Option { fn accept_line( line: &[u8], expected_fields: usize, - delimiter: u8, + separator: u8, eol_char: u8, quote_char: Option, ) -> bool { let mut count = 0usize; - for (field, _) in SplitFields::new(line, delimiter, quote_char, eol_char) { - if memchr2_iter(delimiter, eol_char, field).count() >= expected_fields { + for (field, _) in SplitFields::new(line, separator, quote_char, eol_char) { + if memchr2_iter(separator, eol_char, field).count() >= expected_fields { return false; } count += 1; @@ -95,10 +95,10 @@ pub(crate) fn next_line_position( match (line, expected_fields) { // count the fields, and determine if they are equal to what we expect from the schema (Some(line), Some(expected_fields)) => { - if accept_line(line, expected_fields, delimiter, eol_char, quote_char) { + if accept_line(line, expected_fields, separator, eol_char, quote_char) { let mut valid = true; for line in lines.take(2) { - if !accept_line(line, expected_fields, delimiter, eol_char, quote_char) { + if !accept_line(line, expected_fields, separator, eol_char, quote_char) { valid = false; break; } @@ -160,13 +160,13 @@ pub(crate) fn skip_whitespace(input: &[u8]) -> &[u8] { } #[inline] -/// Can be used to skip whitespace, but exclude the delimiter +/// Can be used to skip whitespace, but exclude the separator pub(crate) fn skip_whitespace_exclude(input: &[u8], exclude: u8) -> &[u8] { skip_condition(input, |b| b != exclude && (is_whitespace(b))) } #[inline] -/// Can be used to skip whitespace, but exclude the delimiter +/// Can be used to skip whitespace, but exclude the separator pub(crate) fn skip_whitespace_line_ending_exclude( input: &[u8], exclude: u8, @@ -188,7 +188,7 @@ pub(crate) fn get_line_stats( n_lines: usize, eol_char: u8, expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, ) -> Option<(f32, f32)> { let mut lengths = Vec::with_capacity(n_lines); @@ -204,7 +204,7 @@ pub(crate) fn get_line_stats( let pos = next_line_position( bytes_trunc, Some(expected_fields), - delimiter, + separator, quote_char, eol_char, )?; @@ -350,7 +350,7 @@ fn skip_this_line(bytes: &[u8], quote: Option, eol_char: u8) -> &[u8] { pub(super) fn parse_lines<'a>( mut bytes: &'a [u8], offset: usize, - delimiter: u8, + separator: u8, comment_char: Option, quote_char: Option, eol_char: u8, @@ -391,9 +391,9 @@ pub(super) fn parse_lines<'a>( // only when we have one column \n should not be skipped // other widths should have commas. bytes = if schema_len > 1 { - skip_whitespace_line_ending_exclude(bytes, delimiter, eol_char) + skip_whitespace_line_ending_exclude(bytes, separator, eol_char) } else { - skip_whitespace_exclude(bytes, delimiter) + skip_whitespace_exclude(bytes, separator) }; if bytes.is_empty() { return Ok(original_bytes_len); @@ -416,7 +416,7 @@ pub(super) fn parse_lines<'a>( let mut next_projected = unsafe { projection_iter.next().unwrap_unchecked() }; let mut processed_fields = 0; - let mut iter = SplitFields::new(bytes, delimiter, quote_char, eol_char); + let mut iter = SplitFields::new(bytes, separator, quote_char, eol_char); let mut idx = 0u32; let mut read_sol = 0; loop { diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs index 8d5b4f67dc90..4d8527b70b80 100644 --- a/crates/polars-io/src/csv/read.rs +++ b/crates/polars-io/src/csv/read.rs @@ -109,7 +109,7 @@ where projection: Option>, /// Optional column names to project/ select. columns: Option>, - delimiter: Option, + separator: Option, pub(crate) schema: Option, encoding: CsvEncoding, n_threads: Option, @@ -204,9 +204,9 @@ where self } - /// Set the CSV file's column delimiter as a byte character - pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); + /// Set the CSV file's column separator as a byte character + pub fn with_separator(mut self, separator: u8) -> Self { + self.separator = Some(separator); self } @@ -310,8 +310,8 @@ where } /// Set the `char` used as quote char. The default is `b'"'`. If set to `[None]` quoting is disabled. - pub fn with_quote_char(mut self, quote: Option) -> Self { - self.quote_char = quote; + pub fn with_quote_char(mut self, quote_char: Option) -> Self { + self.quote_char = quote_char; self } @@ -358,7 +358,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> { self.skip_rows_before_header, std::mem::take(&mut self.projection), self.max_records, - self.delimiter, + self.separator, self.has_header, self.ignore_errors, self.schema.clone(), @@ -481,7 +481,7 @@ impl<'a> CsvReader<'a, Box> { let (inferred_schema, _, _) = infer_file_schema( &reader_bytes, - self.delimiter.unwrap_or(b','), + self.separator.unwrap_or(b','), self.max_records, self.has_header, None, @@ -510,7 +510,7 @@ impl<'a> CsvReader<'a, Box> { let (inferred_schema, _, _) = infer_file_schema( &reader_bytes, - self.delimiter.unwrap_or(b','), + self.separator.unwrap_or(b','), self.max_records, self.has_header, None, @@ -543,7 +543,7 @@ where max_records: Some(128), skip_rows_before_header: 0, projection: None, - delimiter: None, + separator: None, has_header: true, ignore_errors: false, schema: None, diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index 93251de658bf..f0299ca40fe9 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -13,7 +13,7 @@ pub(crate) fn get_file_chunks_iterator( chunk_size: usize, bytes: &[u8], expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) { @@ -27,7 +27,7 @@ pub(crate) fn get_file_chunks_iterator( let end_pos = match next_line_position( &bytes[search_pos..], Some(expected_fields), - delimiter, + separator, quote_char, eol_char, ) { @@ -49,7 +49,7 @@ struct ChunkOffsetIter<'a> { // not a promise, but something we want rows_per_batch: usize, expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, } @@ -68,7 +68,7 @@ impl<'a> Iterator for ChunkOffsetIter<'a> { let bytes_first_row = next_line_position( &self.bytes[self.last_offset + 2..], Some(self.expected_fields), - self.delimiter, + self.separator, self.quote_char, self.eol_char, ) @@ -84,7 +84,7 @@ impl<'a> Iterator for ChunkOffsetIter<'a> { self.rows_per_batch * bytes_first_row, self.bytes, self.expected_fields, - self.delimiter, + self.separator, self.quote_char, self.eol_char, ); @@ -124,7 +124,7 @@ impl<'a> CoreReader<'a> { n_chunks: offset_batch_size, rows_per_batch: self.chunk_size, expected_fields: self.schema.len(), - delimiter: self.delimiter, + separator: self.separator, quote_char: self.quote_char, eol_char: self.eol_char, }; @@ -164,7 +164,7 @@ impl<'a> CoreReader<'a> { truncate_ragged_lines: self.truncate_ragged_lines, n_rows: self.n_rows, encoding: self.encoding, - delimiter: self.delimiter, + separator: self.separator, schema: self.schema, rows_read: 0, _cat_lock, @@ -192,7 +192,7 @@ pub struct BatchedCsvReaderMmap<'a> { ignore_errors: bool, n_rows: Option, encoding: CsvEncoding, - delimiter: u8, + separator: u8, schema: SchemaRef, rows_read: IdxSize, #[cfg(feature = "dtype-categorical")] @@ -233,7 +233,7 @@ impl<'a> BatchedCsvReaderMmap<'a> { .map(|(bytes_offset_thread, stop_at_nbytes)| { let mut df = read_chunk( bytes, - self.delimiter, + self.separator, self.schema.as_ref(), self.ignore_errors, &self.projection, diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index 7f6b94c579f1..9e8e6b6e6836 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -14,7 +14,7 @@ pub(crate) fn get_offsets( chunk_size: usize, bytes: &[u8], expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) { @@ -29,7 +29,7 @@ pub(crate) fn get_offsets( let end_pos = match next_line_position( &bytes[search_pos..], Some(expected_fields), - delimiter, + separator, quote_char, eol_char, ) { @@ -57,7 +57,7 @@ struct ChunkReader<'a> { // not a promise, but something we want rows_per_batch: usize, expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, } @@ -67,7 +67,7 @@ impl<'a> ChunkReader<'a> { file: &'a File, rows_per_batch: usize, expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, page_size: u64, @@ -85,7 +85,7 @@ impl<'a> ChunkReader<'a> { n_chunks: 16, rows_per_batch, expected_fields, - delimiter, + separator, quote_char, eol_char, } @@ -132,7 +132,7 @@ impl<'a> ChunkReader<'a> { bytes_first_row = next_line_position( &self.buf[2..], Some(self.expected_fields), - self.delimiter, + self.separator, self.quote_char, self.eol_char, ); @@ -179,7 +179,7 @@ impl<'a> ChunkReader<'a> { self.rows_per_batch * bytes_first_row, &self.buf, self.expected_fields, - self.delimiter, + self.separator, self.quote_char, self.eol_char, ); @@ -206,7 +206,7 @@ impl<'a> CoreReader<'a> { file, self.chunk_size, self.schema.len(), - self.delimiter, + self.separator, self.quote_char, self.eol_char, 4096, @@ -247,7 +247,7 @@ impl<'a> CoreReader<'a> { truncate_ragged_lines: self.truncate_ragged_lines, n_rows: self.n_rows, encoding: self.encoding, - delimiter: self.delimiter, + separator: self.separator, schema: self.schema, rows_read: 0, _cat_lock, @@ -275,7 +275,7 @@ pub struct BatchedCsvReaderRead<'a> { truncate_ragged_lines: bool, n_rows: Option, encoding: CsvEncoding, - delimiter: u8, + separator: u8, schema: SchemaRef, rows_read: IdxSize, #[cfg(feature = "dtype-categorical")] @@ -330,7 +330,7 @@ impl<'a> BatchedCsvReaderRead<'a> { let stop_at_n_bytes = chunk.len(); let mut df = read_chunk( chunk, - self.delimiter, + self.separator, self.schema.as_ref(), self.ignore_errors, &self.projection, diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs index b7334619276b..3d9b43adc15c 100644 --- a/crates/polars-io/src/csv/read_impl/mod.rs +++ b/crates/polars-io/src/csv/read_impl/mod.rs @@ -110,7 +110,7 @@ pub(crate) struct CoreReader<'a> { encoding: CsvEncoding, n_threads: Option, has_header: bool, - delimiter: u8, + separator: u8, sample_size: usize, chunk_size: usize, low_memory: bool, @@ -191,7 +191,7 @@ impl<'a> CoreReader<'a> { mut skip_rows: usize, mut projection: Option>, max_records: Option, - delimiter: Option, + separator: Option, has_header: bool, ignore_errors: bool, schema: Option, @@ -228,7 +228,7 @@ impl<'a> CoreReader<'a> { } // check if schema should be inferred - let delimiter = delimiter.unwrap_or(b','); + let separator = separator.unwrap_or(b','); let mut schema = match schema { Some(schema) => schema, @@ -239,14 +239,14 @@ impl<'a> CoreReader<'a> { // again after decompression. #[cfg(any(feature = "decompress", feature = "decompress-fast"))] if let Some(b) = - decompress(&reader_bytes, n_rows, delimiter, quote_char, eol_char) + decompress(&reader_bytes, n_rows, separator, quote_char, eol_char) { reader_bytes = ReaderBytes::Owned(b); } let (inferred_schema, _, _) = infer_file_schema( &reader_bytes, - delimiter, + separator, max_records, has_header, schema_overwrite.as_deref(), @@ -300,7 +300,7 @@ impl<'a> CoreReader<'a> { encoding, n_threads, has_header, - delimiter, + separator, sample_size, chunk_size, low_memory, @@ -325,7 +325,7 @@ impl<'a> CoreReader<'a> { let starting_point_offset = bytes.as_ptr() as usize; // Skip all leading white space and the occasional utf8-bom - bytes = skip_whitespace_exclude(skip_bom(bytes), self.delimiter); + bytes = skip_whitespace_exclude(skip_bom(bytes), self.separator); // \n\n can be a empty string row of a single column // in other cases we skip it. if self.schema.len() > 1 { @@ -354,7 +354,7 @@ impl<'a> CoreReader<'a> { // we don't pass expected fields // as we want to skip all rows // no matter the no. of fields - _ => next_line_position(bytes, None, self.delimiter, self.quote_char, eol_char), + _ => next_line_position(bytes, None, self.separator, self.quote_char, eol_char), } .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?; @@ -391,7 +391,7 @@ impl<'a> CoreReader<'a> { self.sample_size, self.eol_char, self.schema.len(), - self.delimiter, + self.separator, self.quote_char, ) { if logging { @@ -415,7 +415,7 @@ impl<'a> CoreReader<'a> { if let Some(pos) = next_line_position( &bytes[n_bytes..], Some(self.schema.len()), - self.delimiter, + self.separator, self.quote_char, self.eol_char, ) { @@ -471,7 +471,7 @@ impl<'a> CoreReader<'a> { bytes, n_file_chunks, self.schema.len(), - self.delimiter, + self.separator, self.quote_char, self.eol_char, ); @@ -569,7 +569,6 @@ impl<'a> CoreReader<'a> { file_chunks .into_par_iter() .map(|(bytes_offset_thread, stop_at_nbytes)| { - let delimiter = self.delimiter; let schema = self.schema.as_ref(); let ignore_errors = self.ignore_errors; let projection = &projection; @@ -599,7 +598,7 @@ impl<'a> CoreReader<'a> { read += parse_lines( local_bytes, offset, - delimiter, + self.separator, self.comment_char, self.quote_char, self.eol_char, @@ -665,7 +664,7 @@ impl<'a> CoreReader<'a> { .map(|(bytes_offset_thread, stop_at_nbytes)| { let mut df = read_chunk( bytes, - self.delimiter, + self.separator, self.schema.as_ref(), self.ignore_errors, &projection, @@ -717,7 +716,7 @@ impl<'a> CoreReader<'a> { parse_lines( remaining_bytes, 0, - self.delimiter, + self.separator, self.comment_char, self.quote_char, self.eol_char, @@ -795,7 +794,7 @@ fn update_string_stats( #[allow(clippy::too_many_arguments)] fn read_chunk( bytes: &[u8], - delimiter: u8, + separator: u8, schema: &Schema, ignore_errors: bool, projection: &[usize], @@ -836,7 +835,7 @@ fn read_chunk( read += parse_lines( local_bytes, offset, - delimiter, + separator, comment_char, quote_char, eol_char, diff --git a/crates/polars-io/src/csv/splitfields.rs b/crates/polars-io/src/csv/splitfields.rs index 7e00aefc53dd..1804cea8559e 100644 --- a/crates/polars-io/src/csv/splitfields.rs +++ b/crates/polars-io/src/csv/splitfields.rs @@ -4,7 +4,7 @@ mod inner { /// This exists solely because we cannot split the lines naively as pub(crate) struct SplitFields<'a> { v: &'a [u8], - delimiter: u8, + separator: u8, finished: bool, quote_char: u8, quoting: bool, @@ -14,13 +14,13 @@ mod inner { impl<'a> SplitFields<'a> { pub(crate) fn new( slice: &'a [u8], - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Self { Self { v: slice, - delimiter, + separator, finished: false, quote_char: quote_char.unwrap_or(b'"'), quoting: quote_char.is_some(), @@ -44,7 +44,7 @@ mod inner { } fn eof_oel(&self, current_ch: u8) -> bool { - current_ch == self.delimiter || current_ch == self.eol_char + current_ch == self.separator || current_ch == self.eol_char } } @@ -59,7 +59,7 @@ mod inner { } let mut needs_escaping = false; - // There can be strings with delimiters: + // There can be strings with separators: // "Street, City", // Safety: @@ -157,33 +157,33 @@ mod inner { /// This exists solely because we cannot split the lines naively as pub(crate) struct SplitFields<'a> { pub v: &'a [u8], - delimiter: u8, + separator: u8, pub finished: bool, quote_char: u8, quoting: bool, eol_char: u8, - simd_delimiter: SimdVec, + simd_separator: SimdVec, simd_eol_char: SimdVec, } impl<'a> SplitFields<'a> { pub(crate) fn new( slice: &'a [u8], - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Self { - let simd_delimiter = SimdVec::splat(delimiter); + let simd_separator = SimdVec::splat(separator); let simd_eol_char = SimdVec::splat(eol_char); Self { v: slice, - delimiter, + separator, finished: false, quote_char: quote_char.unwrap_or(b'"'), quoting: quote_char.is_some(), eol_char, - simd_delimiter, + simd_separator, simd_eol_char, } } @@ -204,7 +204,7 @@ mod inner { } fn eof_oel(&self, current_ch: u8) -> bool { - current_ch == self.delimiter || current_ch == self.eol_char + current_ch == self.separator || current_ch == self.eol_char } } @@ -219,7 +219,7 @@ mod inner { } let mut needs_escaping = false; - // There can be strings with delimiters: + // There can be strings with separators: // "Street, City", // Safety: @@ -279,8 +279,8 @@ mod inner { .unwrap_unchecked_release(); let simd_bytes = SimdVec::from(lane); let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char); - let has_delimiter = simd_bytes.simd_eq(self.simd_delimiter); - let has_any = has_delimiter.bitor(has_eol_char); + let has_separator = simd_bytes.simd_eq(self.simd_separator); + let has_any = has_separator.bitor(has_eol_char); if has_any.any() { // soundness we can transmute because we have the same alignment let has_any = std::mem::transmute::< diff --git a/crates/polars-io/src/csv/utils.rs b/crates/polars-io/src/csv/utils.rs index e9aef318873a..ba8cc68ee63c 100644 --- a/crates/polars-io/src/csv/utils.rs +++ b/crates/polars-io/src/csv/utils.rs @@ -23,7 +23,7 @@ pub(crate) fn get_file_chunks( bytes: &[u8], n_chunks: usize, expected_fields: usize, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Vec<(usize, usize)> { @@ -41,7 +41,7 @@ pub(crate) fn get_file_chunks( let end_pos = match next_line_position( &bytes[search_pos..], Some(expected_fields), - delimiter, + separator, quote_char, eol_char, ) { @@ -134,7 +134,7 @@ pub(crate) fn parse_bytes_with_encoding( #[allow(clippy::too_many_arguments)] pub fn infer_file_schema_inner( reader_bytes: &ReaderBytes, - delimiter: u8, + separator: u8, max_read_rows: Option, has_header: bool, schema_overwrite: Option<&Schema>, @@ -199,7 +199,7 @@ pub fn infer_file_schema_inner( } } - let byterecord = SplitFields::new(header_line, delimiter, quote_char, eol_char); + let byterecord = SplitFields::new(header_line, separator, quote_char, eol_char); if has_header { let headers = byterecord .map(|(slice, needs_escaping)| { @@ -233,8 +233,8 @@ pub fn infer_file_schema_inner( .map(|(i, _s)| format!("column_{}", i + 1)) .collect(); // needed because SplitLines does not return the \n char, so SplitFields does not catch - // the latest value if ending with a delimiter. - if header_line.ends_with(&[delimiter]) { + // the latest value if ending with a separator. + if header_line.ends_with(&[separator]) { column_names.push(format!("column_{}", column_names.len() + 1)) } column_names @@ -248,7 +248,7 @@ pub fn infer_file_schema_inner( return infer_file_schema_inner( &ReaderBytes::Owned(buf), - delimiter, + separator, max_read_rows, has_header, schema_overwrite, @@ -322,7 +322,7 @@ pub fn infer_file_schema_inner( } } - let mut record = SplitFields::new(line, delimiter, quote_char, eol_char); + let mut record = SplitFields::new(line, separator, quote_char, eol_char); for i in 0..header_length { if let Some((slice, needs_escaping)) = record.next() { @@ -434,7 +434,7 @@ pub fn infer_file_schema_inner( rb.push(eol_char); return infer_file_schema_inner( &ReaderBytes::Owned(rb), - delimiter, + separator, max_read_rows, has_header, schema_overwrite, @@ -465,7 +465,7 @@ pub fn infer_file_schema_inner( #[allow(clippy::too_many_arguments)] pub fn infer_file_schema( reader_bytes: &ReaderBytes, - delimiter: u8, + separator: u8, max_read_rows: Option, has_header: bool, schema_overwrite: Option<&Schema>, @@ -482,7 +482,7 @@ pub fn infer_file_schema( ) -> PolarsResult<(Schema, usize, usize)> { infer_file_schema_inner( reader_bytes, - delimiter, + separator, max_read_rows, has_header, schema_overwrite, @@ -516,7 +516,7 @@ pub fn is_compressed(bytes: &[u8]) -> bool { fn decompress_impl( decoder: &mut R, n_rows: Option, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Option> { @@ -548,7 +548,7 @@ fn decompress_impl( } // now that we have enough, we compute the number of fields (also takes embedding into account) expected_fields = - SplitFields::new(&out, delimiter, quote_char, eol_char).count(); + SplitFields::new(&out, separator, quote_char, eol_char).count(); break; } } @@ -561,7 +561,7 @@ fn decompress_impl( match next_line_position( &out[buf_pos + 1..], Some(expected_fields), - delimiter, + separator, quote_char, eol_char, ) { @@ -589,16 +589,16 @@ fn decompress_impl( pub(crate) fn decompress( bytes: &[u8], n_rows: Option, - delimiter: u8, + separator: u8, quote_char: Option, eol_char: u8, ) -> Option> { if bytes.starts_with(&GZIP) { let mut decoder = flate2::read::MultiGzDecoder::new(bytes); - decompress_impl(&mut decoder, n_rows, delimiter, quote_char, eol_char) + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) } else if bytes.starts_with(&ZLIB0) || bytes.starts_with(&ZLIB1) || bytes.starts_with(&ZLIB2) { let mut decoder = flate2::read::ZlibDecoder::new(bytes); - decompress_impl(&mut decoder, n_rows, delimiter, quote_char, eol_char) + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) } else { None } diff --git a/crates/polars-io/src/csv/write.rs b/crates/polars-io/src/csv/write.rs index f0db058c3855..0752cfac872b 100644 --- a/crates/polars-io/src/csv/write.rs +++ b/crates/polars-io/src/csv/write.rs @@ -9,7 +9,7 @@ pub enum QuoteStyle { /// This puts quotes around every field. Always. Always, /// This puts quotes around fields only when necessary. - // They are necessary when fields contain a quote, delimiter or record terminator. Quotes are also necessary when writing an empty record (which is indistinguishable from a record with one empty field). + // They are necessary when fields contain a quote, separator or record terminator. Quotes are also necessary when writing an empty record (which is indistinguishable from a record with one empty field). // This is the default. #[default] Necessary, @@ -69,9 +69,9 @@ where self } - /// Set the CSV file's column delimiter as a byte character. - pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.options.delimiter = delimiter; + /// Set the CSV file's column separator as a byte character. + pub fn with_separator(mut self, separator: u8) -> Self { + self.options.separator = separator; self } @@ -114,8 +114,8 @@ where } /// Set the single byte character used for quoting. - pub fn with_quoting_char(mut self, char: u8) -> Self { - self.options.quote = char; + pub fn with_quote_char(mut self, char: u8) -> Self { + self.options.quote_char = char; self } diff --git a/crates/polars-io/src/csv/write_impl.rs b/crates/polars-io/src/csv/write_impl.rs index 699bffbec1cf..edf15bc9c730 100644 --- a/crates/polars-io/src/csv/write_impl.rs +++ b/crates/polars-io/src/csv/write_impl.rs @@ -24,24 +24,24 @@ fn fmt_and_escape_str(f: &mut Vec, v: &str, options: &SerializeOptions) -> s if options.quote_style == QuoteStyle::Never { return write!(f, "{v}"); } - let quote = options.quote as char; + let quote = options.quote_char as char; if v.is_empty() { return write!(f, "{quote}{quote}"); } - let needs_escaping = memchr(options.quote, v.as_bytes()).is_some(); + let needs_escaping = memchr(options.quote_char, v.as_bytes()).is_some(); if needs_escaping { let replaced = unsafe { // Replace from single quote " to double quote "". v.replace( - std::str::from_utf8_unchecked(&[options.quote]), - std::str::from_utf8_unchecked(&[options.quote, options.quote]), + std::str::from_utf8_unchecked(&[options.quote_char]), + std::str::from_utf8_unchecked(&[options.quote_char, options.quote_char]), ) }; return write!(f, "{quote}{replaced}{quote}"); } let surround_with_quotes = match options.quote_style { QuoteStyle::Always | QuoteStyle::NonNumeric => true, - QuoteStyle::Necessary => memchr2(options.delimiter, b'\n', v.as_bytes()).is_some(), + QuoteStyle::Necessary => memchr2(options.separator, b'\n', v.as_bytes()).is_some(), QuoteStyle::Never => false, }; @@ -86,7 +86,7 @@ unsafe fn write_anyvalue( }, _ => { // Then we deal with the numeric types - let quote = options.quote as char; + let quote = options.quote_char as char; let mut end_with_quote = matches!(options.quote_style, QuoteStyle::Always); if end_with_quote { @@ -238,10 +238,10 @@ pub struct SerializeOptions { pub datetime_format: Option, /// Used for [`DataType::Float64`] and [`DataType::Float32`]. pub float_precision: Option, - /// Used as separator/delimiter. - pub delimiter: u8, + /// Used as separator. + pub separator: u8, /// Quoting character. - pub quote: u8, + pub quote_char: u8, /// Null value representation. pub null: String, /// String appended after every row. @@ -256,8 +256,8 @@ impl Default for SerializeOptions { time_format: None, datetime_format: None, float_precision: None, - delimiter: b',', - quote: b'"', + separator: b',', + quote_char: b'"', null: String::new(), line_terminator: "\n".into(), quote_style: Default::default(), @@ -302,10 +302,10 @@ pub(crate) fn write( // Check that the double quote is valid UTF-8. polars_ensure!( - std::str::from_utf8(&[options.quote, options.quote]).is_ok(), + std::str::from_utf8(&[options.quote_char, options.quote_char]).is_ok(), ComputeError: "quote char results in invalid utf-8", ); - let delimiter = char::from(options.delimiter); + let separator = char::from(options.separator); let (datetime_formats, time_zones): (Vec<&str>, Vec>) = df .get_columns() @@ -439,7 +439,7 @@ pub(crate) fn write( } let current_ptr = col as *const SeriesIter; if current_ptr != last_ptr { - write!(&mut write_buffer, "{delimiter}").unwrap() + write!(&mut write_buffer, "{separator}").unwrap() } } if !finished { @@ -488,7 +488,7 @@ pub(crate) fn write_header( } writer.write_all( escaped_names - .join(std::str::from_utf8(&[options.delimiter]).unwrap()) + .join(std::str::from_utf8(&[options.separator]).unwrap()) .as_bytes(), )?; writer.write_all(options.line_terminator.as_bytes())?; diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs index 80b2b2e3aa95..8542432b31bc 100644 --- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs @@ -29,7 +29,7 @@ impl CsvExec { .unwrap() .has_header(self.options.has_header) .with_dtypes(Some(self.schema.clone())) - .with_delimiter(self.options.delimiter) + .with_separator(self.options.separator) .with_ignore_errors(self.options.ignore_errors) .with_skip_rows(self.options.skip_rows) .with_n_rows(n_rows) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 54360f045f11..f044c96beab8 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -13,7 +13,7 @@ use crate::prelude::*; #[cfg(feature = "csv")] pub struct LazyCsvReader<'a> { path: PathBuf, - delimiter: u8, + separator: u8, has_header: bool, ignore_errors: bool, skip_rows: usize, @@ -42,7 +42,7 @@ impl<'a> LazyCsvReader<'a> { pub fn new(path: impl AsRef) -> Self { LazyCsvReader { path: path.as_ref().to_owned(), - delimiter: b',', + separator: b',', has_header: true, ignore_errors: false, skip_rows: 0, @@ -134,10 +134,10 @@ impl<'a> LazyCsvReader<'a> { self } - /// Set the CSV file's column delimiter as a byte character + /// Set the CSV file's column separator as a byte character #[must_use] - pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = delimiter; + pub fn with_separator(mut self, separator: u8) -> Self { + self.separator = separator; self } @@ -239,7 +239,7 @@ impl<'a> LazyCsvReader<'a> { let (schema, _, _) = infer_file_schema( &reader_bytes, - self.delimiter, + self.separator, self.infer_schema_length, self.has_header, // we set it to None and modify them after the schema is updated @@ -270,7 +270,7 @@ impl LazyFileListReader for LazyCsvReader<'_> { fn finish_no_glob(self) -> PolarsResult { let mut lf: LazyFrame = LogicalPlanBuilder::scan_csv( self.path, - self.delimiter, + self.separator, self.has_header, self.ignore_errors, self.skip_rows, diff --git a/crates/polars-pipe/src/executors/sinks/file_sink.rs b/crates/polars-pipe/src/executors/sinks/file_sink.rs index 5c5ff46acbb9..67dea31c355b 100644 --- a/crates/polars-pipe/src/executors/sinks/file_sink.rs +++ b/crates/polars-pipe/src/executors/sinks/file_sink.rs @@ -182,9 +182,9 @@ impl CsvSink { let file = std::fs::File::create(path)?; let writer = CsvWriter::new(file) .has_header(options.has_header) - .with_delimiter(options.serialize_options.delimiter) + .with_separator(options.serialize_options.separator) .with_line_terminator(options.serialize_options.line_terminator) - .with_quoting_char(options.serialize_options.quote) + .with_quote_char(options.serialize_options.quote_char) .with_batch_size(options.batch_size) .with_datetime_format(options.serialize_options.datetime_format) .with_date_format(options.serialize_options.date_format) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 1053ff1d236c..b1297c39e07c 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -63,7 +63,7 @@ impl CsvSource { .unwrap() .has_header(options.has_header) .with_dtypes(Some(self.schema.clone())) - .with_delimiter(options.delimiter) + .with_separator(options.separator) .with_ignore_errors(options.ignore_errors) .with_skip_rows(options.skip_rows) .with_n_rows(n_rows) diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs index 0194d4fb19a1..4327a8755902 100644 --- a/crates/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -265,7 +265,7 @@ impl LogicalPlanBuilder { #[cfg(feature = "csv")] pub fn scan_csv>( path: P, - delimiter: u8, + separator: u8, has_header: bool, ignore_errors: bool, mut skip_rows: usize, @@ -314,7 +314,7 @@ impl LogicalPlanBuilder { // this needs a way to estimated bytes/rows. let (mut inferred_schema, rows_read, bytes_read) = infer_file_schema( &reader_bytes, - delimiter, + separator, infer_schema_length, has_header, schema_overwrite, @@ -368,7 +368,7 @@ impl LogicalPlanBuilder { scan_type: FileScan::Csv { options: CsvParserOptions { has_header, - delimiter, + separator, ignore_errors, skip_rows, low_memory, diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs index 40e2d7ec0262..795107b49cf0 100644 --- a/crates/polars-plan/src/logical_plan/options.rs +++ b/crates/polars-plan/src/logical_plan/options.rs @@ -24,7 +24,7 @@ pub type FileCount = u32; #[derive(Clone, Debug, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct CsvParserOptions { - pub delimiter: u8, + pub separator: u8, pub comment_char: Option, pub quote_char: Option, pub eol_char: u8, diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 0643f972fa92..8e2c91f050a5 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -577,7 +577,7 @@ //! // write DataFrame to file //! CsvWriter::new(&mut file) //! .has_header(true) -//! .with_delimiter(b',') +//! .with_separator(b',') //! .finish(df); //! # Ok(()) //! # } diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index 44b536914ce1..4b737fbc027d 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -114,7 +114,7 @@ //! //! let df = LazyCsvReader::new("reddit.csv") //! .has_header(true) -//! .with_delimiter(b',') +//! .with_separator(b',') //! .finish()? //! .group_by([col("comment_karma")]) //! .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()]) diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 9df2115ed8d8..710ad4712ae7 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -153,7 +153,7 @@ fn test_tab_sep() { let file = Cursor::new(csv); let df = CsvReader::new(file) .infer_schema(Some(100)) - .with_delimiter(b'\t') + .with_separator(b'\t') .has_header(false) .with_ignore_errors(true) .finish() @@ -472,7 +472,7 @@ fn test_skip_rows() -> PolarsResult<()> { let df = CsvReader::new(file) .has_header(false) .with_skip_rows(3) - .with_delimiter(b' ') + .with_separator(b' ') .finish()?; dbg!(&df); @@ -491,7 +491,7 @@ fn test_projection_idx() -> PolarsResult<()> { let df = CsvReader::new(file) .has_header(false) .with_projection(Some(vec![4, 5])) - .with_delimiter(b' ') + .with_separator(b' ') .finish()?; assert_eq!(df.width(), 2); @@ -501,7 +501,7 @@ fn test_projection_idx() -> PolarsResult<()> { let out = CsvReader::new(file) .has_header(false) .with_projection(Some(vec![4, 6])) - .with_delimiter(b' ') + .with_separator(b' ') .finish(); assert!(out.is_err()); @@ -788,7 +788,7 @@ fn test_infer_schema_eol() -> PolarsResult<()> { } #[test] -fn test_whitespace_delimiters() -> PolarsResult<()> { +fn test_whitespace_separators() -> PolarsResult<()> { let tsv = "\ta\tb\tc\n1\ta1\tb1\tc1\n2\ta2\tb2\tc2\n".to_string(); let contents = vec![ @@ -799,7 +799,7 @@ fn test_whitespace_delimiters() -> PolarsResult<()> { for (content, sep) in contents { let file = Cursor::new(&content); - let df = CsvReader::new(file).with_delimiter(sep).finish()?; + let df = CsvReader::new(file).with_separator(sep).finish()?; assert_eq!(df.shape(), (2, 4)); assert_eq!(df.get_column_names(), &["", "a", "b", "c"]); @@ -828,7 +828,7 @@ fn test_tsv_header_offset() -> PolarsResult<()> { let file = Cursor::new(csv); let df = CsvReader::new(file) .truncate_ragged_lines(true) - .with_delimiter(b'\t') + .with_separator(b'\t') .finish()?; assert_eq!(df.shape(), (3, 2)); @@ -859,7 +859,7 @@ fn test_null_values_infer_schema() -> PolarsResult<()> { fn test_comma_separated_field_in_tsv() -> PolarsResult<()> { let csv = "first\tsecond\n1\t2.3,2.4\n3\t4.5,4.6\n"; let file = Cursor::new(csv); - let df = CsvReader::new(file).with_delimiter(b'\t').finish()?; + let df = CsvReader::new(file).with_separator(b'\t').finish()?; assert_eq!(df.dtypes(), &[DataType::Int64, DataType::Utf8]); Ok(()) } @@ -1096,7 +1096,7 @@ fn test_try_parse_dates_3380() -> PolarsResult<()> { 46.685;7.953;2022-05-10T08:07:12Z;8.8;0.00"; let file = Cursor::new(csv); let df = CsvReader::new(file) - .with_delimiter(b';') + .with_separator(b';') .with_try_parse_dates(true) .finish()?; assert_eq!(df.column("validdate")?.null_count(), 0); diff --git a/docs/src/rust/getting-started/reading-writing.rs b/docs/src/rust/getting-started/reading-writing.rs index 4fe035d34f82..54b538ad93d0 100644 --- a/docs/src/rust/getting-started/reading-writing.rs +++ b/docs/src/rust/getting-started/reading-writing.rs @@ -22,7 +22,7 @@ fn main() -> Result<(), Box> { let mut file = File::create("docs/data/output.csv").expect("could not create file"); CsvWriter::new(&mut file) .has_header(true) - .with_delimiter(b',') + .with_separator(b',') .finish(&mut df); let df_csv = CsvReader::from_path("docs/data/output.csv")? .infer_schema(None) @@ -35,7 +35,7 @@ fn main() -> Result<(), Box> { let mut file = File::create("docs/data/output.csv").expect("could not create file"); CsvWriter::new(&mut file) .has_header(true) - .with_delimiter(b',') + .with_separator(b',') .finish(&mut df); let df_csv = CsvReader::from_path("docs/data/output.csv")? .infer_schema(None) diff --git a/examples/read_csv/src/main.rs b/examples/read_csv/src/main.rs index 7d1f555c8397..ca4fbbd7730c 100644 --- a/examples/read_csv/src/main.rs +++ b/examples/read_csv/src/main.rs @@ -6,7 +6,7 @@ fn main() -> PolarsResult<()> { .unwrap(); let file = Box::new(file) as Box; let _df = CsvReader::new(file) - .with_delimiter(b'|') + .with_separator(b'|') .has_header(false) .with_chunk_size(10) .batched_mmap(None) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index bec784afa305..e11318671575 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -55,6 +55,7 @@ from polars.exceptions import NoRowsReturnedError, TooManyRowsReturnedError from polars.functions import col, lit from polars.io._utils import _is_glob_pattern, _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte from polars.io.spreadsheet._write_utils import ( _unpack_multi_column_dict, _xl_apply_conditional_formats, @@ -657,7 +658,7 @@ def _read_csv( columns: Sequence[int] | Sequence[str] | None = None, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, schema: None | SchemaDict = None, @@ -2443,7 +2444,7 @@ def write_csv( has_header: bool = ..., separator: str = ..., line_terminator: str = ..., - quote: str = ..., + quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., @@ -2462,7 +2463,7 @@ def write_csv( has_header: bool = ..., separator: str = ..., line_terminator: str = ..., - quote: str = ..., + quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., @@ -2473,6 +2474,7 @@ def write_csv( ) -> None: ... + @deprecate_renamed_parameter("quote", "quote_char", version="0.19.8") def write_csv( self, file: BytesIO | TextIOWrapper | str | Path | None = None, @@ -2480,7 +2482,7 @@ def write_csv( has_header: bool = True, separator: str = ",", line_terminator: str = "\n", - quote: str = '"', + quote_char: str = '"', batch_size: int = 1024, datetime_format: str | None = None, date_format: str | None = None, @@ -2503,7 +2505,7 @@ def write_csv( Separate CSV fields with this symbol. line_terminator String used to end each row. - quote + quote_char Byte to use as quoting character. batch_size Number of rows that will be processed per thread. @@ -2530,7 +2532,7 @@ def write_csv( Determines the quoting strategy used. - necessary (default): This puts quotes around fields only when necessary. They are necessary when fields contain a quote, - delimiter or record terminator. + separator or record terminator. Quotes are also necessary when writing an empty record (which is indistinguishable from a record with one empty field). This is the default. @@ -2558,10 +2560,8 @@ def write_csv( >>> df.write_csv(path, separator=",") """ - if len(separator) != 1: - raise ValueError("only single byte separator is allowed") - if len(quote) != 1: - raise ValueError("only single byte quote char is allowed") + _check_arg_is_1byte("separator", separator, can_be_empty=False) + _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True) if not null_value: null_value = None @@ -2579,7 +2579,7 @@ def write_csv( has_header, ord(separator), line_terminator, - ord(quote), + ord(quote_char), batch_size, datetime_format, date_format, diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 8d32415de850..db787162aed1 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -33,7 +33,7 @@ def __init__( columns: Sequence[int] | Sequence[str] | None = None, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, null_values: str | Sequence[str] | dict[str, str] | None = None, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index fd4cc0cf654f..95c006d093ed 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -25,7 +25,7 @@ def read_csv( new_columns: Sequence[str] | None = None, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, schema: SchemaDict | None = None, @@ -50,7 +50,7 @@ def read_csv( raise_if_empty: bool = True, truncate_ragged_lines: bool = False, ) -> DataFrame: - """ + r""" Read a CSV file into a DataFrame. Parameters @@ -73,7 +73,7 @@ def read_csv( list is shorter than the width of the DataFrame the remaining columns will have their original name. separator - Single byte character to use as delimiter in the file. + Single byte character to use as separator in the file. comment_char Single byte character that indicates the start of a comment line, for instance ``#``. @@ -159,7 +159,9 @@ def read_csv( Set the sample size. This is used to sample statistics to estimate the allocation needed. eol_char - Single byte end of line character. + Single byte end of line character (default: `\n`). When encountering a file + with windows line endings (`\r\n`), one can go with the default `\n`. The extra + `\r` will be removed when processed. raise_if_empty When there is no data in the source,``NoDataError`` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -404,7 +406,7 @@ def read_csv_batched( new_columns: Sequence[str] | None = None, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, null_values: str | Sequence[str] | dict[str, str] | None = None, @@ -425,7 +427,7 @@ def read_csv_batched( eol_char: str = "\n", raise_if_empty: bool = True, ) -> BatchedCsvReader: - """ + r""" Read a CSV file in batches. Upon creation of the ``BatchedCsvReader``, Polars will gather statistics and @@ -452,7 +454,7 @@ def read_csv_batched( list is shorter than the width of the DataFrame the remaining columns will have their original name. separator - Single byte character to use as delimiter in the file. + Single byte character to use as separator in the file. comment_char Single byte character that indicates the start of a comment line, for instance ``#``. @@ -517,7 +519,9 @@ def read_csv_batched( Set the sample size. This is used to sample statistics to estimate the allocation needed. eol_char - Single byte end of line character. + Single byte end of line character (default: `\n`). When encountering a file + with windows line endings (`\r\n`), one can go with the default `\n`. The extra + `\r` will be removed when processed. raise_if_empty When there is no data in the source,``NoDataError`` is raised. If this parameter is set to False, ``None`` will be returned from ``next_batches(n)`` instead. @@ -533,7 +537,9 @@ def read_csv_batched( Examples -------- >>> reader = pl.read_csv_batched( - ... "./tpch/tables_scale_100/lineitem.tbl", separator="|", try_parse_dates=True + ... "./tpch/tables_scale_100/lineitem.tbl", + ... separator="|", + ... try_parse_dates=True, ... ) # doctest: +SKIP >>> batches = reader.next_batches(5) # doctest: +SKIP >>> for df in batches: # doctest: +SKIP @@ -694,7 +700,7 @@ def scan_csv( has_header: bool = True, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | Sequence[PolarsDataType] | None = None, schema: SchemaDict | None = None, @@ -717,7 +723,7 @@ def scan_csv( raise_if_empty: bool = True, truncate_ragged_lines: bool = False, ) -> LazyFrame: - """ + r""" Lazily read from a CSV file or multiple files via glob patterns. This allows the query optimizer to push down predicates and @@ -734,7 +740,7 @@ def scan_csv( following format: ``column_x``, with ``x`` being an enumeration over every column in the dataset starting at 1. separator - Single byte character to use as delimiter in the file. + Single byte character to use as separator in the file. comment_char Single byte character that indicates the start of a comment line, for instance ``#``. @@ -796,7 +802,9 @@ def scan_csv( can be inferred, as well as a handful of others. If this does not succeed, the column remains of data type ``pl.Utf8``. eol_char - Single byte end of line character + Single byte end of line character (default: `\n`). When encountering a file + with windows line endings (`\r\n`), one can go with the default `\n`. The extra + `\r` will be removed when processed. new_columns Provide an explicit list of string column names to use (for example, when scanning a headerless CSV file). If the given list is shorter than the width of diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 9fe0e7893b59..1cef1bfd50f3 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -46,6 +46,7 @@ ) from polars.dependencies import dataframe_api_compat, subprocess from polars.io._utils import _is_local_file, _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec from polars.lazyframe.group_by import LazyGroupBy @@ -316,7 +317,7 @@ def _scan_csv( has_header: bool = True, separator: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | None = None, schema: SchemaDict | None = None, @@ -2026,6 +2027,7 @@ def sink_ipc( maintain_order=maintain_order, ) + @deprecate_renamed_parameter("quote", "quote_char", version="0.19.8") def sink_csv( self, path: str | Path, @@ -2033,7 +2035,7 @@ def sink_csv( has_header: bool = True, separator: str = ",", line_terminator: str = "\n", - quote: str = '"', + quote_char: str = '"', batch_size: int = 1024, datetime_format: str | None = None, date_format: str | None = None, @@ -2064,7 +2066,7 @@ def sink_csv( Separate CSV fields with this symbol. line_terminator String used to end each row. - quote + quote_char Byte to use as quoting character. batch_size Number of rows that will be processed per thread. @@ -2097,7 +2099,8 @@ def sink_csv( This is the default. - always: This puts quotes around every field. Always. - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). + invalid CSV data (e.g.: by not quoting strings containing the + separator). - non_numeric: This puts quotes around all fields that are non-numeric. Namely, when writing a field that does not parse as a valid float or integer, then quotes will be used even if they aren`t strictly @@ -2128,10 +2131,8 @@ def sink_csv( >>> lf.sink_csv("out.csv") # doctest: +SKIP """ - if len(separator) != 1: - raise ValueError("only single byte separator is allowed") - if len(quote) != 1: - raise ValueError("only single byte quote char is allowed") + _check_arg_is_1byte("separator", separator, can_be_empty=False) + _check_arg_is_1byte("quote_char", quote_char, can_be_empty=False) if not null_value: null_value = None @@ -2149,7 +2150,7 @@ def sink_csv( has_header=has_header, separator=ord(separator), line_terminator=line_terminator, - quote=ord(quote), + quote_char=ord(quote_char), batch_size=batch_size, datetime_format=datetime_format, date_format=date_format, diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs index 6114fb43a675..63482f5e1322 100644 --- a/py-polars/src/batched_csv.rs +++ b/py-polars/src/batched_csv.rs @@ -99,7 +99,7 @@ impl PyBatchedCsv { .infer_schema(infer_schema_length) .has_header(has_header) .with_n_rows(n_rows) - .with_delimiter(separator.as_bytes()[0]) + .with_separator(separator.as_bytes()[0]) .with_skip_rows(skip_rows) .with_ignore_errors(ignore_errors) .with_projection(projection) diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 6a3e24a51340..c94b48a025b7 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -229,7 +229,7 @@ impl PyDataFrame { .infer_schema(infer_schema_length) .has_header(has_header) .with_n_rows(n_rows) - .with_delimiter(separator.as_bytes()[0]) + .with_separator(separator.as_bytes()[0]) .with_skip_rows(skip_rows) .with_ignore_errors(ignore_errors) .with_projection(projection) @@ -589,7 +589,7 @@ impl PyDataFrame { has_header: bool, separator: u8, line_terminator: String, - quote: u8, + quote_char: u8, batch_size: usize, datetime_format: Option, date_format: Option, @@ -606,9 +606,9 @@ impl PyDataFrame { // No need for a buffered writer, because the csv writer does internal buffering. CsvWriter::new(f) .has_header(has_header) - .with_delimiter(separator) + .with_separator(separator) .with_line_terminator(line_terminator) - .with_quoting_char(quote) + .with_quote_char(quote_char) .with_batch_size(batch_size) .with_datetime_format(datetime_format) .with_date_format(date_format) @@ -623,9 +623,9 @@ impl PyDataFrame { let mut buf = get_file_like(py_f, true)?; CsvWriter::new(&mut buf) .has_header(has_header) - .with_delimiter(separator) + .with_separator(separator) .with_line_terminator(line_terminator) - .with_quoting_char(quote) + .with_quote_char(quote_char) .with_batch_size(batch_size) .with_datetime_format(datetime_format) .with_date_format(date_format) diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index a60feef70a58..d2f0bd3465d6 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -173,7 +173,7 @@ impl PyLazyFrame { let null_values = null_values.map(|w| w.0); let comment_char = comment_char.map(|s| s.as_bytes()[0]); let quote_char = quote_char.map(|s| s.as_bytes()[0]); - let delimiter = separator.as_bytes()[0]; + let separator = separator.as_bytes()[0]; let eol_char = eol_char.as_bytes()[0]; let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); @@ -185,7 +185,7 @@ impl PyLazyFrame { }); let mut r = LazyCsvReader::new(path) .with_infer_schema_length(infer_schema_length) - .with_delimiter(delimiter) + .with_separator(separator) .has_header(has_header) .with_ignore_errors(ignore_errors) .with_skip_rows(skip_rows) @@ -542,7 +542,7 @@ impl PyLazyFrame { #[allow(clippy::too_many_arguments)] #[cfg(all(feature = "streaming", feature = "csv"))] - #[pyo3(signature = (path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style, maintain_order))] + #[pyo3(signature = (path, has_header, separator, line_terminator, quote_char, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style, maintain_order))] fn sink_csv( &self, py: Python, @@ -550,7 +550,7 @@ impl PyLazyFrame { has_header: bool, separator: u8, line_terminator: String, - quote: u8, + quote_char: u8, batch_size: usize, datetime_format: Option, date_format: Option, @@ -568,8 +568,8 @@ impl PyLazyFrame { time_format, datetime_format, float_precision, - delimiter: separator, - quote, + separator, + quote_char, null: null_value, line_terminator, quote_style, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 0afa8b30b3cd..987edbf95841 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -584,7 +584,7 @@ def test_csv_quote_char() -> None: # non-standard quote char df = pl.DataFrame({"x": ["", "0*0", "xyz"]}) - csv_data = df.write_csv(quote="*") + csv_data = df.write_csv(quote_char="*") assert csv_data == "x\n**\n*0**0*\nxyz\n" assert_frame_equal(df, pl.read_csv(io.StringIO(csv_data), quote_char="*")) @@ -722,7 +722,7 @@ def test_empty_string_missing_round_trip() -> None: assert_frame_equal(df, df_read) -def test_write_csv_delimiter() -> None: +def test_write_csv_separator() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) f = io.BytesIO() df.write_csv(f, separator="\t") @@ -867,7 +867,7 @@ def test_glob_csv(df_no_lists: pl.DataFrame, tmp_path: Path) -> None: assert pl.read_csv(path_glob).shape == (3, 11) -def test_csv_whitespace_delimiter_at_start_do_not_skip() -> None: +def test_csv_whitespace_separator_at_start_do_not_skip() -> None: csv = "\t\t\t\t0\t1" assert pl.read_csv(csv.encode(), separator="\t", has_header=False).to_dict( False @@ -881,7 +881,7 @@ def test_csv_whitespace_delimiter_at_start_do_not_skip() -> None: } -def test_csv_whitespace_delimiter_at_end_do_not_skip() -> None: +def test_csv_whitespace_separator_at_end_do_not_skip() -> None: csv = "0\t1\t\t\t\t" assert pl.read_csv(csv.encode(), separator="\t", has_header=False).to_dict( False @@ -1504,7 +1504,9 @@ class TemporalFormats(TypedDict): "2.0,a,bc,2,false,,2077-07-05T03:01:00,03:01:00\n" ',"hello,3,,2077-07-05,2077-07-05T03:01:00,\n' ) - assert df.write_csv(quote_style="non_numeric", quote="8", **temporal_formats) == ( + assert df.write_csv( + quote_style="non_numeric", quote_char="8", **temporal_formats + ) == ( "8float8,8string8,8int8,8bool8,8date8,8datetime8,8time8\n" "1.0,8a8,1,8true8,82077-07-058,,803:01:008\n" "2.0,8a,bc8,2,8false8,,82077-07-05T03:01:008,803:01:008\n" diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index 304041f2f259..3f0ea1f77a45 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -128,7 +128,7 @@ def test_sink_csv_with_options() -> None: has_header=False, separator=";", line_terminator="|", - quote="$", + quote_char="$", batch_size=42, datetime_format="%Y", date_format="%d", @@ -144,7 +144,7 @@ def test_sink_csv_with_options() -> None: has_header=False, separator=ord(";"), line_terminator="|", - quote=ord("$"), + quote_char=ord("$"), batch_size=42, datetime_format="%Y", date_format="%d", @@ -159,15 +159,15 @@ def test_sink_csv_with_options() -> None: @pytest.mark.parametrize(("value"), ["abc", ""]) def test_sink_csv_exception_for_separator(value: str) -> None: df = pl.LazyFrame({"dummy": ["abc"]}) - with pytest.raises(ValueError, match="only single byte separator is allowed"): + with pytest.raises(ValueError, match="should be a single byte character, but is"): df.sink_csv("path", separator=value) @pytest.mark.parametrize(("value"), ["abc", ""]) def test_sink_csv_exception_for_quote(value: str) -> None: df = pl.LazyFrame({"dummy": ["abc"]}) - with pytest.raises(ValueError, match="only single byte quote char is allowed"): - df.sink_csv("path", quote=value) + with pytest.raises(ValueError, match="should be a single byte character, but is"): + df.sink_csv("path", quote_char=value) def test_scan_csv_only_header_10792(io_files_path: Path) -> None: