diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 52f29ee0a128..7d2aa5d2455a 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -15,8 +15,8 @@ use rayon::prelude::*; use super::buffer::init_buffers; use super::options::{CommentPrefix, CsvEncoding, NullValues, NullValuesCompiled}; use super::parser::{ - is_comment_line, next_line_position, next_line_position_naive, parse_lines, skip_bom, - skip_line_ending, skip_this_line, CountLines, + is_comment_line, parse_lines, skip_bom, skip_line_ending, skip_this_line, CountLines, + SplitLines, }; use super::schema_inference::{check_decimal_comma, infer_file_schema}; #[cfg(any(feature = "decompress", feature = "decompress-fast"))] @@ -283,11 +283,19 @@ impl<'a> CoreReader<'a> { // skip 'n' leading rows if self.skip_rows_before_header > 0 { + let mut split_lines = SplitLines::new(bytes, quote_char, eol_char); + let mut current_line = &bytes[..0]; + for _ in 0..self.skip_rows_before_header { - let pos = next_line_position_naive(bytes, eol_char) + current_line = split_lines + .next() .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?; - bytes = &bytes[pos..]; } + + current_line = split_lines + .next() + .unwrap_or(¤t_line[current_line.len()..]); + bytes = &bytes[current_line.as_ptr() as usize - bytes.as_ptr() as usize..]; } // skip lines that are comments @@ -301,19 +309,19 @@ impl<'a> CoreReader<'a> { } // skip 'n' rows following the header if self.skip_rows_after_header > 0 { - for _ in 0..self.skip_rows_after_header { - let pos = if is_comment_line(bytes, self.comment_prefix.as_ref()) { - next_line_position_naive(bytes, eol_char) - } else { - // we don't pass expected fields - // as we want to skip all rows - // no matter the no. of fields - next_line_position(bytes, None, self.separator, self.quote_char, eol_char) - } - .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?; + let mut split_lines = SplitLines::new(bytes, quote_char, eol_char); + let mut current_line = &bytes[..0]; - bytes = &bytes[pos..]; + for _ in 0..self.skip_rows_after_header { + current_line = split_lines + .next() + .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?; } + + current_line = split_lines + .next() + .unwrap_or(¤t_line[current_line.len()..]); + bytes = &bytes[current_line.as_ptr() as usize - bytes.as_ptr() as usize..]; } let starting_point_offset = if bytes.is_empty() { diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 22df34dad668..8d5a49bc536d 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2307,3 +2307,18 @@ def test_csv_double_new_line() -> None: "column_2": ["b", None], "column_3": ["c", None], } + + +def test_csv_quoted_newlines_skip_rows_19535() -> None: + assert_frame_equal( + pl.read_csv( + b"""\ +"a\nb" +0 +""", + has_header=False, + skip_rows=1, + new_columns=["x"], + ), + pl.DataFrame({"x": 0}), + )