Skip to content

Commit

Permalink
fix: Ignore quoted newlines when skipping lines in CSV (pola-rs#19543)
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion authored Oct 31, 2024
1 parent 569d5b7 commit 279c4b1
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 15 deletions.
38 changes: 23 additions & 15 deletions crates/polars-io/src/csv/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ use rayon::prelude::*;
use super::buffer::init_buffers;
use super::options::{CommentPrefix, CsvEncoding, NullValues, NullValuesCompiled};
use super::parser::{
is_comment_line, next_line_position, next_line_position_naive, parse_lines, skip_bom,
skip_line_ending, skip_this_line, CountLines,
is_comment_line, parse_lines, skip_bom, skip_line_ending, skip_this_line, CountLines,
SplitLines,
};
use super::schema_inference::{check_decimal_comma, infer_file_schema};
#[cfg(any(feature = "decompress", feature = "decompress-fast"))]
Expand Down Expand Up @@ -283,11 +283,19 @@ impl<'a> CoreReader<'a> {

// skip 'n' leading rows
if self.skip_rows_before_header > 0 {
let mut split_lines = SplitLines::new(bytes, quote_char, eol_char);
let mut current_line = &bytes[..0];

for _ in 0..self.skip_rows_before_header {
let pos = next_line_position_naive(bytes, eol_char)
current_line = split_lines
.next()
.ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?;
bytes = &bytes[pos..];
}

current_line = split_lines
.next()
.unwrap_or(&current_line[current_line.len()..]);
bytes = &bytes[current_line.as_ptr() as usize - bytes.as_ptr() as usize..];
}

// skip lines that are comments
Expand All @@ -301,19 +309,19 @@ impl<'a> CoreReader<'a> {
}
// skip 'n' rows following the header
if self.skip_rows_after_header > 0 {
for _ in 0..self.skip_rows_after_header {
let pos = if is_comment_line(bytes, self.comment_prefix.as_ref()) {
next_line_position_naive(bytes, eol_char)
} else {
// we don't pass expected fields
// as we want to skip all rows
// no matter the no. of fields
next_line_position(bytes, None, self.separator, self.quote_char, eol_char)
}
.ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?;
let mut split_lines = SplitLines::new(bytes, quote_char, eol_char);
let mut current_line = &bytes[..0];

bytes = &bytes[pos..];
for _ in 0..self.skip_rows_after_header {
current_line = split_lines
.next()
.ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?;
}

current_line = split_lines
.next()
.unwrap_or(&current_line[current_line.len()..]);
bytes = &bytes[current_line.as_ptr() as usize - bytes.as_ptr() as usize..];
}

let starting_point_offset = if bytes.is_empty() {
Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2307,3 +2307,18 @@ def test_csv_double_new_line() -> None:
"column_2": ["b", None],
"column_3": ["c", None],
}


def test_csv_quoted_newlines_skip_rows_19535() -> None:
assert_frame_equal(
pl.read_csv(
b"""\
"a\nb"
0
""",
has_header=False,
skip_rows=1,
new_columns=["x"],
),
pl.DataFrame({"x": 0}),
)

0 comments on commit 279c4b1

Please sign in to comment.