From d6c3296b3a8acad72361a477da87dad2ff3f6245 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 22 Aug 2023 00:04:56 +0800 Subject: [PATCH] chore(rust): replace lexcal by atoi to parse integer --- crates/polars-io/Cargo.toml | 6 +++--- crates/polars-io/src/csv/buffer.rs | 8 ++++---- py-polars/tests/unit/io/test_csv.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 5587fbb55ef3a..160ca571d7ffe 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -18,6 +18,7 @@ polars-utils = { version = "0.32.0", path = "../polars-utils" } ahash = { workspace = true } arrow = { workspace = true } +atoi = { workspace = true, optional = true} async-trait = { version = "0.1.59", optional = true } bytes = { version = "1.3" } chrono = { workspace = true, optional = true } @@ -25,7 +26,6 @@ chrono-tz = { workspace = true, optional = true } fast-float = { version = "0.2", optional = true } flate2 = { version = "1", features = ["zlib-ng"], optional = true, default-features = false } futures = { workspace = true, optional = true } -lexical = { version = "6", optional = true, default-features = false, features = ["std", "parse-integers"] } lexical-core = { version = "0.8", optional = true } memchr = { workspace = true } memmap = { package = "memmap2", version = "0.7", optional = true } @@ -51,10 +51,10 @@ tempdir = "0.3.7" # support for arrows json parsing json = [ "arrow/io_json_write", + "atoi", "polars-json", "simd-json", "memmap", - "lexical", "lexical-core", "serde_json", "dtype-struct", @@ -65,7 +65,7 @@ ipc = ["arrow/io_ipc", "arrow/io_ipc_compression", "memmap"] ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrow avro parsing avro = ["arrow/io_avro", "arrow/io_avro_compression"] -csv = ["memmap", "lexical", "polars-core/rows", "lexical-core", "fast-float", "simdutf8"] +csv = ["atoi", "memmap", "polars-core/rows", "lexical-core", "fast-float", "simdutf8"] decompress = ["flate2/miniz_oxide"] decompress-fast = ["flate2/zlib-ng"] dtype-categorical = ["polars-core/dtype-categorical"] diff --git a/crates/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs index 6eeb98e41e398..60f18b2543931 100644 --- a/crates/polars-io/src/csv/buffer.rs +++ b/crates/polars-io/src/csv/buffer.rs @@ -34,25 +34,25 @@ impl PrimitiveParser for Float64Type { impl PrimitiveParser for UInt32Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi::atoi::(bytes) } } impl PrimitiveParser for UInt64Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi::atoi::(bytes) } } impl PrimitiveParser for Int32Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi::atoi::(bytes) } } impl PrimitiveParser for Int64Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi::atoi::(bytes) } } diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index ca6574a310614..3391470531255 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1312,6 +1312,24 @@ def test_read_csv_chunked() -> None: assert df.filter(pl.col("count") < pl.col("count").shift(1)).is_empty() +def test_read_csv_parse_integer() -> None: + # 2147483647 is max value of i32 + csv = """value + 2147483647 + 2147483648 + 9589934591 + 9589934592 + 9999999999 + 10000000000 + """ + + assert pl.read_csv( + source=io.StringIO(csv), + dtypes={"value": pl.Int32}, + ignore_errors=True, + ).to_dict(False) == {"value": [2147483647, None, None, None, None, None]} + + def test_read_empty_csv(io_files_path: Path) -> None: with pytest.raises(NoDataError) as err: pl.read_csv(io_files_path / "empty.csv")