From fee47559498438e047bf71915f6ffa51319b742b Mon Sep 17 00:00:00 2001 From: Luke Shingles Date: Sat, 27 Jul 2024 23:47:23 +0100 Subject: [PATCH] Add support for LZMA (.xz) decompression of CSV/ndjson --- Cargo.lock | 21 +++++++++++++++++++++ Cargo.toml | 1 + crates/polars-io/Cargo.toml | 5 +++-- crates/polars-io/src/csv/read/utils.rs | 3 +++ crates/polars-io/src/utils/compression.rs | 2 ++ crates/polars-io/src/utils/other.rs | 4 ++++ py-polars/tests/unit/io/test_csv.py | 9 +++++++++ 7 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 626ab52988d3..1a227a1d2ec7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2311,6 +2311,17 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matrixmultiply" version = "0.3.8" @@ -3128,6 +3139,7 @@ dependencies = [ "tokio", "tokio-util", "url", + "xz2", "zstd", ] @@ -5332,6 +5344,15 @@ version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63658493314859b4dfdf3fb8c1defd61587839def09582db50b8a4e93afca6bb" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index c10737d04b5a..60c09605ef03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -92,6 +92,7 @@ url = "2.4" uuid = { version = "1.7.0", features = ["v4"] } version_check = "0.9.4" xxhash-rust = { version = "0.8.6", features = ["xxh3"] } +xz2 = "0.1.7" zstd = "0.13" polars = { version = "0.41.3", path = "crates/polars", default-features = false } diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index aa2dc674f7a9..8d6b78c2bcd9 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -47,6 +47,7 @@ smartstring = { workspace = true } tokio = { workspace = true, features = ["fs", "net", "rt-multi-thread", "time", "sync"], optional = true } tokio-util = { workspace = true, features = ["io", "io-util"], optional = true } url = { workspace = true, optional = true } +xz2 = { workspace = true, optional = true } zstd = { workspace = true, optional = true } [target.'cfg(not(target_family = "wasm"))'.dependencies] @@ -75,8 +76,8 @@ ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrow avro parsing avro = ["arrow/io_avro", "arrow/io_avro_compression"] csv = ["atoi_simd", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"] -decompress = ["flate2/rust_backend", "zstd"] -decompress-fast = ["flate2/zlib-ng", "zstd"] +decompress = ["flate2/rust_backend", "xz2", "zstd"] +decompress-fast = ["flate2/zlib-ng", "xz2", "zstd"] dtype-u8 = ["polars-core/dtype-u8"] dtype-u16 = ["polars-core/dtype-u16"] dtype-i8 = ["polars-core/dtype-i8"] diff --git a/crates/polars-io/src/csv/read/utils.rs b/crates/polars-io/src/csv/read/utils.rs index 33c6a6c8f290..887b0a86c138 100644 --- a/crates/polars-io/src/csv/read/utils.rs +++ b/crates/polars-io/src/csv/read/utils.rs @@ -136,6 +136,9 @@ pub(crate) fn decompress( } else if bytes.starts_with(&ZLIB0) || bytes.starts_with(&ZLIB1) || bytes.starts_with(&ZLIB2) { let mut decoder = flate2::read::ZlibDecoder::new(bytes); decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) + } else if bytes.starts_with(&LZMA) { + let mut decoder = xz2::read::XzDecoder::new(bytes); + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) } else if bytes.starts_with(&ZSTD) { let mut decoder = zstd::Decoder::new(bytes).ok()?; decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) diff --git a/crates/polars-io/src/utils/compression.rs b/crates/polars-io/src/utils/compression.rs index d771b4c6ca1e..12f8d08d2628 100644 --- a/crates/polars-io/src/utils/compression.rs +++ b/crates/polars-io/src/utils/compression.rs @@ -1,6 +1,7 @@ // magic numbers pub mod magic { pub const GZIP: [u8; 2] = [31, 139]; + pub const LZMA: [u8; 6] = [0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]; pub const ZLIB0: [u8; 2] = [0x78, 0x01]; pub const ZLIB1: [u8; 2] = [0x78, 0x9C]; pub const ZLIB2: [u8; 2] = [0x78, 0xDA]; @@ -15,5 +16,6 @@ pub fn is_compressed(bytes: &[u8]) -> bool { || bytes.starts_with(&ZLIB1) || bytes.starts_with(&ZLIB2) || bytes.starts_with(&GZIP) + || bytes.starts_with(&LZMA) || bytes.starts_with(&ZSTD) } diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 9ab9772d955e..bc8f00b076bb 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -71,6 +71,10 @@ pub unsafe fn maybe_decompress_bytes<'a>( flate2::read::ZlibDecoder::new(bytes) .read_to_end(out) .map_err(to_compute_err)?; + } else if bytes.starts_with(&LZMA) { + xz2::read::XzDecoder::new(bytes) + .read_to_end(out) + .map_err(to_compute_err)?; } else if bytes.starts_with(&ZSTD) { zstd::Decoder::new(bytes)?.read_to_end(out)?; } else { diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index b4ccbed97db8..c96d653d1bc2 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2,6 +2,7 @@ import gzip import io +import lzma import sys import textwrap import zlib @@ -572,6 +573,14 @@ def test_compressed_csv(io_files_path: Path, monkeypatch: pytest.MonkeyPatch) -> ) assert_frame_equal(out, expected) + # lzma (.xz) compression + csv_bytes = lzma.compress(csv.encode()) + out = pl.read_csv(csv_bytes) + expected = pl.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0]} + ) + assert_frame_equal(out, expected) + # zstd compression csv_bytes = zstandard.compress(csv.encode()) out = pl.read_csv(csv_bytes)