Skip to content

Commit

Permalink
Add support for LZMA (.xz) decompression of CSV/ndjson
Browse files Browse the repository at this point in the history
  • Loading branch information
lukeshingles committed Jul 27, 2024
1 parent 865c768 commit fee4755
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 2 deletions.
21 changes: 21 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ url = "2.4"
uuid = { version = "1.7.0", features = ["v4"] }
version_check = "0.9.4"
xxhash-rust = { version = "0.8.6", features = ["xxh3"] }
xz2 = "0.1.7"
zstd = "0.13"

polars = { version = "0.41.3", path = "crates/polars", default-features = false }
Expand Down
5 changes: 3 additions & 2 deletions crates/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ smartstring = { workspace = true }
tokio = { workspace = true, features = ["fs", "net", "rt-multi-thread", "time", "sync"], optional = true }
tokio-util = { workspace = true, features = ["io", "io-util"], optional = true }
url = { workspace = true, optional = true }
xz2 = { workspace = true, optional = true }
zstd = { workspace = true, optional = true }

[target.'cfg(not(target_family = "wasm"))'.dependencies]
Expand Down Expand Up @@ -75,8 +76,8 @@ ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"]
# support for arrow avro parsing
avro = ["arrow/io_avro", "arrow/io_avro_compression"]
csv = ["atoi_simd", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"]
decompress = ["flate2/rust_backend", "zstd"]
decompress-fast = ["flate2/zlib-ng", "zstd"]
decompress = ["flate2/rust_backend", "xz2", "zstd"]
decompress-fast = ["flate2/zlib-ng", "xz2", "zstd"]
dtype-u8 = ["polars-core/dtype-u8"]
dtype-u16 = ["polars-core/dtype-u16"]
dtype-i8 = ["polars-core/dtype-i8"]
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-io/src/csv/read/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ pub(crate) fn decompress(
} else if bytes.starts_with(&ZLIB0) || bytes.starts_with(&ZLIB1) || bytes.starts_with(&ZLIB2) {
let mut decoder = flate2::read::ZlibDecoder::new(bytes);
decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char)
} else if bytes.starts_with(&LZMA) {
let mut decoder = xz2::read::XzDecoder::new(bytes);
decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char)
} else if bytes.starts_with(&ZSTD) {
let mut decoder = zstd::Decoder::new(bytes).ok()?;
decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char)
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-io/src/utils/compression.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// magic numbers
pub mod magic {
pub const GZIP: [u8; 2] = [31, 139];
pub const LZMA: [u8; 6] = [0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00];
pub const ZLIB0: [u8; 2] = [0x78, 0x01];
pub const ZLIB1: [u8; 2] = [0x78, 0x9C];
pub const ZLIB2: [u8; 2] = [0x78, 0xDA];
Expand All @@ -15,5 +16,6 @@ pub fn is_compressed(bytes: &[u8]) -> bool {
|| bytes.starts_with(&ZLIB1)
|| bytes.starts_with(&ZLIB2)
|| bytes.starts_with(&GZIP)
|| bytes.starts_with(&LZMA)
|| bytes.starts_with(&ZSTD)
}
4 changes: 4 additions & 0 deletions crates/polars-io/src/utils/other.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ pub unsafe fn maybe_decompress_bytes<'a>(
flate2::read::ZlibDecoder::new(bytes)
.read_to_end(out)
.map_err(to_compute_err)?;
} else if bytes.starts_with(&LZMA) {
xz2::read::XzDecoder::new(bytes)
.read_to_end(out)
.map_err(to_compute_err)?;
} else if bytes.starts_with(&ZSTD) {
zstd::Decoder::new(bytes)?.read_to_end(out)?;
} else {
Expand Down
9 changes: 9 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import gzip
import io
import lzma
import sys
import textwrap
import zlib
Expand Down Expand Up @@ -572,6 +573,14 @@ def test_compressed_csv(io_files_path: Path, monkeypatch: pytest.MonkeyPatch) ->
)
assert_frame_equal(out, expected)

# lzma (.xz) compression
csv_bytes = lzma.compress(csv.encode())
out = pl.read_csv(csv_bytes)
expected = pl.DataFrame(
{"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0]}
)
assert_frame_equal(out, expected)

# zstd compression
csv_bytes = zstandard.compress(csv.encode())
out = pl.read_csv(csv_bytes)
Expand Down

0 comments on commit fee4755

Please sign in to comment.