Skip to content

Commit

Permalink
Merge pull request #337 from martindurant/zip-codec
Browse files Browse the repository at this point in the history
Codec for ZIP files
  • Loading branch information
martindurant authored Jun 28, 2023
2 parents 8d8e84c + 613e9d2 commit f272c66
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 3 deletions.
33 changes: 33 additions & 0 deletions kerchunk/codecs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import ast
import io

import numcodecs
from numcodecs.abc import Codec
import numpy as np
Expand Down Expand Up @@ -206,3 +208,34 @@ def decode(self, buf, out=None):

def encode(self, buf):
raise NotImplementedError


class DeflateCodec(Codec):
"""As implemented for members of zip-files
The input buffer contains the file header as well as the compressed bytes
"""

codec_id = "deflate"

def decode(self, buf, out=None):
import zipfile
import struct

head = buf[: zipfile.sizeFileHeader]
*_, csize, usize, fnsize, extra_size = struct.unpack(
zipfile.structFileHeader, head
)

zi = zipfile.ZipInfo()
zi.compress_size = csize
zi.file_size = usize
zi.compress_type = zipfile.ZIP_DEFLATED

b = io.BytesIO(buf)
b.seek(zipfile.sizeFileHeader + fnsize + extra_size)
zf = zipfile.ZipExtFile(b, mode="r", zipinfo=zi)
return zf.read()

def encode(self, buf):
raise NotImplementedError
22 changes: 22 additions & 0 deletions kerchunk/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,25 @@ def test_archive(m, archive):
assert fs.cat("a") == b"stuff"
assert fs.cat("b") == data
assert fs.cat("c") == data[5:7]


def test_deflate_zip_archive(m):
import zipfile
from kerchunk.codecs import DeflateCodec

dec = DeflateCodec()

data = b"piece of data"
with fsspec.open("memory://archive", "wb") as f:
arc = zipfile.ZipFile(file=f, mode="w", compression=zipfile.ZIP_DEFLATED)
arc.writestr("data1", data)
arc.close()
refs = {
"b": [f"zip://data1::memory://archive"],
}

with pytest.warns(UserWarning):
refs2 = kerchunk.utils.dereference_archives(refs)

fs = fsspec.filesystem("reference", fo=refs2)
assert dec.decode(fs.cat("b")) == data
24 changes: 21 additions & 3 deletions kerchunk/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import copy
import itertools
import warnings

import ujson

Expand Down Expand Up @@ -309,9 +310,26 @@ def dereference_archives(references, remote_options=None):
for zipinfo in zf.filelist:
if zipinfo.is_dir():
continue
# if uncompressed, include only the buffer. In compressed (DEFLATE), include
# also the header, and must use DeflateCodec
if zipinfo.compress_type == zipfile.ZIP_DEFLATED:
# TODO: find relevant .zarray and add filter directly
header = 0
warnings.warn(
"ZIP file contains compressed files, must use DeflateCodec"
)
tail = len(zipinfo.FileHeader())
elif zipinfo.compress_type == zipfile.ZIP_STORED:
header = len(zipinfo.FileHeader())
tail = 0
else:
comp = zipfile.compressor_names[zipinfo.compress_type]
raise ValueError(
f"ZIP compression method not supported: {comp}"
)
offsets[target][zipinfo.filename] = {
"offset": zipinfo.header_offset + len(zipinfo.FileHeader()),
"size": zipinfo.compress_size,
"offset": zipinfo.header_offset + header,
"size": zipinfo.compress_size + tail,
"comp": zipinfo.compress_type != zipfile.ZIP_STORED,
}

Expand All @@ -327,7 +345,7 @@ def dereference_archives(references, remote_options=None):
detail = offsets[target][infile]
if detail["comp"]:
# leave compressed member file alone
continue
pass
v[0] = target
if len(v) == 1:
v.append(detail["offset"])
Expand Down

0 comments on commit f272c66

Please sign in to comment.