From 1fa42d9038dd96f979a3a770c06f4fc009000a1d Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 6 Nov 2024 16:15:45 +0100 Subject: [PATCH 01/35] add default compressor to config --- src/zarr/core/config.py | 4 ++++ src/zarr/core/metadata/v2.py | 21 ++++++++++++++++++--- tests/test_config.py | 4 ++++ tests/test_v2.py | 22 ++++++++++++++++++++-- 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 29f5e139fe..9445e2a789 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -64,6 +64,10 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", + "v2_dtype_kind_to_default_compressor": { + "biufcmM": "zstd", + "OSUV": "vlen-bytes", + }, } ], ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index f18f2e4e8d..7d3eecb330 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -71,6 +71,8 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) + if compressor is None: + compressor = _default_compressor(dtype_parsed) compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -238,15 +240,15 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> numcodecs.abc.Codec: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if isinstance(data, numcodecs.abc.Codec): return data if isinstance(data, dict): return numcodecs.get_codec(data) - msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." + msg = f"Invalid compressor. Expected a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) @@ -326,3 +328,16 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return "" else: return dtype.type(0) + + +def _default_compressor(dtype: np.dtype[Any]) -> numcodecs.abc.Codec: + """Get the default compressor for a type. + + The config contains a mapping from numpy dtype kind to the default compressor. + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_compressor") + for dtype_kinds, compressor in dtype_kind_to_default_compressor.items(): + if dtype.kind in dtype_kinds: + return numcodecs.get_codec({"id": compressor}) + raise ValueError(f"No default compressor found for dtype {dtype} of kind {dtype.kind}") diff --git a/tests/test_config.py b/tests/test_config.py index ddabffb467..e1a15a5f8c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -63,6 +63,10 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, + "v2_dtype_kind_to_default_compressor": { + "biufcmM": "zstd", + "OSUV": "vlen-bytes", + }, } ] assert config.get("array.order") == "C" diff --git a/tests/test_v2.py b/tests/test_v2.py index 3dd17848fb..777d96511b 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -10,7 +10,7 @@ import zarr import zarr.storage -from zarr import Array +from zarr import Array, config from zarr.storage import MemoryStore, StorePath @@ -96,7 +96,6 @@ async def test_v2_encode_decode(dtype): serialized = json.loads(result.to_bytes()) expected = { "chunks": [3], - "compressor": None, "dtype": f"{dtype}0", "fill_value": "WA==", "filters": None, @@ -105,6 +104,7 @@ async def test_v2_encode_decode(dtype): "zarr_format": 2, "dimension_separator": ".", } + del serialized["compressor"] assert serialized == expected data = zarr.open_array(store=store, path="foo")[:] @@ -130,3 +130,21 @@ def test_v2_filters_codecs(filters: Any) -> None: arr[:] = array_fixture result = arr[:] np.testing.assert_array_equal(result, array_fixture) + + +@pytest.mark.parametrize( + "dtype_compressor", + [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-bytes"]], +) +def test_default_compressors(dtype_compressor: Any) -> None: + with config.set( + { + "v2_dtype_kind_to_default_compressor": { + "biufcmM": "zstd", + "OSUV": "vlen-bytes", + }, + } + ): + dtype, expected_compressor = dtype_compressor + arr = zarr.create(shape=(10,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.compressor.codec_id == expected_compressor From 02053e9bf7f52de83e703d12273a35c4f5cf8276 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 6 Nov 2024 19:48:12 +0100 Subject: [PATCH 02/35] modify _default_compressor to _default_filters_and_compressor --- src/zarr/core/array.py | 8 ---- src/zarr/core/config.py | 6 +-- src/zarr/core/metadata/v2.py | 34 +++++++++------ tests/test_config.py | 6 +-- tests/test_v2.py | 82 ++++++++++++++++++++---------------- 5 files changed, 73 insertions(+), 63 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1646959cb5..933e9e2c85 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -492,14 +492,6 @@ async def create( order=order, ) elif zarr_format == 2: - if dtype is str or dtype == "str": - # another special case: zarr v2 added the vlen-utf8 codec - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} - if filters and not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [vlen_codec] - else: - filters = [vlen_codec] - if codecs is not None: raise ValueError( "codecs cannot be used for arrays with version 2. Use filters and compressor instead." diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 9445e2a789..3373d08958 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -64,9 +64,9 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "v2_dtype_kind_to_default_compressor": { - "biufcmM": "zstd", - "OSUV": "vlen-bytes", + "v2_dtype_kind_to_default_filters_and_compressor": { + "biufcmM": ["zstd"], + "OSUV": ["vlen-utf8"], }, } ], diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 7d3eecb330..bcd23e24bc 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, TypedDict, cast +from typing import TYPE_CHECKING, Any, TypedDict, cast from zarr.abc.metadata import Metadata @@ -71,8 +71,14 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) - if compressor is None: - compressor = _default_compressor(dtype_parsed) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype_parsed) + if dtype is str or dtype == "str": + vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} + if filters and not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [vlen_codec] + else: + filters = [vlen_codec] compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -240,15 +246,15 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec: +def parse_compressor(data: object) -> numcodecs.abc.Codec | None: """ Parse a potential compressor. """ - if isinstance(data, numcodecs.abc.Codec): + if data is None or isinstance(data, numcodecs.abc.Codec): return data if isinstance(data, dict): return numcodecs.get_codec(data) - msg = f"Invalid compressor. Expected a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." + msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) @@ -330,14 +336,18 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_compressor(dtype: np.dtype[Any]) -> numcodecs.abc.Codec: - """Get the default compressor for a type. +def _default_filters_and_compressor( + dtype: np.dtype[Any], +) -> tuple[list[dict[str, str]], dict[str, str] | None]: + """Get the default filters and compressor for a dtype. The config contains a mapping from numpy dtype kind to the default compressor. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ - dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_compressor") - for dtype_kinds, compressor in dtype_kind_to_default_compressor.items(): + dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_filters_and_compressor") + for dtype_kinds, filters_and_compressor in dtype_kind_to_default_compressor.items(): if dtype.kind in dtype_kinds: - return numcodecs.get_codec({"id": compressor}) - raise ValueError(f"No default compressor found for dtype {dtype} of kind {dtype.kind}") + filters = [{"id": f} for f in filters_and_compressor] + compressor = None + return filters, compressor + return [], None diff --git a/tests/test_config.py b/tests/test_config.py index e1a15a5f8c..2d158ebd9f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -63,9 +63,9 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - "v2_dtype_kind_to_default_compressor": { - "biufcmM": "zstd", - "OSUV": "vlen-bytes", + "v2_dtype_kind_to_default_filters_and_compressor": { + "biufcmM": ["zstd"], + "OSUV": ["vlen-utf8"], }, } ] diff --git a/tests/test_v2.py b/tests/test_v2.py index 777d96511b..86d54492a7 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -80,36 +80,43 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - store = zarr.storage.MemoryStore(mode="w") - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=b"X", - ) - - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - del serialized["compressor"] - assert serialized == expected + with config.set( + { + "v2_dtype_kind_to_default_filters_and_compressor": { + "OSUV": ["vlen-bytes"], + }, + } + ): + store = zarr.storage.MemoryStore(mode="w") + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": [{"id": "vlen-bytes"}], + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) @@ -133,18 +140,19 @@ def test_v2_filters_codecs(filters: Any) -> None: @pytest.mark.parametrize( - "dtype_compressor", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-bytes"]], + "dtype_expected", + [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], ) -def test_default_compressors(dtype_compressor: Any) -> None: +def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { - "v2_dtype_kind_to_default_compressor": { - "biufcmM": "zstd", - "OSUV": "vlen-bytes", + "v2_dtype_kind_to_default_filters_and_compressor": { + "biufcmM": ["zstd"], + "OSUV": ["vlen-utf8"], }, } ): - dtype, expected_compressor = dtype_compressor + dtype, expected = dtype_expected arr = zarr.create(shape=(10,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.compressor.codec_id == expected_compressor + assert arr.metadata.filters[0].codec_id == expected + print(arr.metadata) From 6ac38eadd97a1e879fec697a3ca22b78c9865c6d Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 6 Nov 2024 20:01:39 +0100 Subject: [PATCH 03/35] fix test_metadata_to_dict --- tests/test_metadata/test_v2.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 089d5c98e1..8801bed4f6 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -11,7 +11,7 @@ from zarr.core.buffer import cpu from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata -from zarr.core.metadata.v2 import parse_zarr_format +from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format if TYPE_CHECKING: from typing import Any @@ -77,6 +77,15 @@ def test_metadata_to_dict( assert observed["dimension_separator"] == expected_dimension_sep observed.pop("dimension_separator") + if not filters and not compressor: + assert observed["filters"], observed["compressor"] == _default_filters_and_compressor( + np.dtype(data_type) + ) + observed.pop("filters") + observed.pop("compressor") + expected.pop("filters") + expected.pop("compressor") + assert observed == expected From 9507e1912c8b9e64fca0cf4a6bcf945c925ed4f4 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 6 Nov 2024 22:06:46 +0100 Subject: [PATCH 04/35] wip debugging --- src/zarr/codecs/_v2.py | 3 +++ src/zarr/core/config.py | 3 ++- tests/test_properties.py | 2 +- tests/test_v2.py | 9 +++++---- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 30504ad204..7f3c1ff8ec 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -46,7 +46,10 @@ async def _decode_single( # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if chunk_spec.dtype != object: + print(chunk_spec.dtype, chunk.dtype) chunk = chunk.view(chunk_spec.dtype) + print("worked") + elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 3373d08958..0391c714bc 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -66,7 +66,8 @@ def reset(self) -> None: "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", "v2_dtype_kind_to_default_filters_and_compressor": { "biufcmM": ["zstd"], - "OSUV": ["vlen-utf8"], + "SV": ["vlen-bytes"], + "OU": ["vlen-utf8"], }, } ], diff --git a/tests/test_properties.py b/tests/test_properties.py index f70753ceb5..8100181fef 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -6,7 +6,7 @@ import hypothesis.extra.numpy as npst # noqa: E402 import hypothesis.strategies as st # noqa: E402 -from hypothesis import assume, given # noqa: E402 +from hypothesis import assume, given, reproduce_failure # noqa: E402 from zarr.testing.strategies import arrays, basic_indices, numpy_arrays, zarr_formats # noqa: E402 diff --git a/tests/test_v2.py b/tests/test_v2.py index 86d54492a7..309a6ae9fe 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -141,18 +141,19 @@ def test_v2_filters_codecs(filters: Any) -> None: @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], + # [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], + [["|S1", "vlen-bytes"]], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "v2_dtype_kind_to_default_filters_and_compressor": { "biufcmM": ["zstd"], - "OSUV": ["vlen-utf8"], + "OSUV": ["vlen-bytes"], }, } ): dtype, expected = dtype_expected - arr = zarr.create(shape=(10,), path="foo", store={}, zarr_format=2, dtype=dtype) + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) assert arr.metadata.filters[0].codec_id == expected - print(arr.metadata) + arr[:] = np.array(["a", "bb", "ccc"], dtype=dtype) From f93ced262040f5b119529a6b7e527673ac33433c Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 13 Nov 2024 12:34:56 +0100 Subject: [PATCH 05/35] format --- tests/test_properties.py | 2 +- tests/test_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_properties.py b/tests/test_properties.py index 8100181fef..f70753ceb5 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -6,7 +6,7 @@ import hypothesis.extra.numpy as npst # noqa: E402 import hypothesis.strategies as st # noqa: E402 -from hypothesis import assume, given, reproduce_failure # noqa: E402 +from hypothesis import assume, given # noqa: E402 from zarr.testing.strategies import arrays, basic_indices, numpy_arrays, zarr_formats # noqa: E402 diff --git a/tests/test_v2.py b/tests/test_v2.py index 3c783831f5..c99fd1742f 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -141,7 +141,7 @@ def test_v2_filters_codecs(filters: Any) -> None: @pytest.mark.parametrize( "dtype_expected", - # [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], + # [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], [["|S1", "vlen-bytes"]], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: From 07590ca12d11a17944e8fa948c373f4335dcd663 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 13 Nov 2024 14:49:49 +0100 Subject: [PATCH 06/35] fix v2 decode string dtype --- src/zarr/codecs/_v2.py | 10 +++++++--- src/zarr/core/config.py | 4 ++-- tests/test_v2.py | 37 +++++++++++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 7f3c1ff8ec..6ed64739e2 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING import numcodecs +import numpy as np from numcodecs.compat import ensure_ndarray_like from zarr.abc.codec import ArrayBytesCodec @@ -43,12 +44,15 @@ async def _decode_single( # view as numpy array with correct dtype chunk = ensure_ndarray_like(chunk) + print(chunk) + print(chunk.dtype) # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if chunk_spec.dtype != object: - print(chunk_spec.dtype, chunk.dtype) - chunk = chunk.view(chunk_spec.dtype) - print("worked") + try: + chunk = chunk.view(chunk_spec.dtype) + except TypeError: + chunk = np.array(chunk).astype(chunk_spec.dtype) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 0391c714bc..fa28258ba6 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -66,8 +66,8 @@ def reset(self) -> None: "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", "v2_dtype_kind_to_default_filters_and_compressor": { "biufcmM": ["zstd"], - "SV": ["vlen-bytes"], - "OU": ["vlen-utf8"], + "U": ["vlen-utf8"], + "OSV": ["vlen-bytes"], }, } ], diff --git a/tests/test_v2.py b/tests/test_v2.py index c99fd1742f..0da668ad89 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -83,7 +83,7 @@ async def test_v2_encode_decode(dtype): with config.set( { "v2_dtype_kind_to_default_filters_and_compressor": { - "OSUV": ["vlen-bytes"], + "SV": ["vlen-bytes"], }, } ): @@ -119,15 +119,37 @@ async def test_v2_encode_decode(dtype): np.testing.assert_equal(data, expected) +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value): + dtype, value = dtype_value + with config.set( + { + "v2_dtype_kind_to_default_filters_and_compressor": { + "U": ["vlen-utf8"], + "OSV": ["vlen-bytes"], + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) + + @pytest.mark.parametrize("dtype", [str, "str"]) async def test_create_dtype_str(dtype: Any) -> None: arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = ["a", "bb", "ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] result = arr[:] - np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: @pytest.mark.parametrize( "dtype_expected", - # [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]], - [["|S1", "vlen-bytes"]], + [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "v2_dtype_kind_to_default_filters_and_compressor": { "biufcmM": ["zstd"], - "OSUV": ["vlen-bytes"], + "U": ["vlen-utf8"], + "OSV": ["vlen-bytes"], }, } ): dtype, expected = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) assert arr.metadata.filters[0].codec_id == expected - arr[:] = np.array(["a", "bb", "ccc"], dtype=dtype) From 4e2a3bc5bb830759297d697914a687bdf29dad41 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 13 Nov 2024 16:06:43 +0100 Subject: [PATCH 07/35] fix config default tests --- tests/test_array.py | 2 ++ tests/test_config.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index 3948896186..4452c018da 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -4,6 +4,7 @@ from itertools import accumulate from typing import Any, Literal +import numcodecs import numpy as np import pytest @@ -431,6 +432,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, + _filters= (numcodecs.Zstd(),) ) assert result == expected diff --git a/tests/test_config.py b/tests/test_config.py index da1ebfa5f3..7bfede2c43 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -65,7 +65,8 @@ def test_config_defaults_set() -> None: }, "v2_dtype_kind_to_default_filters_and_compressor": { "biufcmM": ["zstd"], - "OSUV": ["vlen-utf8"], + "U": ["vlen-utf8"], + "OSV": ["vlen-bytes"], }, } ] From 0fc7b2396ae3599cdce74157749285e8bb66b7e8 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 13 Nov 2024 16:08:30 +0100 Subject: [PATCH 08/35] format --- tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index 4452c018da..b0873f8469 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -432,7 +432,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, - _filters= (numcodecs.Zstd(),) + _filters=(numcodecs.Zstd(),), ) assert result == expected From 8ec16e8c15ced582853333f6bf80e8599ba5a120 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 6 Dec 2024 17:33:22 +0100 Subject: [PATCH 09/35] Update src/zarr/codecs/_v2.py --- src/zarr/codecs/_v2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 6ed64739e2..a20f4aea8d 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -44,8 +44,6 @@ async def _decode_single( # view as numpy array with correct dtype chunk = ensure_ndarray_like(chunk) - print(chunk) - print(chunk.dtype) # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if chunk_spec.dtype != object: From d6dc14676a6b6e6b94ab5fa64cac41f882d43515 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 14:04:50 +0100 Subject: [PATCH 10/35] rename v2_dtype_kind_to_default_filters_and_compressor to v2_default_compressors --- src/zarr/core/config.py | 8 ++++---- src/zarr/core/metadata/v2.py | 19 +++++++++++-------- tests/test_config.py | 8 ++++---- tests/test_v2.py | 16 ++++++++-------- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index fa28258ba6..e5ab29b6c9 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -64,10 +64,10 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "v2_dtype_kind_to_default_filters_and_compressor": { - "biufcmM": ["zstd"], - "U": ["vlen-utf8"], - "OSV": ["vlen-bytes"], + "v2_default_compressors": { + "numeric": ["zstd"], + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], }, } ], diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bcd23e24bc..763aefbf7a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -341,13 +341,16 @@ def _default_filters_and_compressor( ) -> tuple[list[dict[str, str]], dict[str, str] | None]: """Get the default filters and compressor for a dtype. - The config contains a mapping from numpy dtype kind to the default compressor. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ - dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_filters_and_compressor") - for dtype_kinds, filters_and_compressor in dtype_kind_to_default_compressor.items(): - if dtype.kind in dtype_kinds: - filters = [{"id": f} for f in filters_and_compressor] - compressor = None - return filters, compressor - return [], None + default_compressors = config.get("v2_default_compressors") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "unicode" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return [{"id": f} for f in default_compressors[dtype_key]], None diff --git a/tests/test_config.py b/tests/test_config.py index 7bfede2c43..c46b456302 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -63,10 +63,10 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - "v2_dtype_kind_to_default_filters_and_compressor": { - "biufcmM": ["zstd"], - "U": ["vlen-utf8"], - "OSV": ["vlen-bytes"], + "v2_default_compressors": { + "numeric": ["zstd"], + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], }, } ] diff --git a/tests/test_v2.py b/tests/test_v2.py index bb8ef624ff..1a6a179142 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -84,8 +84,8 @@ def test_codec_pipeline() -> None: async def test_v2_encode_decode(dtype): with config.set( { - "v2_dtype_kind_to_default_filters_and_compressor": { - "SV": ["vlen-bytes"], + "v2_default_compressors": { + "bytes": ["vlen-bytes"], }, } ): @@ -126,9 +126,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "v2_dtype_kind_to_default_filters_and_compressor": { - "U": ["vlen-utf8"], - "OSV": ["vlen-bytes"], + "v2_default_compressors": { + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], }, } ): @@ -171,9 +171,9 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "v2_dtype_kind_to_default_filters_and_compressor": { - "biufcmM": ["zstd"], - "U": ["vlen-utf8"], - "OSV": ["vlen-bytes"], + "numeric": ["zstd"], + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], }, } ): From 15577ae9ee71adfc0a6f6d58143aeb64226bf7fe Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 16:33:57 +0100 Subject: [PATCH 11/35] recover test_v2.py --- tests/test_v2.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/tests/test_v2.py b/tests/test_v2.py index e69de29bb2..68c07e2024 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -0,0 +1,179 @@ +import json +from collections.abc import Iterator +from typing import Any, Literal + +import numcodecs.vlen +import numpy as np +import pytest +from numcodecs import Delta +from numcodecs.blosc import Blosc + +import zarr +import zarr.core.buffer +import zarr.storage +from zarr import Array +from zarr.storage import MemoryStore, StorePath + + +@pytest.fixture +async def store() -> Iterator[StorePath]: + return StorePath(await MemoryStore.open()) + + +def test_simple(store: StorePath) -> None: + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + a = Array.create( + store / "simple_v2", + zarr_format=2, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize( + ("dtype", "fill_value"), + [ + ("bool", False), + ("int64", 0), + ("float64", 0.0), + ("|S1", b""), + ("|U1", ""), + ("object", ""), + (str, ""), + ], +) +def test_implicit_fill_value(store: MemoryStore, dtype: str, fill_value: Any) -> None: + arr = zarr.create(store=store, shape=(4,), fill_value=None, zarr_format=2, dtype=dtype) + assert arr.metadata.fill_value is None + assert arr.metadata.to_dict()["fill_value"] is None + result = arr[:] + if dtype is str: + # special case + numpy_dtype = np.dtype(object) + else: + numpy_dtype = np.dtype(dtype) + expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) + np.testing.assert_array_equal(result, expected) + + +def test_codec_pipeline() -> None: + # https://github.com/zarr-developers/zarr-python/issues/2243 + store = MemoryStore() + array = zarr.create( + store=store, + shape=(1,), + dtype="i4", + zarr_format=2, + filters=[Delta(dtype="i4").get_config()], + compressor=Blosc().get_config(), + ) + array[:] = 1 + result = array[:] + expected = np.ones(1) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["|S", "|V"]) +async def test_v2_encode_decode(dtype): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any) -> None: + arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" + assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) + arr[:] = ["a", "bb", "ccc"] + result = arr[:] + np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + + +@pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + array_fixture = [42] + arr = zarr.create(shape=1, dtype=" None: + arr = zarr.Array.create( + MemoryStore({}), + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + exists_ok=True, + order=array_order, + ) + + # Non-contiguous write + a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) + arr[slice(6, 9, None), slice(3, 6, None)] = a[ + slice(6, 9, None), slice(3, 6, None) + ] # The slice on the RHS is important + np.testing.assert_array_equal( + arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] + ) + + arr = zarr.Array.create( + MemoryStore({}), + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + exists_ok=True, + order=array_order, + ) + + # Contiguous write + a = np.arange(9).reshape((3, 3), order=data_order) + if data_order == "F": + assert a.flags.f_contiguous + else: + assert a.flags.c_contiguous + arr[slice(6, 9, None), slice(3, 6, None)] = a + np.testing.assert_array_equal(arr[slice(6, 9, None), slice(3, 6, None)], a) From 67010ce2ab526f733d0d678cff40b7ae7cc171bb Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 18:39:02 +0100 Subject: [PATCH 12/35] incorporate feedback --- src/zarr/api/asynchronous.py | 15 ++- src/zarr/codecs/_v2.py | 6 ++ src/zarr/core/array.py | 9 ++ src/zarr/core/metadata/v2.py | 10 +- tests/test_array.py | 2 + tests/test_group.py | 2 + tests/test_metadata/test_consolidated.py | 2 + tests/test_metadata/test_v2.py | 11 +-- tests/test_v2.py | 116 ++++++++++++++++------- 9 files changed, 119 insertions(+), 54 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 26822f725b..6e8ef9ce8d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -17,10 +17,12 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -885,8 +887,17 @@ async def create( or _default_zarr_version() ) - if zarr_format == 2 and chunks is None: - chunks = shape + if zarr_format == 2: + if chunks is None: + chunks = shape + dtype = parse_dtype(dtype, zarr_format) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + if np.issubdtype(dtype, np.str_): + filters = filters or [] + if not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [{"id": "vlen-utf8"}] + elif zarr_format == 3 and chunk_shape is None: if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 2d186b8878..53edc1f4a1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -50,6 +50,12 @@ async def _decode_single( try: chunk = chunk.view(chunk_spec.dtype) except TypeError: + # this will happen if the dtype of the chunk + # does not match the dtype of the array spec i.g. if + # the dtype of the chunk_spec is a string dtype, but the chunk + # is an object array. In this case, we need to convert the object + # array to the correct dtype. + chunk = np.array(chunk).astype(chunk_spec.dtype) elif chunk.dtype != object: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 56e30e2715..c7cc49a07c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -77,6 +77,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.core.metadata.v3 import parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError @@ -617,6 +618,14 @@ async def _create_v2( if dimension_separator is None: dimension_separator = "." + dtype = parse_dtype(dtype, 2) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + if np.issubdtype(dtype, np.str_): + filters = filters or [] + if not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [{"id": "vlen-utf8"}] + metadata = ArrayV2Metadata( shape=shape, dtype=np.dtype(dtype), diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 763aefbf7a..1cbc4d79a9 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -71,14 +71,7 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype_parsed) - if dtype is str or dtype == "str": - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} - if filters and not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [vlen_codec] - else: - filters = [vlen_codec] + compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -343,6 +336,7 @@ def _default_filters_and_compressor( https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ + dtype = np.dtype(dtype) default_compressors = config.get("v2_default_compressors") if dtype.kind in "biufcmM": dtype_key = "numeric" diff --git a/tests/test_array.py b/tests/test_array.py index 58bc823068..eb138aa8cd 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -8,6 +8,7 @@ import numcodecs import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group @@ -513,6 +514,7 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", + _filters=(Zstd(level=0),), _count_bytes=128, ) assert result == expected diff --git a/tests/test_group.py b/tests/test_group.py index afa290207d..3b7acc9b15 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr import zarr.api.asynchronous @@ -496,6 +497,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "shape": (1,), "chunks": (1,), "order": "C", + "filters": (Zstd(level=0),), "zarr_format": zarr_format, }, "subgroup": { diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 8ae9cc81fd..26e9904608 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous import zarr.api.synchronous @@ -486,6 +487,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=None, + filters=(Zstd(level=0),), order="C", ), "g1": GroupMetadata( diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 8c82eac20a..003aef331f 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -11,7 +11,7 @@ from zarr.core.buffer import cpu from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format +from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: from typing import Any @@ -77,15 +77,6 @@ def test_metadata_to_dict( assert observed["dimension_separator"] == expected_dimension_sep observed.pop("dimension_separator") - if not filters and not compressor: - assert observed["filters"], observed["compressor"] == _default_filters_and_compressor( - np.dtype(data_type) - ) - observed.pop("filters") - observed.pop("compressor") - expected.pop("filters") - expected.pop("compressor") - assert observed == expected diff --git a/tests/test_v2.py b/tests/test_v2.py index 68c07e2024..ba8544ff0f 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array +from zarr import Array, config from zarr.storage import MemoryStore, StorePath @@ -82,36 +82,65 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=b"X", - ) - - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) + with config.set( + { + "v2_default_compressors": { + "bytes": ["vlen-bytes"], + }, + } + ): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": [{"id": "vlen-bytes"}], + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value): + dtype, value = dtype_value + with config.set( + { + "v2_default_compressors": { + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) @@ -119,10 +148,10 @@ async def test_create_dtype_str(dtype: Any) -> None: arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = ["a", "bb", "ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] result = arr[:] - np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + with config.set( + { + "v2_dtype_kind_to_default_filters_and_compressor": { + "numeric": ["zstd"], + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + } + ): + dtype, expected = dtype_expected + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.filters[0].codec_id == expected From f6b98c3d1f8b06803a2ba7898223fa5ddbea9790 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 18:39:02 +0100 Subject: [PATCH 13/35] incorporate feedback --- src/zarr/api/asynchronous.py | 18 +++- src/zarr/codecs/_v2.py | 6 ++ src/zarr/core/array.py | 9 ++ src/zarr/core/metadata/v2.py | 10 +- tests/test_array.py | 2 + tests/test_group.py | 2 + tests/test_metadata/test_consolidated.py | 2 + tests/test_metadata/test_v2.py | 11 +-- tests/test_v2.py | 116 ++++++++++++++++------- 9 files changed, 121 insertions(+), 55 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 26822f725b..dc199f28ff 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -17,10 +17,12 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -815,7 +817,12 @@ async def create( dtype : str or dtype, optional NumPy dtype. compressor : Codec, optional - Primary compressor. + Primary compressor for `zarr_format=2`. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `v2_default_compressors` variable in the Zarr config. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional @@ -885,8 +892,13 @@ async def create( or _default_zarr_version() ) - if zarr_format == 2 and chunks is None: - chunks = shape + if zarr_format == 2: + if chunks is None: + chunks = shape + dtype = parse_dtype(dtype, zarr_format) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + elif zarr_format == 3 and chunk_shape is None: if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 2d186b8878..53edc1f4a1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -50,6 +50,12 @@ async def _decode_single( try: chunk = chunk.view(chunk_spec.dtype) except TypeError: + # this will happen if the dtype of the chunk + # does not match the dtype of the array spec i.g. if + # the dtype of the chunk_spec is a string dtype, but the chunk + # is an object array. In this case, we need to convert the object + # array to the correct dtype. + chunk = np.array(chunk).astype(chunk_spec.dtype) elif chunk.dtype != object: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 56e30e2715..c7cc49a07c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -77,6 +77,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.core.metadata.v3 import parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError @@ -617,6 +618,14 @@ async def _create_v2( if dimension_separator is None: dimension_separator = "." + dtype = parse_dtype(dtype, 2) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + if np.issubdtype(dtype, np.str_): + filters = filters or [] + if not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [{"id": "vlen-utf8"}] + metadata = ArrayV2Metadata( shape=shape, dtype=np.dtype(dtype), diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 763aefbf7a..1cbc4d79a9 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -71,14 +71,7 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype_parsed) - if dtype is str or dtype == "str": - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} - if filters and not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [vlen_codec] - else: - filters = [vlen_codec] + compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -343,6 +336,7 @@ def _default_filters_and_compressor( https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ + dtype = np.dtype(dtype) default_compressors = config.get("v2_default_compressors") if dtype.kind in "biufcmM": dtype_key = "numeric" diff --git a/tests/test_array.py b/tests/test_array.py index 58bc823068..eb138aa8cd 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -8,6 +8,7 @@ import numcodecs import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group @@ -513,6 +514,7 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", + _filters=(Zstd(level=0),), _count_bytes=128, ) assert result == expected diff --git a/tests/test_group.py b/tests/test_group.py index afa290207d..3b7acc9b15 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr import zarr.api.asynchronous @@ -496,6 +497,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "shape": (1,), "chunks": (1,), "order": "C", + "filters": (Zstd(level=0),), "zarr_format": zarr_format, }, "subgroup": { diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 8ae9cc81fd..26e9904608 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous import zarr.api.synchronous @@ -486,6 +487,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=None, + filters=(Zstd(level=0),), order="C", ), "g1": GroupMetadata( diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 8c82eac20a..003aef331f 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -11,7 +11,7 @@ from zarr.core.buffer import cpu from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format +from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: from typing import Any @@ -77,15 +77,6 @@ def test_metadata_to_dict( assert observed["dimension_separator"] == expected_dimension_sep observed.pop("dimension_separator") - if not filters and not compressor: - assert observed["filters"], observed["compressor"] == _default_filters_and_compressor( - np.dtype(data_type) - ) - observed.pop("filters") - observed.pop("compressor") - expected.pop("filters") - expected.pop("compressor") - assert observed == expected diff --git a/tests/test_v2.py b/tests/test_v2.py index 68c07e2024..ba8544ff0f 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array +from zarr import Array, config from zarr.storage import MemoryStore, StorePath @@ -82,36 +82,65 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=b"X", - ) - - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) + with config.set( + { + "v2_default_compressors": { + "bytes": ["vlen-bytes"], + }, + } + ): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": [{"id": "vlen-bytes"}], + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value): + dtype, value = dtype_value + with config.set( + { + "v2_default_compressors": { + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) @@ -119,10 +148,10 @@ async def test_create_dtype_str(dtype: Any) -> None: arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = ["a", "bb", "ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] result = arr[:] - np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + with config.set( + { + "v2_dtype_kind_to_default_filters_and_compressor": { + "numeric": ["zstd"], + "unicode": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + } + ): + dtype, expected = dtype_expected + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.filters[0].codec_id == expected From fcbae8bdeda8aba34145f83bf65fb07608734804 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 19:27:34 +0100 Subject: [PATCH 14/35] fix mypy --- src/zarr/api/asynchronous.py | 3 +-- src/zarr/core/metadata/v2.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index dc199f28ff..1cd7125c55 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -898,8 +898,7 @@ async def create( dtype = parse_dtype(dtype, zarr_format) if not filters and not compressor: filters, compressor = _default_filters_and_compressor(dtype) - - elif zarr_format == 3 and chunk_shape is None: + elif zarr_format == 3 and chunk_shape is None: #type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks chunks = None diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 1cbc4d79a9..abbadfe00d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -331,7 +331,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: def _default_filters_and_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, str]], dict[str, str] | None]: +) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html From a77fb0d7b122b4222746f175403005e4775b7362 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 11 Dec 2024 19:37:44 +0100 Subject: [PATCH 15/35] allow only one default compressor --- src/zarr/api/asynchronous.py | 4 ++-- src/zarr/core/config.py | 8 ++++---- src/zarr/core/metadata/v2.py | 6 +++--- tests/test_config.py | 8 ++++---- tests/test_v2.py | 18 +++++++++--------- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 1cd7125c55..7aabe416e1 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -822,7 +822,7 @@ async def create( - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. - These defaults can be changed using the `v2_default_compressors` variable in the Zarr config. + These defaults can be changed using the `v2_default_compressor` variable in the Zarr config. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional @@ -898,7 +898,7 @@ async def create( dtype = parse_dtype(dtype, zarr_format) if not filters and not compressor: filters, compressor = _default_filters_and_compressor(dtype) - elif zarr_format == 3 and chunk_shape is None: #type: ignore[redundant-expr] + elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks chunks = None diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index e5ab29b6c9..1d64ae2056 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -64,10 +64,10 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "v2_default_compressors": { - "numeric": ["zstd"], - "unicode": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", }, } ], diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index abbadfe00d..04dee87ca4 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -337,14 +337,14 @@ def _default_filters_and_compressor( https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ dtype = np.dtype(dtype) - default_compressors = config.get("v2_default_compressors") + default_compressor = config.get("v2_default_compressor") if dtype.kind in "biufcmM": dtype_key = "numeric" elif dtype.kind in "U": - dtype_key = "unicode" + dtype_key = "string" elif dtype.kind in "OSV": dtype_key = "bytes" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": f} for f in default_compressors[dtype_key]], None + return [{"id": default_compressor[dtype_key]}], None diff --git a/tests/test_config.py b/tests/test_config.py index c46b456302..d24bd23333 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -63,10 +63,10 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - "v2_default_compressors": { - "numeric": ["zstd"], - "unicode": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", }, } ] diff --git a/tests/test_v2.py b/tests/test_v2.py index ba8544ff0f..9811a576d1 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -84,8 +84,8 @@ def test_codec_pipeline() -> None: async def test_v2_encode_decode(dtype): with config.set( { - "v2_default_compressors": { - "bytes": ["vlen-bytes"], + "v2_default_compressor": { + "bytes": "vlen-bytes", }, } ): @@ -126,9 +126,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "v2_default_compressors": { - "unicode": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "v2_default_compressor": { + "string": "vlen-utf8", + "bytes": "vlen-bytes", }, } ): @@ -215,10 +215,10 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { - "v2_dtype_kind_to_default_filters_and_compressor": { - "numeric": ["zstd"], - "unicode": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", }, } ): From 876e67dbee73a2ca3c78e9ad9349e0390b0b6245 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 17:01:59 +0100 Subject: [PATCH 16/35] put `v2_default_compressor` under `array` --- src/zarr/core/config.py | 15 +++++++++------ src/zarr/core/metadata/v2.py | 2 +- tests/test_config.py | 14 ++++++++------ tests/test_v2.py | 8 +++----- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 1d64ae2056..163f122f8b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -42,7 +42,14 @@ def reset(self) -> None: defaults=[ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, @@ -64,11 +71,7 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", - }, + } ], ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index d1546d16b2..b66e41bf0f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -337,7 +337,7 @@ def _default_filters_and_compressor( https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ dtype = np.dtype(dtype) - default_compressor = config.get("v2_default_compressor") + default_compressor = config.get("array.v2_default_compressor") if dtype.kind in "biufcmM": dtype_key = "numeric" elif dtype.kind in "U": diff --git a/tests/test_config.py b/tests/test_config.py index d24bd23333..437b2a56b8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -41,7 +41,14 @@ def test_config_defaults_set() -> None: assert config.defaults == [ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, @@ -63,11 +70,6 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", - }, } ] assert config.get("array.order") == "C" diff --git a/tests/test_v2.py b/tests/test_v2.py index b95b491a41..defd86a685 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -84,9 +84,7 @@ def test_codec_pipeline() -> None: async def test_v2_encode_decode(dtype): with config.set( { - "v2_default_compressor": { - "bytes": "vlen-bytes", - }, + "array.v2_default_compressor.bytes": "vlen-bytes", } ): store = zarr.storage.MemoryStore() @@ -126,7 +124,7 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "v2_default_compressor": { + "array.v2_default_compressor": { "string": "vlen-utf8", "bytes": "vlen-bytes", }, @@ -215,7 +213,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { - "v2_default_compressor": { + "array.v2_default_compressor": { "numeric": "zstd", "string": "vlen-utf8", "bytes": "vlen-bytes", From 12dfaf43195ccbbaf611f87759110e2b0fe07a14 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 17:38:31 +0100 Subject: [PATCH 17/35] deprecate zarr.storage.default_compressor --- src/zarr/core/config.py | 7 +++---- src/zarr/storage/__init__.py | 22 ++++++++++++++++++++++ tests/test_v2.py | 11 ++++++----- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 163f122f8b..9b3b20e5a6 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -31,10 +31,10 @@ def reset(self) -> None: # The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. # For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations # in the registry and then select them in the config. -# e.g. an implementation of the bytes codec in a class "NewBytesCodec", requires the value of codecs.bytes.name to be -# "NewBytesCodec". +# e.g. an implementation of the bytes codec in a class "your.module.NewBytesCodec", requires the value of codecs.bytes +# to be "your.module.NewBytesCodec". # Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations -# e.g. export ZARR_CODECS__BYTES__NAME="NewBytesCodec" +# e.g. export ZARR_CODECS__BYTES="your.module.NewBytesCodec" # (for more information see github.com/pytroll/donfig) # Default values below point to the standard implementations of zarr-python config = Config( @@ -71,7 +71,6 @@ def reset(self) -> None: }, "buffer": "zarr.core.buffer.cpu.Buffer", "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - } ], ) diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 17b11f54a6..282f8ed9c3 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -1,3 +1,8 @@ +import sys +import warnings +from types import ModuleType +from typing import Any + from zarr.storage.common import StoreLike, StorePath, make_store_path from zarr.storage.local import LocalStore from zarr.storage.logging import LoggingStore @@ -17,3 +22,20 @@ "ZipStore", "make_store_path", ] + + +class VerboseModule(ModuleType): + def __setattr__(self, attr: str, value: Any) -> None: + if attr == "default_compressor": + warnings.warn( + "setting zarr.storage.default_compressor is deprecated, use " + "zarr.config to configure array.v2_default_compressor " + "e.g. config.set({'codecs.zstd':'your.module.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})", + DeprecationWarning, + stacklevel=1, + ) + else: + super().__setattr__(attr, value) + + +sys.modules[__name__].__class__ = VerboseModule diff --git a/tests/test_v2.py b/tests/test_v2.py index defd86a685..205b0fdf52 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,11 +82,7 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set( - { - "array.v2_default_compressor.bytes": "vlen-bytes", - } - ): + with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( @@ -206,6 +202,11 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" np.testing.assert_array_equal(arr[slice(6, 9, None), slice(3, 6, None)], a) +def test_default_compressor_deprecation_warning(): + with pytest.warns(DeprecationWarning): + zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" + + @pytest.mark.parametrize( "dtype_expected", [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], From 6954b601124859c34729e494e01029c8c60b8e6e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 18:01:00 +0100 Subject: [PATCH 18/35] test v3_default_codecs --- src/zarr/core/config.py | 5 +++++ src/zarr/core/metadata/v2.py | 1 - tests/test_config.py | 18 +++++++++++++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 9b3b20e5a6..809d23d86c 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -49,6 +49,11 @@ def reset(self) -> None: "string": "vlen-utf8", "bytes": "vlen-bytes", }, + "v3_default_codecs": { + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index b66e41bf0f..bd0fbecf4a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -336,7 +336,6 @@ def _default_filters_and_compressor( https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html """ - dtype = np.dtype(dtype) default_compressor = config.get("array.v2_default_compressor") if dtype.kind in "biufcmM": dtype_key = "numeric" diff --git a/tests/test_config.py b/tests/test_config.py index 437b2a56b8..56b20d4b41 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,7 +11,8 @@ from zarr import Array, zeros from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store -from zarr.codecs import BloscCodec, BytesCodec, Crc32cCodec, ShardingCodec +from zarr.codecs import BloscCodec, BytesCodec, Crc32cCodec, ShardingCodec, TransposeCodec, GzipCodec, VLenBytesCodec, \ + VLenUTF8Codec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline @@ -239,3 +240,18 @@ def test_config_buffer_implementation() -> None: ) arr_Crc32c[:] = data2d assert np.array_equal(arr_Crc32c[:], data2d) + +@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) +def test_default_codecs(dtype:str) -> None: + with config.set({"array.v3_default_codecs": { + "numeric": ["bytes", "gzip"], # test setting non-standard codecs + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }}): + arr = zeros(shape=(100), store=StoreExpectingTestBuffer(), dtype=dtype) + if dtype == "int": + assert arr.metadata.codecs == [BytesCodec(), GzipCodec()] + elif dtype == "bytes": + assert arr.metadata.codecs == [VLenBytesCodec()] + elif dtype == "str": + assert arr.metadata.codecs == [VLenUTF8Codec()] From 80dfc40059f4c86b5a72be2949b5d4b216ef58cb Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 18:21:53 +0100 Subject: [PATCH 19/35] use v3_default_codecs --- src/zarr/codecs/__init__.py | 18 --------- src/zarr/core/array.py | 24 ++++++++---- tests/test_config.py | 76 +++++++++++++++++++++---------------- 3 files changed, 61 insertions(+), 57 deletions(-) diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index e407d94892..165dbe476d 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -1,10 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - import numpy as np - from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.crc32c_ import Crc32cCodec @@ -13,7 +8,6 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec -from zarr.core.metadata.v3 import DataType __all__ = [ "BloscCname", @@ -30,15 +24,3 @@ "VLenUTF8Codec", "ZstdCodec", ] - - -def _get_default_array_bytes_codec( - np_dtype: np.dtype[Any], -) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec: - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - return VLenUTF8Codec() - elif dtype == DataType.bytes: - return VLenBytesCodec() - else: - return BytesCodec() diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 32d30562fa..5b255cd13f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -12,7 +12,6 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete -from zarr.codecs import _get_default_array_bytes_codec from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo from zarr.core.attributes import Attributes @@ -78,7 +77,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import _default_filters_and_compressor -from zarr.core.metadata.v3 import parse_node_type_array +from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError from zarr.registry import get_pipeline_class @@ -556,11 +555,7 @@ async def _create_v3( await ensure_no_existing_node(store_path, zarr_format=3) shape = parse_shapelike(shape) - codecs = ( - list(codecs) - if codecs is not None - else [_get_default_array_bytes_codec(np.dtype(dtype))] - ) + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) if chunk_key_encoding is None: chunk_key_encoding = ("default", "/") @@ -3318,3 +3313,18 @@ def _build_parents( ) return parents + + +def _get_default_codecs( + np_dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + default_codecs = config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] diff --git a/tests/test_config.py b/tests/test_config.py index 56b20d4b41..eeeedec3fd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,8 +11,15 @@ from zarr import Array, zeros from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store -from zarr.codecs import BloscCodec, BytesCodec, Crc32cCodec, ShardingCodec, TransposeCodec, GzipCodec, VLenBytesCodec, \ - VLenUTF8Codec +from zarr.codecs import ( + BloscCodec, + BytesCodec, + Crc32cCodec, + GzipCodec, + ShardingCodec, + VLenBytesCodec, + VLenUTF8Codec, +) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline @@ -216,39 +223,44 @@ def test_config_buffer_implementation() -> None: arr[:] = np.arange(100) register_buffer(TestBuffer) - config.set({"buffer": fully_qualified_name(TestBuffer)}) - assert get_buffer_class() == TestBuffer - - # no error using TestBuffer - data = np.arange(100) - arr[:] = np.arange(100) - assert np.array_equal(arr[:], data) - - data2d = np.arange(1000).reshape(100, 10) - arr_sharding = zeros( - shape=(100, 10), - store=StoreExpectingTestBuffer(), - codecs=[ShardingCodec(chunk_shape=(10, 10))], - ) - arr_sharding[:] = data2d - assert np.array_equal(arr_sharding[:], data2d) + with config.set({"buffer": fully_qualified_name(TestBuffer)}): + assert get_buffer_class() == TestBuffer + + # no error using TestBuffer + data = np.arange(100) + arr[:] = np.arange(100) + assert np.array_equal(arr[:], data) + + data2d = np.arange(1000).reshape(100, 10) + arr_sharding = zeros( + shape=(100, 10), + store=StoreExpectingTestBuffer(), + codecs=[ShardingCodec(chunk_shape=(10, 10))], + ) + arr_sharding[:] = data2d + assert np.array_equal(arr_sharding[:], data2d) + + arr_Crc32c = zeros( + shape=(100, 10), + store=StoreExpectingTestBuffer(), + codecs=[BytesCodec(), Crc32cCodec()], + ) + arr_Crc32c[:] = data2d + assert np.array_equal(arr_Crc32c[:], data2d) - arr_Crc32c = zeros( - shape=(100, 10), - store=StoreExpectingTestBuffer(), - codecs=[BytesCodec(), Crc32cCodec()], - ) - arr_Crc32c[:] = data2d - assert np.array_equal(arr_Crc32c[:], data2d) @pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) -def test_default_codecs(dtype:str) -> None: - with config.set({"array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], - }}): - arr = zeros(shape=(100), store=StoreExpectingTestBuffer(), dtype=dtype) +def test_default_codecs(dtype: str) -> None: + with config.set( + { + "array.v3_default_codecs": { + "numeric": ["bytes", "gzip"], # test setting non-standard codecs + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + } + } + ): + arr = zeros(shape=(100), dtype=dtype) if dtype == "int": assert arr.metadata.codecs == [BytesCodec(), GzipCodec()] elif dtype == "bytes": From 6001e93789cd74317d0b76b474357609ba0f3c11 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 19:09:14 +0100 Subject: [PATCH 20/35] fix tests that expected codecs==["bytes"] --- tests/test_array.py | 18 +++++++++++------- tests/test_config.py | 5 +++++ tests/test_group.py | 5 ++++- tests/test_metadata/test_consolidated.py | 10 ++++++++-- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index b6d82a95ac..feebbc687b 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -12,7 +12,7 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec +from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array import chunks_initialized from zarr.core.buffer import default_buffer_prototype @@ -376,7 +376,7 @@ async def test_chunks_initialized() -> None: def test_nbytes_stored() -> None: - arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]) result = arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. arr[:50] = 1 @@ -388,7 +388,9 @@ def test_nbytes_stored() -> None: async def test_nbytes_stored_async() -> None: - arr = await zarr.api.asynchronous.create(shape=(100,), chunks=(10,), dtype="i4") + arr = await zarr.api.asynchronous.create( + shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()] + ) result = await arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. await arr.setitem(slice(50), 1) @@ -473,13 +475,13 @@ def test_info_v3(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected def test_info_complete(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()]) result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, @@ -530,13 +532,15 @@ async def test_info_v3_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected async def test_info_complete_async(self) -> None: - arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = await zarr.api.asynchronous.create( + shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()] + ) result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, diff --git a/tests/test_config.py b/tests/test_config.py index eeeedec3fd..6860427908 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -56,6 +56,11 @@ def test_config_defaults_set() -> None: "string": "vlen-utf8", "bytes": "vlen-bytes", }, + "v3_default_codecs": { + "bytes": ["vlen-bytes"], + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, diff --git a/tests/test_group.py b/tests/test_group.py index ef5196067b..e0bc304b9b 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -523,7 +523,10 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": fill_value, "node_type": "array", diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 26e9904608..7f0c49338e 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -72,7 +72,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", @@ -216,7 +219,10 @@ def test_consolidated_sync(self, memory_store): "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", From ff766179df6b02b9fb222e1de88970bbf2317721 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 19:33:45 +0100 Subject: [PATCH 21/35] fix test_default_codecs --- tests/test_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 6860427908..6952c9c7fe 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -254,7 +254,7 @@ def test_config_buffer_implementation() -> None: assert np.array_equal(arr_Crc32c[:], data2d) -@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) +@pytest.mark.parametrize("dtype", ["int", "bytes", str]) def test_default_codecs(dtype: str) -> None: with config.set( { @@ -265,7 +265,7 @@ def test_default_codecs(dtype: str) -> None: } } ): - arr = zeros(shape=(100), dtype=dtype) + arr = zeros(shape=(100), dtype=np.dtype(dtype), zarr_format=3) if dtype == "int": assert arr.metadata.codecs == [BytesCodec(), GzipCodec()] elif dtype == "bytes": From f04e0e6c52a00cd3b65699d89a56e046f9f2fe3a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 19:48:38 +0100 Subject: [PATCH 22/35] fail-fast: false --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1c25dcb1f4..770241c59c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,6 +19,7 @@ jobs: name: os=${{ matrix.os }}, py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }} strategy: + fail-fast: false matrix: python-version: ['3.11', '3.12', '3.13'] numpy-version: ['1.25', '2.1'] From f63bb671d6530e279a9d1aea488bdc25436b05d7 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 20:25:17 +0100 Subject: [PATCH 23/35] fix string codecs for np1.25 --- .github/workflows/test.yml | 1 - src/zarr/core/metadata/v3.py | 6 +++++- tests/test_config.py | 25 ++++++++++++++----------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 770241c59c..1c25dcb1f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,6 @@ jobs: name: os=${{ matrix.os }}, py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }} strategy: - fail-fast: false matrix: python-version: ['3.11', '3.12', '3.13'] numpy-version: ['1.25', '2.1'] diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b800ae4d73..eeaab217c3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,7 +37,7 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes -from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE +from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE, _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class @@ -606,6 +606,10 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType.string elif dtype.kind == "S": return DataType.bytes + elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": + # numpy < 2.0 does not support vlen string dtype + # so we fall back on object array of strings + return DataType.string dtype_to_data_type = { "|b1": "bool", "bool": "bool", diff --git a/tests/test_config.py b/tests/test_config.py index 6952c9c7fe..93c2acd37d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,8 +8,8 @@ import pytest import zarr -from zarr import Array, zeros -from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline +from zarr import Array, zeros, AsyncArray +from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline, Codec from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( BloscCodec, @@ -25,6 +25,7 @@ from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple +from zarr.core.strings import _STRING_DTYPE from zarr.registry import ( fully_qualified_name, get_buffer_class, @@ -36,6 +37,7 @@ register_ndbuffer, register_pipeline, ) +from zarr.storage import MemoryStore from zarr.testing.buffer import ( NDBufferUsingTestNDArrayLike, StoreExpectingTestBuffer, @@ -254,8 +256,14 @@ def test_config_buffer_implementation() -> None: assert np.array_equal(arr_Crc32c[:], data2d) -@pytest.mark.parametrize("dtype", ["int", "bytes", str]) -def test_default_codecs(dtype: str) -> None: +@pytest.mark.parametrize(("dtype", "expected_codecs"), + [ + ("int", [BytesCodec(), GzipCodec()]), + ("bytes", [VLenBytesCodec()]), + ("str", [VLenUTF8Codec()]), + ] + ) +async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { "array.v3_default_codecs": { @@ -265,10 +273,5 @@ def test_default_codecs(dtype: str) -> None: } } ): - arr = zeros(shape=(100), dtype=np.dtype(dtype), zarr_format=3) - if dtype == "int": - assert arr.metadata.codecs == [BytesCodec(), GzipCodec()] - elif dtype == "bytes": - assert arr.metadata.codecs == [VLenBytesCodec()] - elif dtype == "str": - assert arr.metadata.codecs == [VLenUTF8Codec()] + arr = await AsyncArray.create(shape=(100,), chunk_shape=(100,),dtype=np.dtype(dtype), zarr_format=3, store=MemoryStore()) + assert arr.metadata.codecs == expected_codecs From 00e241ead3ee1a42f58bbe34c3502e0f413dfe4c Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 14 Dec 2024 20:37:29 +0100 Subject: [PATCH 24/35] format --- src/zarr/core/metadata/v3.py | 3 ++- tests/test_config.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index eeaab217c3..3f8c5def64 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,7 +37,8 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes -from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE, _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class diff --git a/tests/test_config.py b/tests/test_config.py index 93c2acd37d..d7b04eb21c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,8 +8,8 @@ import pytest import zarr -from zarr import Array, zeros, AsyncArray -from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline, Codec +from zarr import Array, AsyncArray, zeros +from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( BloscCodec, @@ -25,7 +25,6 @@ from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple -from zarr.core.strings import _STRING_DTYPE from zarr.registry import ( fully_qualified_name, get_buffer_class, @@ -256,13 +255,14 @@ def test_config_buffer_implementation() -> None: assert np.array_equal(arr_Crc32c[:], data2d) -@pytest.mark.parametrize(("dtype", "expected_codecs"), +@pytest.mark.parametrize( + ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), ("bytes", [VLenBytesCodec()]), ("str", [VLenUTF8Codec()]), - ] - ) + ], +) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { @@ -273,5 +273,11 @@ async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: } } ): - arr = await AsyncArray.create(shape=(100,), chunk_shape=(100,),dtype=np.dtype(dtype), zarr_format=3, store=MemoryStore()) + arr = await AsyncArray.create( + shape=(100,), + chunk_shape=(100,), + dtype=np.dtype(dtype), + zarr_format=3, + store=MemoryStore(), + ) assert arr.metadata.codecs == expected_codecs From 58406c813813f6860fd61d8b4e10a355e9b4094e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 12:58:02 +0100 Subject: [PATCH 25/35] add docstrings to create in asynchronous.py and array.py --- src/zarr/api/asynchronous.py | 38 ++++++++++++---- src/zarr/core/array.py | 86 +++++++++++++++++++++++++++--------- 2 files changed, 94 insertions(+), 30 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ff89b6e4a5..a19f56e06d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -394,7 +394,7 @@ async def save_array( arr : ndarray NumPy array with data to save. zarr_format : {2, 3, None}, optional - The zarr format to use when saving. + The zarr format to use when saving (default is 3). path : str or None, optional The path within the store where the array will be saved. storage_options : dict @@ -810,24 +810,40 @@ async def create( shape : int or tuple of ints Array shape. chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. + The shape of the array's chunks. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. + chunk_shape : int or tuple of ints, optional + The shape of the Array's chunks (default is None). + V3 only. V2 arrays should use `chunks` instead. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations thereof. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. compressor : Codec, optional - Primary compressor for `zarr_format=2`. + Primary compressor to compress chunk data. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` nor `filters` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. - These defaults can be changed using the `v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). + Default is specified in the Zarr config `array.order`. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -842,6 +858,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. + V2 only. If neither `compressor` nor `filters` are provided, a default + compressor will be used. (see `compressor` for details) cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -857,7 +875,8 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - + V2 only. V3 arrays should use `chunk_key_encoding` instead. + Default is ".". .. versionadded:: 2.8 write_empty_chunks : bool, optional @@ -873,6 +892,7 @@ async def create( zarr_format : {2, 3, None}, optional The zarr format to use when saving. + Default is 3. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5b255cd13f..990d7b0936 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -408,27 +408,47 @@ async def create( attributes : dict[str, JSON], optional The attributes of the array (default is None). chunk_shape : ChunkCoords, optional - The shape of the array's chunks (default is None). + The shape of the array's chunks + V3 only. V2 arrays should use `chunks` instead. + Default values are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations thereof. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional - The shape of the array's chunks (default is None). - V2 only. V3 arrays should not have 'chunks' parameter. + The shape of the array's chunks. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). - V2 only. V3 arrays cannot have a dimension separator. + The dimension separator (default is "."). + V2 only. V3 arrays should use `chunk_key_encoding` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). - V2 only. V3 arrays should not have 'filters' parameter. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` + nor `filters` are provided, a default compressor will be used. (see + `compressor` for details) compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should not have 'compressor' parameter. + V2 only. V3 arrays should use `codecs` instead. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional @@ -1472,23 +1492,47 @@ def create( dtype : npt.DTypeLike The data type of the array. chunk_shape : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the Array's chunks. + V3 only. V2 arrays should use `chunks` instead. + Default values are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations thereof. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the array's chunks. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). + The dimension separator (default is "."). + V2 only. V3 arrays should use `chunk_key_encoding` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` + nor `filters` are provided, a default compressor will be used. (see + `compressor` for details) compressor : dict[str, JSON], optional - The compressor used to compress the data (default is None). + Primary compressor to compress chunk data. + V2 only. V3 arrays should use `codecs` instead. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). From fc0998923ccd68652b4ea25f9beacf6789b621c4 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 13:19:30 +0100 Subject: [PATCH 26/35] add docstrings to creation in group.py --- src/zarr/core/group.py | 109 +++++++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f46c5126b2..8adce41c64 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1034,24 +1034,46 @@ async def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + Default values are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). codecs : Iterable[Codec | dict[str, JSON]] | None = None An iterable of Codec or dict serializations thereof. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use `chunk_key_encoding` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` + nor `filters` are provided, a default compressor will be used. (see + `compressor` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use `codecs` instead. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2222,7 +2244,7 @@ def create_array( ) -> Array: """Create a zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + This method lightly wraps `AsyncArray.create`. Parameters ---------- @@ -2233,24 +2255,46 @@ def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + Default values are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of this collection - specify the transformation from array values to stored bytes. + An iterable of Codec or dict serializations thereof. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use `chunk_key_encoding` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` + nor `filters` are provided, a default compressor will be used. (see + `compressor` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use `codecs` instead. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2260,6 +2304,7 @@ def create_array( Returns ------- + Array """ @@ -2574,24 +2619,46 @@ def array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + Default values are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ("default", "/"). codecs : Iterable[Codec | dict[str, JSON]] | None = None An iterable of Codec or dict serializations thereof. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use `filters` and `compressor` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use `chunk_shape` instead. + Default values are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use `chunk_key_encoding` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use `codecs` instead. If neither `compressor` + nor `filters` are provided, a default compressor will be used. (see + `compressor` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use `codecs` instead. + If neither `compressor` nor `filters` are provided, a default compressor will be used: + - For numeric arrays, the default is `ZstdCodec`. + - For Unicode strings, the default is `VLenUTF8Codec`. + - For bytes or objects, the default is `VLenBytesCodec`. + These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is From c62aff53ac226fe825028aa56b0e26a8a1f38cac Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:12:49 +0100 Subject: [PATCH 27/35] Apply suggestions from code review Co-authored-by: David Stansby --- src/zarr/api/asynchronous.py | 20 ++++++++++---------- src/zarr/core/array.py | 2 +- src/zarr/storage/__init__.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index b47bc07f22..ba4acf5cbc 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -403,7 +403,7 @@ async def save_array( arr : ndarray NumPy array with data to save. zarr_format : {2, 3, None}, optional - The zarr format to use when saving (default is 3). + The zarr format to use when saving (default is 3 if not specified). path : str or None, optional The path within the store where the array will be saved. storage_options : dict @@ -821,7 +821,7 @@ async def create( chunks : int or tuple of ints, optional The shape of the array's chunks. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. chunk_shape : int or tuple of ints, optional @@ -830,9 +830,9 @@ async def create( chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use `filters` and `compressor` instead. If no codecs are provided, default codecs will be used: @@ -842,8 +842,8 @@ async def create( These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. compressor : Codec, optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. @@ -852,7 +852,7 @@ async def create( Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - Default is specified in the Zarr config `array.order`. + If not specified, default is taken from the Zarr config ``array.order``. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -867,8 +867,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither `compressor` nor `filters` are provided, a default - compressor will be used. (see `compressor` for details) + V2 only. If neither ``compressor`` nor ``filters`` are provided, a default + compressor will be used. (see ``compressor`` for details). cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -884,7 +884,7 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. Default is ".". .. versionadded:: 2.8 diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0372d67026..4001ff6e2d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -642,7 +642,7 @@ async def _create_v2( if dimension_separator is None: dimension_separator = "." - dtype = parse_dtype(dtype, 2) + dtype = parse_dtype(dtype, zarr_format=2) if not filters and not compressor: filters, compressor = _default_filters_and_compressor(dtype) if np.issubdtype(dtype, np.str_): diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 9172f8c9ce..514361bd6b 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -30,7 +30,7 @@ def __setattr__(self, attr: str, value: Any) -> None: warnings.warn( "setting zarr.storage.default_compressor is deprecated, use " "zarr.config to configure array.v2_default_compressor " - "e.g. config.set({'codecs.zstd':'your.module.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})", + "e.g. config.set({'codecs.zstd':'numcodecs.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})", DeprecationWarning, stacklevel=1, ) From 48c74485e5c8bdd3abe0111e9071842a87065e6d Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 15:17:29 +0100 Subject: [PATCH 28/35] apply suggestions from review --- src/zarr/api/asynchronous.py | 2 +- src/zarr/core/array.py | 44 ++++++++++++------------ src/zarr/core/group.py | 66 ++++++++++++++++++------------------ tests/test_v2.py | 2 +- 4 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ba4acf5cbc..726149e351 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -834,7 +834,7 @@ async def create( codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4001ff6e2d..195d676326 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -411,15 +411,15 @@ async def create( chunk_shape : ChunkCoords, optional The shape of the array's chunks V3 only. V2 arrays should use `chunks` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. @@ -431,21 +431,21 @@ async def create( chunks : ShapeLike, optional The shape of the array's chunks. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use `codecs` instead. If neither `compressor` - nor `filters` are provided, a default compressor will be used. (see - `compressor` for details) + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. @@ -1503,15 +1503,15 @@ def create( chunk_shape : ChunkCoords, optional The shape of the Array's chunks. V3 only. V2 arrays should use `chunks` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. @@ -1523,21 +1523,21 @@ def create( chunks : ChunkCoords, optional The shape of the array's chunks. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use `codecs` instead. If neither `compressor` - nor `filters` are provided, a default compressor will be used. (see - `compressor` for details) + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 8adce41c64..1bf5bd3e58 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1036,15 +1036,15 @@ async def create_array( chunk_shape : tuple[int, ...] | None = None The shape of the chunks of the array. V3 only. V2 arrays should use `chunks` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. @@ -1055,21 +1055,21 @@ async def create_array( chunks : ChunkCoords | None = None The shape of the chunks of the array. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use `codecs` instead. If neither `compressor` - nor `filters` are provided, a default compressor will be used. (see - `compressor` for details) + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. @@ -2257,15 +2257,15 @@ def create_array( chunk_shape : tuple[int, ...] | None = None The shape of the chunks of the array. V3 only. V2 arrays should use `chunks` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. @@ -2276,21 +2276,21 @@ def create_array( chunks : ChunkCoords | None = None The shape of the chunks of the array. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use `codecs` instead. If neither `compressor` - nor `filters` are provided, a default compressor will be used. (see - `compressor` for details) + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. @@ -2621,15 +2621,15 @@ def array( chunk_shape : tuple[int, ...] | None = None The shape of the chunks of the array. V3 only. V2 arrays should use `chunks` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. V3 only. V2 arrays should use `dimension_separator` instead. - Default is ("default", "/"). + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use `filters` and `compressor` instead. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. @@ -2640,21 +2640,21 @@ def array( chunks : ChunkCoords | None = None The shape of the chunks of the array. V2 only. V3 arrays should use `chunk_shape` instead. - Default values are guessed based on the shape and dtype. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use `chunk_key_encoding` instead. + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None The memory order of the array (default is specified in the Zarr config `array.order`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use `codecs` instead. If neither `compressor` - nor `filters` are provided, a default compressor will be used. (see - `compressor` for details) + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). - V2 only. V3 arrays should use `codecs` instead. - If neither `compressor` nor `filters` are provided, a default compressor will be used: + V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - For numeric arrays, the default is `ZstdCodec`. - For Unicode strings, the default is `VLenUTF8Codec`. - For bytes or objects, the default is `VLenBytesCodec`. diff --git a/tests/test_v2.py b/tests/test_v2.py index 205b0fdf52..ef06c13e26 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -203,7 +203,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" def test_default_compressor_deprecation_warning(): - with pytest.warns(DeprecationWarning): + with pytest.warns(DeprecationWarning, match="default_compressor is deprecated"): zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" From 083c4cbd4f364e7b7e4773f891e6dc534b19a06d Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 15:32:01 +0100 Subject: [PATCH 29/35] correct code double backticks --- src/zarr/api/asynchronous.py | 12 +++++------ src/zarr/core/array.py | 28 ++++++++++++------------ src/zarr/core/group.py | 42 ++++++++++++++++++------------------ 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 726149e351..3e14fea4f0 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -836,17 +836,17 @@ async def create( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. compressor : Codec, optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. fill_value : object Default value to use for uninitialized portions of the array. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 195d676326..c487eb3f1a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -421,16 +421,16 @@ async def create( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional The shape of the array's chunks. - V2 only. V3 arrays should use `chunk_shape` instead. + V2 only. V3 arrays should use :func:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). @@ -446,9 +446,9 @@ async def create( The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). @@ -1513,16 +1513,16 @@ def create( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional The shape of the array's chunks. - V2 only. V3 arrays should use `chunk_shape` instead. + V2 only. V3 arrays should use :func:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). @@ -1538,9 +1538,9 @@ def create( Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1bf5bd3e58..eb8cf70f93 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1046,15 +1046,15 @@ async def create_array( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use `chunk_shape` instead. + V2 only. V3 arrays should use :func:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") @@ -1070,9 +1070,9 @@ async def create_array( The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will @@ -2267,15 +2267,15 @@ def create_array( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use `chunk_shape` instead. + V2 only. V3 arrays should use :func:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") @@ -2291,9 +2291,9 @@ def create_array( The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will @@ -2631,15 +2631,15 @@ def array( this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: - - For numeric arrays, the default is `BytesCodec` and `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use `chunk_shape` instead. + V2 only. V3 arrays should use :func:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") @@ -2655,9 +2655,9 @@ def array( The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - For numeric arrays, the default is `ZstdCodec`. - - For Unicode strings, the default is `VLenUTF8Codec`. - - For bytes or objects, the default is `VLenBytesCodec`. + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. overwrite : bool = False If True, a pre-existing array or group at the path of this array will From 500bc7b469b295658b5ffe7becda096c344dc63a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 15:47:22 +0100 Subject: [PATCH 30/35] correct attribute links in docstring --- src/zarr/core/array.py | 4 ++-- src/zarr/core/group.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c487eb3f1a..7214a062fb 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -430,7 +430,7 @@ async def create( V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional The shape of the array's chunks. - V2 only. V3 arrays should use :func:`chunk_shape` instead. + V2 only. V3 arrays should use :attr:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). @@ -1522,7 +1522,7 @@ def create( V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional The shape of the array's chunks. - V2 only. V3 arrays should use :func:`chunk_shape` instead. + V2 only. V3 arrays should use :attr:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index eb8cf70f93..8affa75a8f 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1054,7 +1054,7 @@ async def create_array( The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :func:`chunk_shape` instead. + V2 only. V3 arrays should use :attr:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") @@ -2275,7 +2275,7 @@ def create_array( The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :func:`chunk_shape` instead. + V2 only. V3 arrays should use :attr:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") @@ -2639,7 +2639,7 @@ def array( The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :func:`chunk_shape` instead. + V2 only. V3 arrays should use :attr:`chunk_shape` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") From cdf55429a0be97c3b93cba4076a9e62f9da9489a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 16:09:24 +0100 Subject: [PATCH 31/35] link zarr.core.config in docstrings --- src/zarr/api/asynchronous.py | 6 +++--- src/zarr/core/array.py | 16 ++++++++-------- src/zarr/core/config.py | 23 +++++++++++++---------- src/zarr/core/group.py | 24 ++++++++++++------------ 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 3e14fea4f0..80a854ead8 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -839,7 +839,7 @@ async def create( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. @@ -847,12 +847,12 @@ async def create( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - If not specified, default is taken from the Zarr config ``array.order``. + If not specified, default is taken from the Zarr config ```array.order```. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7214a062fb..afc46866c6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -424,19 +424,19 @@ async def create( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional The shape of the array's chunks. - V2 only. V3 arrays should use :attr:`chunk_shape` instead. + V2 only. V3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified in the Zarr config `array.order`). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -449,7 +449,7 @@ async def create( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional @@ -1516,19 +1516,19 @@ def create( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional The shape of the array's chunks. - V2 only. V3 arrays should use :attr:`chunk_shape` instead. + V2 only. V3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified in the Zarr config `array.order`). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -1541,7 +1541,7 @@ def create( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 809d23d86c..b3ff8c6ceb 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -1,3 +1,15 @@ +""" +The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. +For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations +in the registry and then select them in the config. +e.g. an implementation of the bytes codec in a class "your.module.NewBytesCodec", requires the value of codecs.bytes +to be "your.module.NewBytesCodec". +Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations +e.g. export ZARR_CODECS__BYTES="your.module.NewBytesCodec" +(for more information see github.com/pytroll/donfig) +Default values below point to the standard implementations of zarr-python +""" + from __future__ import annotations from typing import Any, Literal, cast @@ -10,7 +22,7 @@ class BadConfigError(ValueError): class Config(DConfig): # type: ignore[misc] - """Will collect configuration from config files and environment variables + """The Config will collect configuration from config files and environment variables Example environment variables: Grabs environment variables of the form "ZARR_FOO__BAR_BAZ=123" and @@ -28,15 +40,6 @@ def reset(self) -> None: self.refresh() -# The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. -# For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations -# in the registry and then select them in the config. -# e.g. an implementation of the bytes codec in a class "your.module.NewBytesCodec", requires the value of codecs.bytes -# to be "your.module.NewBytesCodec". -# Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations -# e.g. export ZARR_CODECS__BYTES="your.module.NewBytesCodec" -# (for more information see github.com/pytroll/donfig) -# Default values below point to the standard implementations of zarr-python config = Config( "zarr", defaults=[ diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 8affa75a8f..ceb9c9a77b 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1049,18 +1049,18 @@ async def create_array( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :attr:`chunk_shape` instead. + V2 only. V3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array (default is specified in the Zarr config `array.order`). + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -1073,7 +1073,7 @@ async def create_array( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2270,18 +2270,18 @@ def create_array( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :attr:`chunk_shape` instead. + V2 only. V3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array (default is specified in the Zarr config `array.order`). + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -2294,7 +2294,7 @@ def create_array( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2634,18 +2634,18 @@ def array( - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v3_default_codecs` variable in the Zarr config. + These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None The shape of the chunks of the array. - V2 only. V3 arrays should use :attr:`chunk_shape` instead. + V2 only. V3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None The delimiter used for the chunk keys. (default: ".") V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array (default is specified in the Zarr config `array.order`). + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -2658,7 +2658,7 @@ def array( - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the `array.v2_default_compressor` variable in the Zarr config. + These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is From 390c4354eb49ec00c9b7745de195ac2e6e381153 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 16:29:06 +0100 Subject: [PATCH 32/35] improve docstring readability --- src/zarr/api/asynchronous.py | 11 ++++++++--- src/zarr/core/array.py | 20 ++++++++++++++++---- src/zarr/core/config.py | 29 ++++++++++++++++++++++------- src/zarr/core/group.py | 30 ++++++++++++++++++++++++------ 4 files changed, 70 insertions(+), 20 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 80a854ead8..cccbc27e0e 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -835,20 +835,25 @@ async def create( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. - fill_value : object + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index afc46866c6..3dc186846c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -420,11 +420,14 @@ async def create( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. @@ -445,11 +448,14 @@ async def create( compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional @@ -1512,11 +1518,14 @@ def create( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). V3 only. V2 arrays should not use this parameter. @@ -1537,11 +1546,14 @@ def create( compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index b3ff8c6ceb..f9db5ab90f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -1,13 +1,28 @@ """ -The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. +The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations in the registry and then select them in the config. -e.g. an implementation of the bytes codec in a class "your.module.NewBytesCodec", requires the value of codecs.bytes -to be "your.module.NewBytesCodec". -Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations -e.g. export ZARR_CODECS__BYTES="your.module.NewBytesCodec" -(for more information see github.com/pytroll/donfig) -Default values below point to the standard implementations of zarr-python + +Example: + An implementation of the bytes codec in a class `your.module.NewBytesCodec` requires the value of `codecs.bytes` + to be `your.module.NewBytesCodec`. + + ```python + from your.module import NewBytesCodec + from zarr.core.config import register_codec, config + + register_codec("bytes", NewBytesCodec) + config.set({"codecs.bytes": "your.module.NewBytesCodec"}) + ``` + +Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations. +For example, to set the bytes codec via an environment variable: + + export ZARR_CODECS__BYTES="your.module.NewBytesCodec" + +For more information, see the Donfig documentation at https://github.com/pytroll/donfig. + +Default values below point to the standard implementations of zarr-python. """ from __future__ import annotations diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index ceb9c9a77b..2d7a21911a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1045,11 +1045,14 @@ async def create_array( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None @@ -1069,11 +1072,14 @@ async def create_array( compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2266,11 +2272,14 @@ def create_array( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None @@ -2290,11 +2299,14 @@ def create_array( compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2630,11 +2642,14 @@ def array( An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + If no codecs are provided, default codecs will be used: + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v3_default_codecs`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None @@ -2654,11 +2669,14 @@ def array( compressor : dict[str, JSON] | None = None The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed using the ``array.v2_default_compressor`` variable in :mod:`zarr.core.config`. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is From 35e35c4868533bc36053d76961a957dc5b1a30ef Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 16:43:46 +0100 Subject: [PATCH 33/35] correct config docstring --- src/zarr/core/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f9db5ab90f..a57fdfc077 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -4,20 +4,20 @@ in the registry and then select them in the config. Example: - An implementation of the bytes codec in a class `your.module.NewBytesCodec` requires the value of `codecs.bytes` - to be `your.module.NewBytesCodec`. + An implementation of the bytes codec in a class ``your.module.NewBytesCodec`` requires the value of ``codecs.bytes`` + to be ``your.module.NewBytesCodec``. - ```python +.. code-block:: python from your.module import NewBytesCodec from zarr.core.config import register_codec, config register_codec("bytes", NewBytesCodec) config.set({"codecs.bytes": "your.module.NewBytesCodec"}) - ``` Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations. For example, to set the bytes codec via an environment variable: +.. code-block:: bash export ZARR_CODECS__BYTES="your.module.NewBytesCodec" For more information, see the Donfig documentation at https://github.com/pytroll/donfig. From 92de85caf3b6cc8295eecf068c86d64e8767b586 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 16:46:46 +0100 Subject: [PATCH 34/35] correct config docstring --- src/zarr/core/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a57fdfc077..9fd0490862 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -8,6 +8,7 @@ to be ``your.module.NewBytesCodec``. .. code-block:: python + from your.module import NewBytesCodec from zarr.core.config import register_codec, config @@ -18,6 +19,7 @@ For example, to set the bytes codec via an environment variable: .. code-block:: bash + export ZARR_CODECS__BYTES="your.module.NewBytesCodec" For more information, see the Donfig documentation at https://github.com/pytroll/donfig. From 6fd3f25c108f5cd39b43b5f0837b73901d16cf67 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 18 Dec 2024 17:16:22 +0100 Subject: [PATCH 35/35] improve config docstring --- src/zarr/core/config.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 9fd0490862..1feb4a6c2f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -5,26 +5,26 @@ Example: An implementation of the bytes codec in a class ``your.module.NewBytesCodec`` requires the value of ``codecs.bytes`` - to be ``your.module.NewBytesCodec``. + to be ``your.module.NewBytesCodec``. Donfig can be configured programmatically, by environment variables, or from + YAML files in standard locations. -.. code-block:: python + .. code-block:: python - from your.module import NewBytesCodec - from zarr.core.config import register_codec, config + from your.module import NewBytesCodec + from zarr.core.config import register_codec, config - register_codec("bytes", NewBytesCodec) - config.set({"codecs.bytes": "your.module.NewBytesCodec"}) + register_codec("bytes", NewBytesCodec) + config.set({"codecs.bytes": "your.module.NewBytesCodec"}) -Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations. -For example, to set the bytes codec via an environment variable: + Instead of setting the value programmatically with ``config.set``, you can also set the value with an environment + variable. The environment variable ``ZARR_CODECS__BYTES`` can be set to ``your.module.NewBytesCodec``. The double + underscore ``__`` is used to indicate nested access. -.. code-block:: bash + .. code-block:: bash - export ZARR_CODECS__BYTES="your.module.NewBytesCodec" + export ZARR_CODECS__BYTES="your.module.NewBytesCodec" For more information, see the Donfig documentation at https://github.com/pytroll/donfig. - -Default values below point to the standard implementations of zarr-python. """ from __future__ import annotations @@ -57,6 +57,7 @@ def reset(self) -> None: self.refresh() +# The default configuration for zarr config = Config( "zarr", defaults=[