diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index e859df44a6..8b20676e8b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -17,10 +17,12 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -401,7 +403,7 @@ async def save_array( arr : ndarray NumPy array with data to save. zarr_format : {2, 3, None}, optional - The zarr format to use when saving. + The zarr format to use when saving (default is 3 if not specified). path : str or None, optional The path within the store where the array will be saved. storage_options : dict @@ -817,19 +819,45 @@ async def create( shape : int or tuple of ints Array shape. chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. + The shape of the array's chunks. + V2 only. V3 arrays should use `chunk_shape` instead. + If not specified, default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. + chunk_shape : int or tuple of ints, optional + The shape of the Array's chunks (default is None). + V3 only. V2 arrays should use `chunks` instead. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional - Primary compressor. - fill_value : object + Primary compressor to compress chunk data. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). + If not specified, default is taken from the Zarr config ```array.order```. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -844,6 +872,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. + V2 only. If neither ``compressor`` nor ``filters`` are provided, a default + compressor will be used. (see ``compressor`` for details). cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -859,7 +889,8 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Default is ".". .. versionadded:: 2.8 write_empty_chunks : bool, optional @@ -875,6 +906,7 @@ async def create( zarr_format : {2, 3, None}, optional The zarr format to use when saving. + Default is 3. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. @@ -894,9 +926,13 @@ async def create( or _default_zarr_version() ) - if zarr_format == 2 and chunks is None: - chunks = shape - elif zarr_format == 3 and chunk_shape is None: + if zarr_format == 2: + if chunks is None: + chunks = shape + dtype = parse_dtype(dtype, zarr_format) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks chunks = None diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index e407d94892..165dbe476d 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -1,10 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - import numpy as np - from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.crc32c_ import Crc32cCodec @@ -13,7 +8,6 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec -from zarr.core.metadata.v3 import DataType __all__ = [ "BloscCname", @@ -30,15 +24,3 @@ "VLenUTF8Codec", "ZstdCodec", ] - - -def _get_default_array_bytes_codec( - np_dtype: np.dtype[Any], -) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec: - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - return VLenUTF8Codec() - elif dtype == DataType.bytes: - return VLenBytesCodec() - else: - return BytesCodec() diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index df0d8ecb0a..53edc1f4a1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING import numcodecs +import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like from zarr.abc.codec import ArrayBytesCodec @@ -46,7 +47,17 @@ async def _decode_single( # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if chunk_spec.dtype != object: - chunk = chunk.view(chunk_spec.dtype) + try: + chunk = chunk.view(chunk_spec.dtype) + except TypeError: + # this will happen if the dtype of the chunk + # does not match the dtype of the array spec i.g. if + # the dtype of the chunk_spec is a string dtype, but the chunk + # is an object array. In this case, we need to convert the object + # array to the correct dtype. + + chunk = np.array(chunk).astype(chunk_spec.dtype) + elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2849907f98..07ed0e5069 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -13,7 +13,6 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete -from zarr.codecs import _get_default_array_bytes_codec from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo from zarr.core.attributes import Attributes @@ -78,7 +77,8 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v3 import parse_node_type_array +from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError from zarr.registry import get_pipeline_class @@ -409,27 +409,53 @@ async def create( attributes : dict[str, JSON], optional The attributes of the array (default is None). chunk_shape : ChunkCoords, optional - The shape of the array's chunks (default is None). + The shape of the array's chunks + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional - The shape of the array's chunks (default is None). - V2 only. V3 arrays should not have 'chunks' parameter. + The shape of the array's chunks. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). - V2 only. V3 arrays cannot have a dimension separator. + The dimension separator (default is "."). + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). - V2 only. V3 arrays should not have 'filters' parameter. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should not have 'compressor' parameter. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional @@ -494,14 +520,6 @@ async def create( order=order, ) elif zarr_format == 2: - if dtype is str or dtype == "str": - # another special case: zarr v2 added the vlen-utf8 codec - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} - if filters and not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [vlen_codec] - else: - filters = [vlen_codec] - if codecs is not None: raise ValueError( "codecs cannot be used for arrays with version 2. Use filters and compressor instead." @@ -564,11 +582,7 @@ async def _create_v3( await ensure_no_existing_node(store_path, zarr_format=3) shape = parse_shapelike(shape) - codecs = ( - list(codecs) - if codecs is not None - else [_get_default_array_bytes_codec(np.dtype(dtype))] - ) + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) if chunk_key_encoding is None: chunk_key_encoding = ("default", "/") @@ -634,6 +648,14 @@ async def _create_v2( if dimension_separator is None: dimension_separator = "." + dtype = parse_dtype(dtype, zarr_format=2) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + if np.issubdtype(dtype, np.str_): + filters = filters or [] + if not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [{"id": "vlen-utf8"}] + metadata = ArrayV2Metadata( shape=shape, dtype=np.dtype(dtype), @@ -1493,23 +1515,53 @@ def create( dtype : npt.DTypeLike The data type of the array. chunk_shape : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the Array's chunks. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the array's chunks. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). + The dimension separator (default is "."). + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional - The compressor used to compress the data (default is None). + Primary compressor to compress chunk data. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). @@ -3342,3 +3394,18 @@ def _build_parents( ) return parents + + +def _get_default_codecs( + np_dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + default_codecs = config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 29f5e139fe..1feb4a6c2f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -1,3 +1,32 @@ +""" +The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. +For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations +in the registry and then select them in the config. + +Example: + An implementation of the bytes codec in a class ``your.module.NewBytesCodec`` requires the value of ``codecs.bytes`` + to be ``your.module.NewBytesCodec``. Donfig can be configured programmatically, by environment variables, or from + YAML files in standard locations. + + .. code-block:: python + + from your.module import NewBytesCodec + from zarr.core.config import register_codec, config + + register_codec("bytes", NewBytesCodec) + config.set({"codecs.bytes": "your.module.NewBytesCodec"}) + + Instead of setting the value programmatically with ``config.set``, you can also set the value with an environment + variable. The environment variable ``ZARR_CODECS__BYTES`` can be set to ``your.module.NewBytesCodec``. The double + underscore ``__`` is used to indicate nested access. + + .. code-block:: bash + + export ZARR_CODECS__BYTES="your.module.NewBytesCodec" + +For more information, see the Donfig documentation at https://github.com/pytroll/donfig. +""" + from __future__ import annotations from typing import Any, Literal, cast @@ -10,7 +39,7 @@ class BadConfigError(ValueError): class Config(DConfig): # type: ignore[misc] - """Will collect configuration from config files and environment variables + """The Config will collect configuration from config files and environment variables Example environment variables: Grabs environment variables of the form "ZARR_FOO__BAR_BAZ=123" and @@ -28,21 +57,25 @@ def reset(self) -> None: self.refresh() -# The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. -# For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations -# in the registry and then select them in the config. -# e.g. an implementation of the bytes codec in a class "NewBytesCodec", requires the value of codecs.bytes.name to be -# "NewBytesCodec". -# Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations -# e.g. export ZARR_CODECS__BYTES__NAME="NewBytesCodec" -# (for more information see github.com/pytroll/donfig) -# Default values below point to the standard implementations of zarr-python +# The default configuration for zarr config = Config( "zarr", defaults=[ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + "v3_default_codecs": { + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f46c5126b2..2d7a21911a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1034,24 +1034,52 @@ async def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2222,7 +2250,7 @@ def create_array( ) -> Array: """Create a zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + This method lightly wraps `AsyncArray.create`. Parameters ---------- @@ -2233,24 +2261,52 @@ def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of this collection - specify the transformation from array values to stored bytes. + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2260,6 +2316,7 @@ def create_array( Returns ------- + Array """ @@ -2574,24 +2631,52 @@ def array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 50f375203f..bd0fbecf4a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, TypedDict, cast +from typing import TYPE_CHECKING, Any, TypedDict, cast from zarr.abc.metadata import Metadata @@ -71,6 +71,7 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) + compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -326,3 +327,23 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return "" else: return dtype.type(0) + + +def _default_filters_and_compressor( + dtype: np.dtype[Any], +) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_compressor = config.get("array.v2_default_compressor") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return [{"id": default_compressor[dtype_key]}], None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 3e925e08bd..8dcceb7f31 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,6 +37,7 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class @@ -606,6 +607,10 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType.string elif dtype.kind == "S": return DataType.bytes + elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": + # numpy < 2.0 does not support vlen string dtype + # so we fall back on object array of strings + return DataType.string dtype_to_data_type = { "|b1": "bool", "bool": "bool", diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 6f3ec59b01..514361bd6b 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -1,3 +1,8 @@ +import sys +import warnings +from types import ModuleType +from typing import Any + from zarr.storage.common import StoreLike, StorePath, make_store_path from zarr.storage.fsspec import FsspecStore from zarr.storage.local import LocalStore @@ -17,3 +22,20 @@ "ZipStore", "make_store_path", ] + + +class VerboseModule(ModuleType): + def __setattr__(self, attr: str, value: Any) -> None: + if attr == "default_compressor": + warnings.warn( + "setting zarr.storage.default_compressor is deprecated, use " + "zarr.config to configure array.v2_default_compressor " + "e.g. config.set({'codecs.zstd':'numcodecs.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})", + DeprecationWarning, + stacklevel=1, + ) + else: + super().__setattr__(attr, value) + + +sys.modules[__name__].__class__ = VerboseModule diff --git a/tests/test_array.py b/tests/test_array.py index cf722c7385..c89b6187c3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -5,12 +5,14 @@ from itertools import accumulate from typing import Any, Literal +import numcodecs import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec +from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array import chunks_initialized from zarr.core.buffer import default_buffer_prototype @@ -374,7 +376,7 @@ async def test_chunks_initialized() -> None: def test_nbytes_stored() -> None: - arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]) result = arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. arr[:50] = 1 @@ -386,7 +388,9 @@ def test_nbytes_stored() -> None: async def test_nbytes_stored_async() -> None: - arr = await zarr.api.asynchronous.create(shape=(100,), chunks=(10,), dtype="i4") + arr = await zarr.api.asynchronous.create( + shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()] + ) result = await arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. await arr.setitem(slice(50), 1) @@ -456,6 +460,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, + _filters=(numcodecs.Zstd(),), ) assert result == expected @@ -470,13 +475,13 @@ def test_info_v3(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected def test_info_complete(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()]) result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, @@ -511,6 +516,7 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", + _filters=(Zstd(level=0),), _count_bytes=128, ) assert result == expected @@ -526,13 +532,15 @@ async def test_info_v3_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected async def test_info_complete_async(self) -> None: - arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = await zarr.api.asynchronous.create( + shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()] + ) result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, diff --git a/tests/test_config.py b/tests/test_config.py index e3f5ec25e3..8dd15fb75b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,10 +8,18 @@ import pytest import zarr -from zarr import Array, zeros -from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline +from zarr import Array, AsyncArray, zeros +from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store -from zarr.codecs import BloscCodec, BytesCodec, Crc32cCodec, ShardingCodec +from zarr.codecs import ( + BloscCodec, + BytesCodec, + Crc32cCodec, + GzipCodec, + ShardingCodec, + VLenBytesCodec, + VLenUTF8Codec, +) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline @@ -28,6 +36,7 @@ register_ndbuffer, register_pipeline, ) +from zarr.storage import MemoryStore from zarr.testing.buffer import ( NDBufferUsingTestNDArrayLike, StoreExpectingTestBuffer, @@ -41,7 +50,19 @@ def test_config_defaults_set() -> None: assert config.defaults == [ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + "v3_default_codecs": { + "bytes": ["vlen-bytes"], + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, @@ -263,3 +284,31 @@ class NewCodec2(BytesCodec): # no warning if multiple implementations are available and one is selected in the config with config.set({"codecs.new_codec": fully_qualified_name(NewCodec)}): get_codec_class("new_codec") + + +@pytest.mark.parametrize( + ("dtype", "expected_codecs"), + [ + ("int", [BytesCodec(), GzipCodec()]), + ("bytes", [VLenBytesCodec()]), + ("str", [VLenUTF8Codec()]), + ], +) +async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: + with config.set( + { + "array.v3_default_codecs": { + "numeric": ["bytes", "gzip"], # test setting non-standard codecs + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + } + } + ): + arr = await AsyncArray.create( + shape=(100,), + chunk_shape=(100,), + dtype=np.dtype(dtype), + zarr_format=3, + store=MemoryStore(), + ) + assert arr.metadata.codecs == expected_codecs diff --git a/tests/test_group.py b/tests/test_group.py index 416e10af9a..e0bc304b9b 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr import zarr.api.asynchronous @@ -496,6 +497,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "shape": (1,), "chunks": (1,), "order": "C", + "filters": (Zstd(level=0),), "zarr_format": zarr_format, }, "subgroup": { @@ -521,7 +523,10 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": fill_value, "node_type": "array", diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 8ae9cc81fd..7f0c49338e 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous import zarr.api.synchronous @@ -71,7 +72,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", @@ -215,7 +219,10 @@ def test_consolidated_sync(self, memory_store): "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", @@ -486,6 +493,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=None, + filters=(Zstd(level=0),), order="C", ), "g1": GroupMetadata( diff --git a/tests/test_v2.py b/tests/test_v2.py index 890d4039a3..ef06c13e26 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array +from zarr import Array, config from zarr.storage import MemoryStore, StorePath @@ -82,36 +82,59 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=b"X", - ) - - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) + with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": [{"id": "vlen-bytes"}], + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value): + dtype, value = dtype_value + with config.set( + { + "array.v2_default_compressor": { + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) @@ -119,10 +142,10 @@ async def test_create_dtype_str(dtype: Any) -> None: arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = ["a", "bb", "ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] result = arr[:] - np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + with config.set( + { + "array.v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + } + ): + dtype, expected = dtype_expected + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.filters[0].codec_id == expected