From d2cced7ef33e69291af506fcb6ca2165655b7d0c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 15 Oct 2024 17:07:36 -0400 Subject: [PATCH] Add codespell support (config, workflow to detect/not fix) and make it fix few typos (#267) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add github action to codespell main on push and PRs * Add pre-commit definition for codespell * Fix manually not yet known to codespell Zar typo but not yet known for not long: https://github.com/codespell-project/codespell/pull/3568 * Add rudimentary codespell config * [DATALAD RUNCMD] run codespell throughout fixing typos automagically (but ignoring overall fail due to ambigous ones) === Do not change lines below === { "chain": [], "cmd": "codespell -w || :", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ * [DATALAD RUNCMD] Do interactive fixing of some ambigous typos === Do not change lines below === { "chain": [], "cmd": "codespell -w -i 3 -C 2", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ * Mark remaining ignores for codespell * Fix expected by rust spacing to inline comment Co-authored-by: Sebastián Galkin --------- Co-authored-by: Sebastián Galkin --- .codespellrc | 6 +++++ .github/workflows/codespell.yml | 25 +++++++++++++++++++ .pre-commit-config.yaml | 5 ++++ README.md | 6 ++--- docs/docs/contributing.md | 2 +- docs/docs/faq.md | 6 ++--- docs/docs/icechunk-python/version-control.md | 2 +- docs/docs/icechunk-python/virtual.md | 4 +-- docs/docs/overview.md | 6 ++--- docs/docs/spec.md | 6 ++--- .../notebooks/demo-dummy-data.ipynb | 2 +- icechunk-python/notebooks/demo-s3.ipynb | 10 ++++---- icechunk-python/python/icechunk/__init__.py | 10 ++++---- icechunk-python/src/errors.rs | 2 +- icechunk-python/tests/test_zarr/test_api.py | 6 ++--- icechunk/examples/low_level_dataset.rs | 2 +- icechunk/src/lib.rs | 2 +- icechunk/src/refs.rs | 2 +- icechunk/src/repository.rs | 4 +-- icechunk/src/storage/object_store.rs | 4 +-- icechunk/src/zarr.rs | 2 +- icechunk/tests/test_s3_storage.rs | 2 +- 22 files changed, 76 insertions(+), 40 deletions(-) create mode 100644 .codespellrc create mode 100644 .github/workflows/codespell.yml diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 00000000..f3f65c4d --- /dev/null +++ b/.codespellrc @@ -0,0 +1,6 @@ +[codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = .git*,*.svg,*.lock,*.css,.codespellrc +check-hidden = true +ignore-regex = ^\s*"image/\S+": ".*|\bND\b +ignore-words-list = crate diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..c59e0473 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within .codespellrc +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 52d64998..3eee0401 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,3 +7,8 @@ repos: entry: just pre-commit language: system pass_filenames: false + - repo: https://github.com/codespell-project/codespell + # Configuration for codespell is in .codespellrc + rev: v2.3.0 + hooks: + - id: codespell diff --git a/README.md b/README.md index ba82292d..e0d9b569 100644 --- a/README.md +++ b/README.md @@ -72,10 +72,10 @@ Arrays have two fundamental properties: In Zarr / Icechunk, arrays are split into **chunks**, A chunk is the minimum unit of data that must be read / written from storage, and thus choices about chunking have strong implications for performance. Zarr leaves this completely up to the user. -Chunk shape should be chosen based on the anticipated data access patten for each array +Chunk shape should be chosen based on the anticipated data access pattern for each array An Icechunk array is not bounded by an individual file and is effectively unlimited in size. -For further organization of data, Icechunk supports **groups** withing a single repo. +For further organization of data, Icechunk supports **groups** within a single repo. Group are like folders which contain multiple arrays and or other groups. Groups enable data to be organized into hierarchical trees. A common usage pattern is to store multiple arrays in a group representing a NetCDF-style dataset. @@ -86,7 +86,7 @@ Arbitrary JSON-style key-value metadata can be attached to both arrays and group Every update to an Icechunk store creates a new **snapshot** with a unique ID. Icechunk users must organize their updates into groups of related operations called **transactions**. -For example, appending a new time slice to mutliple arrays should be done as a single transaction, comprising the following steps +For example, appending a new time slice to multiple arrays should be done as a single transaction, comprising the following steps 1. Update the array metadata to resize the array to accommodate the new elements. 2. Write new chunks for each array in the group. diff --git a/docs/docs/contributing.md b/docs/docs/contributing.md index 6a9773a5..317c60b4 100644 --- a/docs/docs/contributing.md +++ b/docs/docs/contributing.md @@ -50,7 +50,7 @@ pip install -e icechunk@. ``` -### Rust Development Worflow +### Rust Development Workflow TODO diff --git a/docs/docs/faq.md b/docs/docs/faq.md index 783d7465..22b5d375 100644 --- a/docs/docs/faq.md +++ b/docs/docs/faq.md @@ -163,7 +163,7 @@ HDF is widely used in high-performance computing. --- Icechunk and HDF5 share the same data model: multidimensional arrays and metadata organized into a hierarchical tree structure. - This data model can accomodate a wide range of different use cases and workflows. + This data model can accommodate a wide range of different use cases and workflows. Both Icechunk and HDF5 use the concept of "chunking" to split large arrays into smaller storage units. @@ -238,7 +238,7 @@ The following table compares Zarr + Icechunk with TileDB Embedded in a few key a | *versioning* | snapshots, branches, tags | linear version history | Icechunk's data versioning model is closer to Git's. | | *unit of storage* | chunk | tile | (basically the same thing) | | *minimum write* | chunk | cell | TileDB allows atomic updates to individual cells, while Zarr requires writing an entire chunk. | -| *sparse arrays* | :material-close: | :material-check: | Zar + Icechunk do not currently support sparse arrays. | +| *sparse arrays* | :material-close: | :material-check: | Zarr + Icechunk do not currently support sparse arrays. | | *virtual chunk references* | :material-check: | :material-close: | Icechunk enables references to chunks in other file formats (HDF5, NetCDF, GRIB, etc.), while TileDB does not. | Beyond this list, there are numerous differences in the design, file layout, and implementation of Icechunk and TileDB embedded @@ -251,7 +251,7 @@ SafeTensors is a format developed by HuggingFace for storing tensors (arrays) sa By the same criteria Icechunk and Zarr are also "safe", in that it is impossible to trigger arbitrary code execution when reading data. SafeTensors is a single-file format, like HDF5, -SafeTensors optimizes for a simple on-disk layout that facilitates mem-map-based zero-copy reading in ML training pipleines, +SafeTensors optimizes for a simple on-disk layout that facilitates mem-map-based zero-copy reading in ML training pipelines, assuming that the data are being read from a local POSIX filesystem Zarr and Icechunk instead allow for flexible chunking and compression to optimize I/O against object storage. diff --git a/docs/docs/icechunk-python/version-control.md b/docs/docs/icechunk-python/version-control.md index 335b127a..410fe9f0 100644 --- a/docs/docs/icechunk-python/version-control.md +++ b/docs/docs/icechunk-python/version-control.md @@ -3,4 +3,4 @@ COMING SOON! In the meantime, you can read about [version control in Arraylake](https://docs.earthmover.io/arraylake/version-control), -which is very similar to version contol in Icechunk. \ No newline at end of file +which is very similar to version control in Icechunk. \ No newline at end of file diff --git a/docs/docs/icechunk-python/virtual.md b/docs/docs/icechunk-python/virtual.md index b9b30871..b86e70dd 100644 --- a/docs/docs/icechunk-python/virtual.md +++ b/docs/docs/icechunk-python/virtual.md @@ -4,7 +4,7 @@ While Icechunk works wonderfully with native chunks managed by Zarr, there is lo !!! warning - While virtual references are fully supported in Icechunk, creating virtual datasets currently relies on using experimental or pre-release versions of open source tools. For full instructions on how to install the required tools and ther current statuses [see the tracking issue on Github](https://github.com/earth-mover/icechunk/issues/197). + While virtual references are fully supported in Icechunk, creating virtual datasets currently relies on using experimental or pre-release versions of open source tools. For full instructions on how to install the required tools and their current statuses [see the tracking issue on Github](https://github.com/earth-mover/icechunk/issues/197). With time, these experimental features will make their way into the released packages. To create virtual Icechunk datasets with Python, the community utilizes the [kerchunk](https://fsspec.github.io/kerchunk/) and [VirtualiZarr](https://virtualizarr.readthedocs.io/en/latest/) packages. @@ -83,7 +83,7 @@ virtual_ds = xr.concat( # err (time, zlev, lat, lon) int16 64MB ManifestArray Self: store = cls.create(storage, mode, *args, **kwargs) assert(store) - # We dont want to call _open() becuase icechunk handles the opening, etc. + # We dont want to call _open() because icechunk handles the opening, etc. # if we have gotten this far we can mark it as open store._is_open = True @@ -293,7 +293,7 @@ def commit(self, message: str) -> str: This method will fail if: * there is no currently checked out branch - * some other writer updated the curret branch since the repository was checked out + * some other writer updated the current branch since the repository was checked out """ return self._store.commit(message) @@ -306,7 +306,7 @@ async def async_commit(self, message: str) -> str: This method will fail if: * there is no currently checked out branch - * some other writer updated the curret branch since the repository was checked out + * some other writer updated the current branch since the repository was checked out """ return await self._store.async_commit(message) @@ -321,7 +321,7 @@ def distributed_commit( This method will fail if: * there is no currently checked out branch - * some other writer updated the curret branch since the repository was checked out + * some other writer updated the current branch since the repository was checked out other_change_set_bytes must be generated as the output of calling `change_set_bytes` on other stores. The resulting commit will include changes from all stores. @@ -341,7 +341,7 @@ async def async_distributed_commit( This method will fail if: * there is no currently checked out branch - * some other writer updated the curret branch since the repository was checked out + * some other writer updated the current branch since the repository was checked out other_change_set_bytes must be generated as the output of calling `change_set_bytes` on other stores. The resulting commit will include changes from all stores. diff --git a/icechunk-python/src/errors.rs b/icechunk-python/src/errors.rs index 2323012c..8fd133dd 100644 --- a/icechunk-python/src/errors.rs +++ b/icechunk-python/src/errors.rs @@ -10,7 +10,7 @@ use thiserror::Error; /// A simple wrapper around the StoreError to make it easier to convert to a PyErr /// /// When you use the ? operator, the error is coerced. But if you return the value it is not. -/// So for now we just use the extra operation to get the coersion instead of manually mapping +/// So for now we just use the extra operation to get the coercion instead of manually mapping /// the errors where this is returned from a python class #[allow(clippy::enum_variant_names)] #[derive(Debug, Error)] diff --git a/icechunk-python/tests/test_zarr/test_api.py b/icechunk-python/tests/test_zarr/test_api.py index 1ffee7d8..baa88ffa 100644 --- a/icechunk-python/tests/test_zarr/test_api.py +++ b/icechunk-python/tests/test_zarr/test_api.py @@ -55,7 +55,7 @@ async def test_open_array(memory_store: IcechunkStore) -> None: assert z.shape == (100,) # open array, overwrite - # _store_dict wont currently work with IcechunkStore + # _store_dict won't currently work with IcechunkStore # TODO: Should it? pytest.xfail("IcechunkStore does not support _store_dict") store._store_dict = {} @@ -66,7 +66,7 @@ async def test_open_array(memory_store: IcechunkStore) -> None: # open array, read-only store_cls = type(store) - # _store_dict wont currently work with IcechunkStore + # _store_dict won't currently work with IcechunkStore # TODO: Should it? ro_store = store_cls.open(store_dict=store._store_dict, mode="r") @@ -96,7 +96,7 @@ async def test_open_group(memory_store: IcechunkStore) -> None: # open group, read-only store_cls = type(store) - # _store_dict wont currently work with IcechunkStore + # _store_dict won't currently work with IcechunkStore # TODO: Should it? pytest.xfail("IcechunkStore does not support _store_dict") ro_store = store_cls.open(store_dict=store._store_dict, mode="r") diff --git a/icechunk/examples/low_level_dataset.rs b/icechunk/examples/low_level_dataset.rs index 063ad193..9e6b9e9c 100644 --- a/icechunk/examples/low_level_dataset.rs +++ b/icechunk/examples/low_level_dataset.rs @@ -84,7 +84,7 @@ let zarr_meta1 = ZarrArrayMetadata {{ chunk_key_encoding: ChunkKeyEncoding::Slash, fill_value: FillValue::Int32(0), codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers("tranformers".to_string())), + storage_transformers: Some(StorageTransformers("transformers".to_string())), dimension_names: Some(vec![ Some("x".to_string()), Some("y".to_string()), diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs index 204c40ae..e46b48cb 100644 --- a/icechunk/src/lib.rs +++ b/icechunk/src/lib.rs @@ -7,7 +7,7 @@ //! - There is a low level interface that speaks zarr keys and values, and is used to provide the //! zarr store that will be used from python. This is the [`zarr::Store`] type. //! - There is a translation language between low and high levels. When user writes to a zarr key, -//! we need to convert that key to the language of arrays and groups. This is implmented it the +//! we need to convert that key to the language of arrays and groups. This is implemented it the //! [`zarr`] module //! - There is an abstract type for loading and saving of the Arrow datastructures. //! This is the [`Storage`] trait. It knows how to fetch and write arrow. diff --git a/icechunk/src/refs.rs b/icechunk/src/refs.rs index e2399b81..bb52912a 100644 --- a/icechunk/src/refs.rs +++ b/icechunk/src/refs.rs @@ -310,7 +310,7 @@ mod tests { /// Execute the passed block with all test implementations of Storage. /// - /// Currently this function executes agains the in-memory and local filesystem object_store + /// Currently this function executes against the in-memory and local filesystem object_store /// implementations. async fn with_test_storages< R, diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index e89312e3..55eb1341 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -58,7 +58,7 @@ pub struct RepositoryConfig { pub inline_chunk_threshold_bytes: u16, // Unsafely overwrite refs on write. This is not recommended, users should only use it at their // own risk in object stores for which we don't support write-object-if-not-exists. There is - // teh posibility of race conditions if this variable is set to true and there are concurrent + // the possibility of race conditions if this variable is set to true and there are concurrent // commit attempts. pub unsafe_overwrite_refs: bool, } @@ -1399,7 +1399,7 @@ mod tests { let non_chunk = ds.get_chunk_ref(&new_array_path, &ChunkIndices(vec![1])).await?; assert_eq!(non_chunk, None); - // update old array use attriutes and check them + // update old array use attributes and check them ds.set_user_attributes( array1_path.clone(), Some(UserAttributes::try_new(br#"{"updated": true}"#).unwrap()), diff --git a/icechunk/src/storage/object_store.rs b/icechunk/src/storage/object_store.rs index c63e905f..b7ba8190 100644 --- a/icechunk/src/storage/object_store.rs +++ b/icechunk/src/storage/object_store.rs @@ -72,7 +72,7 @@ pub struct ObjectStorage { } impl ObjectStorage { - /// Create an in memory Storage implementantion + /// Create an in memory Storage implementation /// /// This implementation should not be used in production code. pub fn new_in_memory_store(prefix: Option) -> ObjectStorage { @@ -88,7 +88,7 @@ impl ObjectStorage { } } - /// Create an local filesystem Storage implementantion + /// Create an local filesystem Storage implementation /// /// This implementation should not be used in production code. pub fn new_local_store(prefix: &StdPath) -> Result { diff --git a/icechunk/src/zarr.rs b/icechunk/src/zarr.rs index 5b639bca..ea94ca61 100644 --- a/icechunk/src/zarr.rs +++ b/icechunk/src/zarr.rs @@ -754,7 +754,7 @@ impl Store { .list_prefix(prefix) .await? .map_ok(move |s| { - // If the prefix is "/", get rid of it. This can happend when prefix is missing + // If the prefix is "/", get rid of it. This can happen when prefix is missing // the trailing slash (as it does in zarr-python impl) let rem = &s[idx..].trim_start_matches('/'); let parent = rem.split_once('/').map_or(*rem, |(parent, _)| parent); diff --git a/icechunk/tests/test_s3_storage.rs b/icechunk/tests/test_s3_storage.rs index 8cf09f1d..0d35291d 100644 --- a/icechunk/tests/test_s3_storage.rs +++ b/icechunk/tests/test_s3_storage.rs @@ -75,7 +75,7 @@ pub async fn test_chunk_write_read() -> Result<(), Box> { assert_eq!(Bytes::from_static(b"ello"), back); let back = storage.fetch_chunk(&id, &ByteRange::to_offset(3)).await?; - assert_eq!(Bytes::from_static(b"hel"), back); + assert_eq!(Bytes::from_static(b"hel"), back); // codespell:ignore let back = storage.fetch_chunk(&id, &ByteRange::bounded(1, 4)).await?; assert_eq!(Bytes::from_static(b"ell"), back);