From 9d248178cd737d115bb96007d5e1ca54db53cead Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Mon, 4 Mar 2024 15:05:59 +0000
Subject: [PATCH 01/21] Add normalizations

---
 src/imzy/{ => _centroids}/_extract.py   |  49 +++-------
 src/imzy/_centroids/_hdf5_store.py      |  86 ++----------------
 src/imzy/_hdf5_mixin.py                 | 106 ++++++++++++++++++++++
 src/imzy/_normalizations/__init__.py    | 115 ++++++++++++++++++++++++
 src/imzy/_normalizations/_extract.py    |  19 ++++
 src/imzy/_normalizations/_hdf5_store.py |  22 +++++
 src/imzy/_readers/_base.py              |  35 +++++++-
 src/imzy/_typing.py                     |  12 +++
 src/imzy/utilities.py                   |  23 +++++
 tests/test_imzml.py                     |  10 +++
 tests/test_tdf.py                       |  11 +++
 tests/test_tsf.py                       |  11 +++
 12 files changed, 382 insertions(+), 117 deletions(-)
 rename src/imzy/{ => _centroids}/_extract.py (91%)
 create mode 100644 src/imzy/_hdf5_mixin.py
 create mode 100644 src/imzy/_normalizations/__init__.py
 create mode 100644 src/imzy/_normalizations/_extract.py
 create mode 100644 src/imzy/_normalizations/_hdf5_store.py
 create mode 100644 src/imzy/_typing.py

diff --git a/src/imzy/_extract.py b/src/imzy/_centroids/_extract.py
similarity index 91%
rename from src/imzy/_extract.py
rename to src/imzy/_centroids/_extract.py
index 4d945a7..dbc22d1 100644
--- a/src/imzy/_extract.py
+++ b/src/imzy/_centroids/_extract.py
@@ -1,4 +1,3 @@
-"""Methods to handle data extraction from any reader."""
 import typing as ty
 from pathlib import Path
 
@@ -8,13 +7,19 @@
 from koyo.utilities import chunks
 from tqdm import tqdm
 
+from imzy import get_reader
+from imzy._typing import SpatialInfo
+from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis
+from imzy.utilities import _safe_rmtree, parse_to_attribute
+
 try:
     import hdf5plugin
 except ImportError:
-    pass
 
-from imzy._readers import get_reader
-from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis
+    class hdf5plugin:
+        """Dummy class."""
+
+        LZ4 = lambda *args, **kwargs: {}
 
 
 def check_zarr() -> None:
@@ -154,15 +159,6 @@ def rechunk_zarr_array(
     _safe_rmtree(zarr_path)  # remove the temporary array
 
 
-def check_hdf5() -> None:
-    """Check whether Zarr, dask and rechunker are installed."""
-    try:
-        import h5py
-        import hdf5plugin
-    except ImportError:
-        raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]")
-
-
 def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict[int, np.ndarray]:
     """Get chunk size information for particular dataset."""
     import math
@@ -183,6 +179,7 @@ def create_centroids_hdf5(
     ppm: ty.Optional[float] = None,
     ys: ty.Optional[np.ndarray] = None,
     chunk_info: ty.Optional[ty.Dict[int, np.ndarray]] = None,
+    spatial_info: ty.Optional[SpatialInfo] = None,
 ) -> Path:
     """Create group with datasets inside."""
     from imzy._centroids import H5CentroidsStore
@@ -216,6 +213,9 @@ def create_centroids_hdf5(
             store._add_data_to_group(group, "mzs_max", mzs_max, maxshape=(None,), dtype=mzs_max.dtype)
         if ys is not None:
             store._add_data_to_group(group, "ys", ys, maxshape=(None,), dtype=ys.dtype)
+        if spatial_info is not None:
+            for key, value in spatial_info.items():
+                store._add_data_to_group(group, key, value, dtype=value.dtype)
         if chunk_info is None:
             store._add_data_to_group(
                 group,
@@ -288,26 +288,3 @@ def extract_centroids_hdf5(
             else:
                 temp[i] = accumulate_peaks_profile(extract_indices, y)
         store.update(temp, indices, chunk_id)
-
-
-def _safe_rmtree(path):
-    from shutil import rmtree
-
-    try:
-        rmtree(path)
-    except (OSError, FileNotFoundError):
-        pass
-
-
-def parse_from_attribute(attribute):
-    """Parse attribute from cache."""
-    if isinstance(attribute, str) and attribute == "__NONE__":
-        attribute = None
-    return attribute
-
-
-def parse_to_attribute(attribute):
-    """Parse attribute to cache."""
-    if attribute is None:
-        attribute = "__NONE__"
-    return attribute
diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py
index c3f7b50..ce84e4c 100644
--- a/src/imzy/_centroids/_hdf5_store.py
+++ b/src/imzy/_centroids/_hdf5_store.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+from imzy._hdf5_mixin import HDF5Mixin
+
 try:
     import h5py
 except ImportError:
@@ -15,7 +17,7 @@
 from imzy._centroids._base import BaseCentroids
 
 
-class H5CentroidsStore(BaseCentroids):
+class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     """HDF5-centroids."""
 
     # Private attributes
@@ -38,13 +40,11 @@ def __init__(
         mode: str = "a",
     ):
         super().__init__(xyz_coordinates, pixel_index, image_shape)
-        assert h5py is not None, "h5py is not installed."
-
-        self.path = path
-        self.mode = mode
+        self._init_hdf5(path, mode)
 
     @property
-    def is_low_mem(self):
+    def is_low_mem(self) -> bool:
+        """Get low memory flag."""
         return self._low_mem
 
     @is_low_mem.setter
@@ -55,6 +55,7 @@ def is_low_mem(self, value: bool):
 
     @property
     def is_chunked(self) -> bool:
+        """Get chunked flag."""
         if self._is_chunked is None:
             with self.open() as h5:
                 self._is_chunked = h5[self.PEAKS_KEY].attrs.get("is_chunked")
@@ -128,79 +129,6 @@ def update(self, array: np.ndarray, framelist: np.ndarray, chunk_id: ty.Optional
                 h5[self.PEAKS_ARRAY_KEY][framelist] = array
                 h5.flush()
 
-    @contextmanager
-    def open(self, mode: ty.Optional[str] = None):
-        """Safely open storage."""
-        if mode is None:
-            mode = self.mode
-        try:
-            f_ptr = h5py.File(self.path, mode=mode, rdcc_nbytes=1024 * 1024 * 4)
-        except FileExistsError as err:
-            raise err
-        try:
-            yield f_ptr
-        finally:
-            f_ptr.close()
-
-    def flush(self):
-        """Flush data to disk."""
-        with self.open() as h5:
-            h5.flush()
-
-    @staticmethod
-    def _get_group(hdf, group_name: str, flush: bool = True):
-        try:
-            group = hdf[group_name]
-        except KeyError:
-            group = hdf.create_group(group_name)
-            if flush:
-                hdf.flush()
-        return group
-
-    @staticmethod
-    def _add_data_to_group(
-        group_obj,
-        dataset_name,
-        data,
-        dtype,
-        chunks=None,
-        maxshape=None,
-        compression=None,
-        compression_opts=None,
-        shape=None,
-    ):
-        """Add data to group."""
-        replaced_dataset = False
-
-        if dtype is None:
-            if hasattr(data, "dtype"):
-                dtype = data.dtype
-        if shape is None:
-            if hasattr(data, "shape"):
-                shape = data.shape
-
-        if dataset_name in list(group_obj.keys()):
-            if group_obj[dataset_name].dtype == dtype:
-                try:
-                    group_obj[dataset_name][:] = data
-                    replaced_dataset = True
-                except TypeError:
-                    del group_obj[dataset_name]
-            else:
-                del group_obj[dataset_name]
-
-        if not replaced_dataset:
-            group_obj.create_dataset(
-                dataset_name,
-                data=data,
-                dtype=dtype,
-                compression=compression,
-                chunks=chunks,
-                maxshape=maxshape,
-                compression_opts=compression_opts,
-                shape=shape,
-            )
-
 
 class LazyPeaksProxy:
     """Proxy class to enable similar interface to lazy-peaks."""
diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py
new file mode 100644
index 0000000..555201e
--- /dev/null
+++ b/src/imzy/_hdf5_mixin.py
@@ -0,0 +1,106 @@
+"""Mixin class for HDF5 files."""
+from __future__ import annotations
+
+import typing as ty
+from contextlib import contextmanager
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+from koyo.typing import PathLike
+
+
+def check_hdf5() -> None:
+    """Check whether Zarr, dask and rechunker are installed."""
+    try:
+        import h5py
+        import hdf5plugin
+    except ImportError:
+        raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]")
+
+
+class HDF5Mixin:
+    """Mixin class for HDF5 files."""
+
+    path: PathLike
+    mode: str
+
+    def _init_hdf5(self, path: PathLike, mode: str = "a"):
+        assert h5py is not None, "h5py is not installed."
+
+        self.path = path
+        self.mode = mode
+
+    @contextmanager
+    def open(self, mode: ty.Optional[str] = None) -> h5py.File:
+        """Safely open storage."""
+        if mode is None:
+            mode = self.mode
+        try:
+            f_ptr = h5py.File(self.path, mode=mode, rdcc_nbytes=1024 * 1024 * 4)
+        except FileExistsError as err:
+            raise err
+        try:
+            yield f_ptr
+        finally:
+            f_ptr.close()
+
+    def flush(self):
+        """Flush data to disk."""
+        with self.open() as h5:
+            h5.flush()
+
+    @staticmethod
+    def _get_group(hdf, group_name: str, flush: bool = True):
+        try:
+            group = hdf[group_name]
+        except KeyError:
+            group = hdf.create_group(group_name)
+            if flush:
+                hdf.flush()
+        return group
+
+    @staticmethod
+    def _add_data_to_group(
+        group_obj,
+        dataset_name,
+        data,
+        dtype,
+        chunks=None,
+        maxshape=None,
+        compression=None,
+        compression_opts=None,
+        shape=None,
+    ):
+        """Add data to group."""
+        replaced_dataset = False
+
+        if dtype is None:
+            if hasattr(data, "dtype"):
+                dtype = data.dtype
+        if shape is None:
+            if hasattr(data, "shape"):
+                shape = data.shape
+
+        if dataset_name in list(group_obj.keys()):
+            if group_obj[dataset_name].dtype == dtype:
+                try:
+                    group_obj[dataset_name][:] = data
+                    replaced_dataset = True
+                except TypeError:
+                    del group_obj[dataset_name]
+            else:
+                del group_obj[dataset_name]
+
+        if not replaced_dataset:
+            group_obj.create_dataset(
+                dataset_name,
+                data=data,
+                dtype=dtype,
+                compression=compression,
+                chunks=chunks,
+                maxshape=maxshape,
+                compression_opts=compression_opts,
+                shape=shape,
+            )
diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
new file mode 100644
index 0000000..f75415e
--- /dev/null
+++ b/src/imzy/_normalizations/__init__.py
@@ -0,0 +1,115 @@
+"""Normalizations."""
+from pathlib import Path
+
+import numba
+import numpy as np
+from koyo.typing import PathLike
+from tqdm import tqdm
+
+try:
+    import hdf5plugin
+except ImportError:
+
+    class hdf5plugin:
+        """Dummy class."""
+
+        LZ4 = lambda *args, **kwargs: {}
+
+
+from imzy._normalizations._extract import get_normalizations
+from imzy._normalizations._hdf5_store import H5NormalizationStore
+
+
+def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path:
+    """Create group with datasets inside."""
+    from imzy._readers import get_reader
+
+    reader = get_reader(input_dir)
+    n_pixels = reader.n_pixels
+
+    compression = hdf5plugin.LZ4()
+
+    if Path(hdf_path).suffix != ".h5":
+        hdf_path = Path(hdf_path).with_suffix(".h5")
+
+    store = H5NormalizationStore(hdf_path, mode="a")
+    with store.open() as h5:
+        group = store._get_group(h5, store.NORMALIZATIONS_KEY)
+        for normalization in get_normalizations():
+            group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression)
+    store.flush()
+    return hdf_path
+
+
+def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path:
+    """Extract normalizations from hdf5."""
+    normalization_names = get_normalizations()
+    normalizations = compute_normalizations(input_dir, silent=silent)
+
+    store = H5NormalizationStore(hdf_path, mode="a")
+    with store.open() as h5:
+        group = store._get_group(h5, store.NORMALIZATIONS_KEY)
+        for i, normalization in enumerate(normalization_names):
+            norm = normalizations[:, i]
+            group[normalization][:] = norm
+    store.flush()
+    return hdf_path
+
+
+def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
+    """Calculate normalizations for a set of frames."""
+    from imzy._readers import get_reader
+
+    reader = get_reader(input_dir)
+
+    # specify normalization names
+    names = get_normalizations()
+
+    # pre-assign array
+    n_frames = reader.n_pixels
+    framelist = reader.pixels
+    # start_frame, norm_array = check_cache()
+    norm_array = np.zeros((n_frames, len(names)), dtype=np.float32)
+    for i, (_, y) in enumerate(
+        tqdm(
+            reader.spectra_iter(framelist, silent=True),
+            disable=silent,
+            miniters=100,
+            mininterval=2,
+            total=len(framelist),
+            desc="Computing normalizations...",
+        )
+    ):
+        norm_array[i] = calculate_normalizations(y.astype(np.float32))
+    return norm_array
+
+
+@numba.njit(fastmath=True, cache=True)
+def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray:
+    """Calculate various normalizations.
+
+    This function expects spectrum in the form
+
+    """
+    px_norms = np.zeros(12, dtype=np.float32)
+    px_norms[0] = np.sum(spectrum)  # TIC
+    if px_norms[0] == 0:
+        return px_norms
+
+    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
+    px_norms[2] = np.median(spectrum[spectrum > 0])  # Median
+    _spectrum = spectrum[spectrum > 0]
+    if _spectrum.size > 1:
+        q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05])
+    else:
+        q95, q90, q10, q5 = 0, 0, 0, 0
+    px_norms[3] = np.sum(spectrum[spectrum < q95])  # 0-95% TIC
+    px_norms[4] = np.sum(spectrum[spectrum < q90])  # 0-90% TIC
+    px_norms[5] = np.sum(spectrum[spectrum > q5])  # 5-100% TIC
+    px_norms[6] = np.sum(spectrum[spectrum > q10])  # 10-100% TIC
+    px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)])  # 5-95% TIC
+    px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)])  # 10-90% TIC
+    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
+    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
+    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
+    return px_norms
diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
new file mode 100644
index 0000000..e9857aa
--- /dev/null
+++ b/src/imzy/_normalizations/_extract.py
@@ -0,0 +1,19 @@
+"""Get normalization utilities."""
+
+
+def get_normalizations() -> list[str]:
+    """Get list of available normalizations."""
+    return [
+        "TIC",
+        "RMS",
+        "Median",
+        "0-95% TIC",
+        "0-90% TIC",
+        "5-100% TIC",
+        "10-100% TIC",
+        "5-95% TIC",
+        "10-90% TIC",
+        "0-norm",
+        "2-norm",
+        "3-norm",
+    ]
diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
new file mode 100644
index 0000000..89a523c
--- /dev/null
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -0,0 +1,22 @@
+"""HDF5 store for normalizations."""
+import numpy as np
+
+from imzy._hdf5_mixin import HDF5Mixin
+
+
+class H5NormalizationStore(HDF5Mixin):
+    """HDF5 store for normalizations."""
+
+    NORMALIZATIONS_KEY: str = "Normalizations"
+
+    def __init__(self, path: str, mode: str = "a"):
+        self._init_hdf5(path, mode)
+
+    def get_normalization(self) -> np.ndarray:
+        """Get normalization data."""
+
+    def get_normalization_list(self) -> list[str]:
+        """Get list of available normalizations."""
+        with self.open("r") as h5:
+            normalizations = list(h5[self.NORMALIZATIONS_KEY].keys())
+        return normalizations
diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index 3c6f4b4..c998898 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -298,7 +298,12 @@ def to_zarr(
         silent: bool = False,
     ) -> Path:
         """Export many ion images for specified m/z values (+ tolerance) to Zarr array."""
-        from imzy._extract import check_zarr, create_centroids_zarr, extract_centroids_zarr, rechunk_zarr_array
+        from imzy._centroids._extract import (
+            check_zarr,
+            create_centroids_zarr,
+            extract_centroids_zarr,
+            rechunk_zarr_array,
+        )
 
         if not as_flat:
             raise ValueError("Only flat images are supported at the moment.")
@@ -361,7 +366,8 @@ def to_hdf5(
         silent: bool = False,
     ) -> Path:
         """Export many ion images for specified m/z values (+ tolerance) to a HDF5 store."""
-        from imzy._extract import check_hdf5, create_centroids_hdf5, extract_centroids_hdf5, get_chunk_info
+        from imzy._centroids._extract import create_centroids_hdf5, extract_centroids_hdf5, get_chunk_info
+        from imzy._hdf5_mixin import check_hdf5
 
         if not as_flat:
             raise ValueError("Only flat images are supported at the moment.")
@@ -396,6 +402,11 @@ def to_hdf5(
             ppm=ppm,
             tol=tol,
             chunk_info=chunk_info,
+            spatial_info={
+                "x_coordinates": self.x_coordinates,
+                "y_coordinates": self.y_coordinates,
+                "shape": self.image_shape,
+            },
         )
         extract_centroids_hdf5(
             input_dir=self.path,
@@ -407,6 +418,26 @@ def to_hdf5(
         )
         return hdf_path
 
+    extract_centroids_hdf5 = to_hdf5
+
+    def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False):
+        """Extract normalizations."""
+        from imzy._hdf5_mixin import check_hdf5
+        from imzy._normalizations import create_normalizations_hdf5, extract_normalizations_hdf5
+
+        check_hdf5()
+        hdf_path = Path(hdf_path)
+        if not hdf_path.suffix == ".h5":
+            hdf_path = hdf_path.with_suffix(".h5")
+        if not hdf_path.exists():
+            hdf_path = create_normalizations_hdf5(self.path, hdf_path)
+            hdf_path = extract_normalizations_hdf5(
+                input_dir=self.path,
+                hdf_path=hdf_path,
+                silent=silent,
+            )
+        return hdf_path
+
     def spectra_iter(
         self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False
     ) -> ty.Generator[ty.Tuple[np.ndarray, np.ndarray], None, None]:
diff --git a/src/imzy/_typing.py b/src/imzy/_typing.py
new file mode 100644
index 0000000..952803a
--- /dev/null
+++ b/src/imzy/_typing.py
@@ -0,0 +1,12 @@
+"""Typing for the package."""
+import typing as ty
+
+import numpy as np
+
+
+class SpatialInfo(ty.TypedDict):
+    """Spatial data."""
+
+    x_coordinates: np.ndarray
+    y_coordinates: np.ndarray
+    shape: ty.Tuple[int, int]
diff --git a/src/imzy/utilities.py b/src/imzy/utilities.py
index 40adb1d..25d5e09 100644
--- a/src/imzy/utilities.py
+++ b/src/imzy/utilities.py
@@ -38,3 +38,26 @@ def get_rois_from_bruker_d(path: PathLike) -> ty.List[int]:
     cursor.close()
     conn.close()
     return list(range(0, last_roi + 1))
+
+
+def _safe_rmtree(path):
+    from shutil import rmtree
+
+    try:
+        rmtree(path)
+    except (OSError, FileNotFoundError):
+        pass
+
+
+def parse_from_attribute(attribute):
+    """Parse attribute from cache."""
+    if isinstance(attribute, str) and attribute == "__NONE__":
+        attribute = None
+    return attribute
+
+
+def parse_to_attribute(attribute):
+    """Parse attribute to cache."""
+    if attribute is None:
+        attribute = "__NONE__"
+    return attribute
diff --git a/tests/test_imzml.py b/tests/test_imzml.py
index f36f1f3..24a9dac 100644
--- a/tests/test_imzml.py
+++ b/tests/test_imzml.py
@@ -78,3 +78,13 @@ def test_to_h5(path, tmp_path):
     h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5)
     assert h5_path.exists()
     assert h5_temp != h5_path
+
+
+@pytest.mark.parametrize("path", get_imzml_data())
+def test_norms(path, tmp_path):
+    reader = IMZMLReader(path)
+
+    h5_temp = tmp_path / "output"  # forgot to include .h5 extension
+    h5_path = reader.extract_normalizations_hdf5(h5_temp)
+    assert h5_path.exists()
+    assert h5_temp != h5_path
diff --git a/tests/test_tdf.py b/tests/test_tdf.py
index 1f85a4b..5967137 100644
--- a/tests/test_tdf.py
+++ b/tests/test_tdf.py
@@ -83,3 +83,14 @@ def test_to_h5(path, tmp_path):
     h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5)
     assert h5_path.exists()
     assert h5_temp != h5_path
+
+
+@pytest.mark.skipif(IS_MAC, reason="Bruker reader is not supported on macOS.")
+@pytest.mark.parametrize("path", get_tdf_data())
+def test_norms(path, tmp_path):
+    reader = TDFReader(path)
+
+    h5_temp = tmp_path / "output"  # forgot to include .h5 extension
+    h5_path = reader.extract_normalizations_hdf5(h5_temp)
+    assert h5_path.exists()
+    assert h5_temp != h5_path
diff --git a/tests/test_tsf.py b/tests/test_tsf.py
index 43a0c2b..fbca5e2 100644
--- a/tests/test_tsf.py
+++ b/tests/test_tsf.py
@@ -81,3 +81,14 @@ def test_to_h5(path, tmp_path):
     h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5)
     assert h5_path.exists()
     assert h5_temp != h5_path
+
+
+@pytest.mark.skipif(IS_MAC, reason="Bruker reader is not supported on macOS.")
+@pytest.mark.parametrize("path", get_tsf_data())
+def test_norms(path, tmp_path):
+    reader = TSFReader(path)
+
+    h5_temp = tmp_path / "output"  # forgot to include .h5 extension
+    h5_path = reader.extract_normalizations_hdf5(h5_temp)
+    assert h5_path.exists()
+    assert h5_temp != h5_path

From 7ee584089683268f1dcd4ae5d6283de848ba7ddb Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Mon, 4 Mar 2024 18:18:58 +0000
Subject: [PATCH 02/21] Update __init__.py

---
 src/imzy/_normalizations/__init__.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index f75415e..44ab368 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -1,4 +1,5 @@
 """Normalizations."""
+import warnings
 from pathlib import Path
 
 import numba
@@ -41,6 +42,15 @@ def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path:
     return hdf_path
 
 
+def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray:
+    """Retrieve normalization and determine outliers."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", RuntimeWarning)
+        norm = np.log10(norm + 1)
+        mask = norm < (norm.max() - n_orders)
+    return mask
+
+
 def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path:
     """Extract normalizations from hdf5."""
     normalization_names = get_normalizations()
@@ -50,8 +60,13 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent:
     with store.open() as h5:
         group = store._get_group(h5, store.NORMALIZATIONS_KEY)
         for i, normalization in enumerate(normalization_names):
+            # get normalization
             norm = normalizations[:, i]
-            group[normalization][:] = norm
+            # remove outliers
+            mask = _get_outlier_mask(norm, 2)
+            norm[mask] = np.median(norm[mask])
+            # save the normalizations as 'multiplier' version so it's easier to apply
+            group[normalization][:] = 1 / (norm / np.median(norm))
     store.flush()
     return hdf_path
 

From 528ccf4158f7843bec059101fbd14fc10526e080 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Mon, 4 Mar 2024 20:36:14 +0100
Subject: [PATCH 03/21] Update _base.py

---
 src/imzy/_readers/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index c998898..6402ed4 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -430,7 +430,7 @@ def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False):
         if not hdf_path.suffix == ".h5":
             hdf_path = hdf_path.with_suffix(".h5")
         if not hdf_path.exists():
-            hdf_path = create_normalizations_hdf5(self.path, hdf_path)
+            # hdf_path = create_normalizations_hdf5(self.path, hdf_path)
             hdf_path = extract_normalizations_hdf5(
                 input_dir=self.path,
                 hdf_path=hdf_path,

From 9d9ab3fc25a8e62709297cbeb0ece3f3f2672838 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Mon, 4 Mar 2024 19:42:12 +0000
Subject: [PATCH 04/21] Fixes

---
 src/imzy/_centroids/_extract.py      |  8 +++++---
 src/imzy/_centroids/_hdf5_store.py   |  1 +
 src/imzy/_normalizations/__init__.py |  3 ++-
 src/imzy/_readers/_base.py           | 11 ++++-------
 src/imzy/_typing.py                  |  3 ++-
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py
index dbc22d1..7c879be 100644
--- a/src/imzy/_centroids/_extract.py
+++ b/src/imzy/_centroids/_extract.py
@@ -213,9 +213,6 @@ def create_centroids_hdf5(
             store._add_data_to_group(group, "mzs_max", mzs_max, maxshape=(None,), dtype=mzs_max.dtype)
         if ys is not None:
             store._add_data_to_group(group, "ys", ys, maxshape=(None,), dtype=ys.dtype)
-        if spatial_info is not None:
-            for key, value in spatial_info.items():
-                store._add_data_to_group(group, key, value, dtype=value.dtype)
         if chunk_info is None:
             store._add_data_to_group(
                 group,
@@ -240,6 +237,11 @@ def create_centroids_hdf5(
                     dtype=np.float32,
                     **compression,
                 )
+        group = store._get_group(h5, store.SPATIAL_KEY)
+        if spatial_info is not None:
+            group.attrs["pixel_size"] = spatial_info.get("pixel_size", 1.0)
+            for key, value in spatial_info.items():
+                store._add_data_to_group(group, key, value, dtype=value.dtype)
     return Path(hdf_path)
 
 
diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py
index ce84e4c..235c6f8 100644
--- a/src/imzy/_centroids/_hdf5_store.py
+++ b/src/imzy/_centroids/_hdf5_store.py
@@ -23,6 +23,7 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     # Private attributes
     PEAKS_KEY = "Array"
     PEAKS_ARRAY_KEY = "Array/array"
+    SPATIAL_KEY = "Misc/Spatial"
 
     # Cache attributes
     _chunk_info = None
diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index 44ab368..24d8b17 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -67,7 +67,7 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent:
             norm[mask] = np.median(norm[mask])
             # save the normalizations as 'multiplier' version so it's easier to apply
             group[normalization][:] = 1 / (norm / np.median(norm))
-    store.flush()
+        store.flush()
     return hdf_path
 
 
@@ -96,6 +96,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
         )
     ):
         norm_array[i] = calculate_normalizations(y.astype(np.float32))
+    norm_array = np.nan_to_num(norm_array, nan=1.0)
     return norm_array
 
 
diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index 6402ed4..edf67cb 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -405,7 +405,8 @@ def to_hdf5(
             spatial_info={
                 "x_coordinates": self.x_coordinates,
                 "y_coordinates": self.y_coordinates,
-                "shape": self.image_shape,
+                "image_shape": self.image_shape,
+                "pixel_size": self.pixel_size,
             },
         )
         extract_centroids_hdf5(
@@ -430,12 +431,8 @@ def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False):
         if not hdf_path.suffix == ".h5":
             hdf_path = hdf_path.with_suffix(".h5")
         if not hdf_path.exists():
-            # hdf_path = create_normalizations_hdf5(self.path, hdf_path)
-            hdf_path = extract_normalizations_hdf5(
-                input_dir=self.path,
-                hdf_path=hdf_path,
-                silent=silent,
-            )
+            hdf_path = create_normalizations_hdf5(self.path, hdf_path)
+            hdf_path = extract_normalizations_hdf5(input_dir=self.path, hdf_path=hdf_path, silent=silent)
         return hdf_path
 
     def spectra_iter(
diff --git a/src/imzy/_typing.py b/src/imzy/_typing.py
index 952803a..ae8dffe 100644
--- a/src/imzy/_typing.py
+++ b/src/imzy/_typing.py
@@ -9,4 +9,5 @@ class SpatialInfo(ty.TypedDict):
 
     x_coordinates: np.ndarray
     y_coordinates: np.ndarray
-    shape: ty.Tuple[int, int]
+    image_shape: ty.Tuple[int, int]
+    pixel_size: float

From c9e95b3f48f3285376e75f6095bb6ac8a0126a03 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Mon, 4 Mar 2024 22:45:51 +0100
Subject: [PATCH 05/21] Update __init__.py

---
 src/imzy/_normalizations/__init__.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index 24d8b17..d86c739 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -57,17 +57,18 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent:
     normalizations = compute_normalizations(input_dir, silent=silent)
 
     store = H5NormalizationStore(hdf_path, mode="a")
-    with store.open() as h5:
-        group = store._get_group(h5, store.NORMALIZATIONS_KEY)
-        for i, normalization in enumerate(normalization_names):
-            # get normalization
-            norm = normalizations[:, i]
-            # remove outliers
-            mask = _get_outlier_mask(norm, 2)
-            norm[mask] = np.median(norm[mask])
-            # save the normalizations as 'multiplier' version so it's easier to apply
-            group[normalization][:] = 1 / (norm / np.median(norm))
-        store.flush()
+    with np.errstate(invalid="ignore", divide="ignore"):
+        with store.open() as h5:
+            group = store._get_group(h5, store.NORMALIZATIONS_KEY)
+            for i, normalization in enumerate(normalization_names):
+                # get normalization
+                norm = normalizations[:, i]
+                # remove outliers
+                mask = _get_outlier_mask(norm, 2)
+                norm[mask] = np.median(norm[mask])
+                # save the normalizations as 'multiplier' version so it's easier to apply
+                group[normalization][:] = 1 / (norm / np.median(norm))
+            store.flush()
     return hdf_path
 
 
@@ -104,8 +105,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
 def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray:
     """Calculate various normalizations.
 
-    This function expects spectrum in the form
-
+    This function expects float32 spectrum.
     """
     px_norms = np.zeros(12, dtype=np.float32)
     px_norms[0] = np.sum(spectrum)  # TIC

From b812daa7a58b897d69a148cb6ba9279c91abe0ac Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 09:35:44 +0100
Subject: [PATCH 06/21] Improvements

---
 src/imzy/_centroids/_extract.py      |  6 ++--
 src/imzy/_normalizations/__init__.py | 49 +++++++++++++++++++++++++++-
 src/imzy/_readers/_base.py           |  2 +-
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py
index 7c879be..21b1a7c 100644
--- a/src/imzy/_centroids/_extract.py
+++ b/src/imzy/_centroids/_extract.py
@@ -239,7 +239,7 @@ def create_centroids_hdf5(
                 )
         group = store._get_group(h5, store.SPATIAL_KEY)
         if spatial_info is not None:
-            group.attrs["pixel_size"] = spatial_info.get("pixel_size", 1.0)
+            group.attrs["pixel_size"] = spatial_info.pop("pixel_size", 1.0)
             for key, value in spatial_info.items():
                 store._add_data_to_group(group, key, value, dtype=value.dtype)
     return Path(hdf_path)
@@ -280,8 +280,8 @@ def extract_centroids_hdf5(
                 indices,
                 disable=silent,
                 desc=f"Extracting {n_peaks} peaks (chunk={chunk_id+1}/{n_chunks})",
-                miniters=25,
-                mininterval=0.2,
+                miniters=250,
+                mininterval=2,
             )
         ):
             x, y = reader[index]
diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index d86c739..0e48113 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -96,7 +96,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
             desc="Computing normalizations...",
         )
     ):
-        norm_array[i] = calculate_normalizations(y.astype(np.float32))
+        norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32))
     norm_array = np.nan_to_num(norm_array, nan=1.0)
     return norm_array
 
@@ -129,3 +129,50 @@ def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray:
     px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
     px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
     return px_norms
+
+
+@numba.njit(fastmath=True, cache=True)
+def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray:
+    """Calculate various normalizations, optimized version.
+
+    This function expects a float32 spectrum.
+    """
+    px_norms = np.zeros(12, dtype=np.float32)
+
+    # Direct computation without conditions
+    px_norms[0] = np.sum(spectrum)  # TIC
+    if px_norms[0] == 0:
+        return px_norms
+
+    # Compute these norms directly without extra conditions
+    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
+    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
+    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
+    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
+
+    # Filter positive values once and reuse
+    positive_spectrum = spectrum[spectrum > 0]
+    px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0  # Median
+
+    # Calculating quantiles once for all needed
+    if positive_spectrum.size > 1:
+        q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05])
+    else:
+        q95, q90, q10, q5 = 0, 0, 0, 0
+
+    # Using logical indexing with boolean arrays might be faster due to numba optimization
+    condition_q95 = spectrum < q95
+    condition_q90 = spectrum < q90
+    condition_q5 = spectrum > q5
+    condition_q10 = spectrum > q10
+
+    px_norms[3] = np.sum(spectrum[condition_q95])  # 0-95% TIC
+    px_norms[4] = np.sum(spectrum[condition_q90])  # 0-90% TIC
+    px_norms[5] = np.sum(spectrum[condition_q5])  # 5-100% TIC
+    px_norms[6] = np.sum(spectrum[condition_q10])  # 10-100% TIC
+
+    # For ranges, we can combine conditions
+    px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95])  # 5-95% TIC
+    px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90])  # 10-90% TIC
+
+    return px_norms
diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index edf67cb..f83a976 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -405,7 +405,7 @@ def to_hdf5(
             spatial_info={
                 "x_coordinates": self.x_coordinates,
                 "y_coordinates": self.y_coordinates,
-                "image_shape": self.image_shape,
+                "image_shape": np.asarray(self.image_shape),
                 "pixel_size": self.pixel_size,
             },
         )

From 09d4b7503c3dfa4b5c3bfcda1e9575eeeaddf8b8 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 08:54:35 +0000
Subject: [PATCH 07/21] Update _hdf5_store.py

---
 src/imzy/_normalizations/_hdf5_store.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index 89a523c..6b3800b 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -1,5 +1,6 @@
 """HDF5 store for normalizations."""
 import numpy as np
+from koyo.typing import PathLike
 
 from imzy._hdf5_mixin import HDF5Mixin
 
@@ -9,7 +10,7 @@ class H5NormalizationStore(HDF5Mixin):
 
     NORMALIZATIONS_KEY: str = "Normalizations"
 
-    def __init__(self, path: str, mode: str = "a"):
+    def __init__(self, path: PathLike, mode: str = "a"):
         self._init_hdf5(path, mode)
 
     def get_normalization(self) -> np.ndarray:

From f0f7438a1e1221b0e8481735c6c5cbab60f29e11 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 10:32:40 +0000
Subject: [PATCH 08/21] Fixes

---
 src/imzy/_centroids/_extract.py         |  3 +-
 src/imzy/_centroids/_hdf5_store.py      | 56 ++++++++++++++++++++++++
 src/imzy/_hdf5_mixin.py                 | 27 ++++++++++++
 src/imzy/_normalizations/_hdf5_store.py | 57 +++++++++++++++++++++++--
 src/imzy/utilities.py                   | 13 +-----
 5 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py
index 21b1a7c..ece9303 100644
--- a/src/imzy/_centroids/_extract.py
+++ b/src/imzy/_centroids/_extract.py
@@ -8,9 +8,10 @@
 from tqdm import tqdm
 
 from imzy import get_reader
+from imzy._hdf5_mixin import parse_to_attribute
 from imzy._typing import SpatialInfo
 from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis
-from imzy.utilities import _safe_rmtree, parse_to_attribute
+from imzy.utilities import _safe_rmtree
 
 try:
     import hdf5plugin
diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py
index 235c6f8..9a0edc5 100644
--- a/src/imzy/_centroids/_hdf5_store.py
+++ b/src/imzy/_centroids/_hdf5_store.py
@@ -17,6 +17,16 @@
 from imzy._centroids._base import BaseCentroids
 
 
+def format_ppm(mz: float, ppm: float):
+    """Create formatted ppm'¬s."""
+    return f"{mz:.3f} Da ± {ppm:.1f}ppm"
+
+
+def format_tol(mz: float, mda: float):
+    """Create formatted label."""
+    return f"{mz:.3f} Da ± {mda:.3f}Da"
+
+
 class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     """HDF5-centroids."""
 
@@ -31,6 +41,8 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     _proxy = None
     _is_chunked = None
     _low_mem: bool = True
+    _ion_labels = None
+    _mz, _bins, _ppm, _eppm, _mda = None, None, None, None, None
 
     def __init__(
         self,
@@ -105,6 +117,50 @@ def lazy_peaks(self):
                 group = h5[self.PEAKS_KEY]
                 yield group["array"]
 
+    @property
+    def tol(self) -> int:
+        """Get Da tolerance the data was extracted at."""
+        if self._tol is None:
+            self._tol = self.get_attr(self.PEAKS_KEY, "tol")
+        return self._tol
+
+    @property
+    def ppm(self) -> float:
+        """Get ppm/bins the data was extracted at."""
+        if self._ppm is None:
+            self._ppm = self.get_attr(self.PEAKS_KEY, "ppm")
+        return self._ppm
+
+    def _setup_labels(self):
+        """Setup labels."""
+        if self._ion_labels is None:
+            mzs = self.xs
+            ppm = self.ppm
+            tol = self.tol
+            self._ion_labels = {
+                "ppm": [format_ppm(mz, ppm) for mz in mzs] if ppm else [],
+                "tol": [format_tol(mz, tol) for i, mz in enumerate(mzs)] if tol is not None else [],
+            }
+            if not self._ion_labels["ppm"]:
+                self._ion_labels["ppm"] = self._ion_labels["tol"]
+
+    def index_to_name(self, index: int, effective: bool = True, mz: bool = False):
+        """Convert index to ion name."""
+        if self._ion_labels is None:
+            self._setup_labels()
+        return self._ion_labels["ppm"][index]
+
+    @property
+    def labels(self) -> ty.Tuple[np.ndarray, ty.Optional[np.ndarray], ty.Optional[np.ndarray]]:
+        """Return all available labels."""
+        return self.xs, None, None
+
+    def lazy_iter(self):
+        """Lazily yield ion images."""
+        with self.lazy_peaks() as peaks:
+            for i, mz in enumerate(self.xs):
+                yield mz, peaks[:, i]
+
     def get_ion(self, value: ty.Union[int, float]) -> np.ndarray:
         """Retrieve single ion."""
         if isinstance(value, float):
diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py
index 555201e..05d8351 100644
--- a/src/imzy/_hdf5_mixin.py
+++ b/src/imzy/_hdf5_mixin.py
@@ -11,6 +11,20 @@
 from koyo.typing import PathLike
 
 
+def parse_from_attribute(attribute):
+    """Parse attribute from cache."""
+    if isinstance(attribute, str) and attribute == "__NONE__":
+        attribute = None
+    return attribute
+
+
+def parse_to_attribute(attribute):
+    """Parse attribute to cache."""
+    if attribute is None:
+        attribute = "__NONE__"
+    return attribute
+
+
 def check_hdf5() -> None:
     """Check whether Zarr, dask and rechunker are installed."""
     try:
@@ -104,3 +118,16 @@ def _add_data_to_group(
                 compression_opts=compression_opts,
                 shape=shape,
             )
+
+    def get_attr(self, dataset_name, attr: str, default=None):
+        """Safely retrieve 1 attribute."""
+        with self.open("r") as h5:
+            group = self._get_group(h5, dataset_name)
+            value = parse_from_attribute(group.attrs.get(attr, default))
+            return value
+
+    def get_array(self, dataset_name: str, key: str) -> np.ndarray:
+        """Safely retrieve 1 array."""
+        with self.open("r") as h5:
+            group = self._get_group(h5, dataset_name)
+            return group[key][:]
diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index 6b3800b..d881870 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -1,6 +1,7 @@
 """HDF5 store for normalizations."""
 import numpy as np
 from koyo.typing import PathLike
+from loguru import logger
 
 from imzy._hdf5_mixin import HDF5Mixin
 
@@ -8,16 +9,64 @@
 class H5NormalizationStore(HDF5Mixin):
     """HDF5 store for normalizations."""
 
-    NORMALIZATIONS_KEY: str = "Normalizations"
+    NORMALIZATION_KEY: str = "Normalization"
 
     def __init__(self, path: PathLike, mode: str = "a"):
         self._init_hdf5(path, mode)
 
-    def get_normalization(self) -> np.ndarray:
-        """Get normalization data."""
+    def __getitem__(self, item: str) -> np.ndarray:
+        """Retrieve item."""
+        return self.get_normalization(item)
+
+    def __call__(self, array: np.ndarray, *args, **kwargs):
+        """Apply normalization."""
+        return self.normalize(array, *args, **kwargs)
 
     def get_normalization_list(self) -> list[str]:
         """Get list of available normalizations."""
         with self.open("r") as h5:
-            normalizations = list(h5[self.NORMALIZATIONS_KEY].keys())
+            normalizations = list(h5[self.NORMALIZATION_KEY].keys())
         return normalizations
+
+    def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bool = True) -> np.ndarray:
+        """Get normalization array."""
+        if not name.startswith(f"{self.NORMALIZATION_KEY}/"):
+            name = f"{self.NORMALIZATION_KEY}/{name}"
+        try:
+            normalization = self.get_array(name, "normalization")
+        except KeyError:
+            raise KeyError(f"Normalization '{name}' not found in store.")
+        return normalization
+
+    def normalize(self, array: np.ndarray, name: str, as_median: bool = True, **kwargs) -> np.ndarray:
+        """Normalize array."""
+
+    def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray:
+        """Apply normalization to an array.
+
+        Parameters
+        ----------
+        array : np.ndarray
+            array to be reshaped
+        norm : str
+            name of the normalization to be applied to the array
+
+        Returns
+        -------
+        array : np.ndarray
+            normalize array. If 'reshape' is True it will be automatically reshaped to a 2D representation
+        """
+        if array.ndim > 2:
+            raise ValueError("Array should be 2D or 1D")
+        try:
+            norm_array = self.get_normalization(norm, as_multiplier=True)
+            array = np.multiply(array, norm_array)
+        except ValueError:
+            logger.warning(f"Normalization '{norm}' not found. Skipping normalization.")
+        return _postprocess(array)
+
+
+def _postprocess(array: np.ndarray):
+    # array[np.isnan(array)] = 0
+    array[np.isinf(array)] = 0
+    return array
diff --git a/src/imzy/utilities.py b/src/imzy/utilities.py
index 25d5e09..b4989c0 100644
--- a/src/imzy/utilities.py
+++ b/src/imzy/utilities.py
@@ -49,15 +49,4 @@ def _safe_rmtree(path):
         pass
 
 
-def parse_from_attribute(attribute):
-    """Parse attribute from cache."""
-    if isinstance(attribute, str) and attribute == "__NONE__":
-        attribute = None
-    return attribute
-
-
-def parse_to_attribute(attribute):
-    """Parse attribute to cache."""
-    if attribute is None:
-        attribute = "__NONE__"
-    return attribute
+

From fdeb2b25bc57ef562ecd5317cd3388aaae3f7072 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 10:37:59 +0000
Subject: [PATCH 09/21] Fixes

---
 src/imzy/_normalizations/__init__.py    | 2 +-
 src/imzy/_normalizations/_hdf5_store.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index 0e48113..22c2198 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -59,7 +59,7 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent:
     store = H5NormalizationStore(hdf_path, mode="a")
     with np.errstate(invalid="ignore", divide="ignore"):
         with store.open() as h5:
-            group = store._get_group(h5, store.NORMALIZATIONS_KEY)
+            group = store._get_group(h5, store.NORMALIZATION_KEY)
             for i, normalization in enumerate(normalization_names):
                 # get normalization
                 norm = normalizations[:, i]
diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index d881870..0a1a934 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -38,8 +38,9 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo
             raise KeyError(f"Normalization '{name}' not found in store.")
         return normalization
 
-    def normalize(self, array: np.ndarray, name: str, as_median: bool = True, **kwargs) -> np.ndarray:
+    def normalize(self, array: np.ndarray, name: str, **kwargs) -> np.ndarray:
         """Normalize array."""
+        return self._single_normalize(array, name, **kwargs)
 
     def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray:
         """Apply normalization to an array.

From 7c8d173deea7cce5ae0659771256bd970fcdfcc3 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 11:59:49 +0100
Subject: [PATCH 10/21] Update _hdf5_store.py

---
 src/imzy/_centroids/_hdf5_store.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py
index 9a0edc5..d912c7b 100644
--- a/src/imzy/_centroids/_hdf5_store.py
+++ b/src/imzy/_centroids/_hdf5_store.py
@@ -42,7 +42,8 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     _is_chunked = None
     _low_mem: bool = True
     _ion_labels = None
-    _mz, _bins, _ppm, _eppm, _mda = None, None, None, None, None
+    _tol, _ppm = None, None
+    unique_id: str = ""
 
     def __init__(
         self,

From 98067d98f069fa0012b2f71fac93fda9a0231610 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 12:00:15 +0100
Subject: [PATCH 11/21] Update __init__.py

---
 src/imzy/_normalizations/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index 22c2198..68751ef 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -35,7 +35,7 @@ def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path:
 
     store = H5NormalizationStore(hdf_path, mode="a")
     with store.open() as h5:
-        group = store._get_group(h5, store.NORMALIZATIONS_KEY)
+        group = store._get_group(h5, store.NORMALIZATION_KEY)
         for normalization in get_normalizations():
             group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression)
     store.flush()

From 8132f94011a3f6c86f418ec8ac5d6abe4d058d49 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 11:23:09 +0000
Subject: [PATCH 12/21] Fixes

---
 src/imzy/_normalizations/__init__.py | 175 --------------------------
 src/imzy/_normalizations/_extract.py | 177 +++++++++++++++++++++++++++
 src/imzy/_readers/_base.py           |   3 +-
 3 files changed, 179 insertions(+), 176 deletions(-)

diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py
index 68751ef..fba7cad 100644
--- a/src/imzy/_normalizations/__init__.py
+++ b/src/imzy/_normalizations/__init__.py
@@ -1,178 +1,3 @@
 """Normalizations."""
-import warnings
-from pathlib import Path
-
-import numba
-import numpy as np
-from koyo.typing import PathLike
-from tqdm import tqdm
-
-try:
-    import hdf5plugin
-except ImportError:
-
-    class hdf5plugin:
-        """Dummy class."""
-
-        LZ4 = lambda *args, **kwargs: {}
-
-
 from imzy._normalizations._extract import get_normalizations
 from imzy._normalizations._hdf5_store import H5NormalizationStore
-
-
-def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path:
-    """Create group with datasets inside."""
-    from imzy._readers import get_reader
-
-    reader = get_reader(input_dir)
-    n_pixels = reader.n_pixels
-
-    compression = hdf5plugin.LZ4()
-
-    if Path(hdf_path).suffix != ".h5":
-        hdf_path = Path(hdf_path).with_suffix(".h5")
-
-    store = H5NormalizationStore(hdf_path, mode="a")
-    with store.open() as h5:
-        group = store._get_group(h5, store.NORMALIZATION_KEY)
-        for normalization in get_normalizations():
-            group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression)
-    store.flush()
-    return hdf_path
-
-
-def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray:
-    """Retrieve normalization and determine outliers."""
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", RuntimeWarning)
-        norm = np.log10(norm + 1)
-        mask = norm < (norm.max() - n_orders)
-    return mask
-
-
-def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path:
-    """Extract normalizations from hdf5."""
-    normalization_names = get_normalizations()
-    normalizations = compute_normalizations(input_dir, silent=silent)
-
-    store = H5NormalizationStore(hdf_path, mode="a")
-    with np.errstate(invalid="ignore", divide="ignore"):
-        with store.open() as h5:
-            group = store._get_group(h5, store.NORMALIZATION_KEY)
-            for i, normalization in enumerate(normalization_names):
-                # get normalization
-                norm = normalizations[:, i]
-                # remove outliers
-                mask = _get_outlier_mask(norm, 2)
-                norm[mask] = np.median(norm[mask])
-                # save the normalizations as 'multiplier' version so it's easier to apply
-                group[normalization][:] = 1 / (norm / np.median(norm))
-            store.flush()
-    return hdf_path
-
-
-def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
-    """Calculate normalizations for a set of frames."""
-    from imzy._readers import get_reader
-
-    reader = get_reader(input_dir)
-
-    # specify normalization names
-    names = get_normalizations()
-
-    # pre-assign array
-    n_frames = reader.n_pixels
-    framelist = reader.pixels
-    # start_frame, norm_array = check_cache()
-    norm_array = np.zeros((n_frames, len(names)), dtype=np.float32)
-    for i, (_, y) in enumerate(
-        tqdm(
-            reader.spectra_iter(framelist, silent=True),
-            disable=silent,
-            miniters=100,
-            mininterval=2,
-            total=len(framelist),
-            desc="Computing normalizations...",
-        )
-    ):
-        norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32))
-    norm_array = np.nan_to_num(norm_array, nan=1.0)
-    return norm_array
-
-
-@numba.njit(fastmath=True, cache=True)
-def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray:
-    """Calculate various normalizations.
-
-    This function expects float32 spectrum.
-    """
-    px_norms = np.zeros(12, dtype=np.float32)
-    px_norms[0] = np.sum(spectrum)  # TIC
-    if px_norms[0] == 0:
-        return px_norms
-
-    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
-    px_norms[2] = np.median(spectrum[spectrum > 0])  # Median
-    _spectrum = spectrum[spectrum > 0]
-    if _spectrum.size > 1:
-        q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05])
-    else:
-        q95, q90, q10, q5 = 0, 0, 0, 0
-    px_norms[3] = np.sum(spectrum[spectrum < q95])  # 0-95% TIC
-    px_norms[4] = np.sum(spectrum[spectrum < q90])  # 0-90% TIC
-    px_norms[5] = np.sum(spectrum[spectrum > q5])  # 5-100% TIC
-    px_norms[6] = np.sum(spectrum[spectrum > q10])  # 10-100% TIC
-    px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)])  # 5-95% TIC
-    px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)])  # 10-90% TIC
-    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
-    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
-    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
-    return px_norms
-
-
-@numba.njit(fastmath=True, cache=True)
-def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray:
-    """Calculate various normalizations, optimized version.
-
-    This function expects a float32 spectrum.
-    """
-    px_norms = np.zeros(12, dtype=np.float32)
-
-    # Direct computation without conditions
-    px_norms[0] = np.sum(spectrum)  # TIC
-    if px_norms[0] == 0:
-        return px_norms
-
-    # Compute these norms directly without extra conditions
-    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
-    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
-    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
-    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
-
-    # Filter positive values once and reuse
-    positive_spectrum = spectrum[spectrum > 0]
-    px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0  # Median
-
-    # Calculating quantiles once for all needed
-    if positive_spectrum.size > 1:
-        q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05])
-    else:
-        q95, q90, q10, q5 = 0, 0, 0, 0
-
-    # Using logical indexing with boolean arrays might be faster due to numba optimization
-    condition_q95 = spectrum < q95
-    condition_q90 = spectrum < q90
-    condition_q5 = spectrum > q5
-    condition_q10 = spectrum > q10
-
-    px_norms[3] = np.sum(spectrum[condition_q95])  # 0-95% TIC
-    px_norms[4] = np.sum(spectrum[condition_q90])  # 0-90% TIC
-    px_norms[5] = np.sum(spectrum[condition_q5])  # 5-100% TIC
-    px_norms[6] = np.sum(spectrum[condition_q10])  # 10-100% TIC
-
-    # For ranges, we can combine conditions
-    px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95])  # 5-95% TIC
-    px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90])  # 10-90% TIC
-
-    return px_norms
diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
index e9857aa..f6ddddf 100644
--- a/src/imzy/_normalizations/_extract.py
+++ b/src/imzy/_normalizations/_extract.py
@@ -1,4 +1,24 @@
 """Get normalization utilities."""
+import warnings
+from pathlib import Path
+
+import numba
+
+try:
+    import hdf5plugin
+except ImportError:
+
+    class hdf5plugin:
+        """Dummy class."""
+
+        LZ4 = lambda *args, **kwargs: {}
+
+
+import numpy as np
+from koyo.typing import PathLike
+from tqdm import tqdm
+
+from imzy._normalizations import H5NormalizationStore
 
 
 def get_normalizations() -> list[str]:
@@ -17,3 +37,160 @@ def get_normalizations() -> list[str]:
         "2-norm",
         "3-norm",
     ]
+
+
+def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path:
+    """Create group with datasets inside."""
+    from imzy._readers import get_reader
+
+    reader = get_reader(input_dir)
+    n_pixels = reader.n_pixels
+
+    compression = hdf5plugin.LZ4()
+
+    if Path(hdf_path).suffix != ".h5":
+        hdf_path = Path(hdf_path).with_suffix(".h5")
+
+    store = H5NormalizationStore(hdf_path, mode="a")
+    with store.open() as h5:
+        group = store._get_group(h5, store.NORMALIZATION_KEY)
+        for normalization in get_normalizations():
+            group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression)
+    store.flush()
+    return hdf_path
+
+
+def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray:
+    """Retrieve normalization and determine outliers."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", RuntimeWarning)
+        norm = np.log10(norm + 1)
+        mask = norm < (norm.max() - n_orders)
+    return mask
+
+
+def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path:
+    """Extract normalizations from hdf5."""
+    normalization_names = get_normalizations()
+    normalizations = compute_normalizations(input_dir, silent=silent)
+
+    store = H5NormalizationStore(hdf_path, mode="a")
+    with np.errstate(invalid="ignore", divide="ignore"):
+        with store.open() as h5:
+            group = store._get_group(h5, store.NORMALIZATION_KEY)
+            for i, normalization in enumerate(normalization_names):
+                # get normalization
+                norm = normalizations[:, i]
+                # remove outliers
+                mask = _get_outlier_mask(norm, 2)
+                norm[mask] = np.median(norm[mask])
+                # save the normalizations as 'multiplier' version so it's easier to apply
+                group[normalization][:] = 1 / (norm / np.median(norm))
+            store.flush()
+    return hdf_path
+
+
+def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
+    """Calculate normalizations for a set of frames."""
+    from imzy._readers import get_reader
+
+    reader = get_reader(input_dir)
+
+    # specify normalization names
+    names = get_normalizations()
+
+    # pre-assign array
+    n_frames = reader.n_pixels
+    framelist = reader.pixels
+    # start_frame, norm_array = check_cache()
+    norm_array = np.zeros((n_frames, len(names)), dtype=np.float32)
+    for i, (_, y) in enumerate(
+        tqdm(
+            reader.spectra_iter(framelist, silent=True),
+            disable=silent,
+            miniters=100,
+            mininterval=2,
+            total=len(framelist),
+            desc="Computing normalizations...",
+        )
+    ):
+        norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32))
+    norm_array = np.nan_to_num(norm_array, nan=1.0)
+    return norm_array
+
+
+@numba.njit(fastmath=True, cache=True)
+def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray:
+    """Calculate various normalizations.
+
+    This function expects float32 spectrum.
+    """
+    px_norms = np.zeros(12, dtype=np.float32)
+    px_norms[0] = np.sum(spectrum)  # TIC
+    if px_norms[0] == 0:
+        return px_norms
+
+    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
+    px_norms[2] = np.median(spectrum[spectrum > 0])  # Median
+    _spectrum = spectrum[spectrum > 0]
+    if _spectrum.size > 1:
+        q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05])
+    else:
+        q95, q90, q10, q5 = 0, 0, 0, 0
+    px_norms[3] = np.sum(spectrum[spectrum < q95])  # 0-95% TIC
+    px_norms[4] = np.sum(spectrum[spectrum < q90])  # 0-90% TIC
+    px_norms[5] = np.sum(spectrum[spectrum > q5])  # 5-100% TIC
+    px_norms[6] = np.sum(spectrum[spectrum > q10])  # 10-100% TIC
+    px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)])  # 5-95% TIC
+    px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)])  # 10-90% TIC
+    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
+    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
+    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
+    return px_norms
+
+
+@numba.njit(fastmath=True, cache=True)
+def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray:
+    """Calculate various normalizations, optimized version.
+
+    This function expects a float32 spectrum.
+    """
+    px_norms = np.zeros(12, dtype=np.float32)
+
+    # Direct computation without conditions
+    px_norms[0] = np.sum(spectrum)  # TIC
+    if px_norms[0] == 0:
+        return px_norms
+
+    # Compute these norms directly without extra conditions
+    px_norms[1] = np.sqrt(np.mean(np.square(spectrum)))  # RMS
+    px_norms[9] = np.linalg.norm(spectrum, 0)  # 0-norm
+    px_norms[10] = np.linalg.norm(spectrum, 2)  # 2-norm
+    px_norms[11] = np.linalg.norm(spectrum, 3)  # 3-norm
+
+    # Filter positive values once and reuse
+    positive_spectrum = spectrum[spectrum > 0]
+    px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0  # Median
+
+    # Calculating quantiles once for all needed
+    if positive_spectrum.size > 1:
+        q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05])
+    else:
+        q95, q90, q10, q5 = 0, 0, 0, 0
+
+    # Using logical indexing with boolean arrays might be faster due to numba optimization
+    condition_q95 = spectrum < q95
+    condition_q90 = spectrum < q90
+    condition_q5 = spectrum > q5
+    condition_q10 = spectrum > q10
+
+    px_norms[3] = np.sum(spectrum[condition_q95])  # 0-95% TIC
+    px_norms[4] = np.sum(spectrum[condition_q90])  # 0-90% TIC
+    px_norms[5] = np.sum(spectrum[condition_q5])  # 5-100% TIC
+    px_norms[6] = np.sum(spectrum[condition_q10])  # 10-100% TIC
+
+    # For ranges, we can combine conditions
+    px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95])  # 5-95% TIC
+    px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90])  # 10-90% TIC
+
+    return px_norms
diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index f83a976..292e392 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -424,7 +424,8 @@ def to_hdf5(
     def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False):
         """Extract normalizations."""
         from imzy._hdf5_mixin import check_hdf5
-        from imzy._normalizations import create_normalizations_hdf5, extract_normalizations_hdf5
+        from imzy._normalizations._extract import extract_normalizations_hdf5
+        from imzy._normalizations._extract import create_normalizations_hdf5
 
         check_hdf5()
         hdf_path = Path(hdf_path)

From a5a4f071b0ed0a6e8cf3e4092a5561f7accf3946 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Tue, 5 Mar 2024 15:54:31 +0100
Subject: [PATCH 13/21] Fixes

---
 src/imzy/_centroids/_extract.py         |  4 +++-
 src/imzy/_centroids/_hdf5_store.py      | 24 +++++++++++++++++++++++-
 src/imzy/_hdf5_mixin.py                 |  9 +++++++++
 src/imzy/_normalizations/_hdf5_store.py | 25 ++++++++++++++++++++++++-
 4 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py
index ece9303..25fb9b7 100644
--- a/src/imzy/_centroids/_extract.py
+++ b/src/imzy/_centroids/_extract.py
@@ -166,7 +166,9 @@ def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict
 
     _max_mem = (float(n_pixels) * n_peaks * 4) / (1024**2)  # assume 4 bytes per element
     n_tasks = math.ceil(_max_mem / max_mem) or 1
-    return dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))
+    chunk_info = dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))
+    assert sum(len(v) for v in chunk_info.values()) == n_pixels, "Chunking failed"
+    return chunk_info
 
 
 def create_centroids_hdf5(
diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py
index d912c7b..0bdb3a4 100644
--- a/src/imzy/_centroids/_hdf5_store.py
+++ b/src/imzy/_centroids/_hdf5_store.py
@@ -43,6 +43,7 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids):
     _low_mem: bool = True
     _ion_labels = None
     _tol, _ppm = None, None
+    _peaks = None
     unique_id: str = ""
 
     def __init__(
@@ -118,6 +119,27 @@ def lazy_peaks(self):
                 group = h5[self.PEAKS_KEY]
                 yield group["array"]
 
+    @property
+    def peaks(self) -> np.ndarray:
+        """Load peaks data.
+
+        This function uses custom data loading to speed things up. Because we chunk the ion images along one dimension,
+        it becomes very slow to read the entire data in one go. It can be optimized a little by read the data one image
+        at a time and then building 2d array.
+        """
+        if self._peaks is None:
+            if self.is_chunked:
+                if self._proxy is None:
+                    self._proxy = LazyPeaksProxy(self)
+                self._peaks = self._proxy.peaks()
+            else:
+                array = np.zeros(self.shape, dtype=self.dtype)
+                with self.open("r") as h5:
+                    for sl in h5[self.PEAKS_ARRAY_KEY].iter_chunks():
+                        array[sl] = h5[self.PEAKS_ARRAY_KEY][sl]
+                self._peaks = array
+        return self._peaks
+
     @property
     def tol(self) -> int:
         """Get Da tolerance the data was extracted at."""
@@ -216,7 +238,7 @@ def __getitem__(self, item):
     def shape(self) -> ty.Tuple[int, int]:
         """Return shape."""
         chunk_info = self.obj.chunk_info
-        n_px = chunk_info[max(chunk_info.keys())].max()
+        n_px = sum(len(indices) for indices in chunk_info.values())
         return n_px, self.obj.n_peaks
 
     @property
diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py
index 05d8351..f434f2c 100644
--- a/src/imzy/_hdf5_mixin.py
+++ b/src/imzy/_hdf5_mixin.py
@@ -10,6 +10,15 @@
     h5py = None
 from koyo.typing import PathLike
 
+try:
+    import hdf5plugin
+except ImportError:
+
+    class hdf5plugin:
+        """Dummy class."""
+
+        LZ4 = lambda *args, **kwargs: {}
+
 
 def parse_from_attribute(attribute):
     """Parse attribute from cache."""
diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index 0a1a934..85a9228 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -40,7 +40,11 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo
 
     def normalize(self, array: np.ndarray, name: str, **kwargs) -> np.ndarray:
         """Normalize array."""
-        return self._single_normalize(array, name, **kwargs)
+        array = np.asanyarray(array)
+        if array.ndim < 2:  # or is_single_2d(self.mobj, array):
+            return self._single_normalize(array, name, **kwargs)
+        else:
+            return self._batch_normalize(array, name, **kwargs)
 
     def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray:
         """Apply normalization to an array.
@@ -66,6 +70,25 @@ def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarra
             logger.warning(f"Normalization '{norm}' not found. Skipping normalization.")
         return _postprocess(array)
 
+    def _batch_normalize(
+        self,
+        array: np.ndarray,
+        norm: str,
+        **kwargs,
+    ):
+        if array.ndim < 2:
+            raise ValueError("Expected two-dimensional array of 'N pixels * M peaks'.")
+
+        norm_array = self.get_normalization(
+            norm,
+            as_multiplier=True,
+        )
+        if array.ndim == 2:
+            if norm_array.shape[0] != array.shape[0]:
+                raise ValueError("The input array does not have the same number of pixels as the normalization vector.")
+        array = np.multiply(array.T, norm_array).T
+        return _postprocess(array)
+
 
 def _postprocess(array: np.ndarray):
     # array[np.isnan(array)] = 0

From 6f34b8db99a0d7fa73b04e060a9501d78287b3d7 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Fri, 29 Mar 2024 11:38:06 +0100
Subject: [PATCH 14/21] Fixes

---
 src/imzy/_normalizations/_extract.py | 2 +-
 src/imzy/_readers/_base.py           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
index f6ddddf..84426b9 100644
--- a/src/imzy/_normalizations/_extract.py
+++ b/src/imzy/_normalizations/_extract.py
@@ -18,7 +18,7 @@ class hdf5plugin:
 from koyo.typing import PathLike
 from tqdm import tqdm
 
-from imzy._normalizations import H5NormalizationStore
+from imzy._normalizations._hdf5_store import H5NormalizationStore
 
 
 def get_normalizations() -> list[str]:
diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index 292e392..78a40ae 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -424,8 +424,7 @@ def to_hdf5(
     def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False):
         """Extract normalizations."""
         from imzy._hdf5_mixin import check_hdf5
-        from imzy._normalizations._extract import extract_normalizations_hdf5
-        from imzy._normalizations._extract import create_normalizations_hdf5
+        from imzy._normalizations._extract import create_normalizations_hdf5, extract_normalizations_hdf5
 
         check_hdf5()
         hdf_path = Path(hdf_path)

From d50066bbeae9c948fbfb8365699da790f986d6b6 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 9 Apr 2024 15:27:30 +0100
Subject: [PATCH 15/21] Update _extract.py

---
 src/imzy/_normalizations/_extract.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
index f6ddddf..3dd3783 100644
--- a/src/imzy/_normalizations/_extract.py
+++ b/src/imzy/_normalizations/_extract.py
@@ -192,5 +192,4 @@ def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray:
     # For ranges, we can combine conditions
     px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95])  # 5-95% TIC
     px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90])  # 10-90% TIC
-
     return px_norms

From d96aa001b568d1dc81fd8a906d1e2b5e2d219ad8 Mon Sep 17 00:00:00 2001
From: lukasz-migas <lukas.migas@yahoo.com>
Date: Tue, 9 Apr 2024 15:35:44 +0100
Subject: [PATCH 16/21] Update _base.py

---
 src/imzy/_readers/_base.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index 78a40ae..3d2467e 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -1,4 +1,5 @@
 """Base reader."""
+
 import os
 import typing as ty
 from contextlib import suppress
@@ -131,6 +132,16 @@ def image_shape(self) -> ty.Tuple[int, int]:
         """Return shape of the image."""
         return self.y_size, self.x_size
 
+    def index_to_xy_coordinates(self, index: int) -> ty.Tuple[int, int]:
+        """Convert index to x, y coordinates."""
+        return self.x_coordinates[index], self.y_coordinates[index]
+
+    def xy_coordinates_to_index(self, x: int, y: int) -> int:
+        """Convert x, y coordinates to index."""
+        indices = np.where((self.x_coordinates == x) & (self.y_coordinates == y))[0]
+        if indices.size > 0:
+            return indices[0]
+
     @property
     def xyz_coordinates(self) -> np.ndarray:
         """Return xyz coordinates."""

From baabac1d57b99cd3c3e7f36c8d971b9621ea754d Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Mon, 9 Sep 2024 21:15:09 +0200
Subject: [PATCH 17/21] Fixes

---
 src/imzy/_readers/_base.py        |  4 ++++
 src/imzy/_readers/imzml/_imzml.py | 10 ++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
index 3d2467e..6746266 100644
--- a/src/imzy/_readers/_base.py
+++ b/src/imzy/_readers/_base.py
@@ -127,6 +127,10 @@ def reshape_batch(self, array: np.ndarray, fill_value: float = 0) -> np.ndarray:
             im[i, self.y_coordinates, self.x_coordinates] = array[:, i]
         return im
 
+    def flatten(self, array: np.ndarray) -> np.ndarray:
+        """Flatten 2D image."""
+        return array[self.y_coordinates, self.x_coordinates]
+
     @property
     def image_shape(self) -> ty.Tuple[int, int]:
         """Return shape of the image."""
diff --git a/src/imzy/_readers/imzml/_imzml.py b/src/imzy/_readers/imzml/_imzml.py
index a89dbe8..bf1cd14 100644
--- a/src/imzy/_readers/imzml/_imzml.py
+++ b/src/imzy/_readers/imzml/_imzml.py
@@ -179,6 +179,11 @@ def reshape_batch(self, array: np.ndarray, fill_value: float = 0) -> np.ndarray:
             im[i, self.y_coordinates - 1, self.x_coordinates - 1] = array[:, i]
         return im
 
+    def flatten(self, image: np.ndarray) -> np.ndarray:
+        """Retrieve original vector of intensities from an image."""
+        array = image[self.y_coordinates - 1, self.x_coordinates - 1]
+        return array
+
     def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False) -> ty.Tuple[np.ndarray, np.ndarray]:
         """Sum pixel data to produce summed mass spectrum."""
         indices = np.asarray(indices)
@@ -269,8 +274,9 @@ def _read_spectra(
                 mz_bytes = f_ptr.read(mz_l * self._mz_size)
                 f_ptr.seek(int_o)
                 int_bytes = f_ptr.read(int_l * self._int_size)
-                yield np.frombuffer(mz_bytes, dtype=self.mz_precision), np.frombuffer(
-                    int_bytes, dtype=self.int_precision
+                yield (
+                    np.frombuffer(mz_bytes, dtype=self.mz_precision),
+                    np.frombuffer(int_bytes, dtype=self.int_precision),
                 )
 
 

From abbed7a1e8d300b1b0c8a467318963eef97bc38a Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Thu, 28 Nov 2024 18:00:33 +0100
Subject: [PATCH 18/21] Update _hdf5_store.py

---
 src/imzy/_normalizations/_hdf5_store.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index 85a9228..4fc2a7c 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -34,6 +34,9 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo
             name = f"{self.NORMALIZATION_KEY}/{name}"
         try:
             normalization = self.get_array(name, "normalization")
+        except ValueError:
+            _, name = name.split("/")
+            normalization = self.get_array(self.NORMALIZATION_KEY, name)
         except KeyError:
             raise KeyError(f"Normalization '{name}' not found in store.")
         return normalization

From 4795500c29be3bf915aafb3cb5f590582722dfb3 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Thu, 12 Dec 2024 23:12:25 +0100
Subject: [PATCH 19/21] Improvements

---
 src/imzy/_centroids/_base.py            | 6 ++++--
 src/imzy/_normalizations/_hdf5_store.py | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/imzy/_centroids/_base.py b/src/imzy/_centroids/_base.py
index a3dec5e..fa340d0 100644
--- a/src/imzy/_centroids/_base.py
+++ b/src/imzy/_centroids/_base.py
@@ -56,13 +56,15 @@ def __getitem__(self, item):
     def _reshape_single(self, array, fill_value=np.nan):
         """Reshape single image."""
         if self._reshape_by_coordinates:
-            return reshape_array_from_coordinates(array, self.image_shape, self.xyz_coordinates, fill_value=fill_value)
+            return reshape_array_from_coordinates(
+                array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1
+            )
         return reshape_array(array, self.image_shape, self.pixel_index, fill_value=fill_value)
 
     def _reshape_multiple(self, array, fill_value=np.nan):
         if self._reshape_by_coordinates:
             return reshape_array_batch_from_coordinates(
-                array, self.image_shape, self.xyz_coordinates, fill_value=fill_value
+                array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1
             )
         return reshape_array_batch(array, self.image_shape, self.pixel_index, fill_value=fill_value)
 
diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py
index 4fc2a7c..ccb5575 100644
--- a/src/imzy/_normalizations/_hdf5_store.py
+++ b/src/imzy/_normalizations/_hdf5_store.py
@@ -88,7 +88,10 @@ def _batch_normalize(
         )
         if array.ndim == 2:
             if norm_array.shape[0] != array.shape[0]:
-                raise ValueError("The input array does not have the same number of pixels as the normalization vector.")
+                raise ValueError(
+                    f"The input array does not have the same number of pixels as the normalization vector."
+                    f" (norm={norm_array.shape}; array={array.shape})"
+                )
         array = np.multiply(array.T, norm_array).T
         return _postprocess(array)
 

From 02f010b60656a0d36917b75db6a4fd40f4efb73a Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Fri, 31 Jan 2025 13:54:12 +0000
Subject: [PATCH 20/21] Update _extract.py

---
 src/imzy/_normalizations/_extract.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
index 89cd31e..a4c3d97 100644
--- a/src/imzy/_normalizations/_extract.py
+++ b/src/imzy/_normalizations/_extract.py
@@ -1,4 +1,5 @@
 """Get normalization utilities."""
+
 import warnings
 from pathlib import Path
 
@@ -72,25 +73,19 @@ def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray:
 def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path:
     """Extract normalizations from hdf5."""
     normalization_names = get_normalizations()
-    normalizations = compute_normalizations(input_dir, silent=silent)
+    normalizations = compute_normalizations(input_dir, clean=True, silent=silent)
 
     store = H5NormalizationStore(hdf_path, mode="a")
     with np.errstate(invalid="ignore", divide="ignore"):
         with store.open() as h5:
             group = store._get_group(h5, store.NORMALIZATION_KEY)
             for i, normalization in enumerate(normalization_names):
-                # get normalization
-                norm = normalizations[:, i]
-                # remove outliers
-                mask = _get_outlier_mask(norm, 2)
-                norm[mask] = np.median(norm[mask])
-                # save the normalizations as 'multiplier' version so it's easier to apply
-                group[normalization][:] = 1 / (norm / np.median(norm))
+                group[normalization][:] = normalizations[:, i]
             store.flush()
     return hdf_path
 
 
-def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
+def compute_normalizations(input_dir: Path, clean: bool = True, silent: bool = False) -> np.ndarray:
     """Calculate normalizations for a set of frames."""
     from imzy._readers import get_reader
 
@@ -116,6 +111,16 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray:
     ):
         norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32))
     norm_array = np.nan_to_num(norm_array, nan=1.0)
+    # clean-up normalizations
+    for i in range(norm_array.shape[1]):
+        # get normalization
+        norm = norm_array[:, i]
+        if clean:
+            # remove outliers
+            mask = _get_outlier_mask(norm, 2)
+            norm[mask] = np.median(norm[mask])
+            # save the normalizations as 'multiplier' version so it's easier to apply
+        norm_array[:, i] = 1 / (norm / np.median(norm))
     return norm_array
 
 

From c72795232b540c5a963ea480841dd5a024add252 Mon Sep 17 00:00:00 2001
From: Lukasz Migas <lukas.migas@yahoo.com>
Date: Fri, 31 Jan 2025 14:27:32 +0000
Subject: [PATCH 21/21] Update _extract.py

---
 src/imzy/_normalizations/_extract.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py
index a4c3d97..35d6202 100644
--- a/src/imzy/_normalizations/_extract.py
+++ b/src/imzy/_normalizations/_extract.py
@@ -112,15 +112,16 @@ def compute_normalizations(input_dir: Path, clean: bool = True, silent: bool = F
         norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32))
     norm_array = np.nan_to_num(norm_array, nan=1.0)
     # clean-up normalizations
-    for i in range(norm_array.shape[1]):
-        # get normalization
-        norm = norm_array[:, i]
-        if clean:
-            # remove outliers
-            mask = _get_outlier_mask(norm, 2)
-            norm[mask] = np.median(norm[mask])
-            # save the normalizations as 'multiplier' version so it's easier to apply
-        norm_array[:, i] = 1 / (norm / np.median(norm))
+    with np.errstate(invalid="ignore", divide="ignore"):
+        for i in range(norm_array.shape[1]):
+            # get normalization
+            norm = norm_array[:, i]
+            if clean:
+                # remove outliers
+                mask = _get_outlier_mask(norm, 2)
+                norm[mask] = np.median(norm[mask])
+                # save the normalizations as 'multiplier' version so it's easier to apply
+            norm_array[:, i] = 1 / (norm / np.median(norm))
     return norm_array