From 9d248178cd737d115bb96007d5e1ca54db53cead Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Mon, 4 Mar 2024 15:05:59 +0000 Subject: [PATCH 01/21] Add normalizations --- src/imzy/{ => _centroids}/_extract.py | 49 +++------- src/imzy/_centroids/_hdf5_store.py | 86 ++---------------- src/imzy/_hdf5_mixin.py | 106 ++++++++++++++++++++++ src/imzy/_normalizations/__init__.py | 115 ++++++++++++++++++++++++ src/imzy/_normalizations/_extract.py | 19 ++++ src/imzy/_normalizations/_hdf5_store.py | 22 +++++ src/imzy/_readers/_base.py | 35 +++++++- src/imzy/_typing.py | 12 +++ src/imzy/utilities.py | 23 +++++ tests/test_imzml.py | 10 +++ tests/test_tdf.py | 11 +++ tests/test_tsf.py | 11 +++ 12 files changed, 382 insertions(+), 117 deletions(-) rename src/imzy/{ => _centroids}/_extract.py (91%) create mode 100644 src/imzy/_hdf5_mixin.py create mode 100644 src/imzy/_normalizations/__init__.py create mode 100644 src/imzy/_normalizations/_extract.py create mode 100644 src/imzy/_normalizations/_hdf5_store.py create mode 100644 src/imzy/_typing.py diff --git a/src/imzy/_extract.py b/src/imzy/_centroids/_extract.py similarity index 91% rename from src/imzy/_extract.py rename to src/imzy/_centroids/_extract.py index 4d945a7..dbc22d1 100644 --- a/src/imzy/_extract.py +++ b/src/imzy/_centroids/_extract.py @@ -1,4 +1,3 @@ -"""Methods to handle data extraction from any reader.""" import typing as ty from pathlib import Path @@ -8,13 +7,19 @@ from koyo.utilities import chunks from tqdm import tqdm +from imzy import get_reader +from imzy._typing import SpatialInfo +from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis +from imzy.utilities import _safe_rmtree, parse_to_attribute + try: import hdf5plugin except ImportError: - pass -from imzy._readers import get_reader -from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis + class hdf5plugin: + """Dummy class.""" + + LZ4 = lambda *args, **kwargs: {} def check_zarr() -> None: @@ -154,15 +159,6 @@ def rechunk_zarr_array( _safe_rmtree(zarr_path) # remove the temporary array -def check_hdf5() -> None: - """Check whether Zarr, dask and rechunker are installed.""" - try: - import h5py - import hdf5plugin - except ImportError: - raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]") - - def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict[int, np.ndarray]: """Get chunk size information for particular dataset.""" import math @@ -183,6 +179,7 @@ def create_centroids_hdf5( ppm: ty.Optional[float] = None, ys: ty.Optional[np.ndarray] = None, chunk_info: ty.Optional[ty.Dict[int, np.ndarray]] = None, + spatial_info: ty.Optional[SpatialInfo] = None, ) -> Path: """Create group with datasets inside.""" from imzy._centroids import H5CentroidsStore @@ -216,6 +213,9 @@ def create_centroids_hdf5( store._add_data_to_group(group, "mzs_max", mzs_max, maxshape=(None,), dtype=mzs_max.dtype) if ys is not None: store._add_data_to_group(group, "ys", ys, maxshape=(None,), dtype=ys.dtype) + if spatial_info is not None: + for key, value in spatial_info.items(): + store._add_data_to_group(group, key, value, dtype=value.dtype) if chunk_info is None: store._add_data_to_group( group, @@ -288,26 +288,3 @@ def extract_centroids_hdf5( else: temp[i] = accumulate_peaks_profile(extract_indices, y) store.update(temp, indices, chunk_id) - - -def _safe_rmtree(path): - from shutil import rmtree - - try: - rmtree(path) - except (OSError, FileNotFoundError): - pass - - -def parse_from_attribute(attribute): - """Parse attribute from cache.""" - if isinstance(attribute, str) and attribute == "__NONE__": - attribute = None - return attribute - - -def parse_to_attribute(attribute): - """Parse attribute to cache.""" - if attribute is None: - attribute = "__NONE__" - return attribute diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py index c3f7b50..ce84e4c 100644 --- a/src/imzy/_centroids/_hdf5_store.py +++ b/src/imzy/_centroids/_hdf5_store.py @@ -4,6 +4,8 @@ import numpy as np +from imzy._hdf5_mixin import HDF5Mixin + try: import h5py except ImportError: @@ -15,7 +17,7 @@ from imzy._centroids._base import BaseCentroids -class H5CentroidsStore(BaseCentroids): +class H5CentroidsStore(HDF5Mixin, BaseCentroids): """HDF5-centroids.""" # Private attributes @@ -38,13 +40,11 @@ def __init__( mode: str = "a", ): super().__init__(xyz_coordinates, pixel_index, image_shape) - assert h5py is not None, "h5py is not installed." - - self.path = path - self.mode = mode + self._init_hdf5(path, mode) @property - def is_low_mem(self): + def is_low_mem(self) -> bool: + """Get low memory flag.""" return self._low_mem @is_low_mem.setter @@ -55,6 +55,7 @@ def is_low_mem(self, value: bool): @property def is_chunked(self) -> bool: + """Get chunked flag.""" if self._is_chunked is None: with self.open() as h5: self._is_chunked = h5[self.PEAKS_KEY].attrs.get("is_chunked") @@ -128,79 +129,6 @@ def update(self, array: np.ndarray, framelist: np.ndarray, chunk_id: ty.Optional h5[self.PEAKS_ARRAY_KEY][framelist] = array h5.flush() - @contextmanager - def open(self, mode: ty.Optional[str] = None): - """Safely open storage.""" - if mode is None: - mode = self.mode - try: - f_ptr = h5py.File(self.path, mode=mode, rdcc_nbytes=1024 * 1024 * 4) - except FileExistsError as err: - raise err - try: - yield f_ptr - finally: - f_ptr.close() - - def flush(self): - """Flush data to disk.""" - with self.open() as h5: - h5.flush() - - @staticmethod - def _get_group(hdf, group_name: str, flush: bool = True): - try: - group = hdf[group_name] - except KeyError: - group = hdf.create_group(group_name) - if flush: - hdf.flush() - return group - - @staticmethod - def _add_data_to_group( - group_obj, - dataset_name, - data, - dtype, - chunks=None, - maxshape=None, - compression=None, - compression_opts=None, - shape=None, - ): - """Add data to group.""" - replaced_dataset = False - - if dtype is None: - if hasattr(data, "dtype"): - dtype = data.dtype - if shape is None: - if hasattr(data, "shape"): - shape = data.shape - - if dataset_name in list(group_obj.keys()): - if group_obj[dataset_name].dtype == dtype: - try: - group_obj[dataset_name][:] = data - replaced_dataset = True - except TypeError: - del group_obj[dataset_name] - else: - del group_obj[dataset_name] - - if not replaced_dataset: - group_obj.create_dataset( - dataset_name, - data=data, - dtype=dtype, - compression=compression, - chunks=chunks, - maxshape=maxshape, - compression_opts=compression_opts, - shape=shape, - ) - class LazyPeaksProxy: """Proxy class to enable similar interface to lazy-peaks.""" diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py new file mode 100644 index 0000000..555201e --- /dev/null +++ b/src/imzy/_hdf5_mixin.py @@ -0,0 +1,106 @@ +"""Mixin class for HDF5 files.""" +from __future__ import annotations + +import typing as ty +from contextlib import contextmanager + +try: + import h5py +except ImportError: + h5py = None +from koyo.typing import PathLike + + +def check_hdf5() -> None: + """Check whether Zarr, dask and rechunker are installed.""" + try: + import h5py + import hdf5plugin + except ImportError: + raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]") + + +class HDF5Mixin: + """Mixin class for HDF5 files.""" + + path: PathLike + mode: str + + def _init_hdf5(self, path: PathLike, mode: str = "a"): + assert h5py is not None, "h5py is not installed." + + self.path = path + self.mode = mode + + @contextmanager + def open(self, mode: ty.Optional[str] = None) -> h5py.File: + """Safely open storage.""" + if mode is None: + mode = self.mode + try: + f_ptr = h5py.File(self.path, mode=mode, rdcc_nbytes=1024 * 1024 * 4) + except FileExistsError as err: + raise err + try: + yield f_ptr + finally: + f_ptr.close() + + def flush(self): + """Flush data to disk.""" + with self.open() as h5: + h5.flush() + + @staticmethod + def _get_group(hdf, group_name: str, flush: bool = True): + try: + group = hdf[group_name] + except KeyError: + group = hdf.create_group(group_name) + if flush: + hdf.flush() + return group + + @staticmethod + def _add_data_to_group( + group_obj, + dataset_name, + data, + dtype, + chunks=None, + maxshape=None, + compression=None, + compression_opts=None, + shape=None, + ): + """Add data to group.""" + replaced_dataset = False + + if dtype is None: + if hasattr(data, "dtype"): + dtype = data.dtype + if shape is None: + if hasattr(data, "shape"): + shape = data.shape + + if dataset_name in list(group_obj.keys()): + if group_obj[dataset_name].dtype == dtype: + try: + group_obj[dataset_name][:] = data + replaced_dataset = True + except TypeError: + del group_obj[dataset_name] + else: + del group_obj[dataset_name] + + if not replaced_dataset: + group_obj.create_dataset( + dataset_name, + data=data, + dtype=dtype, + compression=compression, + chunks=chunks, + maxshape=maxshape, + compression_opts=compression_opts, + shape=shape, + ) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py new file mode 100644 index 0000000..f75415e --- /dev/null +++ b/src/imzy/_normalizations/__init__.py @@ -0,0 +1,115 @@ +"""Normalizations.""" +from pathlib import Path + +import numba +import numpy as np +from koyo.typing import PathLike +from tqdm import tqdm + +try: + import hdf5plugin +except ImportError: + + class hdf5plugin: + """Dummy class.""" + + LZ4 = lambda *args, **kwargs: {} + + +from imzy._normalizations._extract import get_normalizations +from imzy._normalizations._hdf5_store import H5NormalizationStore + + +def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path: + """Create group with datasets inside.""" + from imzy._readers import get_reader + + reader = get_reader(input_dir) + n_pixels = reader.n_pixels + + compression = hdf5plugin.LZ4() + + if Path(hdf_path).suffix != ".h5": + hdf_path = Path(hdf_path).with_suffix(".h5") + + store = H5NormalizationStore(hdf_path, mode="a") + with store.open() as h5: + group = store._get_group(h5, store.NORMALIZATIONS_KEY) + for normalization in get_normalizations(): + group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression) + store.flush() + return hdf_path + + +def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path: + """Extract normalizations from hdf5.""" + normalization_names = get_normalizations() + normalizations = compute_normalizations(input_dir, silent=silent) + + store = H5NormalizationStore(hdf_path, mode="a") + with store.open() as h5: + group = store._get_group(h5, store.NORMALIZATIONS_KEY) + for i, normalization in enumerate(normalization_names): + norm = normalizations[:, i] + group[normalization][:] = norm + store.flush() + return hdf_path + + +def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: + """Calculate normalizations for a set of frames.""" + from imzy._readers import get_reader + + reader = get_reader(input_dir) + + # specify normalization names + names = get_normalizations() + + # pre-assign array + n_frames = reader.n_pixels + framelist = reader.pixels + # start_frame, norm_array = check_cache() + norm_array = np.zeros((n_frames, len(names)), dtype=np.float32) + for i, (_, y) in enumerate( + tqdm( + reader.spectra_iter(framelist, silent=True), + disable=silent, + miniters=100, + mininterval=2, + total=len(framelist), + desc="Computing normalizations...", + ) + ): + norm_array[i] = calculate_normalizations(y.astype(np.float32)) + return norm_array + + +@numba.njit(fastmath=True, cache=True) +def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray: + """Calculate various normalizations. + + This function expects spectrum in the form + + """ + px_norms = np.zeros(12, dtype=np.float32) + px_norms[0] = np.sum(spectrum) # TIC + if px_norms[0] == 0: + return px_norms + + px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS + px_norms[2] = np.median(spectrum[spectrum > 0]) # Median + _spectrum = spectrum[spectrum > 0] + if _spectrum.size > 1: + q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05]) + else: + q95, q90, q10, q5 = 0, 0, 0, 0 + px_norms[3] = np.sum(spectrum[spectrum < q95]) # 0-95% TIC + px_norms[4] = np.sum(spectrum[spectrum < q90]) # 0-90% TIC + px_norms[5] = np.sum(spectrum[spectrum > q5]) # 5-100% TIC + px_norms[6] = np.sum(spectrum[spectrum > q10]) # 10-100% TIC + px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)]) # 5-95% TIC + px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)]) # 10-90% TIC + px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm + px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm + px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm + return px_norms diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py new file mode 100644 index 0000000..e9857aa --- /dev/null +++ b/src/imzy/_normalizations/_extract.py @@ -0,0 +1,19 @@ +"""Get normalization utilities.""" + + +def get_normalizations() -> list[str]: + """Get list of available normalizations.""" + return [ + "TIC", + "RMS", + "Median", + "0-95% TIC", + "0-90% TIC", + "5-100% TIC", + "10-100% TIC", + "5-95% TIC", + "10-90% TIC", + "0-norm", + "2-norm", + "3-norm", + ] diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py new file mode 100644 index 0000000..89a523c --- /dev/null +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -0,0 +1,22 @@ +"""HDF5 store for normalizations.""" +import numpy as np + +from imzy._hdf5_mixin import HDF5Mixin + + +class H5NormalizationStore(HDF5Mixin): + """HDF5 store for normalizations.""" + + NORMALIZATIONS_KEY: str = "Normalizations" + + def __init__(self, path: str, mode: str = "a"): + self._init_hdf5(path, mode) + + def get_normalization(self) -> np.ndarray: + """Get normalization data.""" + + def get_normalization_list(self) -> list[str]: + """Get list of available normalizations.""" + with self.open("r") as h5: + normalizations = list(h5[self.NORMALIZATIONS_KEY].keys()) + return normalizations diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index 3c6f4b4..c998898 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -298,7 +298,12 @@ def to_zarr( silent: bool = False, ) -> Path: """Export many ion images for specified m/z values (+ tolerance) to Zarr array.""" - from imzy._extract import check_zarr, create_centroids_zarr, extract_centroids_zarr, rechunk_zarr_array + from imzy._centroids._extract import ( + check_zarr, + create_centroids_zarr, + extract_centroids_zarr, + rechunk_zarr_array, + ) if not as_flat: raise ValueError("Only flat images are supported at the moment.") @@ -361,7 +366,8 @@ def to_hdf5( silent: bool = False, ) -> Path: """Export many ion images for specified m/z values (+ tolerance) to a HDF5 store.""" - from imzy._extract import check_hdf5, create_centroids_hdf5, extract_centroids_hdf5, get_chunk_info + from imzy._centroids._extract import create_centroids_hdf5, extract_centroids_hdf5, get_chunk_info + from imzy._hdf5_mixin import check_hdf5 if not as_flat: raise ValueError("Only flat images are supported at the moment.") @@ -396,6 +402,11 @@ def to_hdf5( ppm=ppm, tol=tol, chunk_info=chunk_info, + spatial_info={ + "x_coordinates": self.x_coordinates, + "y_coordinates": self.y_coordinates, + "shape": self.image_shape, + }, ) extract_centroids_hdf5( input_dir=self.path, @@ -407,6 +418,26 @@ def to_hdf5( ) return hdf_path + extract_centroids_hdf5 = to_hdf5 + + def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False): + """Extract normalizations.""" + from imzy._hdf5_mixin import check_hdf5 + from imzy._normalizations import create_normalizations_hdf5, extract_normalizations_hdf5 + + check_hdf5() + hdf_path = Path(hdf_path) + if not hdf_path.suffix == ".h5": + hdf_path = hdf_path.with_suffix(".h5") + if not hdf_path.exists(): + hdf_path = create_normalizations_hdf5(self.path, hdf_path) + hdf_path = extract_normalizations_hdf5( + input_dir=self.path, + hdf_path=hdf_path, + silent=silent, + ) + return hdf_path + def spectra_iter( self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False ) -> ty.Generator[ty.Tuple[np.ndarray, np.ndarray], None, None]: diff --git a/src/imzy/_typing.py b/src/imzy/_typing.py new file mode 100644 index 0000000..952803a --- /dev/null +++ b/src/imzy/_typing.py @@ -0,0 +1,12 @@ +"""Typing for the package.""" +import typing as ty + +import numpy as np + + +class SpatialInfo(ty.TypedDict): + """Spatial data.""" + + x_coordinates: np.ndarray + y_coordinates: np.ndarray + shape: ty.Tuple[int, int] diff --git a/src/imzy/utilities.py b/src/imzy/utilities.py index 40adb1d..25d5e09 100644 --- a/src/imzy/utilities.py +++ b/src/imzy/utilities.py @@ -38,3 +38,26 @@ def get_rois_from_bruker_d(path: PathLike) -> ty.List[int]: cursor.close() conn.close() return list(range(0, last_roi + 1)) + + +def _safe_rmtree(path): + from shutil import rmtree + + try: + rmtree(path) + except (OSError, FileNotFoundError): + pass + + +def parse_from_attribute(attribute): + """Parse attribute from cache.""" + if isinstance(attribute, str) and attribute == "__NONE__": + attribute = None + return attribute + + +def parse_to_attribute(attribute): + """Parse attribute to cache.""" + if attribute is None: + attribute = "__NONE__" + return attribute diff --git a/tests/test_imzml.py b/tests/test_imzml.py index f36f1f3..24a9dac 100644 --- a/tests/test_imzml.py +++ b/tests/test_imzml.py @@ -78,3 +78,13 @@ def test_to_h5(path, tmp_path): h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5) assert h5_path.exists() assert h5_temp != h5_path + + +@pytest.mark.parametrize("path", get_imzml_data()) +def test_norms(path, tmp_path): + reader = IMZMLReader(path) + + h5_temp = tmp_path / "output" # forgot to include .h5 extension + h5_path = reader.extract_normalizations_hdf5(h5_temp) + assert h5_path.exists() + assert h5_temp != h5_path diff --git a/tests/test_tdf.py b/tests/test_tdf.py index 1f85a4b..5967137 100644 --- a/tests/test_tdf.py +++ b/tests/test_tdf.py @@ -83,3 +83,14 @@ def test_to_h5(path, tmp_path): h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5) assert h5_path.exists() assert h5_temp != h5_path + + +@pytest.mark.skipif(IS_MAC, reason="Bruker reader is not supported on macOS.") +@pytest.mark.parametrize("path", get_tdf_data()) +def test_norms(path, tmp_path): + reader = TDFReader(path) + + h5_temp = tmp_path / "output" # forgot to include .h5 extension + h5_path = reader.extract_normalizations_hdf5(h5_temp) + assert h5_path.exists() + assert h5_temp != h5_path diff --git a/tests/test_tsf.py b/tests/test_tsf.py index 43a0c2b..fbca5e2 100644 --- a/tests/test_tsf.py +++ b/tests/test_tsf.py @@ -81,3 +81,14 @@ def test_to_h5(path, tmp_path): h5_path = reader.to_hdf5(h5_temp, mzs, tol=0.5) assert h5_path.exists() assert h5_temp != h5_path + + +@pytest.mark.skipif(IS_MAC, reason="Bruker reader is not supported on macOS.") +@pytest.mark.parametrize("path", get_tsf_data()) +def test_norms(path, tmp_path): + reader = TSFReader(path) + + h5_temp = tmp_path / "output" # forgot to include .h5 extension + h5_path = reader.extract_normalizations_hdf5(h5_temp) + assert h5_path.exists() + assert h5_temp != h5_path From 7ee584089683268f1dcd4ae5d6283de848ba7ddb Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Mon, 4 Mar 2024 18:18:58 +0000 Subject: [PATCH 02/21] Update __init__.py --- src/imzy/_normalizations/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index f75415e..44ab368 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -1,4 +1,5 @@ """Normalizations.""" +import warnings from pathlib import Path import numba @@ -41,6 +42,15 @@ def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path: return hdf_path +def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray: + """Retrieve normalization and determine outliers.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + norm = np.log10(norm + 1) + mask = norm < (norm.max() - n_orders) + return mask + + def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path: """Extract normalizations from hdf5.""" normalization_names = get_normalizations() @@ -50,8 +60,13 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: with store.open() as h5: group = store._get_group(h5, store.NORMALIZATIONS_KEY) for i, normalization in enumerate(normalization_names): + # get normalization norm = normalizations[:, i] - group[normalization][:] = norm + # remove outliers + mask = _get_outlier_mask(norm, 2) + norm[mask] = np.median(norm[mask]) + # save the normalizations as 'multiplier' version so it's easier to apply + group[normalization][:] = 1 / (norm / np.median(norm)) store.flush() return hdf_path From 528ccf4158f7843bec059101fbd14fc10526e080 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Mon, 4 Mar 2024 20:36:14 +0100 Subject: [PATCH 03/21] Update _base.py --- src/imzy/_readers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index c998898..6402ed4 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -430,7 +430,7 @@ def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False): if not hdf_path.suffix == ".h5": hdf_path = hdf_path.with_suffix(".h5") if not hdf_path.exists(): - hdf_path = create_normalizations_hdf5(self.path, hdf_path) + # hdf_path = create_normalizations_hdf5(self.path, hdf_path) hdf_path = extract_normalizations_hdf5( input_dir=self.path, hdf_path=hdf_path, From 9d9ab3fc25a8e62709297cbeb0ece3f3f2672838 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Mon, 4 Mar 2024 19:42:12 +0000 Subject: [PATCH 04/21] Fixes --- src/imzy/_centroids/_extract.py | 8 +++++--- src/imzy/_centroids/_hdf5_store.py | 1 + src/imzy/_normalizations/__init__.py | 3 ++- src/imzy/_readers/_base.py | 11 ++++------- src/imzy/_typing.py | 3 ++- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py index dbc22d1..7c879be 100644 --- a/src/imzy/_centroids/_extract.py +++ b/src/imzy/_centroids/_extract.py @@ -213,9 +213,6 @@ def create_centroids_hdf5( store._add_data_to_group(group, "mzs_max", mzs_max, maxshape=(None,), dtype=mzs_max.dtype) if ys is not None: store._add_data_to_group(group, "ys", ys, maxshape=(None,), dtype=ys.dtype) - if spatial_info is not None: - for key, value in spatial_info.items(): - store._add_data_to_group(group, key, value, dtype=value.dtype) if chunk_info is None: store._add_data_to_group( group, @@ -240,6 +237,11 @@ def create_centroids_hdf5( dtype=np.float32, **compression, ) + group = store._get_group(h5, store.SPATIAL_KEY) + if spatial_info is not None: + group.attrs["pixel_size"] = spatial_info.get("pixel_size", 1.0) + for key, value in spatial_info.items(): + store._add_data_to_group(group, key, value, dtype=value.dtype) return Path(hdf_path) diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py index ce84e4c..235c6f8 100644 --- a/src/imzy/_centroids/_hdf5_store.py +++ b/src/imzy/_centroids/_hdf5_store.py @@ -23,6 +23,7 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids): # Private attributes PEAKS_KEY = "Array" PEAKS_ARRAY_KEY = "Array/array" + SPATIAL_KEY = "Misc/Spatial" # Cache attributes _chunk_info = None diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index 44ab368..24d8b17 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -67,7 +67,7 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: norm[mask] = np.median(norm[mask]) # save the normalizations as 'multiplier' version so it's easier to apply group[normalization][:] = 1 / (norm / np.median(norm)) - store.flush() + store.flush() return hdf_path @@ -96,6 +96,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: ) ): norm_array[i] = calculate_normalizations(y.astype(np.float32)) + norm_array = np.nan_to_num(norm_array, nan=1.0) return norm_array diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index 6402ed4..edf67cb 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -405,7 +405,8 @@ def to_hdf5( spatial_info={ "x_coordinates": self.x_coordinates, "y_coordinates": self.y_coordinates, - "shape": self.image_shape, + "image_shape": self.image_shape, + "pixel_size": self.pixel_size, }, ) extract_centroids_hdf5( @@ -430,12 +431,8 @@ def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False): if not hdf_path.suffix == ".h5": hdf_path = hdf_path.with_suffix(".h5") if not hdf_path.exists(): - # hdf_path = create_normalizations_hdf5(self.path, hdf_path) - hdf_path = extract_normalizations_hdf5( - input_dir=self.path, - hdf_path=hdf_path, - silent=silent, - ) + hdf_path = create_normalizations_hdf5(self.path, hdf_path) + hdf_path = extract_normalizations_hdf5(input_dir=self.path, hdf_path=hdf_path, silent=silent) return hdf_path def spectra_iter( diff --git a/src/imzy/_typing.py b/src/imzy/_typing.py index 952803a..ae8dffe 100644 --- a/src/imzy/_typing.py +++ b/src/imzy/_typing.py @@ -9,4 +9,5 @@ class SpatialInfo(ty.TypedDict): x_coordinates: np.ndarray y_coordinates: np.ndarray - shape: ty.Tuple[int, int] + image_shape: ty.Tuple[int, int] + pixel_size: float From c9e95b3f48f3285376e75f6095bb6ac8a0126a03 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Mon, 4 Mar 2024 22:45:51 +0100 Subject: [PATCH 05/21] Update __init__.py --- src/imzy/_normalizations/__init__.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index 24d8b17..d86c739 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -57,17 +57,18 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: normalizations = compute_normalizations(input_dir, silent=silent) store = H5NormalizationStore(hdf_path, mode="a") - with store.open() as h5: - group = store._get_group(h5, store.NORMALIZATIONS_KEY) - for i, normalization in enumerate(normalization_names): - # get normalization - norm = normalizations[:, i] - # remove outliers - mask = _get_outlier_mask(norm, 2) - norm[mask] = np.median(norm[mask]) - # save the normalizations as 'multiplier' version so it's easier to apply - group[normalization][:] = 1 / (norm / np.median(norm)) - store.flush() + with np.errstate(invalid="ignore", divide="ignore"): + with store.open() as h5: + group = store._get_group(h5, store.NORMALIZATIONS_KEY) + for i, normalization in enumerate(normalization_names): + # get normalization + norm = normalizations[:, i] + # remove outliers + mask = _get_outlier_mask(norm, 2) + norm[mask] = np.median(norm[mask]) + # save the normalizations as 'multiplier' version so it's easier to apply + group[normalization][:] = 1 / (norm / np.median(norm)) + store.flush() return hdf_path @@ -104,8 +105,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray: """Calculate various normalizations. - This function expects spectrum in the form - + This function expects float32 spectrum. """ px_norms = np.zeros(12, dtype=np.float32) px_norms[0] = np.sum(spectrum) # TIC From b812daa7a58b897d69a148cb6ba9279c91abe0ac Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Tue, 5 Mar 2024 09:35:44 +0100 Subject: [PATCH 06/21] Improvements --- src/imzy/_centroids/_extract.py | 6 ++-- src/imzy/_normalizations/__init__.py | 49 +++++++++++++++++++++++++++- src/imzy/_readers/_base.py | 2 +- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py index 7c879be..21b1a7c 100644 --- a/src/imzy/_centroids/_extract.py +++ b/src/imzy/_centroids/_extract.py @@ -239,7 +239,7 @@ def create_centroids_hdf5( ) group = store._get_group(h5, store.SPATIAL_KEY) if spatial_info is not None: - group.attrs["pixel_size"] = spatial_info.get("pixel_size", 1.0) + group.attrs["pixel_size"] = spatial_info.pop("pixel_size", 1.0) for key, value in spatial_info.items(): store._add_data_to_group(group, key, value, dtype=value.dtype) return Path(hdf_path) @@ -280,8 +280,8 @@ def extract_centroids_hdf5( indices, disable=silent, desc=f"Extracting {n_peaks} peaks (chunk={chunk_id+1}/{n_chunks})", - miniters=25, - mininterval=0.2, + miniters=250, + mininterval=2, ) ): x, y = reader[index] diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index d86c739..0e48113 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -96,7 +96,7 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: desc="Computing normalizations...", ) ): - norm_array[i] = calculate_normalizations(y.astype(np.float32)) + norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32)) norm_array = np.nan_to_num(norm_array, nan=1.0) return norm_array @@ -129,3 +129,50 @@ def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray: px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm return px_norms + + +@numba.njit(fastmath=True, cache=True) +def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray: + """Calculate various normalizations, optimized version. + + This function expects a float32 spectrum. + """ + px_norms = np.zeros(12, dtype=np.float32) + + # Direct computation without conditions + px_norms[0] = np.sum(spectrum) # TIC + if px_norms[0] == 0: + return px_norms + + # Compute these norms directly without extra conditions + px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS + px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm + px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm + px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm + + # Filter positive values once and reuse + positive_spectrum = spectrum[spectrum > 0] + px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0 # Median + + # Calculating quantiles once for all needed + if positive_spectrum.size > 1: + q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05]) + else: + q95, q90, q10, q5 = 0, 0, 0, 0 + + # Using logical indexing with boolean arrays might be faster due to numba optimization + condition_q95 = spectrum < q95 + condition_q90 = spectrum < q90 + condition_q5 = spectrum > q5 + condition_q10 = spectrum > q10 + + px_norms[3] = np.sum(spectrum[condition_q95]) # 0-95% TIC + px_norms[4] = np.sum(spectrum[condition_q90]) # 0-90% TIC + px_norms[5] = np.sum(spectrum[condition_q5]) # 5-100% TIC + px_norms[6] = np.sum(spectrum[condition_q10]) # 10-100% TIC + + # For ranges, we can combine conditions + px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95]) # 5-95% TIC + px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90]) # 10-90% TIC + + return px_norms diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index edf67cb..f83a976 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -405,7 +405,7 @@ def to_hdf5( spatial_info={ "x_coordinates": self.x_coordinates, "y_coordinates": self.y_coordinates, - "image_shape": self.image_shape, + "image_shape": np.asarray(self.image_shape), "pixel_size": self.pixel_size, }, ) From 09d4b7503c3dfa4b5c3bfcda1e9575eeeaddf8b8 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 5 Mar 2024 08:54:35 +0000 Subject: [PATCH 07/21] Update _hdf5_store.py --- src/imzy/_normalizations/_hdf5_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index 89a523c..6b3800b 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -1,5 +1,6 @@ """HDF5 store for normalizations.""" import numpy as np +from koyo.typing import PathLike from imzy._hdf5_mixin import HDF5Mixin @@ -9,7 +10,7 @@ class H5NormalizationStore(HDF5Mixin): NORMALIZATIONS_KEY: str = "Normalizations" - def __init__(self, path: str, mode: str = "a"): + def __init__(self, path: PathLike, mode: str = "a"): self._init_hdf5(path, mode) def get_normalization(self) -> np.ndarray: From f0f7438a1e1221b0e8481735c6c5cbab60f29e11 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 5 Mar 2024 10:32:40 +0000 Subject: [PATCH 08/21] Fixes --- src/imzy/_centroids/_extract.py | 3 +- src/imzy/_centroids/_hdf5_store.py | 56 ++++++++++++++++++++++++ src/imzy/_hdf5_mixin.py | 27 ++++++++++++ src/imzy/_normalizations/_hdf5_store.py | 57 +++++++++++++++++++++++-- src/imzy/utilities.py | 13 +----- 5 files changed, 139 insertions(+), 17 deletions(-) diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py index 21b1a7c..ece9303 100644 --- a/src/imzy/_centroids/_extract.py +++ b/src/imzy/_centroids/_extract.py @@ -8,9 +8,10 @@ from tqdm import tqdm from imzy import get_reader +from imzy._hdf5_mixin import parse_to_attribute from imzy._typing import SpatialInfo from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis -from imzy.utilities import _safe_rmtree, parse_to_attribute +from imzy.utilities import _safe_rmtree try: import hdf5plugin diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py index 235c6f8..9a0edc5 100644 --- a/src/imzy/_centroids/_hdf5_store.py +++ b/src/imzy/_centroids/_hdf5_store.py @@ -17,6 +17,16 @@ from imzy._centroids._base import BaseCentroids +def format_ppm(mz: float, ppm: float): + """Create formatted ppm'¬s.""" + return f"{mz:.3f} Da ± {ppm:.1f}ppm" + + +def format_tol(mz: float, mda: float): + """Create formatted label.""" + return f"{mz:.3f} Da ± {mda:.3f}Da" + + class H5CentroidsStore(HDF5Mixin, BaseCentroids): """HDF5-centroids.""" @@ -31,6 +41,8 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids): _proxy = None _is_chunked = None _low_mem: bool = True + _ion_labels = None + _mz, _bins, _ppm, _eppm, _mda = None, None, None, None, None def __init__( self, @@ -105,6 +117,50 @@ def lazy_peaks(self): group = h5[self.PEAKS_KEY] yield group["array"] + @property + def tol(self) -> int: + """Get Da tolerance the data was extracted at.""" + if self._tol is None: + self._tol = self.get_attr(self.PEAKS_KEY, "tol") + return self._tol + + @property + def ppm(self) -> float: + """Get ppm/bins the data was extracted at.""" + if self._ppm is None: + self._ppm = self.get_attr(self.PEAKS_KEY, "ppm") + return self._ppm + + def _setup_labels(self): + """Setup labels.""" + if self._ion_labels is None: + mzs = self.xs + ppm = self.ppm + tol = self.tol + self._ion_labels = { + "ppm": [format_ppm(mz, ppm) for mz in mzs] if ppm else [], + "tol": [format_tol(mz, tol) for i, mz in enumerate(mzs)] if tol is not None else [], + } + if not self._ion_labels["ppm"]: + self._ion_labels["ppm"] = self._ion_labels["tol"] + + def index_to_name(self, index: int, effective: bool = True, mz: bool = False): + """Convert index to ion name.""" + if self._ion_labels is None: + self._setup_labels() + return self._ion_labels["ppm"][index] + + @property + def labels(self) -> ty.Tuple[np.ndarray, ty.Optional[np.ndarray], ty.Optional[np.ndarray]]: + """Return all available labels.""" + return self.xs, None, None + + def lazy_iter(self): + """Lazily yield ion images.""" + with self.lazy_peaks() as peaks: + for i, mz in enumerate(self.xs): + yield mz, peaks[:, i] + def get_ion(self, value: ty.Union[int, float]) -> np.ndarray: """Retrieve single ion.""" if isinstance(value, float): diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py index 555201e..05d8351 100644 --- a/src/imzy/_hdf5_mixin.py +++ b/src/imzy/_hdf5_mixin.py @@ -11,6 +11,20 @@ from koyo.typing import PathLike +def parse_from_attribute(attribute): + """Parse attribute from cache.""" + if isinstance(attribute, str) and attribute == "__NONE__": + attribute = None + return attribute + + +def parse_to_attribute(attribute): + """Parse attribute to cache.""" + if attribute is None: + attribute = "__NONE__" + return attribute + + def check_hdf5() -> None: """Check whether Zarr, dask and rechunker are installed.""" try: @@ -104,3 +118,16 @@ def _add_data_to_group( compression_opts=compression_opts, shape=shape, ) + + def get_attr(self, dataset_name, attr: str, default=None): + """Safely retrieve 1 attribute.""" + with self.open("r") as h5: + group = self._get_group(h5, dataset_name) + value = parse_from_attribute(group.attrs.get(attr, default)) + return value + + def get_array(self, dataset_name: str, key: str) -> np.ndarray: + """Safely retrieve 1 array.""" + with self.open("r") as h5: + group = self._get_group(h5, dataset_name) + return group[key][:] diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index 6b3800b..d881870 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -1,6 +1,7 @@ """HDF5 store for normalizations.""" import numpy as np from koyo.typing import PathLike +from loguru import logger from imzy._hdf5_mixin import HDF5Mixin @@ -8,16 +9,64 @@ class H5NormalizationStore(HDF5Mixin): """HDF5 store for normalizations.""" - NORMALIZATIONS_KEY: str = "Normalizations" + NORMALIZATION_KEY: str = "Normalization" def __init__(self, path: PathLike, mode: str = "a"): self._init_hdf5(path, mode) - def get_normalization(self) -> np.ndarray: - """Get normalization data.""" + def __getitem__(self, item: str) -> np.ndarray: + """Retrieve item.""" + return self.get_normalization(item) + + def __call__(self, array: np.ndarray, *args, **kwargs): + """Apply normalization.""" + return self.normalize(array, *args, **kwargs) def get_normalization_list(self) -> list[str]: """Get list of available normalizations.""" with self.open("r") as h5: - normalizations = list(h5[self.NORMALIZATIONS_KEY].keys()) + normalizations = list(h5[self.NORMALIZATION_KEY].keys()) return normalizations + + def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bool = True) -> np.ndarray: + """Get normalization array.""" + if not name.startswith(f"{self.NORMALIZATION_KEY}/"): + name = f"{self.NORMALIZATION_KEY}/{name}" + try: + normalization = self.get_array(name, "normalization") + except KeyError: + raise KeyError(f"Normalization '{name}' not found in store.") + return normalization + + def normalize(self, array: np.ndarray, name: str, as_median: bool = True, **kwargs) -> np.ndarray: + """Normalize array.""" + + def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray: + """Apply normalization to an array. + + Parameters + ---------- + array : np.ndarray + array to be reshaped + norm : str + name of the normalization to be applied to the array + + Returns + ------- + array : np.ndarray + normalize array. If 'reshape' is True it will be automatically reshaped to a 2D representation + """ + if array.ndim > 2: + raise ValueError("Array should be 2D or 1D") + try: + norm_array = self.get_normalization(norm, as_multiplier=True) + array = np.multiply(array, norm_array) + except ValueError: + logger.warning(f"Normalization '{norm}' not found. Skipping normalization.") + return _postprocess(array) + + +def _postprocess(array: np.ndarray): + # array[np.isnan(array)] = 0 + array[np.isinf(array)] = 0 + return array diff --git a/src/imzy/utilities.py b/src/imzy/utilities.py index 25d5e09..b4989c0 100644 --- a/src/imzy/utilities.py +++ b/src/imzy/utilities.py @@ -49,15 +49,4 @@ def _safe_rmtree(path): pass -def parse_from_attribute(attribute): - """Parse attribute from cache.""" - if isinstance(attribute, str) and attribute == "__NONE__": - attribute = None - return attribute - - -def parse_to_attribute(attribute): - """Parse attribute to cache.""" - if attribute is None: - attribute = "__NONE__" - return attribute + From fdeb2b25bc57ef562ecd5317cd3388aaae3f7072 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 5 Mar 2024 10:37:59 +0000 Subject: [PATCH 09/21] Fixes --- src/imzy/_normalizations/__init__.py | 2 +- src/imzy/_normalizations/_hdf5_store.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index 0e48113..22c2198 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -59,7 +59,7 @@ def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: store = H5NormalizationStore(hdf_path, mode="a") with np.errstate(invalid="ignore", divide="ignore"): with store.open() as h5: - group = store._get_group(h5, store.NORMALIZATIONS_KEY) + group = store._get_group(h5, store.NORMALIZATION_KEY) for i, normalization in enumerate(normalization_names): # get normalization norm = normalizations[:, i] diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index d881870..0a1a934 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -38,8 +38,9 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo raise KeyError(f"Normalization '{name}' not found in store.") return normalization - def normalize(self, array: np.ndarray, name: str, as_median: bool = True, **kwargs) -> np.ndarray: + def normalize(self, array: np.ndarray, name: str, **kwargs) -> np.ndarray: """Normalize array.""" + return self._single_normalize(array, name, **kwargs) def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray: """Apply normalization to an array. From 7c8d173deea7cce5ae0659771256bd970fcdfcc3 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Tue, 5 Mar 2024 11:59:49 +0100 Subject: [PATCH 10/21] Update _hdf5_store.py --- src/imzy/_centroids/_hdf5_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py index 9a0edc5..d912c7b 100644 --- a/src/imzy/_centroids/_hdf5_store.py +++ b/src/imzy/_centroids/_hdf5_store.py @@ -42,7 +42,8 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids): _is_chunked = None _low_mem: bool = True _ion_labels = None - _mz, _bins, _ppm, _eppm, _mda = None, None, None, None, None + _tol, _ppm = None, None + unique_id: str = "" def __init__( self, From 98067d98f069fa0012b2f71fac93fda9a0231610 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Tue, 5 Mar 2024 12:00:15 +0100 Subject: [PATCH 11/21] Update __init__.py --- src/imzy/_normalizations/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index 22c2198..68751ef 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -35,7 +35,7 @@ def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path: store = H5NormalizationStore(hdf_path, mode="a") with store.open() as h5: - group = store._get_group(h5, store.NORMALIZATIONS_KEY) + group = store._get_group(h5, store.NORMALIZATION_KEY) for normalization in get_normalizations(): group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression) store.flush() From 8132f94011a3f6c86f418ec8ac5d6abe4d058d49 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 5 Mar 2024 11:23:09 +0000 Subject: [PATCH 12/21] Fixes --- src/imzy/_normalizations/__init__.py | 175 -------------------------- src/imzy/_normalizations/_extract.py | 177 +++++++++++++++++++++++++++ src/imzy/_readers/_base.py | 3 +- 3 files changed, 179 insertions(+), 176 deletions(-) diff --git a/src/imzy/_normalizations/__init__.py b/src/imzy/_normalizations/__init__.py index 68751ef..fba7cad 100644 --- a/src/imzy/_normalizations/__init__.py +++ b/src/imzy/_normalizations/__init__.py @@ -1,178 +1,3 @@ """Normalizations.""" -import warnings -from pathlib import Path - -import numba -import numpy as np -from koyo.typing import PathLike -from tqdm import tqdm - -try: - import hdf5plugin -except ImportError: - - class hdf5plugin: - """Dummy class.""" - - LZ4 = lambda *args, **kwargs: {} - - from imzy._normalizations._extract import get_normalizations from imzy._normalizations._hdf5_store import H5NormalizationStore - - -def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path: - """Create group with datasets inside.""" - from imzy._readers import get_reader - - reader = get_reader(input_dir) - n_pixels = reader.n_pixels - - compression = hdf5plugin.LZ4() - - if Path(hdf_path).suffix != ".h5": - hdf_path = Path(hdf_path).with_suffix(".h5") - - store = H5NormalizationStore(hdf_path, mode="a") - with store.open() as h5: - group = store._get_group(h5, store.NORMALIZATION_KEY) - for normalization in get_normalizations(): - group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression) - store.flush() - return hdf_path - - -def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray: - """Retrieve normalization and determine outliers.""" - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - norm = np.log10(norm + 1) - mask = norm < (norm.max() - n_orders) - return mask - - -def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path: - """Extract normalizations from hdf5.""" - normalization_names = get_normalizations() - normalizations = compute_normalizations(input_dir, silent=silent) - - store = H5NormalizationStore(hdf_path, mode="a") - with np.errstate(invalid="ignore", divide="ignore"): - with store.open() as h5: - group = store._get_group(h5, store.NORMALIZATION_KEY) - for i, normalization in enumerate(normalization_names): - # get normalization - norm = normalizations[:, i] - # remove outliers - mask = _get_outlier_mask(norm, 2) - norm[mask] = np.median(norm[mask]) - # save the normalizations as 'multiplier' version so it's easier to apply - group[normalization][:] = 1 / (norm / np.median(norm)) - store.flush() - return hdf_path - - -def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: - """Calculate normalizations for a set of frames.""" - from imzy._readers import get_reader - - reader = get_reader(input_dir) - - # specify normalization names - names = get_normalizations() - - # pre-assign array - n_frames = reader.n_pixels - framelist = reader.pixels - # start_frame, norm_array = check_cache() - norm_array = np.zeros((n_frames, len(names)), dtype=np.float32) - for i, (_, y) in enumerate( - tqdm( - reader.spectra_iter(framelist, silent=True), - disable=silent, - miniters=100, - mininterval=2, - total=len(framelist), - desc="Computing normalizations...", - ) - ): - norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32)) - norm_array = np.nan_to_num(norm_array, nan=1.0) - return norm_array - - -@numba.njit(fastmath=True, cache=True) -def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray: - """Calculate various normalizations. - - This function expects float32 spectrum. - """ - px_norms = np.zeros(12, dtype=np.float32) - px_norms[0] = np.sum(spectrum) # TIC - if px_norms[0] == 0: - return px_norms - - px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS - px_norms[2] = np.median(spectrum[spectrum > 0]) # Median - _spectrum = spectrum[spectrum > 0] - if _spectrum.size > 1: - q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05]) - else: - q95, q90, q10, q5 = 0, 0, 0, 0 - px_norms[3] = np.sum(spectrum[spectrum < q95]) # 0-95% TIC - px_norms[4] = np.sum(spectrum[spectrum < q90]) # 0-90% TIC - px_norms[5] = np.sum(spectrum[spectrum > q5]) # 5-100% TIC - px_norms[6] = np.sum(spectrum[spectrum > q10]) # 10-100% TIC - px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)]) # 5-95% TIC - px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)]) # 10-90% TIC - px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm - px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm - px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm - return px_norms - - -@numba.njit(fastmath=True, cache=True) -def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray: - """Calculate various normalizations, optimized version. - - This function expects a float32 spectrum. - """ - px_norms = np.zeros(12, dtype=np.float32) - - # Direct computation without conditions - px_norms[0] = np.sum(spectrum) # TIC - if px_norms[0] == 0: - return px_norms - - # Compute these norms directly without extra conditions - px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS - px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm - px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm - px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm - - # Filter positive values once and reuse - positive_spectrum = spectrum[spectrum > 0] - px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0 # Median - - # Calculating quantiles once for all needed - if positive_spectrum.size > 1: - q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05]) - else: - q95, q90, q10, q5 = 0, 0, 0, 0 - - # Using logical indexing with boolean arrays might be faster due to numba optimization - condition_q95 = spectrum < q95 - condition_q90 = spectrum < q90 - condition_q5 = spectrum > q5 - condition_q10 = spectrum > q10 - - px_norms[3] = np.sum(spectrum[condition_q95]) # 0-95% TIC - px_norms[4] = np.sum(spectrum[condition_q90]) # 0-90% TIC - px_norms[5] = np.sum(spectrum[condition_q5]) # 5-100% TIC - px_norms[6] = np.sum(spectrum[condition_q10]) # 10-100% TIC - - # For ranges, we can combine conditions - px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95]) # 5-95% TIC - px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90]) # 10-90% TIC - - return px_norms diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py index e9857aa..f6ddddf 100644 --- a/src/imzy/_normalizations/_extract.py +++ b/src/imzy/_normalizations/_extract.py @@ -1,4 +1,24 @@ """Get normalization utilities.""" +import warnings +from pathlib import Path + +import numba + +try: + import hdf5plugin +except ImportError: + + class hdf5plugin: + """Dummy class.""" + + LZ4 = lambda *args, **kwargs: {} + + +import numpy as np +from koyo.typing import PathLike +from tqdm import tqdm + +from imzy._normalizations import H5NormalizationStore def get_normalizations() -> list[str]: @@ -17,3 +37,160 @@ def get_normalizations() -> list[str]: "2-norm", "3-norm", ] + + +def create_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike) -> Path: + """Create group with datasets inside.""" + from imzy._readers import get_reader + + reader = get_reader(input_dir) + n_pixels = reader.n_pixels + + compression = hdf5plugin.LZ4() + + if Path(hdf_path).suffix != ".h5": + hdf_path = Path(hdf_path).with_suffix(".h5") + + store = H5NormalizationStore(hdf_path, mode="a") + with store.open() as h5: + group = store._get_group(h5, store.NORMALIZATION_KEY) + for normalization in get_normalizations(): + group.create_dataset(normalization, shape=(n_pixels,), dtype=np.float32, **compression) + store.flush() + return hdf_path + + +def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray: + """Retrieve normalization and determine outliers.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + norm = np.log10(norm + 1) + mask = norm < (norm.max() - n_orders) + return mask + + +def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path: + """Extract normalizations from hdf5.""" + normalization_names = get_normalizations() + normalizations = compute_normalizations(input_dir, silent=silent) + + store = H5NormalizationStore(hdf_path, mode="a") + with np.errstate(invalid="ignore", divide="ignore"): + with store.open() as h5: + group = store._get_group(h5, store.NORMALIZATION_KEY) + for i, normalization in enumerate(normalization_names): + # get normalization + norm = normalizations[:, i] + # remove outliers + mask = _get_outlier_mask(norm, 2) + norm[mask] = np.median(norm[mask]) + # save the normalizations as 'multiplier' version so it's easier to apply + group[normalization][:] = 1 / (norm / np.median(norm)) + store.flush() + return hdf_path + + +def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: + """Calculate normalizations for a set of frames.""" + from imzy._readers import get_reader + + reader = get_reader(input_dir) + + # specify normalization names + names = get_normalizations() + + # pre-assign array + n_frames = reader.n_pixels + framelist = reader.pixels + # start_frame, norm_array = check_cache() + norm_array = np.zeros((n_frames, len(names)), dtype=np.float32) + for i, (_, y) in enumerate( + tqdm( + reader.spectra_iter(framelist, silent=True), + disable=silent, + miniters=100, + mininterval=2, + total=len(framelist), + desc="Computing normalizations...", + ) + ): + norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32)) + norm_array = np.nan_to_num(norm_array, nan=1.0) + return norm_array + + +@numba.njit(fastmath=True, cache=True) +def calculate_normalizations(spectrum: np.ndarray) -> np.ndarray: + """Calculate various normalizations. + + This function expects float32 spectrum. + """ + px_norms = np.zeros(12, dtype=np.float32) + px_norms[0] = np.sum(spectrum) # TIC + if px_norms[0] == 0: + return px_norms + + px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS + px_norms[2] = np.median(spectrum[spectrum > 0]) # Median + _spectrum = spectrum[spectrum > 0] + if _spectrum.size > 1: + q95, q90, q10, q5 = np.nanquantile(_spectrum, [0.95, 0.9, 0.1, 0.05]) + else: + q95, q90, q10, q5 = 0, 0, 0, 0 + px_norms[3] = np.sum(spectrum[spectrum < q95]) # 0-95% TIC + px_norms[4] = np.sum(spectrum[spectrum < q90]) # 0-90% TIC + px_norms[5] = np.sum(spectrum[spectrum > q5]) # 5-100% TIC + px_norms[6] = np.sum(spectrum[spectrum > q10]) # 10-100% TIC + px_norms[7] = np.sum(spectrum[(spectrum > q5) & (spectrum < q95)]) # 5-95% TIC + px_norms[8] = np.sum(spectrum[(spectrum > q10) & (spectrum < q90)]) # 10-90% TIC + px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm + px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm + px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm + return px_norms + + +@numba.njit(fastmath=True, cache=True) +def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray: + """Calculate various normalizations, optimized version. + + This function expects a float32 spectrum. + """ + px_norms = np.zeros(12, dtype=np.float32) + + # Direct computation without conditions + px_norms[0] = np.sum(spectrum) # TIC + if px_norms[0] == 0: + return px_norms + + # Compute these norms directly without extra conditions + px_norms[1] = np.sqrt(np.mean(np.square(spectrum))) # RMS + px_norms[9] = np.linalg.norm(spectrum, 0) # 0-norm + px_norms[10] = np.linalg.norm(spectrum, 2) # 2-norm + px_norms[11] = np.linalg.norm(spectrum, 3) # 3-norm + + # Filter positive values once and reuse + positive_spectrum = spectrum[spectrum > 0] + px_norms[2] = np.median(positive_spectrum) if positive_spectrum.size > 0 else 0 # Median + + # Calculating quantiles once for all needed + if positive_spectrum.size > 1: + q95, q90, q10, q5 = np.nanquantile(positive_spectrum, [0.95, 0.9, 0.1, 0.05]) + else: + q95, q90, q10, q5 = 0, 0, 0, 0 + + # Using logical indexing with boolean arrays might be faster due to numba optimization + condition_q95 = spectrum < q95 + condition_q90 = spectrum < q90 + condition_q5 = spectrum > q5 + condition_q10 = spectrum > q10 + + px_norms[3] = np.sum(spectrum[condition_q95]) # 0-95% TIC + px_norms[4] = np.sum(spectrum[condition_q90]) # 0-90% TIC + px_norms[5] = np.sum(spectrum[condition_q5]) # 5-100% TIC + px_norms[6] = np.sum(spectrum[condition_q10]) # 10-100% TIC + + # For ranges, we can combine conditions + px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95]) # 5-95% TIC + px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90]) # 10-90% TIC + + return px_norms diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index f83a976..292e392 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -424,7 +424,8 @@ def to_hdf5( def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False): """Extract normalizations.""" from imzy._hdf5_mixin import check_hdf5 - from imzy._normalizations import create_normalizations_hdf5, extract_normalizations_hdf5 + from imzy._normalizations._extract import extract_normalizations_hdf5 + from imzy._normalizations._extract import create_normalizations_hdf5 check_hdf5() hdf_path = Path(hdf_path) From a5a4f071b0ed0a6e8cf3e4092a5561f7accf3946 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Tue, 5 Mar 2024 15:54:31 +0100 Subject: [PATCH 13/21] Fixes --- src/imzy/_centroids/_extract.py | 4 +++- src/imzy/_centroids/_hdf5_store.py | 24 +++++++++++++++++++++++- src/imzy/_hdf5_mixin.py | 9 +++++++++ src/imzy/_normalizations/_hdf5_store.py | 25 ++++++++++++++++++++++++- 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/src/imzy/_centroids/_extract.py b/src/imzy/_centroids/_extract.py index ece9303..25fb9b7 100644 --- a/src/imzy/_centroids/_extract.py +++ b/src/imzy/_centroids/_extract.py @@ -166,7 +166,9 @@ def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict _max_mem = (float(n_pixels) * n_peaks * 4) / (1024**2) # assume 4 bytes per element n_tasks = math.ceil(_max_mem / max_mem) or 1 - return dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks)))) + chunk_info = dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks)))) + assert sum(len(v) for v in chunk_info.values()) == n_pixels, "Chunking failed" + return chunk_info def create_centroids_hdf5( diff --git a/src/imzy/_centroids/_hdf5_store.py b/src/imzy/_centroids/_hdf5_store.py index d912c7b..0bdb3a4 100644 --- a/src/imzy/_centroids/_hdf5_store.py +++ b/src/imzy/_centroids/_hdf5_store.py @@ -43,6 +43,7 @@ class H5CentroidsStore(HDF5Mixin, BaseCentroids): _low_mem: bool = True _ion_labels = None _tol, _ppm = None, None + _peaks = None unique_id: str = "" def __init__( @@ -118,6 +119,27 @@ def lazy_peaks(self): group = h5[self.PEAKS_KEY] yield group["array"] + @property + def peaks(self) -> np.ndarray: + """Load peaks data. + + This function uses custom data loading to speed things up. Because we chunk the ion images along one dimension, + it becomes very slow to read the entire data in one go. It can be optimized a little by read the data one image + at a time and then building 2d array. + """ + if self._peaks is None: + if self.is_chunked: + if self._proxy is None: + self._proxy = LazyPeaksProxy(self) + self._peaks = self._proxy.peaks() + else: + array = np.zeros(self.shape, dtype=self.dtype) + with self.open("r") as h5: + for sl in h5[self.PEAKS_ARRAY_KEY].iter_chunks(): + array[sl] = h5[self.PEAKS_ARRAY_KEY][sl] + self._peaks = array + return self._peaks + @property def tol(self) -> int: """Get Da tolerance the data was extracted at.""" @@ -216,7 +238,7 @@ def __getitem__(self, item): def shape(self) -> ty.Tuple[int, int]: """Return shape.""" chunk_info = self.obj.chunk_info - n_px = chunk_info[max(chunk_info.keys())].max() + n_px = sum(len(indices) for indices in chunk_info.values()) return n_px, self.obj.n_peaks @property diff --git a/src/imzy/_hdf5_mixin.py b/src/imzy/_hdf5_mixin.py index 05d8351..f434f2c 100644 --- a/src/imzy/_hdf5_mixin.py +++ b/src/imzy/_hdf5_mixin.py @@ -10,6 +10,15 @@ h5py = None from koyo.typing import PathLike +try: + import hdf5plugin +except ImportError: + + class hdf5plugin: + """Dummy class.""" + + LZ4 = lambda *args, **kwargs: {} + def parse_from_attribute(attribute): """Parse attribute from cache.""" diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index 0a1a934..85a9228 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -40,7 +40,11 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo def normalize(self, array: np.ndarray, name: str, **kwargs) -> np.ndarray: """Normalize array.""" - return self._single_normalize(array, name, **kwargs) + array = np.asanyarray(array) + if array.ndim < 2: # or is_single_2d(self.mobj, array): + return self._single_normalize(array, name, **kwargs) + else: + return self._batch_normalize(array, name, **kwargs) def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarray: """Apply normalization to an array. @@ -66,6 +70,25 @@ def _single_normalize(self, array: np.ndarray, norm: str, **kwargs) -> np.ndarra logger.warning(f"Normalization '{norm}' not found. Skipping normalization.") return _postprocess(array) + def _batch_normalize( + self, + array: np.ndarray, + norm: str, + **kwargs, + ): + if array.ndim < 2: + raise ValueError("Expected two-dimensional array of 'N pixels * M peaks'.") + + norm_array = self.get_normalization( + norm, + as_multiplier=True, + ) + if array.ndim == 2: + if norm_array.shape[0] != array.shape[0]: + raise ValueError("The input array does not have the same number of pixels as the normalization vector.") + array = np.multiply(array.T, norm_array).T + return _postprocess(array) + def _postprocess(array: np.ndarray): # array[np.isnan(array)] = 0 From 6f34b8db99a0d7fa73b04e060a9501d78287b3d7 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Fri, 29 Mar 2024 11:38:06 +0100 Subject: [PATCH 14/21] Fixes --- src/imzy/_normalizations/_extract.py | 2 +- src/imzy/_readers/_base.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py index f6ddddf..84426b9 100644 --- a/src/imzy/_normalizations/_extract.py +++ b/src/imzy/_normalizations/_extract.py @@ -18,7 +18,7 @@ class hdf5plugin: from koyo.typing import PathLike from tqdm import tqdm -from imzy._normalizations import H5NormalizationStore +from imzy._normalizations._hdf5_store import H5NormalizationStore def get_normalizations() -> list[str]: diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index 292e392..78a40ae 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -424,8 +424,7 @@ def to_hdf5( def extract_normalizations_hdf5(self, hdf_path: PathLike, silent: bool = False): """Extract normalizations.""" from imzy._hdf5_mixin import check_hdf5 - from imzy._normalizations._extract import extract_normalizations_hdf5 - from imzy._normalizations._extract import create_normalizations_hdf5 + from imzy._normalizations._extract import create_normalizations_hdf5, extract_normalizations_hdf5 check_hdf5() hdf_path = Path(hdf_path) From d50066bbeae9c948fbfb8365699da790f986d6b6 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 9 Apr 2024 15:27:30 +0100 Subject: [PATCH 15/21] Update _extract.py --- src/imzy/_normalizations/_extract.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py index f6ddddf..3dd3783 100644 --- a/src/imzy/_normalizations/_extract.py +++ b/src/imzy/_normalizations/_extract.py @@ -192,5 +192,4 @@ def calculate_normalizations_optimized(spectrum: np.ndarray) -> np.ndarray: # For ranges, we can combine conditions px_norms[7] = np.sum(spectrum[condition_q5 & condition_q95]) # 5-95% TIC px_norms[8] = np.sum(spectrum[condition_q10 & condition_q90]) # 10-90% TIC - return px_norms From d96aa001b568d1dc81fd8a906d1e2b5e2d219ad8 Mon Sep 17 00:00:00 2001 From: lukasz-migas Date: Tue, 9 Apr 2024 15:35:44 +0100 Subject: [PATCH 16/21] Update _base.py --- src/imzy/_readers/_base.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index 78a40ae..3d2467e 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -1,4 +1,5 @@ """Base reader.""" + import os import typing as ty from contextlib import suppress @@ -131,6 +132,16 @@ def image_shape(self) -> ty.Tuple[int, int]: """Return shape of the image.""" return self.y_size, self.x_size + def index_to_xy_coordinates(self, index: int) -> ty.Tuple[int, int]: + """Convert index to x, y coordinates.""" + return self.x_coordinates[index], self.y_coordinates[index] + + def xy_coordinates_to_index(self, x: int, y: int) -> int: + """Convert x, y coordinates to index.""" + indices = np.where((self.x_coordinates == x) & (self.y_coordinates == y))[0] + if indices.size > 0: + return indices[0] + @property def xyz_coordinates(self) -> np.ndarray: """Return xyz coordinates.""" From baabac1d57b99cd3c3e7f36c8d971b9621ea754d Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Mon, 9 Sep 2024 21:15:09 +0200 Subject: [PATCH 17/21] Fixes --- src/imzy/_readers/_base.py | 4 ++++ src/imzy/_readers/imzml/_imzml.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py index 3d2467e..6746266 100644 --- a/src/imzy/_readers/_base.py +++ b/src/imzy/_readers/_base.py @@ -127,6 +127,10 @@ def reshape_batch(self, array: np.ndarray, fill_value: float = 0) -> np.ndarray: im[i, self.y_coordinates, self.x_coordinates] = array[:, i] return im + def flatten(self, array: np.ndarray) -> np.ndarray: + """Flatten 2D image.""" + return array[self.y_coordinates, self.x_coordinates] + @property def image_shape(self) -> ty.Tuple[int, int]: """Return shape of the image.""" diff --git a/src/imzy/_readers/imzml/_imzml.py b/src/imzy/_readers/imzml/_imzml.py index a89dbe8..bf1cd14 100644 --- a/src/imzy/_readers/imzml/_imzml.py +++ b/src/imzy/_readers/imzml/_imzml.py @@ -179,6 +179,11 @@ def reshape_batch(self, array: np.ndarray, fill_value: float = 0) -> np.ndarray: im[i, self.y_coordinates - 1, self.x_coordinates - 1] = array[:, i] return im + def flatten(self, image: np.ndarray) -> np.ndarray: + """Retrieve original vector of intensities from an image.""" + array = image[self.y_coordinates - 1, self.x_coordinates - 1] + return array + def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False) -> ty.Tuple[np.ndarray, np.ndarray]: """Sum pixel data to produce summed mass spectrum.""" indices = np.asarray(indices) @@ -269,8 +274,9 @@ def _read_spectra( mz_bytes = f_ptr.read(mz_l * self._mz_size) f_ptr.seek(int_o) int_bytes = f_ptr.read(int_l * self._int_size) - yield np.frombuffer(mz_bytes, dtype=self.mz_precision), np.frombuffer( - int_bytes, dtype=self.int_precision + yield ( + np.frombuffer(mz_bytes, dtype=self.mz_precision), + np.frombuffer(int_bytes, dtype=self.int_precision), ) From abbed7a1e8d300b1b0c8a467318963eef97bc38a Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Thu, 28 Nov 2024 18:00:33 +0100 Subject: [PATCH 18/21] Update _hdf5_store.py --- src/imzy/_normalizations/_hdf5_store.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index 85a9228..4fc2a7c 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -34,6 +34,9 @@ def get_normalization(self, name: str, as_median: bool = True, as_multiplier: bo name = f"{self.NORMALIZATION_KEY}/{name}" try: normalization = self.get_array(name, "normalization") + except ValueError: + _, name = name.split("/") + normalization = self.get_array(self.NORMALIZATION_KEY, name) except KeyError: raise KeyError(f"Normalization '{name}' not found in store.") return normalization From 4795500c29be3bf915aafb3cb5f590582722dfb3 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Thu, 12 Dec 2024 23:12:25 +0100 Subject: [PATCH 19/21] Improvements --- src/imzy/_centroids/_base.py | 6 ++++-- src/imzy/_normalizations/_hdf5_store.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/imzy/_centroids/_base.py b/src/imzy/_centroids/_base.py index a3dec5e..fa340d0 100644 --- a/src/imzy/_centroids/_base.py +++ b/src/imzy/_centroids/_base.py @@ -56,13 +56,15 @@ def __getitem__(self, item): def _reshape_single(self, array, fill_value=np.nan): """Reshape single image.""" if self._reshape_by_coordinates: - return reshape_array_from_coordinates(array, self.image_shape, self.xyz_coordinates, fill_value=fill_value) + return reshape_array_from_coordinates( + array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1 + ) return reshape_array(array, self.image_shape, self.pixel_index, fill_value=fill_value) def _reshape_multiple(self, array, fill_value=np.nan): if self._reshape_by_coordinates: return reshape_array_batch_from_coordinates( - array, self.image_shape, self.xyz_coordinates, fill_value=fill_value + array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1 ) return reshape_array_batch(array, self.image_shape, self.pixel_index, fill_value=fill_value) diff --git a/src/imzy/_normalizations/_hdf5_store.py b/src/imzy/_normalizations/_hdf5_store.py index 4fc2a7c..ccb5575 100644 --- a/src/imzy/_normalizations/_hdf5_store.py +++ b/src/imzy/_normalizations/_hdf5_store.py @@ -88,7 +88,10 @@ def _batch_normalize( ) if array.ndim == 2: if norm_array.shape[0] != array.shape[0]: - raise ValueError("The input array does not have the same number of pixels as the normalization vector.") + raise ValueError( + f"The input array does not have the same number of pixels as the normalization vector." + f" (norm={norm_array.shape}; array={array.shape})" + ) array = np.multiply(array.T, norm_array).T return _postprocess(array) From 02f010b60656a0d36917b75db6a4fd40f4efb73a Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Fri, 31 Jan 2025 13:54:12 +0000 Subject: [PATCH 20/21] Update _extract.py --- src/imzy/_normalizations/_extract.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py index 89cd31e..a4c3d97 100644 --- a/src/imzy/_normalizations/_extract.py +++ b/src/imzy/_normalizations/_extract.py @@ -1,4 +1,5 @@ """Get normalization utilities.""" + import warnings from pathlib import Path @@ -72,25 +73,19 @@ def _get_outlier_mask(norm: np.ndarray, n_orders: int = 2) -> np.ndarray: def extract_normalizations_hdf5(input_dir: PathLike, hdf_path: PathLike, silent: bool = False) -> Path: """Extract normalizations from hdf5.""" normalization_names = get_normalizations() - normalizations = compute_normalizations(input_dir, silent=silent) + normalizations = compute_normalizations(input_dir, clean=True, silent=silent) store = H5NormalizationStore(hdf_path, mode="a") with np.errstate(invalid="ignore", divide="ignore"): with store.open() as h5: group = store._get_group(h5, store.NORMALIZATION_KEY) for i, normalization in enumerate(normalization_names): - # get normalization - norm = normalizations[:, i] - # remove outliers - mask = _get_outlier_mask(norm, 2) - norm[mask] = np.median(norm[mask]) - # save the normalizations as 'multiplier' version so it's easier to apply - group[normalization][:] = 1 / (norm / np.median(norm)) + group[normalization][:] = normalizations[:, i] store.flush() return hdf_path -def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: +def compute_normalizations(input_dir: Path, clean: bool = True, silent: bool = False) -> np.ndarray: """Calculate normalizations for a set of frames.""" from imzy._readers import get_reader @@ -116,6 +111,16 @@ def compute_normalizations(input_dir: Path, silent: bool = False) -> np.ndarray: ): norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32)) norm_array = np.nan_to_num(norm_array, nan=1.0) + # clean-up normalizations + for i in range(norm_array.shape[1]): + # get normalization + norm = norm_array[:, i] + if clean: + # remove outliers + mask = _get_outlier_mask(norm, 2) + norm[mask] = np.median(norm[mask]) + # save the normalizations as 'multiplier' version so it's easier to apply + norm_array[:, i] = 1 / (norm / np.median(norm)) return norm_array From c72795232b540c5a963ea480841dd5a024add252 Mon Sep 17 00:00:00 2001 From: Lukasz Migas Date: Fri, 31 Jan 2025 14:27:32 +0000 Subject: [PATCH 21/21] Update _extract.py --- src/imzy/_normalizations/_extract.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/imzy/_normalizations/_extract.py b/src/imzy/_normalizations/_extract.py index a4c3d97..35d6202 100644 --- a/src/imzy/_normalizations/_extract.py +++ b/src/imzy/_normalizations/_extract.py @@ -112,15 +112,16 @@ def compute_normalizations(input_dir: Path, clean: bool = True, silent: bool = F norm_array[i] = calculate_normalizations_optimized(y.astype(np.float32)) norm_array = np.nan_to_num(norm_array, nan=1.0) # clean-up normalizations - for i in range(norm_array.shape[1]): - # get normalization - norm = norm_array[:, i] - if clean: - # remove outliers - mask = _get_outlier_mask(norm, 2) - norm[mask] = np.median(norm[mask]) - # save the normalizations as 'multiplier' version so it's easier to apply - norm_array[:, i] = 1 / (norm / np.median(norm)) + with np.errstate(invalid="ignore", divide="ignore"): + for i in range(norm_array.shape[1]): + # get normalization + norm = norm_array[:, i] + if clean: + # remove outliers + mask = _get_outlier_mask(norm, 2) + norm[mask] = np.median(norm[mask]) + # save the normalizations as 'multiplier' version so it's easier to apply + norm_array[:, i] = 1 / (norm / np.median(norm)) return norm_array