Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add norms #52

Merged
merged 22 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/imzy/_centroids/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ def __getitem__(self, item):
def _reshape_single(self, array, fill_value=np.nan):
"""Reshape single image."""
if self._reshape_by_coordinates:
return reshape_array_from_coordinates(array, self.image_shape, self.xyz_coordinates, fill_value=fill_value)
return reshape_array_from_coordinates(
array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1
)
return reshape_array(array, self.image_shape, self.pixel_index, fill_value=fill_value)

def _reshape_multiple(self, array, fill_value=np.nan):
if self._reshape_by_coordinates:
return reshape_array_batch_from_coordinates(
array, self.image_shape, self.xyz_coordinates, fill_value=fill_value
array, self.image_shape, self.xyz_coordinates, fill_value=fill_value, offset=1
)
return reshape_array_batch(array, self.image_shape, self.pixel_index, fill_value=fill_value)

Expand Down
60 changes: 21 additions & 39 deletions src/imzy/_extract.py → src/imzy/_centroids/_extract.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""Methods to handle data extraction from any reader."""
import typing as ty
from pathlib import Path

Expand All @@ -8,13 +7,20 @@
from koyo.utilities import chunks
from tqdm import tqdm

from imzy import get_reader
from imzy._hdf5_mixin import parse_to_attribute
from imzy._typing import SpatialInfo
from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis
from imzy.utilities import _safe_rmtree

try:
import hdf5plugin
except ImportError:
pass

from imzy._readers import get_reader
from imzy.processing import accumulate_peaks_centroid, accumulate_peaks_profile, optimize_chunks_along_axis
class hdf5plugin:
"""Dummy class."""

LZ4 = lambda *args, **kwargs: {}


def check_zarr() -> None:
Expand Down Expand Up @@ -154,22 +160,15 @@ def rechunk_zarr_array(
_safe_rmtree(zarr_path) # remove the temporary array


def check_hdf5() -> None:
"""Check whether Zarr, dask and rechunker are installed."""
try:
import h5py
import hdf5plugin
except ImportError:
raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]")


def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict[int, np.ndarray]:
"""Get chunk size information for particular dataset."""
import math

_max_mem = (float(n_pixels) * n_peaks * 4) / (1024**2) # assume 4 bytes per element
n_tasks = math.ceil(_max_mem / max_mem) or 1
return dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))
chunk_info = dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))
assert sum(len(v) for v in chunk_info.values()) == n_pixels, "Chunking failed"
return chunk_info


def create_centroids_hdf5(
Expand All @@ -183,6 +182,7 @@ def create_centroids_hdf5(
ppm: ty.Optional[float] = None,
ys: ty.Optional[np.ndarray] = None,
chunk_info: ty.Optional[ty.Dict[int, np.ndarray]] = None,
spatial_info: ty.Optional[SpatialInfo] = None,
) -> Path:
"""Create group with datasets inside."""
from imzy._centroids import H5CentroidsStore
Expand Down Expand Up @@ -240,6 +240,11 @@ def create_centroids_hdf5(
dtype=np.float32,
**compression,
)
group = store._get_group(h5, store.SPATIAL_KEY)
if spatial_info is not None:
group.attrs["pixel_size"] = spatial_info.pop("pixel_size", 1.0)
for key, value in spatial_info.items():
store._add_data_to_group(group, key, value, dtype=value.dtype)
return Path(hdf_path)


Expand Down Expand Up @@ -278,8 +283,8 @@ def extract_centroids_hdf5(
indices,
disable=silent,
desc=f"Extracting {n_peaks} peaks (chunk={chunk_id+1}/{n_chunks})",
miniters=25,
mininterval=0.2,
miniters=250,
mininterval=2,
)
):
x, y = reader[index]
Expand All @@ -288,26 +293,3 @@ def extract_centroids_hdf5(
else:
temp[i] = accumulate_peaks_profile(extract_indices, y)
store.update(temp, indices, chunk_id)


def _safe_rmtree(path):
from shutil import rmtree

try:
rmtree(path)
except (OSError, FileNotFoundError):
pass


def parse_from_attribute(attribute):
"""Parse attribute from cache."""
if isinstance(attribute, str) and attribute == "__NONE__":
attribute = None
return attribute


def parse_to_attribute(attribute):
"""Parse attribute to cache."""
if attribute is None:
attribute = "__NONE__"
return attribute
168 changes: 88 additions & 80 deletions src/imzy/_centroids/_hdf5_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import numpy as np

from imzy._hdf5_mixin import HDF5Mixin

try:
import h5py
except ImportError:
Expand All @@ -15,19 +17,34 @@
from imzy._centroids._base import BaseCentroids


class H5CentroidsStore(BaseCentroids):
def format_ppm(mz: float, ppm: float):
"""Create formatted ppm'¬s."""
return f"{mz:.3f} Da ± {ppm:.1f}ppm"


def format_tol(mz: float, mda: float):
"""Create formatted label."""
return f"{mz:.3f} Da ± {mda:.3f}Da"


class H5CentroidsStore(HDF5Mixin, BaseCentroids):
"""HDF5-centroids."""

# Private attributes
PEAKS_KEY = "Array"
PEAKS_ARRAY_KEY = "Array/array"
SPATIAL_KEY = "Misc/Spatial"

# Cache attributes
_chunk_info = None
_xs = None
_proxy = None
_is_chunked = None
_low_mem: bool = True
_ion_labels = None
_tol, _ppm = None, None
_peaks = None
unique_id: str = ""

def __init__(
self,
Expand All @@ -38,13 +55,11 @@ def __init__(
mode: str = "a",
):
super().__init__(xyz_coordinates, pixel_index, image_shape)
assert h5py is not None, "h5py is not installed."

self.path = path
self.mode = mode
self._init_hdf5(path, mode)

@property
def is_low_mem(self):
def is_low_mem(self) -> bool:
"""Get low memory flag."""
return self._low_mem

@is_low_mem.setter
Expand All @@ -55,6 +70,7 @@ def is_low_mem(self, value: bool):

@property
def is_chunked(self) -> bool:
"""Get chunked flag."""
if self._is_chunked is None:
with self.open() as h5:
self._is_chunked = h5[self.PEAKS_KEY].attrs.get("is_chunked")
Expand Down Expand Up @@ -103,6 +119,71 @@ def lazy_peaks(self):
group = h5[self.PEAKS_KEY]
yield group["array"]

@property
def peaks(self) -> np.ndarray:
"""Load peaks data.

This function uses custom data loading to speed things up. Because we chunk the ion images along one dimension,
it becomes very slow to read the entire data in one go. It can be optimized a little by read the data one image
at a time and then building 2d array.
"""
if self._peaks is None:
if self.is_chunked:
if self._proxy is None:
self._proxy = LazyPeaksProxy(self)
self._peaks = self._proxy.peaks()
else:
array = np.zeros(self.shape, dtype=self.dtype)
with self.open("r") as h5:
for sl in h5[self.PEAKS_ARRAY_KEY].iter_chunks():
array[sl] = h5[self.PEAKS_ARRAY_KEY][sl]
self._peaks = array
return self._peaks

@property
def tol(self) -> int:
"""Get Da tolerance the data was extracted at."""
if self._tol is None:
self._tol = self.get_attr(self.PEAKS_KEY, "tol")
return self._tol

@property
def ppm(self) -> float:
"""Get ppm/bins the data was extracted at."""
if self._ppm is None:
self._ppm = self.get_attr(self.PEAKS_KEY, "ppm")
return self._ppm

def _setup_labels(self):
"""Setup labels."""
if self._ion_labels is None:
mzs = self.xs
ppm = self.ppm
tol = self.tol
self._ion_labels = {
"ppm": [format_ppm(mz, ppm) for mz in mzs] if ppm else [],
"tol": [format_tol(mz, tol) for i, mz in enumerate(mzs)] if tol is not None else [],
}
if not self._ion_labels["ppm"]:
self._ion_labels["ppm"] = self._ion_labels["tol"]

def index_to_name(self, index: int, effective: bool = True, mz: bool = False):
"""Convert index to ion name."""
if self._ion_labels is None:
self._setup_labels()
return self._ion_labels["ppm"][index]

@property
def labels(self) -> ty.Tuple[np.ndarray, ty.Optional[np.ndarray], ty.Optional[np.ndarray]]:
"""Return all available labels."""
return self.xs, None, None

def lazy_iter(self):
"""Lazily yield ion images."""
with self.lazy_peaks() as peaks:
for i, mz in enumerate(self.xs):
yield mz, peaks[:, i]

def get_ion(self, value: ty.Union[int, float]) -> np.ndarray:
"""Retrieve single ion."""
if isinstance(value, float):
Expand All @@ -128,79 +209,6 @@ def update(self, array: np.ndarray, framelist: np.ndarray, chunk_id: ty.Optional
h5[self.PEAKS_ARRAY_KEY][framelist] = array
h5.flush()

@contextmanager
def open(self, mode: ty.Optional[str] = None):
"""Safely open storage."""
if mode is None:
mode = self.mode
try:
f_ptr = h5py.File(self.path, mode=mode, rdcc_nbytes=1024 * 1024 * 4)
except FileExistsError as err:
raise err
try:
yield f_ptr
finally:
f_ptr.close()

def flush(self):
"""Flush data to disk."""
with self.open() as h5:
h5.flush()

@staticmethod
def _get_group(hdf, group_name: str, flush: bool = True):
try:
group = hdf[group_name]
except KeyError:
group = hdf.create_group(group_name)
if flush:
hdf.flush()
return group

@staticmethod
def _add_data_to_group(
group_obj,
dataset_name,
data,
dtype,
chunks=None,
maxshape=None,
compression=None,
compression_opts=None,
shape=None,
):
"""Add data to group."""
replaced_dataset = False

if dtype is None:
if hasattr(data, "dtype"):
dtype = data.dtype
if shape is None:
if hasattr(data, "shape"):
shape = data.shape

if dataset_name in list(group_obj.keys()):
if group_obj[dataset_name].dtype == dtype:
try:
group_obj[dataset_name][:] = data
replaced_dataset = True
except TypeError:
del group_obj[dataset_name]
else:
del group_obj[dataset_name]

if not replaced_dataset:
group_obj.create_dataset(
dataset_name,
data=data,
dtype=dtype,
compression=compression,
chunks=chunks,
maxshape=maxshape,
compression_opts=compression_opts,
shape=shape,
)


class LazyPeaksProxy:
"""Proxy class to enable similar interface to lazy-peaks."""
Expand Down Expand Up @@ -230,7 +238,7 @@ def __getitem__(self, item):
def shape(self) -> ty.Tuple[int, int]:
"""Return shape."""
chunk_info = self.obj.chunk_info
n_px = chunk_info[max(chunk_info.keys())].max()
n_px = sum(len(indices) for indices in chunk_info.values())
return n_px, self.obj.n_peaks

@property
Expand Down
Loading