Skip to content

Commit

Permalink
Merge pull request #30 from vandeplaslab/improved-types
Browse files Browse the repository at this point in the history
Improvements to types
  • Loading branch information
lukasz-migas authored Oct 10, 2023
2 parents 71fbd8a + 7b77b1e commit 6e8e162
Show file tree
Hide file tree
Showing 13 changed files with 287 additions and 221 deletions.
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dependencies = [
"tqdm",
"natsort",
"koyo",
"pluggy"
"pluggy",
]

# extras
Expand Down Expand Up @@ -155,8 +155,14 @@ disallow_any_generics = false
disallow_subclassing_any = false
show_error_codes = true
pretty = true
ignore_missing_imports = true
exclude = [
"venv",
"tests",
"src/imzy/_version.py"
]

# # module specific overrides
# module specific overrides
# [[tool.mypy.overrides]]
# module = ["numpy.*",]
# ignore_errors = true
Expand Down
7 changes: 5 additions & 2 deletions src/imzy/_centroids/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Centroids."""
from imzy._centroids._hdf5_store import H5CentroidsStore # noqa F401
from imzy._centroids._memory_store import InMemoryStore # noqa F401
from imzy._centroids._zarr_store import ZarrCentroidsStore # noqa F401
from imzy._centroids._memory_store import InMemoryStore # F401
from imzy._centroids._zarr_store import ZarrCentroidsStore # F401


__all__ = ["H5CentroidsStore", "InMemoryStore", "ZarrCentroidsStore"]
37 changes: 21 additions & 16 deletions src/imzy/_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from koyo.spectrum import find_between_batch
from koyo.typing import PathLike
from koyo.utilities import chunks
from tqdm.auto import tqdm
from tqdm import tqdm

try:
import hdf5plugin
Expand All @@ -17,12 +17,12 @@
from imzy.utilities import accumulate_peaks_centroid, accumulate_peaks_profile


def check_zarr():
def check_zarr() -> None:
"""Check whether Zarr, dask and rechunker are installed."""
try:
import dask # noqa
import rechunker # noqa
import zarr # noqa
import dask
import rechunker
import zarr
except ImportError:
raise ImportError(
"Please install `zarr`, `dask` and `rechunker` to continue. You can do `pip install imzy[zarr]"
Expand All @@ -36,15 +36,17 @@ def create_centroids_zarr(
mzs: ty.Optional[np.ndarray] = None,
mzs_min: ty.Optional[np.ndarray] = None,
mzs_max: ty.Optional[np.ndarray] = None,
tol: float = 0,
ppm: float = 0,
tol: ty.Optional[float] = None,
ppm: ty.Optional[float] = None,
ys: ty.Optional[np.ndarray] = None,
):
"""Create group with datasets inside."""
import zarr

reader = get_reader(input_dir)
if tol is None and ppm is None:
raise ValueError("Either `tol` or `ppm` should be specified.")

reader = get_reader(input_dir)
store = zarr.DirectoryStore(str(zarr_path))
group = zarr.group(store=store)
# add metadata
Expand Down Expand Up @@ -152,11 +154,11 @@ def rechunk_zarr_array(
_safe_rmtree(zarr_path) # remove the temporary array


def check_hdf5():
def check_hdf5() -> None:
"""Check whether Zarr, dask and rechunker are installed."""
try:
import h5py # noqa
import hdf5plugin # noqa
import h5py
import hdf5plugin
except ImportError:
raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]")

Expand All @@ -165,7 +167,7 @@ def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict
"""Get chunk size information for particular dataset."""
import math

_max_mem = (float(n_pixels) * n_peaks * 4) / (1024 ** 2) # assume 4 bytes per element
_max_mem = (float(n_pixels) * n_peaks * 4) / (1024**2) # assume 4 bytes per element
n_tasks = math.ceil(_max_mem / max_mem) or 1
return dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))

Expand All @@ -177,15 +179,18 @@ def create_centroids_hdf5(
mzs: ty.Optional[np.ndarray] = None,
mzs_min: ty.Optional[np.ndarray] = None,
mzs_max: ty.Optional[np.ndarray] = None,
tol: float = 0,
ppm: float = 0,
tol: ty.Optional[float] = None,
ppm: ty.Optional[float] = None,
ys: ty.Optional[np.ndarray] = None,
chunk_info: ty.Optional[ty.Dict[int, np.ndarray]] = None,
):
) -> Path:
"""Create group with datasets inside."""
from imzy._centroids import H5CentroidsStore
from imzy.utilities import optimize_chunks_along_axis

if tol is None and ppm is None:
raise ValueError("Either `tol` or `ppm` should be specified.")

reader = get_reader(input_dir)
n_pixels = reader.n_pixels
array_shape = (n_pixels, n_peaks)
Expand Down Expand Up @@ -236,7 +241,7 @@ def create_centroids_hdf5(
dtype=np.float32,
**compression,
)
return hdf_path
return Path(hdf_path)


def extract_centroids_hdf5(
Expand Down
68 changes: 41 additions & 27 deletions src/imzy/_readers/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class BaseReader:
def __init__(self, path: PathLike):
self.path = Path(path)

def _init(self, *args, **kwargs):
def _init(self, *args, **kwargs) -> None:
"""Method which is called to initialize the reader."""
raise NotImplementedError("Must implement method")

Expand All @@ -48,7 +48,7 @@ def get_spectrum(self, index: int):
"""Return mass spectrum."""
return self._read_spectrum(index)

def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False):
def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False) -> ty.Tuple[np.ndarray, np.ndarray]:
"""Sum pixel data to produce summed mass spectrum."""
raise NotImplementedError("Must implement method")

Expand All @@ -75,22 +75,22 @@ def y_pixel_size(self) -> float:

@property
@lru_cache
def x_size(self):
def x_size(self) -> int:
"""X-axis size."""
min_val, max_max = get_min_max(self.x_coordinates)
return max_max - min_val + 1
return int(max_max - min_val + 1)

@property
@lru_cache
def y_size(self):
def y_size(self) -> int:
"""Y-axis size."""
min_val, max_max = get_min_max(self.y_coordinates)
return max_max - min_val + 1
return int(max_max - min_val + 1)

def __iter__(self):
def __iter__(self) -> "BaseReader":
return self

def __next__(self):
def __next__(self) -> ty.Tuple[np.ndarray, np.ndarray]:
"""Get next spectrum."""
if self._current < self.n_pixels - 1:
self._current += 1
Expand All @@ -99,7 +99,7 @@ def __next__(self):
self._current = -1
raise StopIteration

def __getitem__(self, item: int):
def __getitem__(self, item: int) -> ty.Tuple[np.ndarray, np.ndarray]:
"""Retrieve spectrum."""
return self.get_spectrum(item)

Expand Down Expand Up @@ -133,6 +133,8 @@ def image_shape(self) -> ty.Tuple[int, int]:
@property
def xyz_coordinates(self) -> np.ndarray:
"""Return xyz coordinates."""
if self._xyz_coordinates is None:
raise ValueError("Coordinates have not been initialized.")
return self._xyz_coordinates

@property
Expand All @@ -156,7 +158,7 @@ def pixels(self) -> np.ndarray:
return np.arange(self.n_pixels)

@property
def n_pixels(self):
def n_pixels(self) -> int:
"""Return the total number of pixels in the dataset."""
return len(self.x_coordinates)

Expand All @@ -170,7 +172,7 @@ def pixel_size(self) -> float:
raise ValueError("Pixel size is not equal in both dimensions.")
return self.x_pixel_size

def get_chromatogram(self, indices: ty.Iterable[int]):
def get_chromatogram(self, indices: ty.Iterable[int]) -> np.ndarray:
"""Return chromatogram."""
indices = np.asarray(indices)
array = np.zeros(len(indices), dtype=np.float32)
Expand Down Expand Up @@ -221,7 +223,7 @@ def _get_ions(
ppm: ty.Optional[float] = None,
fill_value: float = np.nan,
silent: bool = False,
):
) -> np.ndarray:
mzs = np.asarray(mzs)
mzs_min, mzs_max = get_mzs_for_tol(mzs, tol, ppm)
res = np.full((self.n_pixels, len(mzs)), dtype=np.float32, fill_value=fill_value)
Expand Down Expand Up @@ -255,7 +257,7 @@ def to_table(
ppm: ty.Optional[float] = None,
fill_value: float = np.nan,
silent: bool = False,
):
) -> np.ndarray:
"""Return many ion images for specified m/z values without reshaping."""
return self._get_ions(mzs, tol, ppm, fill_value, silent)

Expand All @@ -274,7 +276,11 @@ def to_zarr(

if not as_flat:
raise ValueError("Only flat images are supported at the moment.")
if tol is None and ppm is None or tol == 0 and ppm == 0:
raise ValueError("Please specify `tol` or `ppm`.")

check_zarr()
import dask.array as dsa

mzs = np.asarray(mzs)
if mzs.size == 0:
Expand Down Expand Up @@ -303,11 +309,16 @@ def to_zarr(
silent=silent,
)

import dask.array as dsa

ds = dsa.from_zarr(zarr_array_path)
ys = ds.sum(axis=0).compute()
create_centroids_zarr(self.path, zarr_path, len(mzs_min), ys=np.asarray(ys))
create_centroids_zarr(
self.path,
zarr_path,
len(mzs_min),
ys=np.asarray(ys),
ppm=ppm,
tol=tol,
)

target_path = str(zarr_path / "array")
rechunk_zarr_array(self.path, zarr_array_path, target_path, chunk_size=chunk_size)
Expand Down Expand Up @@ -370,17 +381,20 @@ def to_hdf5(
)
return hdf_path

def spectra_iter(self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False):
def spectra_iter(
self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False
) -> ty.Generator[ty.Tuple[np.ndarray, np.ndarray], None, None]:
"""Yield spectra."""
indices = self.pixels if indices is None else np.asarray(indices)
yield from tqdm(
self._read_spectra(indices), total=len(indices), disable=silent, miniters=500, desc="Iterating spectra..."
)

def _write_cache(self, filename: str, data: ty.Dict):
"""Loading of SQL data can be very slow for some datasets so we can cache it instead.
def _write_cache(self, filename: str, data: ty.Dict) -> None:
"""Sometimes, reading data from raw data can be very slow, so we can cache it instead.
We write some of the metadata to a cache directory that will be located inside of the `Bruker .d` folder.
Cache data is usually written inside the raw directory (e.g. inside Bruker .d or Waters .raw) or next to it
(e.g. when dealing with imzML).
Parameters
----------
Expand All @@ -391,25 +405,25 @@ def _write_cache(self, filename: str, data: ty.Dict):
"""
cache_dir_path = Path(self.path) / ".icache"
cache_dir_path.mkdir(exist_ok=True)
_filename = cache_dir_path / (filename + ".tmp.npz")
tmp_filename = cache_dir_path / (filename + ".tmp.npz")
filename = cache_dir_path / (filename + ".npz")
np.savez(_filename, **data)
np.savez(tmp_filename, **data)
try:
_filename.rename(filename)
tmp_filename.rename(filename)
except OSError:
with suppress(FileNotFoundError):
os.remove(filename)
_filename.rename(filename)
tmp_filename.rename(filename)

def _read_cache(self, filename: str, keys: ty.List[str]):
def _read_cache(self, filename: str, keys: ty.List[str]) -> ty.Dict[str, ty.Optional[np.ndarray]]:
"""Load cache metadata.
Parameters
----------
filename : str
name of the cache file without the .npz suffix
Name of the cache file without the .npz suffix
keys : list
list of keys to be read when cache file is loaded
Keys to be read when cache file is loaded
"""
cache_file_path = Path(self.path) / ".icache" / (filename + ".npz")

Expand Down
1 change: 1 addition & 0 deletions src/imzy/_readers/bruker/_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Metadata reader for Bruker .d (.tdf/.tsf) files on macOS."""
Loading

0 comments on commit 6e8e162

Please sign in to comment.