Merge pull request #30 from vandeplaslab/improved-types

Improvements to types
vandeplaslab · Oct 10, 2023 · 6e8e162 · 6e8e162
2 parents 71fbd8a + 7b77b1e
commit 6e8e162
Show file tree

Hide file tree

Showing 13 changed files with 287 additions and 221 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
   "tqdm",
   "natsort",
   "koyo",
-  "pluggy"
+  "pluggy",
 ]
 
 # extras
@@ -155,8 +155,14 @@ disallow_any_generics = false
 disallow_subclassing_any = false
 show_error_codes = true
 pretty = true
+ignore_missing_imports = true
+exclude = [
+    "venv",
+    "tests",
+    "src/imzy/_version.py"
+]
 
-# # module specific overrides
+# module specific overrides
 # [[tool.mypy.overrides]]
 # module = ["numpy.*",]
 # ignore_errors = true

diff --git a/src/imzy/_centroids/__init__.py b/src/imzy/_centroids/__init__.py
@@ -1,4 +1,7 @@
 """Centroids."""
 from imzy._centroids._hdf5_store import H5CentroidsStore  # noqa F401
-from imzy._centroids._memory_store import InMemoryStore  # noqa F401
-from imzy._centroids._zarr_store import ZarrCentroidsStore  # noqa F401
+from imzy._centroids._memory_store import InMemoryStore  # F401
+from imzy._centroids._zarr_store import ZarrCentroidsStore  # F401
+
+
+__all__ = ["H5CentroidsStore", "InMemoryStore", "ZarrCentroidsStore"]
diff --git a/src/imzy/_extract.py b/src/imzy/_extract.py
@@ -6,7 +6,7 @@
 from koyo.spectrum import find_between_batch
 from koyo.typing import PathLike
 from koyo.utilities import chunks
-from tqdm.auto import tqdm
+from tqdm import tqdm
 
 try:
     import hdf5plugin
@@ -17,12 +17,12 @@
 from imzy.utilities import accumulate_peaks_centroid, accumulate_peaks_profile
 
 
-def check_zarr():
+def check_zarr() -> None:
     """Check whether Zarr, dask and rechunker are installed."""
     try:
-        import dask  # noqa
-        import rechunker  # noqa
-        import zarr  # noqa
+        import dask
+        import rechunker
+        import zarr
     except ImportError:
         raise ImportError(
             "Please install `zarr`, `dask` and `rechunker` to continue. You can do `pip install imzy[zarr]"
@@ -36,15 +36,17 @@ def create_centroids_zarr(
     mzs: ty.Optional[np.ndarray] = None,
     mzs_min: ty.Optional[np.ndarray] = None,
     mzs_max: ty.Optional[np.ndarray] = None,
-    tol: float = 0,
-    ppm: float = 0,
+    tol: ty.Optional[float] = None,
+    ppm: ty.Optional[float] = None,
     ys: ty.Optional[np.ndarray] = None,
 ):
     """Create group with datasets inside."""
     import zarr
 
-    reader = get_reader(input_dir)
+    if tol is None and ppm is None:
+        raise ValueError("Either `tol` or `ppm` should be specified.")
 
+    reader = get_reader(input_dir)
     store = zarr.DirectoryStore(str(zarr_path))
     group = zarr.group(store=store)
     # add metadata
@@ -152,11 +154,11 @@ def rechunk_zarr_array(
     _safe_rmtree(zarr_path)  # remove the temporary array
 
 
-def check_hdf5():
+def check_hdf5() -> None:
     """Check whether Zarr, dask and rechunker are installed."""
     try:
-        import h5py  # noqa
-        import hdf5plugin  # noqa
+        import h5py
+        import hdf5plugin
     except ImportError:
         raise ImportError("Please install `h5py` and `hdf5plugins` to continue. You can do `pip install imzy[hdf5]")
 
@@ -165,7 +167,7 @@ def get_chunk_info(n_pixels: int, n_peaks: int, max_mem: float = 512) -> ty.Dict
     """Get chunk size information for particular dataset."""
     import math
 
-    _max_mem = (float(n_pixels) * n_peaks * 4) / (1024 ** 2)  # assume 4 bytes per element
+    _max_mem = (float(n_pixels) * n_peaks * 4) / (1024**2)  # assume 4 bytes per element
     n_tasks = math.ceil(_max_mem / max_mem) or 1
     return dict(enumerate(list(chunks(np.arange(n_pixels), n_tasks=n_tasks))))
 
@@ -177,15 +179,18 @@ def create_centroids_hdf5(
     mzs: ty.Optional[np.ndarray] = None,
     mzs_min: ty.Optional[np.ndarray] = None,
     mzs_max: ty.Optional[np.ndarray] = None,
-    tol: float = 0,
-    ppm: float = 0,
+    tol: ty.Optional[float] = None,
+    ppm: ty.Optional[float] = None,
     ys: ty.Optional[np.ndarray] = None,
     chunk_info: ty.Optional[ty.Dict[int, np.ndarray]] = None,
-):
+) -> Path:
     """Create group with datasets inside."""
     from imzy._centroids import H5CentroidsStore
     from imzy.utilities import optimize_chunks_along_axis
 
+    if tol is None and ppm is None:
+        raise ValueError("Either `tol` or `ppm` should be specified.")
+
     reader = get_reader(input_dir)
     n_pixels = reader.n_pixels
     array_shape = (n_pixels, n_peaks)
@@ -236,7 +241,7 @@ def create_centroids_hdf5(
                     dtype=np.float32,
                     **compression,
                 )
-    return hdf_path
+    return Path(hdf_path)
 
 
 def extract_centroids_hdf5(

diff --git a/src/imzy/_readers/_base.py b/src/imzy/_readers/_base.py
@@ -25,7 +25,7 @@ class BaseReader:
     def __init__(self, path: PathLike):
         self.path = Path(path)
 
-    def _init(self, *args, **kwargs):
+    def _init(self, *args, **kwargs) -> None:
         """Method which is called to initialize the reader."""
         raise NotImplementedError("Must implement method")
 
@@ -48,7 +48,7 @@ def get_spectrum(self, index: int):
         """Return mass spectrum."""
         return self._read_spectrum(index)
 
-    def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False):
+    def get_summed_spectrum(self, indices: ty.Iterable[int], silent: bool = False) -> ty.Tuple[np.ndarray, np.ndarray]:
         """Sum pixel data to produce summed mass spectrum."""
         raise NotImplementedError("Must implement method")
 
@@ -75,22 +75,22 @@ def y_pixel_size(self) -> float:
 
     @property
     @lru_cache
-    def x_size(self):
+    def x_size(self) -> int:
         """X-axis size."""
         min_val, max_max = get_min_max(self.x_coordinates)
-        return max_max - min_val + 1
+        return int(max_max - min_val + 1)
 
     @property
     @lru_cache
-    def y_size(self):
+    def y_size(self) -> int:
         """Y-axis size."""
         min_val, max_max = get_min_max(self.y_coordinates)
-        return max_max - min_val + 1
+        return int(max_max - min_val + 1)
 
-    def __iter__(self):
+    def __iter__(self) -> "BaseReader":
         return self
 
-    def __next__(self):
+    def __next__(self) -> ty.Tuple[np.ndarray, np.ndarray]:
         """Get next spectrum."""
         if self._current < self.n_pixels - 1:
             self._current += 1
@@ -99,7 +99,7 @@ def __next__(self):
             self._current = -1
             raise StopIteration
 
-    def __getitem__(self, item: int):
+    def __getitem__(self, item: int) -> ty.Tuple[np.ndarray, np.ndarray]:
         """Retrieve spectrum."""
         return self.get_spectrum(item)
 
@@ -133,6 +133,8 @@ def image_shape(self) -> ty.Tuple[int, int]:
     @property
     def xyz_coordinates(self) -> np.ndarray:
         """Return xyz coordinates."""
+        if self._xyz_coordinates is None:
+            raise ValueError("Coordinates have not been initialized.")
         return self._xyz_coordinates
 
     @property
@@ -156,7 +158,7 @@ def pixels(self) -> np.ndarray:
         return np.arange(self.n_pixels)
 
     @property
-    def n_pixels(self):
+    def n_pixels(self) -> int:
         """Return the total number of pixels in the dataset."""
         return len(self.x_coordinates)
 
@@ -170,7 +172,7 @@ def pixel_size(self) -> float:
             raise ValueError("Pixel size is not equal in both dimensions.")
         return self.x_pixel_size
 
-    def get_chromatogram(self, indices: ty.Iterable[int]):
+    def get_chromatogram(self, indices: ty.Iterable[int]) -> np.ndarray:
         """Return chromatogram."""
         indices = np.asarray(indices)
         array = np.zeros(len(indices), dtype=np.float32)
@@ -221,7 +223,7 @@ def _get_ions(
         ppm: ty.Optional[float] = None,
         fill_value: float = np.nan,
         silent: bool = False,
-    ):
+    ) -> np.ndarray:
         mzs = np.asarray(mzs)
         mzs_min, mzs_max = get_mzs_for_tol(mzs, tol, ppm)
         res = np.full((self.n_pixels, len(mzs)), dtype=np.float32, fill_value=fill_value)
@@ -255,7 +257,7 @@ def to_table(
         ppm: ty.Optional[float] = None,
         fill_value: float = np.nan,
         silent: bool = False,
-    ):
+    ) -> np.ndarray:
         """Return many ion images for specified m/z values without reshaping."""
         return self._get_ions(mzs, tol, ppm, fill_value, silent)
 
@@ -274,7 +276,11 @@ def to_zarr(
 
         if not as_flat:
             raise ValueError("Only flat images are supported at the moment.")
+        if tol is None and ppm is None or tol == 0 and ppm == 0:
+            raise ValueError("Please specify `tol` or `ppm`.")
+
         check_zarr()
+        import dask.array as dsa
 
         mzs = np.asarray(mzs)
         if mzs.size == 0:
@@ -303,11 +309,16 @@ def to_zarr(
             silent=silent,
         )
 
-        import dask.array as dsa
-
         ds = dsa.from_zarr(zarr_array_path)
         ys = ds.sum(axis=0).compute()
-        create_centroids_zarr(self.path, zarr_path, len(mzs_min), ys=np.asarray(ys))
+        create_centroids_zarr(
+            self.path,
+            zarr_path,
+            len(mzs_min),
+            ys=np.asarray(ys),
+            ppm=ppm,
+            tol=tol,
+        )
 
         target_path = str(zarr_path / "array")
         rechunk_zarr_array(self.path, zarr_array_path, target_path, chunk_size=chunk_size)
@@ -370,17 +381,20 @@ def to_hdf5(
         )
         return hdf_path
 
-    def spectra_iter(self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False):
+    def spectra_iter(
+        self, indices: ty.Optional[ty.Iterable[int]] = None, silent: bool = False
+    ) -> ty.Generator[ty.Tuple[np.ndarray, np.ndarray], None, None]:
         """Yield spectra."""
         indices = self.pixels if indices is None else np.asarray(indices)
         yield from tqdm(
             self._read_spectra(indices), total=len(indices), disable=silent, miniters=500, desc="Iterating spectra..."
         )
 
-    def _write_cache(self, filename: str, data: ty.Dict):
-        """Loading of SQL data can be very slow for some datasets so we can cache it instead.
+    def _write_cache(self, filename: str, data: ty.Dict) -> None:
+        """Sometimes, reading data from raw data can be very slow, so we can cache it instead.
 
-        We write some of the metadata to a cache directory that will be located inside of the `Bruker .d` folder.
+        Cache data is usually written inside the raw directory (e.g. inside Bruker .d or Waters .raw) or next to it
+        (e.g. when dealing with imzML).
 
         Parameters
         ----------
@@ -391,25 +405,25 @@ def _write_cache(self, filename: str, data: ty.Dict):
         """
         cache_dir_path = Path(self.path) / ".icache"
         cache_dir_path.mkdir(exist_ok=True)
-        _filename = cache_dir_path / (filename + ".tmp.npz")
+        tmp_filename = cache_dir_path / (filename + ".tmp.npz")
         filename = cache_dir_path / (filename + ".npz")
-        np.savez(_filename, **data)
+        np.savez(tmp_filename, **data)
         try:
-            _filename.rename(filename)
+            tmp_filename.rename(filename)
         except OSError:
             with suppress(FileNotFoundError):
                 os.remove(filename)
-            _filename.rename(filename)
+            tmp_filename.rename(filename)
 
-    def _read_cache(self, filename: str, keys: ty.List[str]):
+    def _read_cache(self, filename: str, keys: ty.List[str]) -> ty.Dict[str, ty.Optional[np.ndarray]]:
         """Load cache metadata.
 
         Parameters
         ----------
         filename : str
-            name of the cache file without the .npz suffix
+            Name of the cache file without the .npz suffix
         keys : list
-            list of keys to be read when cache file is loaded
+            Keys to be read when cache file is loaded
         """
         cache_file_path = Path(self.path) / ".icache" / (filename + ".npz")
 

diff --git a/src/imzy/_readers/bruker/_metadata.py b/src/imzy/_readers/bruker/_metadata.py
@@ -0,0 +1 @@
+"""Metadata reader for Bruker .d (.tdf/.tsf) files on macOS."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Metadata reader for Bruker .d (.tdf/.tsf) files on macOS."""