Skip to content

Commit

Permalink
feat(dataset): add env var to disable mmapped loading
Browse files Browse the repository at this point in the history
In case it causes problems in prod
  • Loading branch information
nfrasser committed Aug 26, 2024
1 parent f954c9d commit 8f9489b
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 13 deletions.
24 changes: 16 additions & 8 deletions cryosparc/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,10 +583,10 @@ def load(
file (str | Path | IO): Readable file path or handle. Must be
seekable if loading a dataset saved in the default
``NUMPY_FORMAT``
prefixes (list[str], optional): Which field prefixes to load. Loads
either all if not specified, or specified `fields`.
fields (list[str], optional): Which fields to load. If no specified,
loads either all or prefixes if prefixes is specified.
prefixes (list[str], optional): Which field prefixes to load. If
not specified, loads either all or specified `fields`.
fields (list[str], optional): Which fields to load. If not
specified, loads either all or specified `prefixes`.
cstrs (bool): If True, load internal string columns as C strings
instead of Python strings. Defaults to False.
Expand Down Expand Up @@ -627,15 +627,23 @@ def _load_numpy(
fields: Optional[Sequence[str]] = None,
cstrs: bool = False,
):
# Use mmap to avoid loading full record array into memory
# cast path to a string for older numpy/python
mmap_mode, f = ("r", str(file)) if isinstance(file, (str, PurePath)) else (None, file)
import os

# disable mmap by setting CRYOSPARC_DATASET_MMAP=false
if os.getenv("CRYOSPARC_DATASET_MMAP", "true").lower() == "true" and isinstance(file, (str, PurePath)):
# Use mmap to avoid loading full record array into memory
# cast path to a string for older numpy/python
mmap_mode, f = "r", str(file)
chunk_size = 2**14 # magic number optimizes memory and performance
else:
mmap_mode, f = None, file
chunk_size = 2**60 # huge enough number so you don't use chunks

indata = n.load(f, mmap_mode=mmap_mode, allow_pickle=False)
size = len(indata)
descr = filter_descr(indata.dtype.descr, keep_prefixes=prefixes, keep_fields=fields)
dset = cls.allocate(size, descr)
offset = 0
chunk_size = 2**14 # magic number optimizes memory and performance
while offset < size:
end = min(offset + chunk_size, size)
chunk = indata[offset:end]
Expand Down
10 changes: 5 additions & 5 deletions cryosparc/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,14 @@ def normalize_field(name: str, dtype: "DTypeLike") -> Field:
# Note: field name "uid" is always uint64, regardless of given dtype
# Note: sd
dt = n.dtype(dtype)
dtstr = dt.str
if name == "uid":
dtstr = n.dtype(n.uint64).str
return name, n.dtype(n.uint64).str
elif dt.char in {"O", "S", "U"}: # all python string object types
dtstr = n.dtype(object).str
return name, n.dtype(object).str
elif dt.shape:
dtstr = dt.base.str
return (name, dtstr, dt.shape) if dt.shape else (name, dtstr)
return name, dt.base.str, dt.shape
else:
return name, dt.str


def fielddtype(field: Field) -> DType:
Expand Down

0 comments on commit 8f9489b

Please sign in to comment.