From 392dec6564b271cffdf97abaee9e6619e90be3db Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 15:58:09 -0700 Subject: [PATCH] Add ability to configure standard datastore cache to be disabled This allows code to use the normal cache with disabled configruation but allow external environment variables to turn it on. This is not possible if the code is using an explicit disabled cache manager class. --- .../configs/datastores/fileDatastore.yaml | 1 + .../daf/butler/datastore/cache_manager.py | 87 ++++++++++++++----- tests/test_datastore.py | 12 +++ 3 files changed, 77 insertions(+), 23 deletions(-) diff --git a/python/lsst/daf/butler/configs/datastores/fileDatastore.yaml b/python/lsst/daf/butler/configs/datastores/fileDatastore.yaml index 59398b5e8f..9c0f7c8813 100644 --- a/python/lsst/daf/butler/configs/datastores/fileDatastore.yaml +++ b/python/lsst/daf/butler/configs/datastores/fileDatastore.yaml @@ -23,6 +23,7 @@ datastore: # Expiry mode and associated threshold. # Options are: # - null (no expiry) + # - disabled (no caching) # - files (threshold is number of files) # - datasets (threshold is number of datasets) # - size (threshold is size in bytes) diff --git a/python/lsst/daf/butler/datastore/cache_manager.py b/python/lsst/daf/butler/datastore/cache_manager.py index 3d4e03a76e..16eeb82c22 100644 --- a/python/lsst/daf/butler/datastore/cache_manager.py +++ b/python/lsst/daf/butler/datastore/cache_manager.py @@ -480,6 +480,54 @@ class DatastoreCacheManager(AbstractDatastoreCacheManager): def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): super().__init__(config, universe) + # Expiration mode. Read from config but allow override from + # the environment. + expiration_mode = self.config.get(("expiry", "mode")) + threshold = self.config.get(("expiry", "threshold")) + + external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") + if external_mode: + if external_mode == "disabled": + expiration_mode = external_mode + threshold = 0 + elif "=" in external_mode: + expiration_mode, expiration_threshold = external_mode.split("=", 1) + threshold = int(expiration_threshold) + else: + log.warning( + "Unrecognized form (%s) for DAF_BUTLER_CACHE_EXPIRATION_MODE environment variable. " + "Ignoring.", + external_mode, + ) + if expiration_mode is None: + # Force to None to avoid confusion. + threshold = None + + allowed = ("disabled", "datasets", "age", "size", "files") + if expiration_mode and expiration_mode not in allowed: + raise ValueError( + f"Unrecognized value for cache expiration mode. Got {expiration_mode} but expected " + + ",".join(allowed) + ) + + self._expiration_mode: str | None = expiration_mode + self._expiration_threshold: int | None = threshold + if self._expiration_threshold is None and self._expiration_mode is not None: + raise ValueError( + f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" + ) + + # Files in cache, indexed by path within the cache directory. + self._cache_entries = CacheRegistry() + + # No need for additional configuration if the cache has been disabled. + if self._expiration_mode == "disabled": + log.debug("Cache configured in disabled state.") + self._cache_directory: ResourcePath | None = ResourcePath( + "datastore_cache_disabled", forceAbsolute=False, forceDirectory=True + ) + return + # Set cache directory if it pre-exists, else defer creation until # requested. Allow external override from environment. root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") @@ -510,35 +558,12 @@ def __init__(self, config: str | DatastoreCacheManagerConfig, universe: Dimensio # Default decision to for whether a dataset should be cached. self._caching_default = self.config.get("default", False) - # Expiration mode. Read from config but allow override from - # the environment. - expiration_mode = self.config.get(("expiry", "mode")) - threshold = self.config.get(("expiry", "threshold")) - - external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") - if external_mode and "=" in external_mode: - expiration_mode, expiration_threshold = external_mode.split("=", 1) - threshold = int(expiration_threshold) - if expiration_mode is None: - # Force to None to avoid confusion. - threshold = None - - self._expiration_mode: str | None = expiration_mode - self._expiration_threshold: int | None = threshold - if self._expiration_threshold is None and self._expiration_mode is not None: - raise ValueError( - f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" - ) - log.debug( "Cache configuration:\n- root: %s\n- expiration mode: %s", self._cache_directory if self._cache_directory else "tmpdir", f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", ) - # Files in cache, indexed by path within the cache directory. - self._cache_entries = CacheRegistry() - @property def cache_directory(self) -> ResourcePath: if self._cache_directory is None: @@ -625,6 +650,9 @@ def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: # Docstring inherited + if self._expiration_mode == "disabled": + return False + matchName: LookupKey | str = f"{entity} (via default)" should_cache = self._caching_default @@ -662,6 +690,8 @@ def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: # Docstring inherited + if self._expiration_mode == "disabled": + return None if not self.should_be_cached(ref): return None @@ -696,6 +726,10 @@ def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | No @contextlib.contextmanager def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: # Docstring inherited + if self._expiration_mode == "disabled": + yield None + return + # Short circuit this if the cache directory has not been created yet. if self._cache_directory is None: yield None @@ -810,6 +844,9 @@ def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = def scan_cache(self) -> None: """Scan the cache directory and record information about files.""" + if self._expiration_mode == "disabled": + return + found = set() for file in ResourcePath.findFileResources([self.cache_directory]): assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" @@ -865,6 +902,8 @@ def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: This method does not force the cache to be re-scanned and so can miss cached datasets that have recently been written by other processes. """ + if self._expiration_mode == "disabled": + return False if self._cache_directory is None: return False if self.file_count == 0: @@ -1002,6 +1041,8 @@ def _sort_by_time(key: str) -> datetime.datetime: return sorted(self._cache_entries, key=_sort_by_time) def __str__(self) -> str: + if self._expiration_mode == "disabled": + return f"""{type(self).__name__}(disabled)""" cachedir = self._cache_directory if self._cache_directory else "" return ( f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," diff --git a/tests/test_datastore.py b/tests/test_datastore.py index 97b4ce0f7f..dd2171ef46 100644 --- a/tests/test_datastore.py +++ b/tests/test_datastore.py @@ -1896,6 +1896,18 @@ def testCacheExpirySize(self) -> None: self.assertExpiration(cache_manager, 10, 6) self.assertIn(f"{mode}={threshold}", str(cache_manager)) + def testDisabledCache(self) -> None: + threshold = 0 + mode = "disabled" + config_str = self._expiration_config(mode, threshold) + cache_manager = self._make_cache_manager(config_str) + for uri, ref in zip(self.files, self.refs, strict=True): + self.assertFalse(cache_manager.should_be_cached(ref)) + self.assertIsNone(cache_manager.move_to_cache(uri, ref)) + self.assertFalse(cache_manager.known_to_cache(ref)) + with cache_manager.find_in_cache(ref, ".txt") as found: + self.assertIsNone(found, msg=f"{cache_manager}") + def assertExpiration( self, cache_manager: DatastoreCacheManager, n_datasets: int, n_retained: int ) -> None: