From 065bc25ddd4456461ac49b52f9ad759efb334382 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 12 Dec 2023 18:15:25 +0100 Subject: [PATCH 01/74] Start work on preloading filehandlers First beginning of work to preload filehandlers before files are present. Not much implementation yet, just a skeleton on what it might look like in the YAMLReader. --- satpy/readers/__init__.py | 1 + satpy/readers/yaml_reader.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/satpy/readers/__init__.py b/satpy/readers/__init__.py index c8fc0a8b69..8069895cc8 100644 --- a/satpy/readers/__init__.py +++ b/satpy/readers/__init__.py @@ -645,6 +645,7 @@ def _get_reader_kwargs(reader, reader_kwargs): for (k, v) in reader_kwargs.items(): reader_kwargs_without_filter[k] = v.copy() reader_kwargs_without_filter[k].pop("filter_parameters", None) + reader_kwargs_without_filter[k].pop("preload", None) return (reader_kwargs, reader_kwargs_without_filter) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index ff3599052a..f6c20bd3ca 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1154,8 +1154,21 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): field which will be used if ``expected_segments`` is not defined. This will default to 1 segment. + This reader uses an optional ``preload`` keyword argument that can be + passed to :meth:`Scene.__init__`. This argument is intended for near real + time processing. When only one segment has arrived, the user can create a + scene using this one segment and ``preload=True``, and Satpy will populate + a scene based on all expected segments with dask delayed objects. When + computing the dask graphs, satpy will process each segment as soon as it + comes in, strongly reducing the timeliness for processing a full disc + image. As of Satpy 0.47, this feature is experimental. Use at own risk. """ + def __init__(self, *args, **kwargs): + """Initialise object.""" + self.preload = kwargs.pop("preload", False) + super().__init__(*args, **kwargs) + def create_filehandlers(self, filenames, fh_kwargs=None): """Create file handler objects and determine expected segments for each.""" created_fhs = super(GEOSegmentYAMLReader, self).create_filehandlers( @@ -1303,6 +1316,17 @@ def _get_new_areadef_heights(self, previous_area, previous_seg_size, **kwargs): return new_height_proj_coord, new_height_px + def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=None): + for (i, fh) in enumerate(super()._new_filehandler_instances( + filetype_info, filename_items, fh_kwargs=fh_kwargs)): + yield fh + if self.preload: + if i < fh.filetype_info["expected_segments"]: + yield from self._new_preloaded_filehandler_instances() + + def _new_preloaded_filehandler_instances(self): + raise NotImplementedError("Pre-loading not implemented yet") + def _stack_area_defs(area_def_dict): """Stack given dict of area definitions and return a StackedAreaDefinition.""" From d0f4e472b8dac4f58853675a5cfd31f42e8feafd Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 13 Dec 2023 12:43:24 +0100 Subject: [PATCH 02/74] Prepare yaml reader for preloading filehandlers Prepare GeoSegmentYAMLReader for preloading filehandlers for files corresponding to segments that don't exist yet. Early draft implementation that appears to work with limitations. Implementation in GEOSegmentYAMLReader still needs tweaking, tests need to be improved, and the corresponding file handler (for now just FCI) needs to be able to handle it. --- satpy/readers/yaml_reader.py | 76 +++++++++++++++++++++++++++++++-- satpy/tests/test_yaml_reader.py | 30 +++++++++++++ 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index f6c20bd3ca..bdd3e48aee 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -33,7 +33,7 @@ import yaml from pyresample.boundary import AreaDefBoundary, Boundary from pyresample.geometry import AreaDefinition, StackedAreaDefinition, SwathDefinition -from trollsift.parser import globify, parse +from trollsift.parser import Parser, globify, parse try: from yaml import CLoader as Loader @@ -1317,15 +1317,83 @@ def _get_new_areadef_heights(self, previous_area, previous_seg_size, **kwargs): return new_height_proj_coord, new_height_px def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=None): + """Get new filehandler instances. + + Gets new filehandler instances, either just for files that exist, or, + if self.preload is True, also for predicted files that don't exist, + as a glob pattern. + """ for (i, fh) in enumerate(super()._new_filehandler_instances( filetype_info, filename_items, fh_kwargs=fh_kwargs)): yield fh if self.preload: if i < fh.filetype_info["expected_segments"]: - yield from self._new_preloaded_filehandler_instances() + yield from self._new_preloaded_filehandler_instances( + filetype_info, fh) + + def _new_preloaded_filehandler_instances(self, filetype_info, fh): + """Get filehandler instances for non-existing files. + + Based on the filehandler for a single existing file, yield filehandler + instances for all files that we expect for a full disc cycle. Those + filehandler instances are usually based on glob patterns rather than on + the explicity filename, which may not be reliably predictable. + """ + if filetype_info.get("requires"): + raise NotImplementedError("Pre-loading not implemented for " + "handlers with required segments.") + filetype_cls = filetype_info["file_reader"] + for (filename, filename_info) in self._predict_filenames(filetype_info, fh): + # FIXME: handle fh_kwargs + yield filetype_cls(filename, filename_info, filetype_info) + + def _predict_filenames(self, filetype_info, fh): + """Predict what filenames or glob patterns we should expect. + + Based on the filehandler for a single existing file, yield strings + for filenames or glob patterns for all files that we expect to make up + a scene in the end. + """ + for i in range(fh.filename_info["count_in_repeat_cycle"]+1, + fh.filetype_info["expected_segments"]): + yield self._predict_filename(fh, i) - def _new_preloaded_filehandler_instances(self): - raise NotImplementedError("Pre-loading not implemented yet") + def _select_pattern(self, filename): + """Choose the matching file pattern. + + Given a filename for a yamlreader with multiple file patterns, + return the matching file pattern. + """ + # FIXME: this should test to match the filename + return self.file_patterns[0] + + def _predict_filename(self, fh, i): + """Predict filename or glob pattern. + + Given a filehandler for an extant file, predict what the filename or + glob pattern will be for segment i. + """ + pat = self._select_pattern(fh.filename) + p = Parser(pat) + new_info = _predict_filename_info(fh, i) + new_filename = p.compose(new_info) + new_filename = new_filename.replace("000000", "??????") + return (new_filename, new_info) + + +def _predict_filename_info(fh, i): + """Predict filename info for file that doesn't exist yet. + + Taking an existing filehandler that refers to a real file with a segmented + reader, return a glob pattern that should match the file for segment number + i. + """ + new_info = fh.filename_info.copy() + new_info[fh.filetype_info["segment_tag"]] = i + for tm in fh.filetype_info["time_tags"]: + new_info[tm] = new_info[tm].replace( + hour=0, minute=0, second=0) + return new_info def _stack_area_defs(area_def_dict): diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 41439a1ac6..5b61ae8327 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1500,3 +1500,33 @@ def test_get_empty_segment_with_height(self): new_height = 140 new_empty_segment = geswh(empty_segment, new_height, dim) assert new_empty_segment is empty_segment + + +def test_predict_filename_info(tmp_path): + """Test prediction of filename info.""" + from satpy.readers.yaml_reader import _predict_filename_info + + fake_filename = os.fspath(tmp_path / "M9G-a-21000101100000-21000101100100-0001.nc") + fake_filename_info = { + "platform": "M9G", + "start_time": datetime(2100, 1, 1, 10, 0, 0), + "end_time": datetime(2100, 1, 1, 10, 1, 0), + "segment": 1} + fake_filetype_info = { + "file_reader": BaseFileHandler, + "file_patterns": [ + "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%m%d%H%M%S}-{segment:>04d}", + "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%H%m%d%H%M%S}-{segment:>04d}"], + "expected_segments": 5, + "time_tags": ["start_time", "end_time"], + "segment_tag": "segment"} + fakehandler = BaseFileHandler( + fake_filename, + fake_filename_info, + fake_filetype_info) + info = _predict_filename_info(fakehandler, 3) + assert info == { + "platform": "M9G", + "start_time": datetime(2100, 1, 1, 0, 0, ), + "end_time": datetime(2100, 1, 1, 0, 0, ), + "segment": 3} From 36c71e99b1802b1094f37249f79ee4a38813170a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 13 Dec 2023 14:47:39 +0100 Subject: [PATCH 03/74] Continue pre-loading implementation in yaml reader Continue the pre-loading implementation in the GEOSegmentedYAMLReader. Add unit tests. --- satpy/etc/readers/fci_l1c_nc.yaml | 5 ++ satpy/readers/yaml_reader.py | 27 +++++-- satpy/tests/test_yaml_reader.py | 112 ++++++++++++++++++++++++++---- 3 files changed, 124 insertions(+), 20 deletions(-) diff --git a/satpy/etc/readers/fci_l1c_nc.yaml b/satpy/etc/readers/fci_l1c_nc.yaml index d241b3fa9e..792d7bf4e1 100644 --- a/satpy/etc/readers/fci_l1c_nc.yaml +++ b/satpy/etc/readers/fci_l1c_nc.yaml @@ -64,6 +64,11 @@ file_types: - ir_105 - ir_123 - ir_133 + segment_tag: count_in_repeat_cycle + time_tags: + - processing_time + - start_time + - end_time fci_l1c_hrfi: file_reader: !!python/name:satpy.readers.fci_l1c_nc.FCIL1cNCFileHandler file_patterns: [ '{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-HRFI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc' ] diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index bdd3e48aee..50c2a1ec58 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -21,6 +21,7 @@ import itertools import logging import os +import pathlib import warnings from abc import ABCMeta, abstractmethod from collections import OrderedDict, deque @@ -1161,7 +1162,10 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): a scene based on all expected segments with dask delayed objects. When computing the dask graphs, satpy will process each segment as soon as it comes in, strongly reducing the timeliness for processing a full disc - image. As of Satpy 0.47, this feature is experimental. Use at own risk. + image in near real time. This feature is experimental. Use at your own + risk. + + .. versionadded: 0.46 """ def __init__(self, *args, **kwargs): @@ -1323,9 +1327,13 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No if self.preload is True, also for predicted files that don't exist, as a glob pattern. """ + i = -1 for (i, fh) in enumerate(super()._new_filehandler_instances( filetype_info, filename_items, fh_kwargs=fh_kwargs)): yield fh + if i == -1: + raise ValueError("Failed to create initial filehandler. " + "Cannot predict remaining files.") if self.preload: if i < fh.filetype_info["expected_segments"]: yield from self._new_preloaded_filehandler_instances( @@ -1354,8 +1362,8 @@ def _predict_filenames(self, filetype_info, fh): for filenames or glob patterns for all files that we expect to make up a scene in the end. """ - for i in range(fh.filename_info["count_in_repeat_cycle"]+1, - fh.filetype_info["expected_segments"]): + for i in range(fh.filename_info[filetype_info["segment_tag"]]+1, + fh.filetype_info["expected_segments"]+1): yield self._predict_filename(fh, i) def _select_pattern(self, filename): @@ -1364,20 +1372,29 @@ def _select_pattern(self, filename): Given a filename for a yamlreader with multiple file patterns, return the matching file pattern. """ - # FIXME: this should test to match the filename - return self.file_patterns[0] + for pattern in self.file_patterns: + if _match_filenames([filename], pattern): + return pattern + else: + raise ValueError("Cannot predict filenames, because the " + "initial file doesn't appear to meet any defined " + "pattern.") def _predict_filename(self, fh, i): """Predict filename or glob pattern. Given a filehandler for an extant file, predict what the filename or glob pattern will be for segment i. + + Returns filename pattern and filename info dict. """ pat = self._select_pattern(fh.filename) p = Parser(pat) new_info = _predict_filename_info(fh, i) new_filename = p.compose(new_info) + basedir = pathlib.Path(fh.filename).parent new_filename = new_filename.replace("000000", "??????") + new_filename = os.fspath(basedir / new_filename) return (new_filename, new_info) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 5b61ae8327..2177ee950b 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1502,31 +1502,113 @@ def test_get_empty_segment_with_height(self): assert new_empty_segment is empty_segment -def test_predict_filename_info(tmp_path): - """Test prediction of filename info.""" - from satpy.readers.yaml_reader import _predict_filename_info - - fake_filename = os.fspath(tmp_path / "M9G-a-21000101100000-21000101100100-0001.nc") +_fake_filetype_info = { + "file_reader": BaseFileHandler, + "file_patterns": [ + "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc", + "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc"], + "expected_segments": 5, + "time_tags": ["start_time", "end_time"], + "segment_tag": "segment"} + + +def _get_fake_handler(parent, filename): + fake_filename = os.fspath(parent / filename) fake_filename_info = { "platform": "M9G", "start_time": datetime(2100, 1, 1, 10, 0, 0), "end_time": datetime(2100, 1, 1, 10, 1, 0), "segment": 1} - fake_filetype_info = { - "file_reader": BaseFileHandler, - "file_patterns": [ - "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%m%d%H%M%S}-{segment:>04d}", - "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%H%m%d%H%M%S}-{segment:>04d}"], - "expected_segments": 5, - "time_tags": ["start_time", "end_time"], - "segment_tag": "segment"} fakehandler = BaseFileHandler( fake_filename, fake_filename_info, - fake_filetype_info) - info = _predict_filename_info(fakehandler, 3) + _fake_filetype_info) + return fakehandler + + +def test_predict_filename_info(tmp_path): + """Test prediction of filename info.""" + from satpy.readers.yaml_reader import _predict_filename_info + + fh = _get_fake_handler( + tmp_path, + "M9G-a-21000101100000-21000101100100-0001.nc") + info = _predict_filename_info(fh, 3) assert info == { "platform": "M9G", "start_time": datetime(2100, 1, 1, 0, 0, ), "end_time": datetime(2100, 1, 1, 0, 0, ), "segment": 3} + + +@pytest.fixture() +def fake_gsyreader(): + """Create a fake GeoSegmentYAMLReader.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + return GEOSegmentYAMLReader( + {"reader": { + "name": "alicudi"}, + "file_types": { + "m9g": _fake_filetype_info}}, preload=True) + + +def test_predict_filename(tmp_path, fake_gsyreader): + """Test predicting a filename.""" + fh = _get_fake_handler( + tmp_path, + "M9G-a-21000101100000-21000101100100-0001.nc") + newname = fake_gsyreader._predict_filename(fh, 4) + assert newname[0] == os.fspath(tmp_path / "M9G-a-21000101??????-21000101??????-0004.nc") + fh = _get_fake_handler( + tmp_path, + "M9G-b-21000101100000-21000101100100-0001.nc") + newname = fake_gsyreader._predict_filename(fh, 4) + assert newname[0] == os.fspath(tmp_path / "M9G-b-21000101??????-21000101??????-0004.nc") + + +def test_select_pattern(fake_gsyreader): + """Test selecting the appropriate pattern.""" + assert fake_gsyreader._select_pattern( + "M9G-a-21000101100000-21000101100100-0001.nc") == ( + "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc") + assert fake_gsyreader._select_pattern( + "M9G-b-21000101100000-21000101100100-0001.nc") == ( + "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc") + with pytest.raises(ValueError, match="Cannot predict filenames"): + fake_gsyreader._select_pattern( + "M9G-c-21000101100000-21000101100100-0001.nc") + + +def test_preloaded_instances(tmp_path, fake_gsyreader): + """That that preloaded instances are generated.""" + g = fake_gsyreader._new_filehandler_instances( + _fake_filetype_info, + [("M9G-a-21000101053000-21000101053100-0001.nc", + {"platform": "M9G", + "start_time": datetime(2100, 1, 1, 5, 30, ), + "end_time": datetime(2100, 1, 1, 5, 31, ), + "segment": 1})]) + total = list(g) + assert len(total) == 5 + + ffi2 = _fake_filetype_info.copy() + ffi2["requires"] = ["pergola"] + g = fake_gsyreader._new_filehandler_instances( + ffi2, + [("M9G-a-21000101053000-21000101053100-0001.nc", + {"platform": "M9G", + "start_time": datetime(2100, 1, 1, 5, 30, ), + "end_time": datetime(2100, 1, 1, 5, 31, ), + "segment": 1})]) + with pytest.raises(ValueError, match="Failed to create"): + list(g) + + g = fake_gsyreader._new_preloaded_filehandler_instances( + ffi2, + [("M9G-a-21000101053000-21000101053100-0001.nc", + {"platform": "M9G", + "start_time": datetime(2100, 1, 1, 5, 30, ), + "end_time": datetime(2100, 1, 1, 5, 31, ), + "segment": 1})]) + with pytest.raises(NotImplementedError, match="Pre-loading not implemented"): + list(g) From eeb05faa81537b71308b07d5e2d3ddaf75ab1826 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 13 Dec 2023 14:51:02 +0100 Subject: [PATCH 04/74] Don't fail to predict files if unasked Don't raise an error that we can't predict the remaining files if this functionality was not requested. --- satpy/readers/yaml_reader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 50c2a1ec58..e3ea1c77b9 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1331,10 +1331,10 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No for (i, fh) in enumerate(super()._new_filehandler_instances( filetype_info, filename_items, fh_kwargs=fh_kwargs)): yield fh - if i == -1: - raise ValueError("Failed to create initial filehandler. " - "Cannot predict remaining files.") if self.preload: + if i == -1: + raise ValueError("Failed to create initial filehandler. " + "Cannot predict remaining files.") if i < fh.filetype_info["expected_segments"]: yield from self._new_preloaded_filehandler_instances( filetype_info, fh) From 56575f59542181f4b303ff395110b4241e25c064 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 14 Dec 2023 17:16:05 +0100 Subject: [PATCH 05/74] Progress on Preloadable Add a Preloadable class to netcdf_utils. This so far implements pre-loading filehandlers for to-be-expected files if a single one already exists, taking a defined set of data variables and their attributes from the first segment. Still to be implemented is to take other information from other repeat cycles, by on-disk caching. --- satpy/etc/readers/fci_l1c_nc.yaml | 88 +++++++++++++++++++++---------- satpy/readers/__init__.py | 1 - satpy/readers/fci_l1c_nc.py | 11 ++-- satpy/readers/netcdf_utils.py | 88 +++++++++++++++++++++++++++++++ satpy/readers/yaml_reader.py | 14 ++--- 5 files changed, 164 insertions(+), 38 deletions(-) diff --git a/satpy/etc/readers/fci_l1c_nc.yaml b/satpy/etc/readers/fci_l1c_nc.yaml index 792d7bf4e1..a3beda2dcc 100644 --- a/satpy/etc/readers/fci_l1c_nc.yaml +++ b/satpy/etc/readers/fci_l1c_nc.yaml @@ -19,33 +19,67 @@ file_types: file_patterns: [ '{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-FDHSI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc' ] expected_segments: 40 required_netcdf_variables: - - attr/platform - - data/{channel_name}/measured/start_position_row - - data/{channel_name}/measured/end_position_row - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_wavenumber - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_a - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_b - - data/{channel_name}/measured/radiance_to_bt_conversion_constant_c1 - - data/{channel_name}/measured/radiance_to_bt_conversion_constant_c2 - - data/{channel_name}/measured/radiance_unit_conversion_coefficient - - data/{channel_name}/measured/channel_effective_solar_irradiance - - data/{channel_name}/measured/effective_radiance - - data/{channel_name}/measured/x - - data/{channel_name}/measured/y - - data/{channel_name}/measured/pixel_quality - - data/{channel_name}/measured/index_map - - data/mtg_geos_projection - - data/swath_direction - - data/swath_number - - index - - state/celestial/earth_sun_distance - - state/celestial/subsolar_latitude - - state/celestial/subsolar_longitude - - state/celestial/sun_satellite_distance - - state/platform/platform_altitude - - state/platform/subsatellite_latitude - - state/platform/subsatellite_longitude - - time + # key/value; keys are names, value is a list of string on how this may be + # cached between segments or between repeat cycles or neither + attr/platform: + - segment + - rc + data/{channel_name}/measured/start_position_row: + - rc + data/{channel_name}/measured/end_position_row: + - rc + data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_wavenumber: + - segment + - rc + data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_a: + - segment + - rc + data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_b: + - segment + - rc + data/{channel_name}/measured/radiance_to_bt_conversion_constant_c1: + - segment + - rc + data/{channel_name}/measured/radiance_to_bt_conversion_constant_c2: + - segment + - rc + data/{channel_name}/measured/radiance_unit_conversion_coefficient: + - segment + - rc + data/{channel_name}/measured/channel_effective_solar_irradiance: + - segment + - rc + data/{channel_name}/measured/effective_radiance: [] + data/{channel_name}/measured/x: + - segment + - rc + data/{channel_name}/measured/y: + - rc + data/{channel_name}/measured/pixel_quality: [] + data/{channel_name}/measured/index_map: [] + data/mtg_geos_projection: + - segment + - rc + data/swath_direction: + - segment + data/swath_number: + - rc + index: [] + state/celestial/earth_sun_distance: + - segment + state/celestial/subsolar_latitude: + - segment + state/celestial/subsolar_longitude: + - segment + state/celestial/sun_satellite_distance: + - segment + state/platform/platform_altitude: + - segment + state/platform/subsatellite_latitude: + - segment + state/platform/subsatellite_longitude: + - segment + time: [] variable_name_replacements: channel_name: - vis_04 diff --git a/satpy/readers/__init__.py b/satpy/readers/__init__.py index 8069895cc8..c8fc0a8b69 100644 --- a/satpy/readers/__init__.py +++ b/satpy/readers/__init__.py @@ -645,7 +645,6 @@ def _get_reader_kwargs(reader, reader_kwargs): for (k, v) in reader_kwargs.items(): reader_kwargs_without_filter[k] = v.copy() reader_kwargs_without_filter[k].pop("filter_parameters", None) - reader_kwargs_without_filter[k].pop("preload", None) return (reader_kwargs, reader_kwargs_without_filter) diff --git a/satpy/readers/fci_l1c_nc.py b/satpy/readers/fci_l1c_nc.py index 0c7b9fb8cc..4c40708672 100644 --- a/satpy/readers/fci_l1c_nc.py +++ b/satpy/readers/fci_l1c_nc.py @@ -124,7 +124,7 @@ from satpy.readers._geos_area import get_geos_area_naming from satpy.readers.eum_base import get_service_mode -from .netcdf_utils import NetCDF4FsspecFileHandler +from .netcdf_utils import NetCDF4FsspecFileHandler, Preloadable logger = logging.getLogger(__name__) @@ -175,7 +175,7 @@ def _get_channel_name_from_dsname(dsname): return channel_name -class FCIL1cNCFileHandler(NetCDF4FsspecFileHandler): +class FCIL1cNCFileHandler(Preloadable, NetCDF4FsspecFileHandler): """Class implementing the MTG FCI L1c Filehandler. This class implements the Meteosat Third Generation (MTG) Flexible @@ -200,12 +200,15 @@ class using the :mod:`~satpy.Scene.load` method with the reader "MTI3": "MTG-I3", "MTI4": "MTG-I4"} - def __init__(self, filename, filename_info, filetype_info): + def __init__(self, filename, filename_info, filetype_info, + *args, **kwargs): """Initialize file handler.""" super().__init__(filename, filename_info, filetype_info, + *args, cache_var_size=0, - cache_handle=True) + cache_handle=True, + **kwargs) logger.debug("Reading: {}".format(self.filename)) logger.debug("Start: {}".format(self.start_time)) logger.debug("End: {}".format(self.end_time)) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index cb5c38d1cf..da01bfb779 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -18,8 +18,10 @@ """Helpers for reading netcdf-based files.""" import logging +import time import dask.array as da +import dask.delayed import netCDF4 import numpy as np import xarray as xr @@ -431,3 +433,89 @@ def _get_attr(self, obj, key): if self._use_h5netcdf: return obj.attrs[key] return super()._get_attr(obj, key) + + +class Preloadable: + """Mixin class for pre-loading.""" + def __init__(self, *args, preload=False, ref_fh=None, **kwargs): + """Store attributes needed for preloading to work.""" + self.preload = preload + self.ref_fh = ref_fh + super().__init__(*args, **kwargs) + + def _collect_listed_variables(self, file_handle, listed_variables): + if self.preload: + self._preload_listed_variables(listed_variables) + else: + super()._collect_listed_variables(file_handle, listed_variables) + + def _preload_listed_variables(self, listed_variables): + variable_name_replacements = self.filetype_info.get("variable_name_replacements") + for raw_name in listed_variables: + for subst_name in self._get_required_variable_names( + [raw_name], variable_name_replacements): + if self._can_get_from_other_segment(raw_name): + self.file_content[subst_name] = self.ref_fh[subst_name] + self._copy_variable_info(subst_name) +# elif self._can_get_from_other_rc(raw_name): +# LOG.debug(f"Get {subst_name:s} from cached RC") +# raise NotImplementedError("RC-caching not implemented yet") + else: + self._collect_variable_delayed(subst_name) + # FIXME: collect attributes + + def _can_get_from_other_segment(self, itm): + return "segment" in self.filetype_info["required_netcdf_variables"][itm] + + def _can_get_from_other_rc(self, itm): + return "rc" in self.filetype_info["required_netcdf_variables"][itm] + + def _collect_variable_delayed(self, subst_name): + other = self.ref_fh[subst_name] + dade = dask.delayed(_get_delayed_value_from_nc)(self.filename, subst_name) + array = da.from_delayed(dade, shape=other.shape, dtype=other.dtype) + + # FIXME: can we safely reuse dimensions, attrs, and name? + # NO! Vertical extent varies between segments. + var_obj = xr.DataArray( + array, + dims=other.dims, + attrs=other.attrs, + name=other.name) +# self.file_content[subst_name] = var_obj + self._collect_variable_info(subst_name, var_obj) + + def _collect_variable_info(self, var_name, var_obj): + if self.preload: + self.file_content[var_name] = var_obj + self._copy_variable_info(var_name) + else: + super()._collect_variable_info(var_name, var_obj) + + def _copy_variable_info(self, var_name): + # Assume attributes and info can be copied from first segment + # FIXME: shape is NOT valid to copy on some variables, but + # could come from an older repeat cycle + to_copy = [k for k in self.ref_fh.file_content if var_name in k and var_name != k] + for var in to_copy: + self.file_content[var] = self.ref_fh[var] + + def _get_file_handle(self): + if self.preload: + return None + return super()._get_file_handle() + + +def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1): + LOG.debug(f"Waiting for {fn!s} to appear.") + for _ in range(max_tries): + # this should use glob + try: + nc = netCDF4.Dataset(fn, "r") + except FileNotFoundError: + time.sleep(wait) + else: + break + else: + raise TimeoutError("File failed to materialise") + return da.from_array(nc[var]) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index e3ea1c77b9..6367a389e7 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1328,13 +1328,14 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No as a glob pattern. """ i = -1 + # on first call, don't pass preload + fh_kwargs_without_preload = fh_kwargs.copy() + fh_kwargs_without_preload.pop("preload", None) for (i, fh) in enumerate(super()._new_filehandler_instances( - filetype_info, filename_items, fh_kwargs=fh_kwargs)): + filetype_info, filename_items, + fh_kwargs=fh_kwargs_without_preload)): yield fh - if self.preload: - if i == -1: - raise ValueError("Failed to create initial filehandler. " - "Cannot predict remaining files.") + if self.preload and i >= 0: if i < fh.filetype_info["expected_segments"]: yield from self._new_preloaded_filehandler_instances( filetype_info, fh) @@ -1353,7 +1354,8 @@ def _new_preloaded_filehandler_instances(self, filetype_info, fh): filetype_cls = filetype_info["file_reader"] for (filename, filename_info) in self._predict_filenames(filetype_info, fh): # FIXME: handle fh_kwargs - yield filetype_cls(filename, filename_info, filetype_info) + yield filetype_cls(filename, filename_info, filetype_info, + preload=True, ref_fh=fh) def _predict_filenames(self, filetype_info, fh): """Predict what filenames or glob patterns we should expect. From 394be48714dc3f7c8f6fc4acd1eb45f3536def5e Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 15 Dec 2023 14:24:36 +0100 Subject: [PATCH 06/74] Add on-disk caching between repeat cycles Cache data variables between repeat cycles. --- satpy/etc/readers/fci_l1c_nc.yaml | 3 ++ satpy/readers/__init__.py | 30 ++++++++++++-- satpy/readers/netcdf_utils.py | 65 ++++++++++++++++++++++++------- satpy/readers/yaml_reader.py | 30 +++++++++++--- 4 files changed, 106 insertions(+), 22 deletions(-) diff --git a/satpy/etc/readers/fci_l1c_nc.yaml b/satpy/etc/readers/fci_l1c_nc.yaml index a3beda2dcc..b353e17219 100644 --- a/satpy/etc/readers/fci_l1c_nc.yaml +++ b/satpy/etc/readers/fci_l1c_nc.yaml @@ -98,11 +98,14 @@ file_types: - ir_105 - ir_123 - ir_133 + # information needed for preloading segment_tag: count_in_repeat_cycle time_tags: - processing_time - start_time - end_time + variable_tags: # other tags where RC caching doesn't care about variation + - repeat_cycle_in_day fci_l1c_hrfi: file_reader: !!python/name:satpy.readers.fci_l1c_nc.FCIL1cNCFileHandler file_patterns: [ '{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-HRFI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc' ] diff --git a/satpy/readers/__init__.py b/satpy/readers/__init__.py index c8fc0a8b69..77d0b9c2d6 100644 --- a/satpy/readers/__init__.py +++ b/satpy/readers/__init__.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (c) 2015-2018 Satpy developers +# Copyright (c) 2015-2024 Satpy developers # # This file is part of satpy. # @@ -20,6 +18,7 @@ import logging import os +import pathlib import pickle # nosec B403 import warnings from datetime import datetime, timedelta @@ -784,3 +783,28 @@ def open_file_or_filename(unknown_file_thing): except AttributeError: f_obj = unknown_file_thing return f_obj + + +def create_preloadable_cache(reader_name, filenames): + """Create on-disk cache for preloadables. + + Some readers allow on-disk caching of metadata. This can be used to + preload data, creating file handlers and a scene object before data files + are available. This utility function creates the associated cache for + multiple filenames. + + Args: + reader_name (str): + Reader for which to create the cache. + filenames (List[str]): + Files for which to create the cache. Typically, this would be + the same set of files to create a single scene. + """ + reader_instances = load_readers(filenames, reader_name) + for (nm, reader_inst) in reader_instances.items(): + for (tp, handlers) in reader_inst.file_handlers.items(): + for handler in handlers: + filename = reader_inst._get_cache_filename(handler) + p = pathlib.Path(filename) + p.parent.mkdir(exist_ok=True, parents=True) + handler.store_cache(filename) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index da01bfb779..fa588a3ee9 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (c) 2016-2020 Satpy developers +# Copyright (c) 2016-2024 Satpy developers # # This file is part of satpy. # @@ -17,7 +15,9 @@ # satpy. If not, see . """Helpers for reading netcdf-based files.""" +import glob import logging +import pickle # nosec import time import dask.array as da @@ -437,10 +437,11 @@ def _get_attr(self, obj, key): class Preloadable: """Mixin class for pre-loading.""" - def __init__(self, *args, preload=False, ref_fh=None, **kwargs): + def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload self.ref_fh = ref_fh + self.rc_cache = rc_cache super().__init__(*args, **kwargs) def _collect_listed_variables(self, file_handle, listed_variables): @@ -457,9 +458,14 @@ def _preload_listed_variables(self, listed_variables): if self._can_get_from_other_segment(raw_name): self.file_content[subst_name] = self.ref_fh[subst_name] self._copy_variable_info(subst_name) -# elif self._can_get_from_other_rc(raw_name): -# LOG.debug(f"Get {subst_name:s} from cached RC") -# raise NotImplementedError("RC-caching not implemented yet") + elif self._can_get_from_other_rc(raw_name): + try: + self.file_content[subst_name] = self._get_from_other_rc(subst_name) + except OSError as err: + raise OSError("Cannot read repeat cycle cache. " + "Generate using " + "satpy.utils.create_preloadable_cache " + "first. ") from err else: self._collect_variable_delayed(subst_name) # FIXME: collect attributes @@ -470,6 +476,14 @@ def _can_get_from_other_segment(self, itm): def _can_get_from_other_rc(self, itm): return "rc" in self.filetype_info["required_netcdf_variables"][itm] + def _get_from_other_rc(self, itm): + cache_fn = self.rc_cache + return self._read_var_from_cache(cache_fn, itm) + + def _read_var_from_cache(self, cache_fn, itm): + with open(cache_fn, "rb") as fp: + return pickle.load(fp)[itm] # nosec + def _collect_variable_delayed(self, subst_name): other = self.ref_fh[subst_name] dade = dask.delayed(_get_delayed_value_from_nc)(self.filename, subst_name) @@ -505,17 +519,40 @@ def _get_file_handle(self): return None return super()._get_file_handle() + def store_cache(self, filename=None): + """Store RC-cachable data to cache.""" + if self.preload: + raise ValueError("Cannot store cache with pre-loaded handler") + listed_variables = self.filetype_info.get("required_netcdf_variables") + variable_name_replacements = self.filetype_info.get("variable_name_replacements") + to_store = {} + for raw_name in listed_variables: + for subst_name in self._get_required_variable_names( + [raw_name], variable_name_replacements): + if self._can_get_from_other_rc(raw_name): + try: + obj = self[subst_name].compute() + except AttributeError: # not a dask array + obj = self[subst_name] + to_store[subst_name] = obj + filename = filename or self.rc_cache + LOG.info(f"Storing cache to {filename:s}") + with open(filename, "wb") as fp: + # FIXME: potential security risk? Acceptable? + pickle.dump(to_store, fp) + def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1): - LOG.debug(f"Waiting for {fn!s} to appear.") + LOG.debug(f"Waiting for {fn!s} to appear to get {var:s}.") for _ in range(max_tries): - # this should use glob - try: - nc = netCDF4.Dataset(fn, "r") - except FileNotFoundError: + fns = glob.glob(fn) + if len(fns) == 0: time.sleep(wait) - else: - break + continue + elif len(fns) > 1: + raise ValueError(f"Expected one matching file, found {len(fns):d}") + break else: raise TimeoutError("File failed to materialise") + nc = netCDF4.Dataset(fns[0], "r") return da.from_array(nc[var]) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 6367a389e7..cd93613996 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (c) 2016-2022 Satpy developers +# Copyright (c) 2016-2024 Satpy developers # # This file is part of satpy. # @@ -17,6 +15,7 @@ # satpy. If not, see . """Base classes and utilities for all readers configured by YAML files.""" +import datetime import glob import itertools import logging @@ -29,6 +28,7 @@ from fnmatch import fnmatch from weakref import WeakValueDictionary +import appdirs import numpy as np import xarray as xr import yaml @@ -1346,7 +1346,7 @@ def _new_preloaded_filehandler_instances(self, filetype_info, fh): Based on the filehandler for a single existing file, yield filehandler instances for all files that we expect for a full disc cycle. Those filehandler instances are usually based on glob patterns rather than on - the explicity filename, which may not be reliably predictable. + the explicit filename, which may not be reliably predictable. """ if filetype_info.get("requires"): raise NotImplementedError("Pre-loading not implemented for " @@ -1355,7 +1355,8 @@ def _new_preloaded_filehandler_instances(self, filetype_info, fh): for (filename, filename_info) in self._predict_filenames(filetype_info, fh): # FIXME: handle fh_kwargs yield filetype_cls(filename, filename_info, filetype_info, - preload=True, ref_fh=fh) + preload=True, ref_fh=fh, + rc_cache=self._get_cache_filename(fh)) def _predict_filenames(self, filetype_info, fh): """Predict what filenames or glob patterns we should expect. @@ -1399,6 +1400,25 @@ def _predict_filename(self, fh, i): new_filename = os.fspath(basedir / new_filename) return (new_filename, new_info) + def _get_cache_filename(self, fh): + """Get filename for inter-rc caching.""" + dirname = self._get_cache_dir(fh) + pat = self._select_pattern(fh.filename) + p = Parser(pat) + new_info = fh.filename_info.copy() + for tm in fh.filetype_info["time_tags"]: + new_info[tm] = datetime.datetime.min + for ct in fh.filetype_info["variable_tags"]: + new_info[ct] = 0 + name = p.compose(new_info) + pt = pathlib.Path(name) + pt = pt.with_suffix(".pkl") + return os.path.join(dirname, os.fspath(pt)) + + def _get_cache_dir(self, fh): + return os.path.join(appdirs.user_cache_dir(), "satpy", "preloadable", + type(fh).__name__) + def _predict_filename_info(fh, i): """Predict filename info for file that doesn't exist yet. From 437b0965157ce93a43b4cff3de97ec0dcbe07719 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 15 Dec 2023 18:10:02 +0100 Subject: [PATCH 07/74] More work on repeat-cycle caching Continue working on repeat-cycle caching. --- satpy/readers/__init__.py | 5 ++- satpy/readers/netcdf_utils.py | 84 +++++++++++++++-------------------- satpy/readers/yaml_reader.py | 9 ++-- 3 files changed, 45 insertions(+), 53 deletions(-) diff --git a/satpy/readers/__init__.py b/satpy/readers/__init__.py index 77d0b9c2d6..9c5e9afa48 100644 --- a/satpy/readers/__init__.py +++ b/satpy/readers/__init__.py @@ -804,7 +804,10 @@ def create_preloadable_cache(reader_name, filenames): for (nm, reader_inst) in reader_instances.items(): for (tp, handlers) in reader_inst.file_handlers.items(): for handler in handlers: - filename = reader_inst._get_cache_filename(handler) + filename = reader_inst._get_cache_filename( + handler.filename, + handler.filename_info, + handler) p = pathlib.Path(filename) p.parent.mkdir(exist_ok=True, parents=True) handler.store_cache(filename) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index fa588a3ee9..dce20bce15 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -452,29 +452,36 @@ def _collect_listed_variables(self, file_handle, listed_variables): def _preload_listed_variables(self, listed_variables): variable_name_replacements = self.filetype_info.get("variable_name_replacements") + with open(self.rc_cache, "rb") as fp: + self.file_content.update(pickle.load(fp)) # nosec for raw_name in listed_variables: for subst_name in self._get_required_variable_names( [raw_name], variable_name_replacements): if self._can_get_from_other_segment(raw_name): self.file_content[subst_name] = self.ref_fh[subst_name] - self._copy_variable_info(subst_name) - elif self._can_get_from_other_rc(raw_name): - try: - self.file_content[subst_name] = self._get_from_other_rc(subst_name) - except OSError as err: - raise OSError("Cannot read repeat cycle cache. " - "Generate using " - "satpy.utils.create_preloadable_cache " - "first. ") from err - else: + elif not self._can_get_from_other_rc(raw_name): self._collect_variable_delayed(subst_name) - # FIXME: collect attributes def _can_get_from_other_segment(self, itm): return "segment" in self.filetype_info["required_netcdf_variables"][itm] def _can_get_from_other_rc(self, itm): - return "rc" in self.filetype_info["required_netcdf_variables"][itm] + # it's always safe to store variable attributes, shapes, dtype, dimensions + # between repeat cycles + for meta in ("/attr/", "/shape", "/dtype", "/dimension/", "/dimensions"): + if meta in itm: + return True + # need an inverted mapping so I can tell which variables I can store + listed_variables = self.filetype_info.get("required_netcdf_variables") + variable_name_replacements = self.filetype_info.get("variable_name_replacements") + invmap = {} + for raw_name in listed_variables: + for subst_name in self._get_required_variable_names([raw_name], + variable_name_replacements): + invmap[subst_name] = raw_name + if "rc" in self.filetype_info["required_netcdf_variables"][invmap.get(itm, itm)]: + return True + return False def _get_from_other_rc(self, itm): cache_fn = self.rc_cache @@ -485,34 +492,19 @@ def _read_var_from_cache(self, cache_fn, itm): return pickle.load(fp)[itm] # nosec def _collect_variable_delayed(self, subst_name): - other = self.ref_fh[subst_name] + md = self.ref_fh[subst_name] # some metadata from reference segment dade = dask.delayed(_get_delayed_value_from_nc)(self.filename, subst_name) - array = da.from_delayed(dade, shape=other.shape, dtype=other.dtype) + array = da.from_delayed( + dade, + shape=self[subst_name + "/shape"], # not safe from reference segment! + dtype=md.dtype) - # FIXME: can we safely reuse dimensions, attrs, and name? - # NO! Vertical extent varies between segments. var_obj = xr.DataArray( array, - dims=other.dims, - attrs=other.attrs, - name=other.name) -# self.file_content[subst_name] = var_obj - self._collect_variable_info(subst_name, var_obj) - - def _collect_variable_info(self, var_name, var_obj): - if self.preload: - self.file_content[var_name] = var_obj - self._copy_variable_info(var_name) - else: - super()._collect_variable_info(var_name, var_obj) - - def _copy_variable_info(self, var_name): - # Assume attributes and info can be copied from first segment - # FIXME: shape is NOT valid to copy on some variables, but - # could come from an older repeat cycle - to_copy = [k for k in self.ref_fh.file_content if var_name in k and var_name != k] - for var in to_copy: - self.file_content[var] = self.ref_fh[var] + dims=md.dims, # dimension /names/ should be safe + attrs=md.attrs, # FIXME: is this safe? More involved to gather otherwise + name=md.name) + self.file_content[subst_name] = var_obj def _get_file_handle(self): if self.preload: @@ -523,18 +515,14 @@ def store_cache(self, filename=None): """Store RC-cachable data to cache.""" if self.preload: raise ValueError("Cannot store cache with pre-loaded handler") - listed_variables = self.filetype_info.get("required_netcdf_variables") - variable_name_replacements = self.filetype_info.get("variable_name_replacements") to_store = {} - for raw_name in listed_variables: - for subst_name in self._get_required_variable_names( - [raw_name], variable_name_replacements): - if self._can_get_from_other_rc(raw_name): - try: - obj = self[subst_name].compute() - except AttributeError: # not a dask array - obj = self[subst_name] - to_store[subst_name] = obj + for key in self.file_content.keys(): + if self._can_get_from_other_rc(key): + try: + to_store[key] = self[key].compute() + except AttributeError: # not a dask array + to_store[key] = self[key] + filename = filename or self.rc_cache LOG.info(f"Storing cache to {filename:s}") with open(filename, "wb") as fp: @@ -555,4 +543,4 @@ def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1): else: raise TimeoutError("File failed to materialise") nc = netCDF4.Dataset(fns[0], "r") - return da.from_array(nc[var]) + return nc[var][:] diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index cd93613996..e821225a6c 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1356,7 +1356,8 @@ def _new_preloaded_filehandler_instances(self, filetype_info, fh): # FIXME: handle fh_kwargs yield filetype_cls(filename, filename_info, filetype_info, preload=True, ref_fh=fh, - rc_cache=self._get_cache_filename(fh)) + rc_cache=self._get_cache_filename( + filename, filename_info, fh)) def _predict_filenames(self, filetype_info, fh): """Predict what filenames or glob patterns we should expect. @@ -1400,12 +1401,12 @@ def _predict_filename(self, fh, i): new_filename = os.fspath(basedir / new_filename) return (new_filename, new_info) - def _get_cache_filename(self, fh): + def _get_cache_filename(self, filename, filename_info, fh): """Get filename for inter-rc caching.""" dirname = self._get_cache_dir(fh) - pat = self._select_pattern(fh.filename) + pat = self._select_pattern(filename) p = Parser(pat) - new_info = fh.filename_info.copy() + new_info = filename_info.copy() for tm in fh.filetype_info["time_tags"]: new_info[tm] = datetime.datetime.min for ct in fh.filetype_info["variable_tags"]: From c578a1d08fd3ca7987773bae048b2885dc18a557 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 10:00:51 +0100 Subject: [PATCH 08/74] Bump version number for "version added" --- satpy/readers/yaml_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index e821225a6c..662dcbb8da 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1165,7 +1165,7 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): image in near real time. This feature is experimental. Use at your own risk. - .. versionadded: 0.46 + .. versionadded: 0.47 """ def __init__(self, *args, **kwargs): From 87010f8d972aef38ed8a88f6803c429d2cfcde69 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 14:29:46 +0100 Subject: [PATCH 09/74] Unset automaskandscale when preloading When preloading, unset automaskandscale. --- satpy/readers/netcdf_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index dce20bce15..00708adef7 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -530,7 +530,7 @@ def store_cache(self, filename=None): pickle.dump(to_store, fp) -def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1): +def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1, auto_maskandscale=False): LOG.debug(f"Waiting for {fn!s} to appear to get {var:s}.") for _ in range(max_tries): fns = glob.glob(fn) @@ -543,4 +543,6 @@ def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1): else: raise TimeoutError("File failed to materialise") nc = netCDF4.Dataset(fns[0], "r") + if hasattr(nc, "set_auto_maskandscale"): + nc.set_auto_maskandscale(auto_maskandscale) return nc[var][:] From 75916ab8c7235ef991c882c65761faa3efe3f934 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 15:46:57 +0100 Subject: [PATCH 10/74] Split delayed loading in waiting and loading Split the delayed loading from file into one delayed function that waits for the file and another that loads a variable from the file. --- satpy/readers/netcdf_utils.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 00708adef7..538b7113e9 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -493,7 +493,8 @@ def _read_var_from_cache(self, cache_fn, itm): def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment - dade = dask.delayed(_get_delayed_value_from_nc)(self.filename, subst_name) + fn_matched = _wait_for_file(self.filename) + dade = _get_delayed_value_from_nc(fn_matched, subst_name) array = da.from_delayed( dade, shape=self[subst_name + "/shape"], # not safe from reference segment! @@ -530,8 +531,10 @@ def store_cache(self, filename=None): pickle.dump(to_store, fp) -def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1, auto_maskandscale=False): - LOG.debug(f"Waiting for {fn!s} to appear to get {var:s}.") +@dask.delayed +def _wait_for_file(fn, max_tries=300, wait=2): + """Wait for file to appear.""" + LOG.debug(f"Waiting for {fn!s} to appear.") for _ in range(max_tries): fns = glob.glob(fn) if len(fns) == 0: @@ -539,10 +542,14 @@ def _get_delayed_value_from_nc(fn, var, max_tries=10, wait=1, auto_maskandscale= continue elif len(fns) > 1: raise ValueError(f"Expected one matching file, found {len(fns):d}") - break + return fns[0] else: raise TimeoutError("File failed to materialise") - nc = netCDF4.Dataset(fns[0], "r") - if hasattr(nc, "set_auto_maskandscale"): - nc.set_auto_maskandscale(auto_maskandscale) - return nc[var][:] + + +@dask.delayed +def _get_delayed_value_from_nc(fn, var, max_tries=300, wait=2, auto_maskandscale=False): + with netCDF4.Dataset(fn, "r") as nc: + if hasattr(nc, "set_auto_maskandscale"): + nc.set_auto_maskandscale(auto_maskandscale) + return nc[var][:] From 2d3fbb6f084f60f5a50c40c1a3f7b0f5d307fd74 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 16:28:54 +0100 Subject: [PATCH 11/74] Add tests for waiting for file Add tests for the functionality to wait for a file to appear in a delayed object. --- satpy/tests/reader_tests/test_netcdf_utils.py | 71 ++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 2d29288784..2765215797 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -17,17 +17,19 @@ # satpy. If not, see . """Module for testing the satpy.readers.netcdf_utils module.""" +import concurrent.futures import os +import time import unittest import numpy as np import pytest try: - from satpy.readers.netcdf_utils import NetCDF4FileHandler + from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable except ImportError: # fake the import so we can at least run the tests in this file - NetCDF4FileHandler = object # type: ignore + NetCDF4FileHandler = Preloadable = object # type: ignore class FakeNetCDF4FileHandler(NetCDF4FileHandler): @@ -293,3 +295,68 @@ def test_use_h5netcdf_for_file_not_accessible_locally(self): fh = NetCDF4FsspecFileHandler(fname, {}, {}) h5_file.assert_called_once() assert fh._use_h5netcdf + + +class FakePreloadableHandler(Preloadable, FakeNetCDF4FileHandler): + """Fake preloadable handler.""" + + +class TestPreloadableHandler: + """Test functionality related to preloading.""" + + def test_wait_for_file_already_exists(self, tmp_path): + """Test case where file already exists.""" + from satpy.readers.netcdf_utils import _wait_for_file + fn = tmp_path / "file1" + fn.parent.mkdir(exist_ok=True, parents=True) + fn.touch() + waiter = _wait_for_file(os.fspath(fn), max_tries=1, wait=0) + assert waiter.compute() == os.fspath(fn) + + def test_wait_for_file_appears(self, tmp_path): + """Test case where file appears after a bit.""" + from satpy.readers.netcdf_utils import _wait_for_file + def _wait_and_create(path): + time.sleep(0.1) + path.touch() + fn = tmp_path / "file2" + waiter = _wait_for_file(os.fspath(fn), max_tries=3, wait=0.05) + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.submit(_wait_and_create, fn) + t1 = time.time() + res = waiter.compute() + t2 = time.time() + assert res == os.fspath(fn) + assert t2 - t1 > 0.05 + + def test_wait_for_file_not_appears(self, tmp_path): + """Test case where file fails to appear.""" + from satpy.readers.netcdf_utils import _wait_for_file + fn = tmp_path / "file3" + waiter = _wait_for_file(os.fspath(fn), max_tries=1, wait=0) + with pytest.raises(TimeoutError): + waiter.compute() + + def test_wait_for_file_multiple_appear(self, tmp_path): + """Test case where file fails to appear.""" + from satpy.readers.netcdf_utils import _wait_for_file + fn = tmp_path / "file?" + waiter = _wait_for_file(os.fspath(fn), max_tries=1, wait=0) + with pytest.raises(TimeoutError): + waiter.compute() + + def test_wait_for_file_multiple_appears(self, tmp_path): + """Test case where file appears after a bit.""" + from satpy.readers.netcdf_utils import _wait_for_file + def _wait_and_create(path1, path2): + path1.touch() + path2.touch() + fn4 = tmp_path / "file4" + fn5 = tmp_path / "file5" + pat = os.fspath(tmp_path / "file?") + waiter = _wait_for_file(pat, max_tries=2, wait=0.1) + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.submit(_wait_and_create, fn4, fn5) + with pytest.raises(ValueError, + match="Expected one matching file, found 2"): + waiter.compute() From 1e1cb21dbf53b91e72df4dbb9225aea2a4fedbd7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 16:38:45 +0100 Subject: [PATCH 12/74] Add test method for getting delayed from a file Add a test method to test getting a delayed value from a (delayed) file --- satpy/tests/reader_tests/test_netcdf_utils.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 2765215797..97ed917a5e 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -24,6 +24,7 @@ import numpy as np import pytest +import xarray as xr try: from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable @@ -304,7 +305,8 @@ class FakePreloadableHandler(Preloadable, FakeNetCDF4FileHandler): class TestPreloadableHandler: """Test functionality related to preloading.""" - def test_wait_for_file_already_exists(self, tmp_path): + @staticmethod + def test_wait_for_file_already_exists(tmp_path): """Test case where file already exists.""" from satpy.readers.netcdf_utils import _wait_for_file fn = tmp_path / "file1" @@ -313,7 +315,8 @@ def test_wait_for_file_already_exists(self, tmp_path): waiter = _wait_for_file(os.fspath(fn), max_tries=1, wait=0) assert waiter.compute() == os.fspath(fn) - def test_wait_for_file_appears(self, tmp_path): + @staticmethod + def test_wait_for_file_appears(tmp_path): """Test case where file appears after a bit.""" from satpy.readers.netcdf_utils import _wait_for_file def _wait_and_create(path): @@ -329,7 +332,8 @@ def _wait_and_create(path): assert res == os.fspath(fn) assert t2 - t1 > 0.05 - def test_wait_for_file_not_appears(self, tmp_path): + @staticmethod + def test_wait_for_file_not_appears(tmp_path): """Test case where file fails to appear.""" from satpy.readers.netcdf_utils import _wait_for_file fn = tmp_path / "file3" @@ -337,7 +341,8 @@ def test_wait_for_file_not_appears(self, tmp_path): with pytest.raises(TimeoutError): waiter.compute() - def test_wait_for_file_multiple_appear(self, tmp_path): + @staticmethod + def test_wait_for_file_multiple_appear(tmp_path): """Test case where file fails to appear.""" from satpy.readers.netcdf_utils import _wait_for_file fn = tmp_path / "file?" @@ -345,7 +350,8 @@ def test_wait_for_file_multiple_appear(self, tmp_path): with pytest.raises(TimeoutError): waiter.compute() - def test_wait_for_file_multiple_appears(self, tmp_path): + @staticmethod + def test_wait_for_file_multiple_appears(tmp_path): """Test case where file appears after a bit.""" from satpy.readers.netcdf_utils import _wait_for_file def _wait_and_create(path1, path2): @@ -360,3 +366,12 @@ def _wait_and_create(path1, path2): with pytest.raises(ValueError, match="Expected one matching file, found 2"): waiter.compute() + + def test_get_delayed_from_file(self, tmp_path): + """Get getting a value from a file (delayed).""" + from satpy.readers.netcdf_utils import _get_delayed_value_from_nc + ncname = tmp_path / "croatia.nc" + ds = xr.Dataset({"rijeka": (("y", "x"), np.zeros((3, 3)))}) + ds.to_netcdf(ncname) + var_del = _get_delayed_value_from_nc(ncname, "rijeka") + var_del.compute() From 19c8b3ced409fd2ab17f7ef8befe9daba71872f5 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 18:12:12 +0100 Subject: [PATCH 13/74] Progress on tests for pre-loading PRogress un nit tests fol the puproses of delayed olading --- satpy/readers/netcdf_utils.py | 44 ++++++++++++- satpy/tests/reader_tests/test_netcdf_utils.py | 66 +++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 538b7113e9..39ad62d4df 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -436,7 +436,42 @@ def _get_attr(self, obj, key): class Preloadable: - """Mixin class for pre-loading.""" + """Mixin class for pre-loading. + + This is a mixin class designed to use with file handlers that support + preloading. Subclasses deriving from this mixin are expected to be + geo-segmentable file handlers, and can be created based on a glob pattern + for a non-existing file (as well as a path or glob pattern to an existing + file). The first segment of a repeat cycle is always processed only after + the file exists. For the remaining segments, metadata are collected as + much as possible before the file exists, from two possible sources: + + - From the first segment. For some file formats, many pieces of metadata + can be expected to be identical between segments. + - From the same segment number for a previous repeat cycle. In this case, + caching is done on disk. + + To implement a filehandler using this, make sure it derives from both + :class:`NetCDF4FileHandler` and this class, and make sure to pass keyword + arguments in `__init__` to the superclass. In the YAML file, + ``required_netcdf_variables`` must be defined as a dictionary. The keys + are variable names, and values are a list of strings describing what + assumptions can be made about caching, where ``"rc"`` means the value is + expected to be constant between repeat cycles, and ``"segment"`` means it is + expected to be constant between the segments within a repeat cycle. For + variables that are actually variable, define the empty list. + + Attributes (variable, global, and group), shapes, dtypes, and dimension names + are assumed to be always shareable between repeat cycles. + + To use preloading, pass ``preload=True`` to ``reader_kwargs`` when creating + the Scene. + + This feature is experimental. + + ..versionadded: 0.47 + """ + def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload @@ -445,12 +480,14 @@ def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): super().__init__(*args, **kwargs) def _collect_listed_variables(self, file_handle, listed_variables): + """Collect listed variables, either preloaded or regular.""" if self.preload: self._preload_listed_variables(listed_variables) else: super()._collect_listed_variables(file_handle, listed_variables) def _preload_listed_variables(self, listed_variables): + """Preload listed variables, either from RC cache or segment cache.""" variable_name_replacements = self.filetype_info.get("variable_name_replacements") with open(self.rc_cache, "rb") as fp: self.file_content.update(pickle.load(fp)) # nosec @@ -463,9 +500,11 @@ def _preload_listed_variables(self, listed_variables): self._collect_variable_delayed(subst_name) def _can_get_from_other_segment(self, itm): + """Return true if variable is segment-cachable.""" return "segment" in self.filetype_info["required_netcdf_variables"][itm] def _can_get_from_other_rc(self, itm): + """Return true if variable is rc-cachable.""" # it's always safe to store variable attributes, shapes, dtype, dimensions # between repeat cycles for meta in ("/attr/", "/shape", "/dtype", "/dimension/", "/dimensions"): @@ -484,6 +523,7 @@ def _can_get_from_other_rc(self, itm): return False def _get_from_other_rc(self, itm): + """Obtain variable from repeat cycle cache.""" cache_fn = self.rc_cache return self._read_var_from_cache(cache_fn, itm) @@ -516,6 +556,8 @@ def store_cache(self, filename=None): """Store RC-cachable data to cache.""" if self.preload: raise ValueError("Cannot store cache with pre-loaded handler") + if "required_netcdf_variables" not in self.filetype_info: + raise ValueError("Caching needs required_netcdf_variables, but none is defined.") to_store = {} for key in self.file_content.keys(): if self._can_get_from_other_rc(key): diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 97ed917a5e..4937fe3251 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -19,9 +19,11 @@ import concurrent.futures import os +import pickle import time import unittest +import dask.array as da import numpy as np import pytest import xarray as xr @@ -301,6 +303,10 @@ def test_use_h5netcdf_for_file_not_accessible_locally(self): class FakePreloadableHandler(Preloadable, FakeNetCDF4FileHandler): """Fake preloadable handler.""" + def get_test_content(self, filename, filename_info, filetype_info): + """Get fake test content.""" + return {} + class TestPreloadableHandler: """Test functionality related to preloading.""" @@ -375,3 +381,63 @@ def test_get_delayed_from_file(self, tmp_path): ds.to_netcdf(ncname) var_del = _get_delayed_value_from_nc(ncname, "rijeka") var_del.compute() + + @pytest.fixture() + def preloadable_false(self): + """Return a preloadable filehandler with preload=False.""" + handler = FakePreloadableHandler( + "dummy", + {}, + {"required_netcdf_variables": { + "/iceland/reykjavík": ["rc"], + "/iceland/bolungarvík": ["rc"]}}, + preload=False) + return handler + + @pytest.fixture() + def preloadable_true(self, preloadable_false): + """Return a preloadable filehandler with preload=True.""" + handler = FakePreloadableHandler( + "dummy", + {}, + {"required_netcdf_variables": { + "/iceland/reykjavík": ["rc"], + "/iceland/bolungarvík": ["segment"], + "/iceland/{volcano}/lava": ["rc"], + "/iceland/{volcano}/magma": [], + }, + "variable_name_replacements": { + "volcano": ["eyjafjallajökull", "bardarbunga"] + }}, + preload=True, + ref_fh=preloadable_false) + return handler + + def test_store_cache(self, tmp_path, preloadable_false): + """Test that cache is stored as expected.""" + cf = os.fspath(tmp_path / "test.pkl") + preloadable_false.file_content["/iceland/reykjavík"] = da.from_array([0, 1, 2]) + preloadable_false.file_content["/iceland/bolungarvík"] = "012" + preloadable_false.store_cache(cf) + dt = pickle.load(open(cf, mode="rb")) + assert isinstance(dt["/iceland/reykjavík"], np.ndarray) # calculated + np.testing.assert_array_equal(dt["/iceland/reykjavík"], np.array([0, 1, 2])) + assert dt["/iceland/bolungarvík"] == "012" + + def test_can_get_from_other_segment(self, preloadable_true): + """Test if segment-cachable check works correctly.""" + assert preloadable_true._can_get_from_other_segment("/iceland/bolungarvík") + assert not preloadable_true._can_get_from_other_segment("/iceland/reykjavík") + + def test_can_get_from_other_rc(self, preloadable_true): + """Test if rc-cachable check works correctly.""" + assert not preloadable_true._can_get_from_other_rc("/iceland/bolungarvík") + assert preloadable_true._can_get_from_other_rc("/iceland/reykjavík") + assert preloadable_true._can_get_from_other_rc("/iceland/bolungarvík/shape") + assert not preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/magma") + assert preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/lava") + assert preloadable_true._can_get_from_other_rc("/iceland/bardarbunga/lava") + assert preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/magma/dtype") + + def test_get_from_other_segment(self, preloadable_true): + """Test that loading from a cached segment works.""" From 875dcce693b94cf8c4b04c87a410e274fc2692d7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 18 Dec 2023 18:17:10 +0100 Subject: [PATCH 14/74] Update header in FCI source file --- satpy/readers/fci_l1c_nc.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/satpy/readers/fci_l1c_nc.py b/satpy/readers/fci_l1c_nc.py index 4c40708672..43724ca14b 100644 --- a/satpy/readers/fci_l1c_nc.py +++ b/satpy/readers/fci_l1c_nc.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (c) 2017-2019 Satpy developers +# Copyright (c) 2017-2024 Satpy developers # # This file is part of satpy. # @@ -19,12 +17,10 @@ This module defines the :class:`FCIL1cNCFileHandler` file handler, to be used for reading Meteosat Third Generation (MTG) Flexible Combined -Imager (FCI) Level-1c data. FCI will fly -on the MTG Imager (MTG-I) series of satellites, with the first satellite (MTG-I1) -scheduled to be launched on the 13th of December 2022. -For more information about FCI, see `EUMETSAT`_. +Imager (FCI) Level-1c data. FCI flies on the MTG Imager (MTG-I) series +of satellites, with the first satellite (MTG-I1) launched on the 13th +of December 2022. For more information about FCI, see `EUMETSAT`_. -For simulated test data to be used with this reader, see `test data releases`_. For the Product User Guide (PUG) of the FCI L1c data, see `PUG`_. .. note:: From 1be2c79df803b5d59a27f2493bed9bc3dcd460f8 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 19 Dec 2023 16:09:28 +0100 Subject: [PATCH 15/74] Add tests and cleanup implementation Add more tests for the preloadable mixin in netcdf_utils. Cleanup some unused code and add checks/guards in this mixin. --- satpy/readers/netcdf_utils.py | 25 ++-- satpy/tests/reader_tests/test_netcdf_utils.py | 134 ++++++++++++++---- 2 files changed, 121 insertions(+), 38 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 39ad62d4df..4947457be6 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -17,6 +17,7 @@ import glob import logging +import os import pickle # nosec import time @@ -475,8 +476,17 @@ class Preloadable: def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload - self.ref_fh = ref_fh - self.rc_cache = rc_cache + if preload: + if not isinstance(ref_fh, BaseFileHandler): + raise TypeError( + "Expect reference filehandler when preloading, got " + f"{type(ref_fh)!s}") + self.ref_fh = ref_fh + if not isinstance(rc_cache, (str, bytes, os.PathLike)): + raise TypeError( + "Expect cache file when preloading, got " + f"{type(rc_cache)!s}") + self.rc_cache = rc_cache super().__init__(*args, **kwargs) def _collect_listed_variables(self, file_handle, listed_variables): @@ -522,15 +532,6 @@ def _can_get_from_other_rc(self, itm): return True return False - def _get_from_other_rc(self, itm): - """Obtain variable from repeat cycle cache.""" - cache_fn = self.rc_cache - return self._read_var_from_cache(cache_fn, itm) - - def _read_var_from_cache(self, cache_fn, itm): - with open(cache_fn, "rb") as fp: - return pickle.load(fp)[itm] # nosec - def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment fn_matched = _wait_for_file(self.filename) @@ -567,7 +568,7 @@ def store_cache(self, filename=None): to_store[key] = self[key] filename = filename or self.rc_cache - LOG.info(f"Storing cache to {filename:s}") + LOG.info(f"Storing cache to {filename!s}") with open(filename, "wb") as fp: # FIXME: potential security risk? Acceptable? pickle.dump(to_store, fp) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 4937fe3251..e48e9b4e7f 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -303,6 +303,13 @@ def test_use_h5netcdf_for_file_not_accessible_locally(self): class FakePreloadableHandler(Preloadable, FakeNetCDF4FileHandler): """Fake preloadable handler.""" + def __init__(self, filename, filename_info, filetype_info, **kwargs): + """Initialise the fake preloadable handler.""" + super().__init__(filename, filename_info, filetype_info, **kwargs) + if self.preload: + self._collect_listed_variables(None, + filetype_info.get("required_netcdf_variables")) + def get_test_content(self, filename, filename_info, filetype_info): """Get fake test content.""" return {} @@ -383,46 +390,87 @@ def test_get_delayed_from_file(self, tmp_path): var_del.compute() @pytest.fixture() - def preloadable_false(self): + def fake_config(self): + """Get a fake required net vars configuration.""" + return {"required_netcdf_variables": { + "/iceland/reykjavík": ["rc"], + "/iceland/bolungarvík": ["segment"], + "/iceland/{volcano}/lava": ["rc"], + "/iceland/{volcano}/crater": ["segment"], + "/iceland/{volcano}/magma": [], + }, + "variable_name_replacements": { + "volcano": ["eyjafjallajökull", "sýlingarfell"] + }} + + @pytest.fixture() + def preloadable_false(self, fake_config, tmp_path): """Return a preloadable filehandler with preload=False.""" + ds = xr.Dataset() + ds.to_netcdf(tmp_path / "grimsvötn.nc") handler = FakePreloadableHandler( - "dummy", + os.fspath(tmp_path / "grimsvötn.nc"), {}, - {"required_netcdf_variables": { - "/iceland/reykjavík": ["rc"], - "/iceland/bolungarvík": ["rc"]}}, + fake_config, preload=False) + handler.file_content["/iceland/reykjavík"] = xr.DataArray( + da.from_array([[0, 1, 2]])) + handler.file_content["/iceland/bolungarvík"] = "012" + handler.file_content["/iceland/eyjafjallajökull/lava"] = xr.DataArray( + da.from_array([[1, 2, 3]])) + handler.file_content["/iceland/eyjafjallajökull/crater"] = xr.DataArray( + da.from_array([[2, 2, 3]])) + handler.file_content["/iceland/eyjafjallajökull/magma"] = xr.DataArray( + da.from_array([[3, 2, 3]])) + handler.file_content["/iceland/eyjafjallajökull/magma/shape"] = (1, 3) + handler.file_content["/iceland/sýlingarfell/lava"] = xr.DataArray( + da.from_array([[4, 2, 3]])) + handler.file_content["/iceland/sýlingarfell/crater"] = xr.DataArray( + da.from_array([[5, 2, 3]])) + handler.file_content["/iceland/sýlingarfell/magma"] = xr.DataArray( + da.from_array([[6, 2, 3]])) + handler.file_content["/iceland/sýlingarfell/magma/shape"] = (1, 3) return handler @pytest.fixture() - def preloadable_true(self, preloadable_false): + def cache_file(self, preloadable_false, tmp_path): + """Define a cache file.""" + cf = os.fspath(tmp_path / "test.pkl") + preloadable_false.store_cache(cf) + return cf + + @pytest.fixture() + def preloadable_true(self, preloadable_false, tmp_path, cache_file, + fake_config): """Return a preloadable filehandler with preload=True.""" handler = FakePreloadableHandler( "dummy", {}, - {"required_netcdf_variables": { - "/iceland/reykjavík": ["rc"], - "/iceland/bolungarvík": ["segment"], - "/iceland/{volcano}/lava": ["rc"], - "/iceland/{volcano}/magma": [], - }, - "variable_name_replacements": { - "volcano": ["eyjafjallajökull", "bardarbunga"] - }}, + fake_config, preload=True, - ref_fh=preloadable_false) + ref_fh=preloadable_false, + rc_cache=cache_file) return handler - def test_store_cache(self, tmp_path, preloadable_false): + def test_store_and_load_cache(self, tmp_path, preloadable_false, + cache_file, fake_config): """Test that cache is stored as expected.""" - cf = os.fspath(tmp_path / "test.pkl") - preloadable_false.file_content["/iceland/reykjavík"] = da.from_array([0, 1, 2]) - preloadable_false.file_content["/iceland/bolungarvík"] = "012" - preloadable_false.store_cache(cf) - dt = pickle.load(open(cf, mode="rb")) - assert isinstance(dt["/iceland/reykjavík"], np.ndarray) # calculated - np.testing.assert_array_equal(dt["/iceland/reykjavík"], np.array([0, 1, 2])) - assert dt["/iceland/bolungarvík"] == "012" + dt = pickle.load(open(cache_file, mode="rb")) + assert isinstance(dt["/iceland/reykjavík"].data, np.ndarray) # calculated + np.testing.assert_array_equal(dt["/iceland/reykjavík"], + np.array([[0, 1, 2]])) + # segment-only should not be stored + assert "/iceland/bolungarvík" not in dt + # and test loading + handler = FakePreloadableHandler( + "dummy", + {}, + fake_config, + preload=True, + ref_fh=preloadable_false, + rc_cache=cache_file) + np.testing.assert_array_equal(handler["/iceland/reykjavík"], + np.array([[0, 1, 2]])) def test_can_get_from_other_segment(self, preloadable_true): """Test if segment-cachable check works correctly.""" @@ -436,8 +484,42 @@ def test_can_get_from_other_rc(self, preloadable_true): assert preloadable_true._can_get_from_other_rc("/iceland/bolungarvík/shape") assert not preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/magma") assert preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/lava") - assert preloadable_true._can_get_from_other_rc("/iceland/bardarbunga/lava") + assert preloadable_true._can_get_from_other_rc("/iceland/sýlingarfell/lava") assert preloadable_true._can_get_from_other_rc("/iceland/eyjafjallajökull/magma/dtype") def test_get_from_other_segment(self, preloadable_true): """Test that loading from a cached segment works.""" + assert preloadable_true["/iceland/bolungarvík"] == "012" + + def test_get_from_other_rc(self, preloadable_true): + """Test that loading from a cached repeat cycle works.""" + assert preloadable_true["/iceland/bolungarvík"] == "012" + + def test_incorrect_usage(self, fake_config, preloadable_false, cache_file, + tmp_path): + """Test exceptions raised when usage is incorrect.""" + with pytest.raises(TypeError, match="Expect reference filehandler"): + FakePreloadableHandler("dummy", {}, fake_config, preload=True, + rc_cache=cache_file) + with pytest.raises(TypeError, match="Expect cache file"): + FakePreloadableHandler("dummy", {}, fake_config, preload=True, + ref_fh=preloadable_false) + fph = FakePreloadableHandler("dummy", {}, fake_config, preload=True, + rc_cache=cache_file, ref_fh=preloadable_false) + with pytest.raises(ValueError, match="Cannot store cache with pre-loaded handler"): + fph.store_cache(tmp_path / "nowhere-special.pkl") + fc2 = fake_config.copy() + del fc2["required_netcdf_variables"] + fph = FakePreloadableHandler("dummy", {}, fc2, preload=False, + rc_cache=cache_file) + with pytest.raises(ValueError, match="Caching needs required_netcdf_variables"): + fph.store_cache(tmp_path / "nowhere-special.pkl") + + def test_get_file_handle(self, preloadable_true, preloadable_false): + """Test getting the file handle.""" + assert preloadable_true._get_file_handle() is None + assert preloadable_false._get_file_handle() is not None + + def test_collect_vars(self, tmp_path, fake_config, preloadable_false): + """Test collecting variables is delegated.""" + preloadable_false._collect_listed_variables(None, {}) From c1a9cf4c90d66afcc4ab46624d057397f782504a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 19 Dec 2023 17:22:29 +0100 Subject: [PATCH 16/74] Improve yaml reader tests Improve tests for the yaml reader in case of preloading file handlers. Tests fail. --- satpy/etc/readers/fci_l1c_nc.yaml | 37 ++++-------------- satpy/readers/yaml_reader.py | 2 + satpy/tests/test_yaml_reader.py | 64 +++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 30 deletions(-) diff --git a/satpy/etc/readers/fci_l1c_nc.yaml b/satpy/etc/readers/fci_l1c_nc.yaml index b353e17219..06349bbb93 100644 --- a/satpy/etc/readers/fci_l1c_nc.yaml +++ b/satpy/etc/readers/fci_l1c_nc.yaml @@ -18,7 +18,7 @@ file_types: file_reader: !!python/name:satpy.readers.fci_l1c_nc.FCIL1cNCFileHandler file_patterns: [ '{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-FDHSI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc' ] expected_segments: 40 - required_netcdf_variables: + required_netcdf_variables: &req # key/value; keys are names, value is a list of string on how this may be # cached between segments or between repeat cycles or neither attr/platform: @@ -100,7 +100,7 @@ file_types: - ir_133 # information needed for preloading segment_tag: count_in_repeat_cycle - time_tags: + time_tags: &tt - processing_time - start_time - end_time @@ -110,34 +110,11 @@ file_types: file_reader: !!python/name:satpy.readers.fci_l1c_nc.FCIL1cNCFileHandler file_patterns: [ '{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-HRFI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc' ] expected_segments: 40 - required_netcdf_variables: - - attr/platform - - data/{channel_name}/measured/start_position_row - - data/{channel_name}/measured/end_position_row - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_wavenumber - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_a - - data/{channel_name}/measured/radiance_to_bt_conversion_coefficient_b - - data/{channel_name}/measured/radiance_to_bt_conversion_constant_c1 - - data/{channel_name}/measured/radiance_to_bt_conversion_constant_c2 - - data/{channel_name}/measured/radiance_unit_conversion_coefficient - - data/{channel_name}/measured/channel_effective_solar_irradiance - - data/{channel_name}/measured/effective_radiance - - data/{channel_name}/measured/x - - data/{channel_name}/measured/y - - data/{channel_name}/measured/pixel_quality - - data/{channel_name}/measured/index_map - - data/mtg_geos_projection - - data/swath_direction - - data/swath_number - - index - - state/celestial/earth_sun_distance - - state/celestial/subsolar_latitude - - state/celestial/subsolar_longitude - - state/celestial/sun_satellite_distance - - state/platform/platform_altitude - - state/platform/subsatellite_latitude - - state/platform/subsatellite_longitude - - time + required_netcdf_variables: *req + segment_tag: count_in_repeat_cycle + time_tags: *tt + variable_tags: + - repeat_cycle_in_day variable_name_replacements: channel_name: - vis_06_hr diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 662dcbb8da..91c7ed9f52 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1327,6 +1327,8 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No if self.preload is True, also for predicted files that don't exist, as a glob pattern. """ + if fh_kwargs is None: + fh_kwargs = {} i = -1 # on first call, don't pass preload fh_kwargs_without_preload = fh_kwargs.copy() diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 2177ee950b..67bf0c6d3c 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1612,3 +1612,67 @@ def test_preloaded_instances(tmp_path, fake_gsyreader): "segment": 1})]) with pytest.raises(NotImplementedError, match="Pre-loading not implemented"): list(g) + + +def test_get_cache_filename(tmp_path): + """Test getting cache filename.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + + fn = tmp_path / "a-01.nc" + fn_info = {"segment": 1} + ft_info_simple = { + "file_reader": BaseFileHandler, + "file_patterns": ["a-{segment:>02d}.nc"], + "segment_tag": "segment", + "expected_segments": 5} + + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "filicudi"}, + "file_types": { + "m9g": ft_info_simple}}, preload=True) + fh = _get_fake_handler(fn, os.fspath(fn)) + + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) + assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / + "BaseFileHandler" / "a-01.pkz") + + fn = tmp_path / "a-04-01.nc" + fn_info = {"rc": 4, "segment": 1} + ft_info = ft_info_simple.copy() + ft_info["file_patterns"] = ["a-{rc:>02d}-{segment:>02d}.nc"] + + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "salina"}, + "file_types": { + "m9g": ft_info}}, preload=True) + fh = _get_fake_handler(fn, os.fspath(fn)) + + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) + assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / + "BaseFileHandler" / "a-00-01.pkz") + + fn = tmp_path / "a-20421015-234500-234600-04-01.nc" + fn_info = {"start_time": datetime(2042, 10, 15, 23, 45), + "end_time": datetime(2042, 10, 15, 23, 46), "rc": 4, "segment": 1} + ft_info = ft_info_simple.copy() + ft_info["file_patterns"] = ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{rc:>02d}-{segment:>02d}.nc"] + ft_info["time_tags"] = ["start_time", "end_time"] + + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "salina"}, + "file_types": { + "m9g": ft_info}}, preload=True) + fh = _get_fake_handler(fn, os.fspath(fn)) + + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) + assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / + "BaseFileHandler" / "a-010101000000-010101000000-00-01.pkz") From c2f23cd4893d2ea4e904497baca9df26044efa47 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 11:23:27 +0100 Subject: [PATCH 17/74] Improve tests for yaml reader and netcdf utils Improve the tests for the YAML reader and the NetCDF utils. Tolerate absence of time_tags and variable_tags for preloadables. Verify presence of required_netcdf_filehandlers on creation rather than on caching time. --- satpy/readers/netcdf_utils.py | 9 +- satpy/readers/yaml_reader.py | 6 +- satpy/tests/reader_tests/test_netcdf_utils.py | 16 ++-- satpy/tests/test_yaml_reader.py | 86 ++++++++++++++----- 4 files changed, 82 insertions(+), 35 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 4947457be6..89721e7247 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -113,7 +113,7 @@ def __init__(self, filename, filename_info, filetype_info, self._set_xarray_kwargs(xarray_kwargs, auto_maskandscale) listed_variables = filetype_info.get("required_netcdf_variables") - if listed_variables: + if listed_variables is not None: self._collect_listed_variables(file_handle, listed_variables) else: self.collect_metadata("", file_handle) @@ -488,6 +488,9 @@ def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): f"{type(rc_cache)!s}") self.rc_cache = rc_cache super().__init__(*args, **kwargs) + if "required_netcdf_variables" not in self.filetype_info: + raise ValueError("For preloadable filehandlers, " + "required_netcdf_variables is mandatory.") def _collect_listed_variables(self, file_handle, listed_variables): """Collect listed variables, either preloaded or regular.""" @@ -550,15 +553,13 @@ def _collect_variable_delayed(self, subst_name): def _get_file_handle(self): if self.preload: - return None + return open("/dev/null", "r") return super()._get_file_handle() def store_cache(self, filename=None): """Store RC-cachable data to cache.""" if self.preload: raise ValueError("Cannot store cache with pre-loaded handler") - if "required_netcdf_variables" not in self.filetype_info: - raise ValueError("Caching needs required_netcdf_variables, but none is defined.") to_store = {} for key in self.file_content.keys(): if self._can_get_from_other_rc(key): diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 91c7ed9f52..f4a6584773 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1327,6 +1327,8 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No if self.preload is True, also for predicted files that don't exist, as a glob pattern. """ + if self.preload and "requires" in filetype_info: + raise ValueError("Unable to preload with required types") if fh_kwargs is None: fh_kwargs = {} i = -1 @@ -1409,9 +1411,9 @@ def _get_cache_filename(self, filename, filename_info, fh): pat = self._select_pattern(filename) p = Parser(pat) new_info = filename_info.copy() - for tm in fh.filetype_info["time_tags"]: + for tm in fh.filetype_info.get("time_tags", []): new_info[tm] = datetime.datetime.min - for ct in fh.filetype_info["variable_tags"]: + for ct in fh.filetype_info.get("variable_tags", []): new_info[ct] = 0 name = p.compose(new_info) pt = pathlib.Path(name) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index e48e9b4e7f..c9eef6fe17 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -510,15 +510,17 @@ def test_incorrect_usage(self, fake_config, preloadable_false, cache_file, fph.store_cache(tmp_path / "nowhere-special.pkl") fc2 = fake_config.copy() del fc2["required_netcdf_variables"] - fph = FakePreloadableHandler("dummy", {}, fc2, preload=False, - rc_cache=cache_file) - with pytest.raises(ValueError, match="Caching needs required_netcdf_variables"): - fph.store_cache(tmp_path / "nowhere-special.pkl") + with pytest.raises(ValueError, + match="For preloadable filehandlers, " + "required_netcdf_variables is mandatory"): + fph = FakePreloadableHandler("dummy", {}, fc2, preload=False, + rc_cache=cache_file) - def test_get_file_handle(self, preloadable_true, preloadable_false): + def test_get_file_handle(self, preloadable_true, preloadable_false, + tmp_path): """Test getting the file handle.""" - assert preloadable_true._get_file_handle() is None - assert preloadable_false._get_file_handle() is not None + preloadable_true._get_file_handle() + preloadable_false._get_file_handle() def test_collect_vars(self, tmp_path, fake_config, preloadable_false): """Test collecting variables is delegated.""" diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 67bf0c6d3c..a994fa7505 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -18,6 +18,7 @@ """Testing the yaml_reader module.""" import os +import pickle import random import unittest from datetime import datetime @@ -1503,7 +1504,7 @@ def test_get_empty_segment_with_height(self): _fake_filetype_info = { - "file_reader": BaseFileHandler, + "file_reader": DummyReader, "file_patterns": [ "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc", "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc"], @@ -1581,26 +1582,67 @@ def test_select_pattern(fake_gsyreader): def test_preloaded_instances(tmp_path, fake_gsyreader): """That that preloaded instances are generated.""" - g = fake_gsyreader._new_filehandler_instances( - _fake_filetype_info, - [("M9G-a-21000101053000-21000101053100-0001.nc", - {"platform": "M9G", - "start_time": datetime(2100, 1, 1, 5, 30, ), - "end_time": datetime(2100, 1, 1, 5, 31, ), - "segment": 1})]) - total = list(g) - assert len(total) == 5 + from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): + pass + + ft_info = { + "file_reader": DummyPreloadableHandler, + "file_patterns": [ + "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc", + "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc"], + "expected_segments": 5, + "time_tags": ["start_time", "end_time"], + "segment_tag": "segment", + "required_netcdf_variables": []} - ffi2 = _fake_filetype_info.copy() + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "alicudi"}, + "file_types": { + "m9g": ft_info}}, preload=True) + + nm = tmp_path / "M9G-a-21000101053000-21000101053100-0001.nc" + nm.parent.mkdir(exist_ok=True, parents=True) + ds = xr.Dataset() + ds.to_netcdf(nm) + + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + for i in range(2, 6): # disk cache except for nr. 1 + fn = (tmp_path / "cache" / "satpy" / "preloadable" / + "DummyPreloadableHandler" / + f"M9G-a-10101000000-10101000000-{i:>04d}.pkl") + fn.parent.mkdir(exist_ok=True, parents=True) + with fn.open(mode="wb") as fp: + pickle.dump({}, fp) + + g = gsyr._new_filehandler_instances( + ft_info, + [(os.fspath(nm), + {"platform": "M9G", + "start_time": datetime(2100, 1, 1, 5, 30, ), + "end_time": datetime(2100, 1, 1, 5, 31, ), + "segment": 1})]) + total = list(g) + assert len(total) == 5 + + ffi2 = ft_info.copy() ffi2["requires"] = ["pergola"] - g = fake_gsyreader._new_filehandler_instances( + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "alicudi"}, + "file_types": { + "m9g": ffi2}}, preload=True) + g = gsyr._new_filehandler_instances( ffi2, - [("M9G-a-21000101053000-21000101053100-0001.nc", + [(os.fspath(nm), {"platform": "M9G", "start_time": datetime(2100, 1, 1, 5, 30, ), "end_time": datetime(2100, 1, 1, 5, 31, ), "segment": 1})]) - with pytest.raises(ValueError, match="Failed to create"): + with pytest.raises(ValueError, match="Unable to preload"): list(g) g = fake_gsyreader._new_preloaded_filehandler_instances( @@ -1631,13 +1673,13 @@ def test_get_cache_filename(tmp_path): "name": "filicudi"}, "file_types": { "m9g": ft_info_simple}}, preload=True) - fh = _get_fake_handler(fn, os.fspath(fn)) + fh = BaseFileHandler(fn, fn_info, ft_info_simple) with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-01.pkz") + "BaseFileHandler" / "a-01.pkl") fn = tmp_path / "a-04-01.nc" fn_info = {"rc": 4, "segment": 1} @@ -1649,19 +1691,19 @@ def test_get_cache_filename(tmp_path): "name": "salina"}, "file_types": { "m9g": ft_info}}, preload=True) - fh = _get_fake_handler(fn, os.fspath(fn)) + fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-00-01.pkz") + "BaseFileHandler" / "a-04-01.pkl") - fn = tmp_path / "a-20421015-234500-234600-04-01.nc" + fn = tmp_path / "a-20421015234500-234600-04-01.nc" fn_info = {"start_time": datetime(2042, 10, 15, 23, 45), "end_time": datetime(2042, 10, 15, 23, 46), "rc": 4, "segment": 1} ft_info = ft_info_simple.copy() - ft_info["file_patterns"] = ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{rc:>02d}-{segment:>02d}.nc"] + ft_info["file_patterns"] = ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%M%S}-{rc:>02d}-{segment:>02d}.nc"] ft_info["time_tags"] = ["start_time", "end_time"] gsyr = GEOSegmentYAMLReader( @@ -1669,10 +1711,10 @@ def test_get_cache_filename(tmp_path): "name": "salina"}, "file_types": { "m9g": ft_info}}, preload=True) - fh = _get_fake_handler(fn, os.fspath(fn)) + fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-010101000000-010101000000-00-01.pkz") + "BaseFileHandler" / "a-10101000000-000000-04-01.pkl") From 3093aff79b304f44e39a09e934f73a6c0259b93b Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 12:44:16 +0100 Subject: [PATCH 18/74] Use max rather than min date for mocking Using datetime.min for an artificial date leads to different strftime results between platforms (see https://bugs.python.org/msg307401). Use datetime.max instead. --- satpy/readers/yaml_reader.py | 2 +- satpy/tests/test_yaml_reader.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index f4a6584773..dd69ec2203 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1412,7 +1412,7 @@ def _get_cache_filename(self, filename, filename_info, fh): p = Parser(pat) new_info = filename_info.copy() for tm in fh.filetype_info.get("time_tags", []): - new_info[tm] = datetime.datetime.min + new_info[tm] = datetime.datetime.max for ct in fh.filetype_info.get("variable_tags", []): new_info[ct] = 0 name = p.compose(new_info) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index a994fa7505..c1c97af279 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1613,7 +1613,7 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): for i in range(2, 6): # disk cache except for nr. 1 fn = (tmp_path / "cache" / "satpy" / "preloadable" / "DummyPreloadableHandler" / - f"M9G-a-10101000000-10101000000-{i:>04d}.pkl") + f"M9G-a-99991231235959-99991231235959-{i:>04d}.pkl") fn.parent.mkdir(exist_ok=True, parents=True) with fn.open(mode="wb") as fp: pickle.dump({}, fp) @@ -1717,4 +1717,4 @@ def test_get_cache_filename(tmp_path): au.return_value = os.fspath(tmp_path / "cache") cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-10101000000-000000-04-01.pkl") + "BaseFileHandler" / "a-99991231235959-235959-04-01.pkl") From 4ffe742f2b68307498784f25cd34ee8fc7e171fb Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 13:22:55 +0100 Subject: [PATCH 19/74] Simplify method and repair windows support Simplify _new_filehandler_instances. Hopefully this will satisfy codescene. Replace "/dev/null" by os.devnull for cross-platform support. --- satpy/readers/netcdf_utils.py | 2 +- satpy/readers/yaml_reader.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 89721e7247..57046b79b8 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -553,7 +553,7 @@ def _collect_variable_delayed(self, subst_name): def _get_file_handle(self): if self.preload: - return open("/dev/null", "r") + return open(os.devnull, "r") return super()._get_file_handle() def store_cache(self, filename=None): diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index dd69ec2203..0a6cd64a23 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1327,22 +1327,23 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No if self.preload is True, also for predicted files that don't exist, as a glob pattern. """ - if self.preload and "requires" in filetype_info: - raise ValueError("Unable to preload with required types") if fh_kwargs is None: fh_kwargs = {} - i = -1 - # on first call, don't pass preload fh_kwargs_without_preload = fh_kwargs.copy() fh_kwargs_without_preload.pop("preload", None) - for (i, fh) in enumerate(super()._new_filehandler_instances( - filetype_info, filename_items, - fh_kwargs=fh_kwargs_without_preload)): - yield fh - if self.preload and i >= 0: - if i < fh.filetype_info["expected_segments"]: - yield from self._new_preloaded_filehandler_instances( - filetype_info, fh) + fh_it = super()._new_filehandler_instances(filetype_info, + filename_items, + fh_kwargs=fh_kwargs_without_preload) + if not self.preload: + yield from fh_it + return + if "requires" in filetype_info: + raise ValueError("Unable to preload with required types") + fh_first = next(fh_it) + yield fh_first + yield from fh_it + yield from self._new_preloaded_filehandler_instances( + filetype_info, fh_first) def _new_preloaded_filehandler_instances(self, filetype_info, fh): """Get filehandler instances for non-existing files. From 558ff3a91a24065cedfa68b351da27f7118c11e4 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 15:19:12 +0100 Subject: [PATCH 20/74] Wait longer in test In test waiting for file to appear, wait longer. Maybe this will fix the "file not found" on 3.11 problem. --- satpy/tests/reader_tests/test_netcdf_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index c9eef6fe17..45e02dd055 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -333,10 +333,10 @@ def test_wait_for_file_appears(tmp_path): """Test case where file appears after a bit.""" from satpy.readers.netcdf_utils import _wait_for_file def _wait_and_create(path): - time.sleep(0.1) + time.sleep(0.08) path.touch() fn = tmp_path / "file2" - waiter = _wait_for_file(os.fspath(fn), max_tries=3, wait=0.05) + waiter = _wait_for_file(os.fspath(fn), max_tries=8, wait=0.07) with concurrent.futures.ThreadPoolExecutor() as executor: executor.submit(_wait_and_create, fn) t1 = time.time() From 690eedf119b9dc966e081ef69634c8bdc832641a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 15:22:36 +0100 Subject: [PATCH 21/74] Control max tries with kw arguments When waiting for a file to appear, control max tries and wait between tries with keyword arguments to the reader. --- satpy/readers/netcdf_utils.py | 10 +++++++--- satpy/readers/yaml_reader.py | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 57046b79b8..4dac6eeb1f 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -473,9 +473,12 @@ class Preloadable: ..versionadded: 0.47 """ - def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): + def __init__(self, *args, preload=False, preload_step=2, preload_tries=300, + ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload + self.preload_tries = preload_tries + self.preload_step = preload_step if preload: if not isinstance(ref_fh, BaseFileHandler): raise TypeError( @@ -537,7 +540,8 @@ def _can_get_from_other_rc(self, itm): def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment - fn_matched = _wait_for_file(self.filename) + fn_matched = _wait_for_file(self.filename, self.preload_tries, + self.preload_step) dade = _get_delayed_value_from_nc(fn_matched, subst_name) array = da.from_delayed( dade, @@ -592,7 +596,7 @@ def _wait_for_file(fn, max_tries=300, wait=2): @dask.delayed -def _get_delayed_value_from_nc(fn, var, max_tries=300, wait=2, auto_maskandscale=False): +def _get_delayed_value_from_nc(fn, var, auto_maskandscale=False): with netCDF4.Dataset(fn, "r") as nc: if hasattr(nc, "set_auto_maskandscale"): nc.set_auto_maskandscale(auto_maskandscale) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 0a6cd64a23..0f3a8eab75 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1162,8 +1162,10 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): a scene based on all expected segments with dask delayed objects. When computing the dask graphs, satpy will process each segment as soon as it comes in, strongly reducing the timeliness for processing a full disc - image in near real time. This feature is experimental. Use at your own - risk. + image in near real time. Checking for new files can be controlled + with the argumentst ``preload_step`` (time in seconds) and + ``preload_tries`` (how many tries before giving up). + This feature is experimental. Use at your own risk. .. versionadded: 0.47 """ From 31815dcc6861379ca0b4ab183a21dbfbca7bc635 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 16:49:29 +0100 Subject: [PATCH 22/74] Added test for create_preloadable_cache Added a test for create_preloadable_cache. --- satpy/tests/test_readers.py | 35 +++++++++++++++++++++++++++++++++ satpy/tests/test_yaml_reader.py | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/satpy/tests/test_readers.py b/satpy/tests/test_readers.py index d91e2b6fed..e1b0c7fb14 100644 --- a/satpy/tests/test_readers.py +++ b/satpy/tests/test_readers.py @@ -19,13 +19,16 @@ import builtins import os +import pickle import sys import unittest import warnings from contextlib import suppress from unittest import mock +import dask.array as da import pytest +import xarray as xr from satpy.dataset.data_dict import get_key from satpy.dataset.dataid import DataID, ModifierTuple, WavelengthRange @@ -1088,3 +1091,35 @@ def test_hash(self): assert len({hash(FSFile(fn, fs)) for fn in {self.local_filename, self.local_filename2} for fs in [None, lfs, zfs, cfs]}) == 2*4 + + +def test_create_preloadable_cache(tmp_path): + """Test utility function creating a test for preloading.""" + from satpy.readers import create_preloadable_cache + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + from satpy.tests.reader_tests.test_netcdf_utils import FakePreloadableHandler + fake_config = {"reader": { + "name": "tartupaluk"}, + "file_types": { + "m9g": { + "file_reader": FakePreloadableHandler, + "file_patterns": ["a-{segment:d}.nc"], + "expected_segments": 3, + "required_netcdf_variables": {"/iceland/reykjavík": ["rc"]}}}} + dph = FakePreloadableHandler( + os.fspath(tmp_path / "a-0.nc"), + {"segment": 0}, fake_config["file_types"]["m9g"], + preload=False, rc_cache=tmp_path / "test.pkl") + dph.file_content["/iceland/reykjavík"] = xr.DataArray(da.from_array([[0, 1, 2]])) + gsyr = GEOSegmentYAMLReader(fake_config, preload=True) + gsyr.file_handlers["handler"] = [dph] + + with unittest.mock.patch("satpy.readers.load_readers") as srl: + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + srl.return_value = {"tartupaluk": gsyr} + create_preloadable_cache("tartupaluk", [tmp_path / "a-0.nc"]) + with (tmp_path / "cache" / "satpy" / "preloadable" / + "FakePreloadableHandler" / "a-0.pkl").open(mode="rb") as fp: + data = pickle.load(fp) + assert data.keys() diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index c1c97af279..8302444aba 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1595,7 +1595,7 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): "expected_segments": 5, "time_tags": ["start_time", "end_time"], "segment_tag": "segment", - "required_netcdf_variables": []} + "required_netcdf_variables": {}} gsyr = GEOSegmentYAMLReader( {"reader": { From fe2e9922bd42f40a61b8f41b71cdf1ec27b97829 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 20 Dec 2023 17:25:56 +0100 Subject: [PATCH 23/74] Add documentation on pre-loading/eager processing --- doc/source/reading.rst | 47 +++++++++++++++++++++++++++++++++++ satpy/readers/netcdf_utils.py | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index b7264eeb6e..f137e22e0a 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -286,6 +286,53 @@ are automatically added. Other possible coordinates you may see: * ``acq_time``: Instrument data acquisition time per scan or row of data. +Reading data before they're available +===================================== + +Normally, data need to exist before they can be read. This requirement +impacts data processing timeliness. For data arriving in segments, +Satpy can process each segment immediately as it comes in. +This feature currently only works with the :mod:`~satpy.readers.fci_l1c_nc` reader. +This is experimental and likely to be instable and might change. + +Consider a near real time data reception situation where FCI segments are +delivered one by one. Clasically, to produce full disc imagery, +users would wait for all needed segments to arrive, before they +start processing any data by passing all segments to the :class:`~satpy.Scene`. +For a more timely imagery production, users can create the Scene, load the data, resample, and even call :method:`~satpy.Scene.save_datasets` (as long as ``compute=False``) before there the data are complete. +When they then trigger the computation, much of the overhead in Satpy internals has already been completed, and Satpy will process each segment as it comes in. + +To do so, Satpy caches a selection data and metadata between segments +and between repeat cycles. Caching between segments happens in-memory +and needs no farther preparation from the user, but where data are cached +between repeat cycles, the user needs to create this cache first:: + + >>> from satpy.readers import create_preloadable_cache + >>> create_preloadable_cache("fci_l1c_nc", fci_files) + +For one full disc of FDHSI or HRFI data. This needs to be done only once, or +possibly again if there is an unexpected change in the data that are cached +between repeat cycles (as defined in the reader YAML file). +To make use of eager processing, the Scene object should be called passing +``preload=True``, passing _only_ the path to the first segment:: + + >>> sc = Scene( + ... filenames=[path_to_first_segment], + ... reader="fci_l1c_nc", + ... reader_kwargs={"preload": True}) + +Satpy will figure out the names of the remaining segments and find them as +they come in. If the data are already available, processing is similar to +the regular case. If the data are not yet available, Satpy will wait during +the computation of the dask graphs until data become available. + +For more technical background reading including hints +on how this could be extended to other readers, see +:class:`~satpy.readers.netcdf_utils.Preloadable` and +:class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. + +.. versionadded: 0.47 + Adding a Reader to Satpy ======================== diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 4dac6eeb1f..6c59f1b4f9 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -470,7 +470,7 @@ class Preloadable: This feature is experimental. - ..versionadded: 0.47 + .. versionadded: 0.47 """ def __init__(self, *args, preload=False, preload_step=2, preload_tries=300, From d5365f153ec7626f748c7a36334677808fafd3d7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 09:52:15 +0100 Subject: [PATCH 24/74] Small doc fixes Small fixes in the sphinx syntax --- doc/source/reading.rst | 22 +++++++++++++--------- satpy/readers/netcdf_utils.py | 4 ++-- satpy/readers/yaml_reader.py | 26 ++++++++++++++------------ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index f137e22e0a..d76f899706 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -296,15 +296,19 @@ This feature currently only works with the :mod:`~satpy.readers.fci_l1c_nc` read This is experimental and likely to be instable and might change. Consider a near real time data reception situation where FCI segments are -delivered one by one. Clasically, to produce full disc imagery, -users would wait for all needed segments to arrive, before they -start processing any data by passing all segments to the :class:`~satpy.Scene`. -For a more timely imagery production, users can create the Scene, load the data, resample, and even call :method:`~satpy.Scene.save_datasets` (as long as ``compute=False``) before there the data are complete. -When they then trigger the computation, much of the overhead in Satpy internals has already been completed, and Satpy will process each segment as it comes in. - -To do so, Satpy caches a selection data and metadata between segments +delivered one by one. Classically, to produce full disc imagery, users +would wait for all needed segments to arrive, before they start processing +any data by passing all segments to the :class:`~satpy.scene.Scene`. +For a more timely imagery production, users can create the Scene, load the +data, resample, and even call :meth:`~satpy.scene.Scene.save_datasets` +(as long as ``compute=False``) before there the data are complete. +When they then trigger the computation, much of the overhead in Satpy +internals has already been completed, and Satpy will process each segment +as it comes in. + +To do so, Satpy caches a selection of data and metadata between segments and between repeat cycles. Caching between segments happens in-memory -and needs no farther preparation from the user, but where data are cached +and needs no preparation from the user, but where data are cached between repeat cycles, the user needs to create this cache first:: >>> from satpy.readers import create_preloadable_cache @@ -331,7 +335,7 @@ on how this could be extended to other readers, see :class:`~satpy.readers.netcdf_utils.Preloadable` and :class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. -.. versionadded: 0.47 +.. versionadded:: 0.47 Adding a Reader to Satpy ======================== diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 6c59f1b4f9..812d4f88eb 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -454,7 +454,7 @@ class Preloadable: To implement a filehandler using this, make sure it derives from both :class:`NetCDF4FileHandler` and this class, and make sure to pass keyword - arguments in `__init__` to the superclass. In the YAML file, + arguments in ``__init__`` to the superclass. In the YAML file, ``required_netcdf_variables`` must be defined as a dictionary. The keys are variable names, and values are a list of strings describing what assumptions can be made about caching, where ``"rc"`` means the value is @@ -470,7 +470,7 @@ class Preloadable: This feature is experimental. - .. versionadded: 0.47 + .. versionadded:: 0.47 """ def __init__(self, *args, preload=False, preload_step=2, preload_tries=300, diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 0f3a8eab75..84d098c5ff 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1144,8 +1144,8 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): This reader pads the data to full geostationary disk if necessary. This reader uses an optional ``pad_data`` keyword argument that can be - passed to :meth:`Scene.load` to control if padding is done (True by - default). Passing `pad_data=False` will return data unpadded. + passed to :meth:`~satpy.scene.Scene.load` to control if padding is done (True by + default). Passing ``pad_data=False`` will return data unpadded. When using this class in a reader's YAML configuration, segmented file types (files that may have multiple segments) should specify an extra @@ -1156,18 +1156,20 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): will default to 1 segment. This reader uses an optional ``preload`` keyword argument that can be - passed to :meth:`Scene.__init__`. This argument is intended for near real - time processing. When only one segment has arrived, the user can create a - scene using this one segment and ``preload=True``, and Satpy will populate - a scene based on all expected segments with dask delayed objects. When - computing the dask graphs, satpy will process each segment as soon as it - comes in, strongly reducing the timeliness for processing a full disc - image in near real time. Checking for new files can be controlled - with the argumentst ``preload_step`` (time in seconds) and - ``preload_tries`` (how many tries before giving up). + passed to the :meth:`~satpy.scene.Scene` class upon instantiation. + This argument is intended for near real time processing. When only + one segment has arrived, the user can create a scene using this one + segment and ``preload=True``, and Satpy will populate a scene based on + all expected segments with dask delayed objects. When computing the + dask graphs, satpy will process each segment as soon as it comes in, + strongly reducing the timeliness for processing a full disc image in + near real time. Checking for new files can be controlled with the + arguments ``preload_step`` (time in seconds) and ``preload_tries`` + (how many tries before giving up). + This feature is experimental. Use at your own risk. - .. versionadded: 0.47 + .. versionadded:: 0.47 """ def __init__(self, *args, **kwargs): From 8de6a4d55dd8ada201857bf39f70b0c38f9c686b Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 11:46:41 +0100 Subject: [PATCH 25/74] Improve tests further --- satpy/tests/test_yaml_reader.py | 50 +++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 8302444aba..3584a6f431 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1590,43 +1590,53 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): ft_info = { "file_reader": DummyPreloadableHandler, "file_patterns": [ - "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc", - "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>04d}.nc"], - "expected_segments": 5, + "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", + "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"], + "expected_segments": 3, "time_tags": ["start_time", "end_time"], "segment_tag": "segment", - "required_netcdf_variables": {}} + "required_netcdf_variables": { + "panarea": ["segment"], + "strómboli": ["rc"], + "salina": []}} gsyr = GEOSegmentYAMLReader( {"reader": { - "name": "alicudi"}, + "name": "island-reader"}, "file_types": { "m9g": ft_info}}, preload=True) - nm = tmp_path / "M9G-a-21000101053000-21000101053100-0001.nc" + # create a dummy file + + nm = tmp_path / "M9G-a-21000101053000-21000101053100-01.nc" nm.parent.mkdir(exist_ok=True, parents=True) ds = xr.Dataset() + ds["panarea"] = xr.DataArray(np.array([[0, 1, 2]]), dims=["y", "x"]) + ds["strómboli"] = xr.DataArray(np.array([[1, 1, 2]]), dims=["y", "x"]) + ds["salina"] = xr.DataArray(np.array([[2, 1, 2]]), dims=["y", "x"]) ds.to_netcdf(nm) with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") - for i in range(2, 6): # disk cache except for nr. 1 + # prepare cache files + for i in range(2, 4): # disk cache except for nr. 1 fn = (tmp_path / "cache" / "satpy" / "preloadable" / "DummyPreloadableHandler" / - f"M9G-a-99991231235959-99991231235959-{i:>04d}.pkl") + f"M9G-a-99991231235959-99991231235959-{i:>02d}.pkl") fn.parent.mkdir(exist_ok=True, parents=True) with fn.open(mode="wb") as fp: - pickle.dump({}, fp) - - g = gsyr._new_filehandler_instances( - ft_info, - [(os.fspath(nm), - {"platform": "M9G", - "start_time": datetime(2100, 1, 1, 5, 30, ), - "end_time": datetime(2100, 1, 1, 5, 31, ), - "segment": 1})]) + pickle.dump( + {"strómboli": ds["strómboli"]}, fp) + + g = gsyr.create_filehandlers([os.fspath(nm)]) +# ft_info, +# [(os.fspath(nm), +# {"platform": "M9G", +# "start_time": datetime(2100, 1, 1, 5, 30, ), +# "end_time": datetime(2100, 1, 1, 5, 31, ), +# "segment": 1})]) total = list(g) - assert len(total) == 5 + assert len(total) == 3 ffi2 = ft_info.copy() ffi2["requires"] = ["pergola"] @@ -1718,3 +1728,7 @@ def test_get_cache_filename(tmp_path): cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / "BaseFileHandler" / "a-99991231235959-235959-04-01.pkl") + + +def test_preloaded_instances_files_absent(tmp_path): + """Test preloaded instances when subsequent files are absent.""" From fd1f7cf2856f02e769cc19e49d354a81cd70e6c8 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 12:42:00 +0100 Subject: [PATCH 26/74] Expanded preloading test Added preloading test in case of multiple filetypes, which reproduces a bug I introduced. --- satpy/tests/test_yaml_reader.py | 46 +++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 3584a6f431..477638d0d2 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -18,7 +18,6 @@ """Testing the yaml_reader module.""" import os -import pickle import random import unittest from datetime import datetime @@ -1596,15 +1595,29 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): "time_tags": ["start_time", "end_time"], "segment_tag": "segment", "required_netcdf_variables": { - "panarea": ["segment"], - "strómboli": ["rc"], - "salina": []}} + "grp/panarea": ["segment"], # put in group to avoid https://github.com/pytroll/satpy/issues/2704 + "grp/strómboli": ["rc"], + "grp/salina": []}} + + ft_info_2 = { + "file_reader": DummyPreloadableHandler, + "file_patterns": [ + "{platform}-c-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", + "{platform}-d-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"], + "expected_segments": 3, + "time_tags": ["start_time", "end_time"], + "segment_tag": "segment", + "required_netcdf_variables": { + "grp/panarea": ["segment"], + "grp/strómboli": ["rc"], + "grp/salina": []}} gsyr = GEOSegmentYAMLReader( {"reader": { "name": "island-reader"}, "file_types": { - "m9g": ft_info}}, preload=True) + "m9g": ft_info, + "mag": ft_info_2}}, preload=True) # create a dummy file @@ -1614,29 +1627,24 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): ds["panarea"] = xr.DataArray(np.array([[0, 1, 2]]), dims=["y", "x"]) ds["strómboli"] = xr.DataArray(np.array([[1, 1, 2]]), dims=["y", "x"]) ds["salina"] = xr.DataArray(np.array([[2, 1, 2]]), dims=["y", "x"]) - ds.to_netcdf(nm) + ds.to_netcdf(nm, group="/grp") + + fn_info = {"platform": "M9a", "start_time": datetime(2100, 1, 1, 5, 30), + "end_time": datetime(2100, 1, 1, 5, 31), "segment": 1} with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") # prepare cache files + dph = DummyPreloadableHandler(os.fspath(nm), fn_info, ft_info) for i in range(2, 4): # disk cache except for nr. 1 fn = (tmp_path / "cache" / "satpy" / "preloadable" / "DummyPreloadableHandler" / f"M9G-a-99991231235959-99991231235959-{i:>02d}.pkl") fn.parent.mkdir(exist_ok=True, parents=True) - with fn.open(mode="wb") as fp: - pickle.dump( - {"strómboli": ds["strómboli"]}, fp) - - g = gsyr.create_filehandlers([os.fspath(nm)]) -# ft_info, -# [(os.fspath(nm), -# {"platform": "M9G", -# "start_time": datetime(2100, 1, 1, 5, 30, ), -# "end_time": datetime(2100, 1, 1, 5, 31, ), -# "segment": 1})]) - total = list(g) - assert len(total) == 3 + dph.store_cache(os.fspath(fn)) + + fhs = gsyr.create_filehandlers([os.fspath(nm)]) + assert len(fhs["m9g"]) == 3 ffi2 = ft_info.copy() ffi2["requires"] = ["pergola"] From 1ef18ee8a7a6b6dcc4c75a4f4a67c79847b75b43 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 14:26:35 +0100 Subject: [PATCH 27/74] Guard against unmatching files Guard against non-matching files, such as is common with multilpe filetypes with different file patterns. --- satpy/readers/yaml_reader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 84d098c5ff..045e4635f5 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1343,7 +1343,10 @@ def _new_filehandler_instances(self, filetype_info, filename_items, fh_kwargs=No return if "requires" in filetype_info: raise ValueError("Unable to preload with required types") - fh_first = next(fh_it) + try: + fh_first = next(fh_it) + except StopIteration: + return yield fh_first yield from fh_it yield from self._new_preloaded_filehandler_instances( From 5808c9fccae03d4e32bff0eb103d9ed835f167de Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 15:15:26 +0100 Subject: [PATCH 28/74] Use different time sentinel When producing the glob pattern, use a different time sentinel. Use 235959 rather than 000000, as the former is less likely to appear accidentally in a filename. --- satpy/readers/netcdf_utils.py | 1 + satpy/readers/yaml_reader.py | 4 ++-- satpy/tests/test_yaml_reader.py | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 812d4f88eb..185f12b8bf 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -590,6 +590,7 @@ def _wait_for_file(fn, max_tries=300, wait=2): continue elif len(fns) > 1: raise ValueError(f"Expected one matching file, found {len(fns):d}") + LOG.debug(f"Found {fn!s}!") return fns[0] else: raise TimeoutError("File failed to materialise") diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 045e4635f5..0f9c379245 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1409,7 +1409,7 @@ def _predict_filename(self, fh, i): new_info = _predict_filename_info(fh, i) new_filename = p.compose(new_info) basedir = pathlib.Path(fh.filename).parent - new_filename = new_filename.replace("000000", "??????") + new_filename = new_filename.replace("235959", "??????") new_filename = os.fspath(basedir / new_filename) return (new_filename, new_info) @@ -1444,7 +1444,7 @@ def _predict_filename_info(fh, i): new_info[fh.filetype_info["segment_tag"]] = i for tm in fh.filetype_info["time_tags"]: new_info[tm] = new_info[tm].replace( - hour=0, minute=0, second=0) + hour=23, minute=59, second=59) return new_info diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 477638d0d2..556f363a74 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -20,7 +20,7 @@ import os import random import unittest -from datetime import datetime +from datetime import datetime, timedelta from tempfile import mkdtemp from unittest.mock import MagicMock, call, patch @@ -1536,8 +1536,8 @@ def test_predict_filename_info(tmp_path): info = _predict_filename_info(fh, 3) assert info == { "platform": "M9G", - "start_time": datetime(2100, 1, 1, 0, 0, ), - "end_time": datetime(2100, 1, 1, 0, 0, ), + "start_time": datetime(2100, 1, 1, 23, 59, 59), + "end_time": datetime(2100, 1, 1, 23, 59, 59), "segment": 3} @@ -1565,6 +1565,18 @@ def test_predict_filename(tmp_path, fake_gsyreader): newname = fake_gsyreader._predict_filename(fh, 4) assert newname[0] == os.fspath(tmp_path / "M9G-b-21000101??????-21000101??????-0004.nc") + st = datetime(2023, 12, 20, 15, 8, 49) + et = st + timedelta(minutes=5) + fn = f"M9G-b-{st:%Y%m%d%H%M%S}-{et:%Y%m%d%H%M%S}-0001.nc" + pt = "M9G-b-20231220??????-20231220??????-0004.nc" + fake_filename_info = {"platform": "M9G", "start_time": st, "end_time": et, "segment": 1} + fakehandler = BaseFileHandler( + os.fspath(tmp_path / fn), + fake_filename_info, + _fake_filetype_info) + newname = fake_gsyreader._predict_filename(fakehandler, 4) + assert newname[0] == os.fspath(tmp_path / pt) + def test_select_pattern(fake_gsyreader): """Test selecting the appropriate pattern.""" From e0a38ccc34dee3755bc3da60b4605a52ad4916d7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 15:34:14 +0100 Subject: [PATCH 29/74] Add more logging --- satpy/readers/netcdf_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 185f12b8bf..6d7b5a54cf 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -586,14 +586,16 @@ def _wait_for_file(fn, max_tries=300, wait=2): for _ in range(max_tries): fns = glob.glob(fn) if len(fns) == 0: + if _ % 60 == 0: + LOG.debug(f"Still waiting for {fn!s}") time.sleep(wait) continue elif len(fns) > 1: raise ValueError(f"Expected one matching file, found {len(fns):d}") - LOG.debug(f"Found {fn!s}!") + LOG.debug(f"Found {fns[0]!s} matching {fn!s}!") return fns[0] else: - raise TimeoutError("File failed to materialise") + raise TimeoutError(f"File matching {fn!s} failed to materialise") @dask.delayed From 68dc9d8bc66549d8fef1ada41f37db0998fdc018 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 17:54:44 +0100 Subject: [PATCH 30/74] Use xarray rather than netCDF4 for opening Use xarray rather than netCDF4 for retrieving the variable name. Maybe this will help to resolve the concurrency problem. --- satpy/readers/netcdf_utils.py | 10 ++++++---- satpy/tests/reader_tests/test_netcdf_utils.py | 8 +++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 6d7b5a54cf..c3ee48164c 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -586,7 +586,7 @@ def _wait_for_file(fn, max_tries=300, wait=2): for _ in range(max_tries): fns = glob.glob(fn) if len(fns) == 0: - if _ % 60 == 0: + if _ % 60 == 30: LOG.debug(f"Still waiting for {fn!s}") time.sleep(wait) continue @@ -600,7 +600,9 @@ def _wait_for_file(fn, max_tries=300, wait=2): @dask.delayed def _get_delayed_value_from_nc(fn, var, auto_maskandscale=False): - with netCDF4.Dataset(fn, "r") as nc: - if hasattr(nc, "set_auto_maskandscale"): - nc.set_auto_maskandscale(auto_maskandscale) + if "/" in var: + (grp, var) = var.rsplit("/", maxsplit=1) + else: + (grp, var) = (None, var) + with xr.open_dataset(fn, group=grp, mask_and_scale=auto_maskandscale) as nc: return nc[var][:] diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 45e02dd055..ab9197dc5a 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -387,7 +387,13 @@ def test_get_delayed_from_file(self, tmp_path): ds = xr.Dataset({"rijeka": (("y", "x"), np.zeros((3, 3)))}) ds.to_netcdf(ncname) var_del = _get_delayed_value_from_nc(ncname, "rijeka") - var_del.compute() + np.testing.assert_array_equal(var_del.compute(), np.zeros((3, 3))) + + ncname = tmp_path / "liechtenstein.nc" + ds = xr.Dataset({"vaduz": (("y", "x"), np.ones((2, 2)))}) + ds.to_netcdf(ncname, group="/earth/europe") + var_del = _get_delayed_value_from_nc(ncname, "earth/europe/vaduz") + np.testing.assert_array_equal(var_del.compute(), np.ones((2, 2))) @pytest.fixture() def fake_config(self): From c78d2464971d05ed638ae184fab857701a086497 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 21 Dec 2023 18:25:16 +0100 Subject: [PATCH 31/74] Use h5netcdf backend. --- doc/source/reading.rst | 2 ++ satpy/readers/netcdf_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index d76f899706..b28c30106d 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -330,6 +330,8 @@ they come in. If the data are already available, processing is similar to the regular case. If the data are not yet available, Satpy will wait during the computation of the dask graphs until data become available. +Note that this uses the ``h5netcdf`` backend for opening NetCDF files. + For more technical background reading including hints on how this could be extended to other readers, see :class:`~satpy.readers.netcdf_utils.Preloadable` and diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index c3ee48164c..db60ff02c1 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -604,5 +604,5 @@ def _get_delayed_value_from_nc(fn, var, auto_maskandscale=False): (grp, var) = var.rsplit("/", maxsplit=1) else: (grp, var) = (None, var) - with xr.open_dataset(fn, group=grp, mask_and_scale=auto_maskandscale) as nc: + with xr.open_dataset(fn, group=grp, mask_and_scale=auto_maskandscale, engine="h5netcdf") as nc: return nc[var][:] From c8d5ba44bf444dcbae38a6898c8c760ee704b5fd Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 22 Dec 2023 10:05:00 +0100 Subject: [PATCH 32/74] Adapt docs and logging Write about limitations in the documentation. Adapt how things are logged. --- doc/source/reading.rst | 12 ++++++++++++ satpy/readers/netcdf_utils.py | 12 +++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index b28c30106d..38c8b7f20c 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -330,6 +330,18 @@ they come in. If the data are already available, processing is similar to the regular case. If the data are not yet available, Satpy will wait during the computation of the dask graphs until data become available. +Some limitations that may be resolved in the future: + +- Mixing different file types for the same reader is not yet supported. + For FCI, that means it is not yet possible to mix FDHSI and HRFI data. +- Only nominal case +- Dask may not order the processing of the chunks optimally. That means some + dask workers may be waiting for chunks 33–40 as chunks 1–32 are coming in + and are not being processed. +- Currently, Satpy merely checks the existence of a file and not whether it + has been completely written. This may lead to incomplete files being read, + which might lead to failures. + Note that this uses the ``h5netcdf`` backend for opening NetCDF files. For more technical background reading including hints diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index db60ff02c1..44e2ba2011 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -582,17 +582,19 @@ def store_cache(self, filename=None): @dask.delayed def _wait_for_file(fn, max_tries=300, wait=2): """Wait for file to appear.""" - LOG.debug(f"Waiting for {fn!s} to appear.") - for _ in range(max_tries): + for i in range(max_tries): fns = glob.glob(fn) if len(fns) == 0: - if _ % 60 == 30: + if i == 0: # only log if not yet present + LOG.debug(f"Waiting for {fn!s} to appear.") + if i % 60 == 30: LOG.debug(f"Still waiting for {fn!s}") time.sleep(wait) continue - elif len(fns) > 1: + if len(fns) > 1: raise ValueError(f"Expected one matching file, found {len(fns):d}") - LOG.debug(f"Found {fns[0]!s} matching {fn!s}!") + if i > 0: # don't log if found immediately + LOG.debug(f"Found {fns[0]!s} matching {fn!s}!") return fns[0] else: raise TimeoutError(f"File matching {fn!s} failed to materialise") From 795d882a50568ae116157da74769909f3e5edfc2 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 13 Feb 2024 18:23:18 +0100 Subject: [PATCH 33/74] Refactor unit test test_preloaded_intsances Refactor the unit test test_preloaded_instances into multiple unit tests testing different cases. This should hopefully improve the code health as measured by CodeScene. --- satpy/tests/test_yaml_reader.py | 105 +++++++++++++++++++------------- 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index e4b75faf49..6ccbbe33df 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1618,15 +1618,37 @@ def test_select_pattern(fake_gsyreader): "M9G-c-21000101100000-21000101100100-0001.nc") -def test_preloaded_instances(tmp_path, fake_gsyreader): - """That that preloaded instances are generated.""" +@pytest.fixture() +def fake_simple_nc_file(tmp_path): + """Create a small dummy NetCDF file for testing preloaded instances. + + Returns the filename. + """ + nm = tmp_path / "M9G-a-21000101053000-21000101053100-01.nc" + nm.parent.mkdir(exist_ok=True, parents=True) + ds = xr.Dataset() + ds["panarea"] = xr.DataArray(np.array([[0, 1, 2]]), dims=["y", "x"]) + ds["strómboli"] = xr.DataArray(np.array([[1, 1, 2]]), dims=["y", "x"]) + ds["salina"] = xr.DataArray(np.array([[2, 1, 2]]), dims=["y", "x"]) + ds.to_netcdf(nm, group="/grp") + + return nm + + +@pytest.fixture() +def dummy_preloadable_handler(): + """Return a dummy preloadable netcdf4-based filehandler.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable - from satpy.readers.yaml_reader import GEOSegmentYAMLReader class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): pass + return DummyPreloadableHandler + +@pytest.fixture() +def fake_filetype_info(dummy_preloadable_handler): + """Return a fake filetype info dict.""" ft_info = { - "file_reader": DummyPreloadableHandler, + "file_reader": dummy_preloadable_handler, "file_patterns": [ "{platform}-a-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", "{platform}-b-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"], @@ -1638,35 +1660,29 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): "grp/strómboli": ["rc"], "grp/salina": []}} - ft_info_2 = { - "file_reader": DummyPreloadableHandler, + return ft_info + + +def test_preloaded_instances_works( + tmp_path, fake_gsyreader, fake_simple_nc_file, + dummy_preloadable_handler, fake_filetype_info): + """That that preloaded instances are generated.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + + ft_info_2 = {**fake_filetype_info, "file_patterns": [ "{platform}-c-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", - "{platform}-d-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"], - "expected_segments": 3, - "time_tags": ["start_time", "end_time"], - "segment_tag": "segment", - "required_netcdf_variables": { - "grp/panarea": ["segment"], - "grp/strómboli": ["rc"], - "grp/salina": []}} + "{platform}-d-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"]} gsyr = GEOSegmentYAMLReader( {"reader": { "name": "island-reader"}, "file_types": { - "m9g": ft_info, + "m9g": fake_filetype_info, "mag": ft_info_2}}, preload=True) - # create a dummy file - nm = tmp_path / "M9G-a-21000101053000-21000101053100-01.nc" - nm.parent.mkdir(exist_ok=True, parents=True) - ds = xr.Dataset() - ds["panarea"] = xr.DataArray(np.array([[0, 1, 2]]), dims=["y", "x"]) - ds["strómboli"] = xr.DataArray(np.array([[1, 1, 2]]), dims=["y", "x"]) - ds["salina"] = xr.DataArray(np.array([[2, 1, 2]]), dims=["y", "x"]) - ds.to_netcdf(nm, group="/grp") + # filename info belonging to fake_simple_nc_file fn_info = {"platform": "M9a", "start_time": datetime(2100, 1, 1, 5, 30), "end_time": datetime(2100, 1, 1, 5, 31), "segment": 1} @@ -1674,7 +1690,8 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") # prepare cache files - dph = DummyPreloadableHandler(os.fspath(nm), fn_info, ft_info) + dph = dummy_preloadable_handler(os.fspath(fake_simple_nc_file), + fn_info, fake_filetype_info) for i in range(2, 4): # disk cache except for nr. 1 fn = (tmp_path / "cache" / "satpy" / "preloadable" / "DummyPreloadableHandler" / @@ -1682,19 +1699,27 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): fn.parent.mkdir(exist_ok=True, parents=True) dph.store_cache(os.fspath(fn)) - fhs = gsyr.create_filehandlers([os.fspath(nm)]) + fhs = gsyr.create_filehandlers([os.fspath(fake_simple_nc_file)]) assert len(fhs["m9g"]) == 3 - ffi2 = ft_info.copy() - ffi2["requires"] = ["pergola"] + +def test_preloaded_instances_requirement( + tmp_path, fake_gsyreader, fake_simple_nc_file, + dummy_preloadable_handler, fake_filetype_info): + """Test that pre-loading instances fails if there is a required tag.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + + ft_info = {**fake_filetype_info, + "requires": ["pergola"]} + gsyr = GEOSegmentYAMLReader( {"reader": { "name": "alicudi"}, "file_types": { - "m9g": ffi2}}, preload=True) + "m9g": ft_info}}, preload=True) g = gsyr._new_filehandler_instances( - ffi2, - [(os.fspath(nm), + ft_info, + [(os.fspath(fake_simple_nc_file), {"platform": "M9G", "start_time": datetime(2100, 1, 1, 5, 30, ), "end_time": datetime(2100, 1, 1, 5, 31, ), @@ -1702,13 +1727,15 @@ class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): with pytest.raises(ValueError, match="Unable to preload"): list(g) - g = fake_gsyreader._new_preloaded_filehandler_instances( - ffi2, - [("M9G-a-21000101053000-21000101053100-0001.nc", - {"platform": "M9G", - "start_time": datetime(2100, 1, 1, 5, 30, ), - "end_time": datetime(2100, 1, 1, 5, 31, ), - "segment": 1})]) + +def test_preloaded_instances_not_implemented(tmp_path, fake_gsyreader, + fake_filetype_info): + """Test that pre-loading instances fails if it is not implemented.""" + ft_info = {**fake_filetype_info, "requires": ["pergola"]} + + # Second argument irrelevant, as this part of the method should not be + # reached + g = fake_gsyreader._new_preloaded_filehandler_instances(ft_info, None) with pytest.raises(NotImplementedError, match="Pre-loading not implemented"): list(g) @@ -1775,7 +1802,3 @@ def test_get_cache_filename(tmp_path): cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / "BaseFileHandler" / "a-99991231235959-235959-04-01.pkl") - - -def test_preloaded_instances_files_absent(tmp_path): - """Test preloaded instances when subsequent files are absent.""" From a2887990e1bee09643be7519c38b604383bcc744 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 14 Feb 2024 10:47:03 +0100 Subject: [PATCH 34/74] Change fallback Preloadable --- satpy/tests/reader_tests/test_netcdf_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index ab9197dc5a..d0cdd6ea82 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -32,7 +32,12 @@ from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable except ImportError: # fake the import so we can at least run the tests in this file - NetCDF4FileHandler = Preloadable = object # type: ignore + NetCDF4FileHandler = object # type: ignore + # setting Preloadable to object leads to an MRO error when defining + # FakePreloadableHandler, therefore define own class + class Preloadable: # type: ignore + """Backup preloadable class if import fails.""" + pass class FakeNetCDF4FileHandler(NetCDF4FileHandler): From da7d566d5ffe1e74d545dd96622dfaf6337184b7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 14 Feb 2024 11:04:08 +0100 Subject: [PATCH 35/74] Refactor _wait_for_file Refactor _wait_for_file to reduce the cyclomatic complexity --- satpy/readers/netcdf_utils.py | 40 ++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 44e2ba2011..2fde4aa337 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -580,24 +580,34 @@ def store_cache(self, filename=None): @dask.delayed -def _wait_for_file(fn, max_tries=300, wait=2): +def _wait_for_file(pat, max_tries=300, wait=2): """Wait for file to appear.""" for i in range(max_tries): - fns = glob.glob(fn) - if len(fns) == 0: - if i == 0: # only log if not yet present - LOG.debug(f"Waiting for {fn!s} to appear.") - if i % 60 == 30: - LOG.debug(f"Still waiting for {fn!s}") - time.sleep(wait) - continue - if len(fns) > 1: - raise ValueError(f"Expected one matching file, found {len(fns):d}") - if i > 0: # don't log if found immediately - LOG.debug(f"Found {fns[0]!s} matching {fn!s}!") - return fns[0] + if (match := _check_for_matching_file(i, pat, wait)): + if i > 0: # only log if not found immediately + LOG.debug(f"Found {match!s} matching {pat!s}!") + return match else: - raise TimeoutError(f"File matching {fn!s} failed to materialise") + raise TimeoutError(f"File matching {pat!s} failed to materialise") + + +def _check_for_matching_file(i, pat, wait): + fns = glob.glob(pat) + if len(fns) == 0: + _log_and_wait(i, pat, wait) + return + if len(fns) > 1: + raise ValueError(f"Expected one matching file, found {len(fns):d}") + return fns[0] + + +def _log_and_wait(i, pat, wait): + """Maybe log that we're waiting for pat, then wait.""" + if i == 0: # only log if not yet present + LOG.debug(f"Waiting for {pat!s} to appear.") + if i % 60 == 30: + LOG.debug(f"Still waiting for {pat!s}") + time.sleep(wait) @dask.delayed From 9bafbbc634e9b2f2e488a16786e03cb7541b914f Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 14 Feb 2024 11:10:21 +0100 Subject: [PATCH 36/74] Refactor _can_get_from_other_rc Refactor to simplify conditional logic. --- satpy/readers/netcdf_utils.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 2fde4aa337..a0b7cf5595 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -523,10 +523,30 @@ def _can_get_from_other_rc(self, itm): """Return true if variable is rc-cachable.""" # it's always safe to store variable attributes, shapes, dtype, dimensions # between repeat cycles + if self._is_safely_shareable_metadata(itm): + return True + # need an inverted mapping so I can tell which variables I can store + invmap = self._get_inv_name_map() + +# listed_variables = self.filetype_info.get("required_netcdf_variables") +# variable_name_replacements = self.filetype_info.get("variable_name_replacements") +# invmap = {} +# for raw_name in listed_variables: +# for subst_name in self._get_required_variable_names([raw_name], +# variable_name_replacements): +# invmap[subst_name] = raw_name + if "rc" in self.filetype_info["required_netcdf_variables"][invmap.get(itm, itm)]: + return True + return False + + def _is_safely_shareable_metadata(self, itm): + """Check if item refers to safely shareable metadata.""" for meta in ("/attr/", "/shape", "/dtype", "/dimension/", "/dimensions"): if meta in itm: return True - # need an inverted mapping so I can tell which variables I can store + + def _get_inv_name_map(self): + """Get inverted mapping for variable name replacements.""" listed_variables = self.filetype_info.get("required_netcdf_variables") variable_name_replacements = self.filetype_info.get("variable_name_replacements") invmap = {} @@ -534,9 +554,7 @@ def _can_get_from_other_rc(self, itm): for subst_name in self._get_required_variable_names([raw_name], variable_name_replacements): invmap[subst_name] = raw_name - if "rc" in self.filetype_info["required_netcdf_variables"][invmap.get(itm, itm)]: - return True - return False + return invmap def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment From fd4b225c2e3a410971f09be88e6512e4456bac4e Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 16 Feb 2024 17:34:37 +0100 Subject: [PATCH 37/74] Remove dead code --- satpy/readers/netcdf_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index a0b7cf5595..cfd7673c27 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -527,14 +527,6 @@ def _can_get_from_other_rc(self, itm): return True # need an inverted mapping so I can tell which variables I can store invmap = self._get_inv_name_map() - -# listed_variables = self.filetype_info.get("required_netcdf_variables") -# variable_name_replacements = self.filetype_info.get("variable_name_replacements") -# invmap = {} -# for raw_name in listed_variables: -# for subst_name in self._get_required_variable_names([raw_name], -# variable_name_replacements): -# invmap[subst_name] = raw_name if "rc" in self.filetype_info["required_netcdf_variables"][invmap.get(itm, itm)]: return True return False From 205121b8df25a916ca04b0f649cd73dfb5ec6cf6 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 12 Jun 2024 11:10:29 +0200 Subject: [PATCH 38/74] Doc fixes + remove accidentally removed lines Small fixes in the documentation. Restore lines in YAML that were accidentally removed while resolving the merge conflict in commit 445fc5614df9d8042a38e6dfbe5b678f380d28c4 --- doc/source/reading.rst | 17 +++++++++-------- satpy/etc/readers/fci_l1c_nc.yaml | 4 ++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index 38c8b7f20c..c71d1dfa99 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -301,7 +301,7 @@ would wait for all needed segments to arrive, before they start processing any data by passing all segments to the :class:`~satpy.scene.Scene`. For a more timely imagery production, users can create the Scene, load the data, resample, and even call :meth:`~satpy.scene.Scene.save_datasets` -(as long as ``compute=False``) before there the data are complete. +(as long as ``compute=False``) before the data are complete. When they then trigger the computation, much of the overhead in Satpy internals has already been completed, and Satpy will process each segment as it comes in. @@ -309,14 +309,15 @@ as it comes in. To do so, Satpy caches a selection of data and metadata between segments and between repeat cycles. Caching between segments happens in-memory and needs no preparation from the user, but where data are cached -between repeat cycles, the user needs to create this cache first:: +between repeat cycles, the user needs to create this cache first from +a repeat cycle that is available completely:: >>> from satpy.readers import create_preloadable_cache >>> create_preloadable_cache("fci_l1c_nc", fci_files) -For one full disc of FDHSI or HRFI data. This needs to be done only once, or -possibly again if there is an unexpected change in the data that are cached -between repeat cycles (as defined in the reader YAML file). +This needs to be done only once as long as data or metadata cached +between repeat cycles does not change (for example, the rows at which +each repeat cycle starts and ends). To make use of eager processing, the Scene object should be called passing ``preload=True``, passing _only_ the path to the first segment:: @@ -334,10 +335,10 @@ Some limitations that may be resolved in the future: - Mixing different file types for the same reader is not yet supported. For FCI, that means it is not yet possible to mix FDHSI and HRFI data. -- Only nominal case +- Only the nominal case has been tested. Missing segments are not yet supported. - Dask may not order the processing of the chunks optimally. That means some dask workers may be waiting for chunks 33–40 as chunks 1–32 are coming in - and are not being processed. + and are not being processed. A suboptimal workaround is to use 40 workers. - Currently, Satpy merely checks the existence of a file and not whether it has been completely written. This may lead to incomplete files being read, which might lead to failures. @@ -349,7 +350,7 @@ on how this could be extended to other readers, see :class:`~satpy.readers.netcdf_utils.Preloadable` and :class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. -.. versionadded:: 0.47 +.. versionadded:: 0.50 Adding a Reader to Satpy ======================== diff --git a/satpy/etc/readers/fci_l1c_nc.yaml b/satpy/etc/readers/fci_l1c_nc.yaml index 96f0d194cc..26a191050a 100644 --- a/satpy/etc/readers/fci_l1c_nc.yaml +++ b/satpy/etc/readers/fci_l1c_nc.yaml @@ -114,6 +114,10 @@ file_types: - "{pflag}_{location_indicator},{data_designator},MTI{spacecraft_id:1d}+{data_source}-1C-RRAD-HRFI-{coverage}-{subsetting}-{component1}-BODY-{component3}-{purpose}-{format}_{oflag}_{originator}_{processing_time:%Y%m%d%H%M%S}_{facility_or_tool}_{environment}_{start_time:%Y%m%d%H%M%S}_{end_time:%Y%m%d%H%M%S}_{processing_mode}_{special_compression}_{disposition_mode}_{repeat_cycle_in_day:>04d}_{count_in_repeat_cycle:>04d}.nc" expected_segments: 40 required_netcdf_variables: *required-variables + segment_tag: count_in_repeat_cycle + time_tags: *tt + variable_tags: + - repeat_cycle_in_day variable_name_replacements: channel_name: - vis_06_hr From d362ff919b7ef367a1a8b41a4593e412aa0cef6a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 12 Jun 2024 11:59:01 +0200 Subject: [PATCH 39/74] Rename Preloadable class Rename Preloadable class to be called PreloadableSegments. --- doc/source/reading.rst | 2 +- satpy/readers/fci_l1c_nc.py | 4 ++-- satpy/readers/netcdf_utils.py | 17 +++++++++-------- satpy/tests/reader_tests/test_netcdf_utils.py | 6 +++--- satpy/tests/test_yaml_reader.py | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/doc/source/reading.rst b/doc/source/reading.rst index c71d1dfa99..eb29d15df7 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -347,7 +347,7 @@ Note that this uses the ``h5netcdf`` backend for opening NetCDF files. For more technical background reading including hints on how this could be extended to other readers, see -:class:`~satpy.readers.netcdf_utils.Preloadable` and +:class:`~satpy.readers.netcdf_utils.PreloadableSegments` and :class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. .. versionadded:: 0.50 diff --git a/satpy/readers/fci_l1c_nc.py b/satpy/readers/fci_l1c_nc.py index 203dd11f63..72bc268ecb 100644 --- a/satpy/readers/fci_l1c_nc.py +++ b/satpy/readers/fci_l1c_nc.py @@ -124,7 +124,7 @@ from satpy.readers._geos_area import get_geos_area_naming from satpy.readers.eum_base import get_service_mode -from .netcdf_utils import NetCDF4FsspecFileHandler, Preloadable +from .netcdf_utils import NetCDF4FsspecFileHandler, PreloadableSegments logger = logging.getLogger(__name__) @@ -177,7 +177,7 @@ def _get_channel_name_from_dsname(dsname): return channel_name -class FCIL1cNCFileHandler(Preloadable, NetCDF4FsspecFileHandler): +class FCIL1cNCFileHandler(PreloadableSegments, NetCDF4FsspecFileHandler): """Class implementing the MTG FCI L1c Filehandler. This class implements the Meteosat Third Generation (MTG) Flexible diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index f0bc4554ad..a68fcf8f8c 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -448,16 +448,17 @@ def _get_attr(self, obj, key): return super()._get_attr(obj, key) -class Preloadable: - """Mixin class for pre-loading. +class PreloadableSegments: + """Mixin class for pre-loading segments. This is a mixin class designed to use with file handlers that support - preloading. Subclasses deriving from this mixin are expected to be - geo-segmentable file handlers, and can be created based on a glob pattern - for a non-existing file (as well as a path or glob pattern to an existing - file). The first segment of a repeat cycle is always processed only after - the file exists. For the remaining segments, metadata are collected as - much as possible before the file exists, from two possible sources: + preloading segments. Subclasses deriving from this mixin are expected + to be geo-segmentable file handlers, and can be created based on + a glob pattern for a non-existing file (as well as a path or glob + pattern to an existing file). The first segment of a repeat cycle + is always processed only after the file exists. For the remaining + segments, metadata are collected as much as possible before the file + exists, from two possible sources: - From the first segment. For some file formats, many pieces of metadata can be expected to be identical between segments. diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 2ee81821fb..f677d36f04 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -27,13 +27,13 @@ import xarray as xr try: - from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable + from satpy.readers.netcdf_utils import NetCDF4FileHandler, PreloadableSegments except ImportError: # fake the import so we can at least run the tests in this file NetCDF4FileHandler = object # type: ignore # setting Preloadable to object leads to an MRO error when defining # FakePreloadableHandler, therefore define own class - class Preloadable: # type: ignore + class PreloadableSegments: # type: ignore """Backup preloadable class if import fails.""" pass @@ -303,7 +303,7 @@ def test_use_h5netcdf_for_file_not_accessible_locally(self): assert fh._use_h5netcdf -class FakePreloadableHandler(Preloadable, FakeNetCDF4FileHandler): +class FakePreloadableHandler(PreloadableSegments, FakeNetCDF4FileHandler): """Fake preloadable handler.""" def __init__(self, filename, filename_info, filetype_info, **kwargs): diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 8ed2d61856..c8e3593629 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1640,8 +1640,8 @@ def fake_simple_nc_file(tmp_path): @pytest.fixture() def dummy_preloadable_handler(): """Return a dummy preloadable netcdf4-based filehandler.""" - from satpy.readers.netcdf_utils import NetCDF4FileHandler, Preloadable - class DummyPreloadableHandler(Preloadable, NetCDF4FileHandler): + from satpy.readers.netcdf_utils import NetCDF4FileHandler, PreloadableSegments + class DummyPreloadableHandler(PreloadableSegments, NetCDF4FileHandler): pass return DummyPreloadableHandler From 30e9755ff33645a43a1e9d8890743e80317f875e Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 12 Jun 2024 12:09:29 +0200 Subject: [PATCH 40/74] Split test case in three Split the test function for getting the cache filename in three separate tests for three different behaviours. --- satpy/tests/test_yaml_reader.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index c8e3593629..1a4d2536a4 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1742,13 +1742,13 @@ def test_preloaded_instances_not_implemented(tmp_path, fake_gsyreader, list(g) -def test_get_cache_filename(tmp_path): - """Test getting cache filename.""" +def test_get_cache_filename_segment_only(tmp_path): + """Test getting cache filename, segment only.""" from satpy.readers.yaml_reader import GEOSegmentYAMLReader fn = tmp_path / "a-01.nc" fn_info = {"segment": 1} - ft_info_simple = { + ft_info = { "file_reader": BaseFileHandler, "file_patterns": ["a-{segment:>02d}.nc"], "segment_tag": "segment", @@ -1758,8 +1758,8 @@ def test_get_cache_filename(tmp_path): {"reader": { "name": "filicudi"}, "file_types": { - "m9g": ft_info_simple}}, preload=True) - fh = BaseFileHandler(fn, fn_info, ft_info_simple) + "m9g": ft_info}}, preload=True) + fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: au.return_value = os.fspath(tmp_path / "cache") @@ -1767,10 +1767,18 @@ def test_get_cache_filename(tmp_path): assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / "BaseFileHandler" / "a-01.pkl") + +def test_get_cache_filename_cache_and_segment(tmp_path): + """Test getting the cache filename with segment and repeat cycle.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + fn = tmp_path / "a-04-01.nc" fn_info = {"rc": 4, "segment": 1} - ft_info = ft_info_simple.copy() - ft_info["file_patterns"] = ["a-{rc:>02d}-{segment:>02d}.nc"] + ft_info = { + "file_reader": BaseFileHandler, + "file_patterns": ["a-{rc:>02d}-{segment:>02d}.nc"], + "segment_tag": "segment", + "expected_segments": 5} gsyr = GEOSegmentYAMLReader( {"reader": { @@ -1785,12 +1793,20 @@ def test_get_cache_filename(tmp_path): assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / "BaseFileHandler" / "a-04-01.pkl") + +def test_get_cache_filename_including_time(tmp_path): + """Test getting the cache filename including a dummpy time.""" + from satpy.readers.yaml_reader import GEOSegmentYAMLReader + fn = tmp_path / "a-20421015234500-234600-04-01.nc" fn_info = {"start_time": dt.datetime(2042, 10, 15, 23, 45), "end_time": dt.datetime(2042, 10, 15, 23, 46), "rc": 4, "segment": 1} - ft_info = ft_info_simple.copy() - ft_info["file_patterns"] = ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%M%S}-{rc:>02d}-{segment:>02d}.nc"] - ft_info["time_tags"] = ["start_time", "end_time"] + ft_info = { + "file_reader": BaseFileHandler, + "file_patterns": ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%M%S}-{rc:>02d}-{segment:>02d}.nc"], + "segment_tag": "segment", + "expected_segments": 5, + "time_tags": ["start_time", "end_time"]} gsyr = GEOSegmentYAMLReader( {"reader": { From 38431d382c187d84a16983d02ee0abfda3c6a468 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 12 Jun 2024 14:56:43 +0200 Subject: [PATCH 41/74] Use configuration parameters for preloading Use configuration parameters to decide whether to preload or not. --- doc/source/config.rst | 23 ++++ doc/source/reading.rst | 12 ++- satpy/_config.py | 3 + satpy/readers/netcdf_utils.py | 18 ++-- satpy/readers/yaml_reader.py | 9 +- satpy/tests/reader_tests/test_netcdf_utils.py | 2 +- satpy/tests/test_readers.py | 7 +- satpy/tests/test_yaml_reader.py | 101 ++++++++++-------- 8 files changed, 107 insertions(+), 68 deletions(-) diff --git a/doc/source/config.rst b/doc/source/config.rst index 2279e10fe5..82b25512b8 100644 --- a/doc/source/config.rst +++ b/doc/source/config.rst @@ -274,6 +274,29 @@ Clipping of negative radiances is currently implemented for the following reader * ``abi_l1b`` +Preloading segments +^^^^^^^^^^^^^^^^^^^ + +* **Environment variable**: ``SATPY_READERS__PRELOAD_SEGMENTS`` +* **YAML-Config Key**: ``readers.preload_segments`` +* **Default**: False + +Preload segments for those readers where it is supported. See +:ref:`Reading data before they're available` for details. + +* **Environment variable**: ``SATPY_READERS__PRELOAD_STEP`` +* **YAML-Config Key**: ``readers.preload_step`` +* **Default**: 2 + +When preloading, internal time in seconds to check if a file has become +available. + +* **Environment variable**: ``SATPY_READERS__PRELOAD_TRIES``. +* **YAML-Config Key**: ``readers.preload_tries``. +* **Default**: 300 + +When preloading, how many times to check if a file has become available +before giving up. Temporary Directory ^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/reading.rst b/doc/source/reading.rst index eb29d15df7..6639fa32f9 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -317,20 +317,22 @@ a repeat cycle that is available completely:: This needs to be done only once as long as data or metadata cached between repeat cycles does not change (for example, the rows at which -each repeat cycle starts and ends). -To make use of eager processing, the Scene object should be called passing -``preload=True``, passing _only_ the path to the first segment:: +each repeat cycle starts and ends). To make use of eager processing, set +the configuration variable ``readers.preload_segments``. When creating +the scene, pass only the path to the first segment:: + >>> satpy.config.set({"readers.preload_segments": True}) >>> sc = Scene( ... filenames=[path_to_first_segment], - ... reader="fci_l1c_nc", - ... reader_kwargs={"preload": True}) + ... reader="fci_l1c_nc") Satpy will figure out the names of the remaining segments and find them as they come in. If the data are already available, processing is similar to the regular case. If the data are not yet available, Satpy will wait during the computation of the dask graphs until data become available. +For additional configuration parameters, see :ref:`Settings`. + Some limitations that may be resolved in the future: - Mixing different file types for the same reader is not yet supported. diff --git a/satpy/_config.py b/satpy/_config.py index fbfcb0c0d5..656b15cbfb 100644 --- a/satpy/_config.py +++ b/satpy/_config.py @@ -53,6 +53,9 @@ "sensor_angles_position_preference": "actual", "readers": { "clip_negative_radiances": False, + "preload_segments": False, + "preload_step": 2, + "preload_tries": 300, }, } diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index a68fcf8f8c..637c4adff4 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -27,6 +27,7 @@ import numpy as np import xarray as xr +import satpy from satpy.readers import open_file_or_filename from satpy.readers.file_handlers import BaseFileHandler from satpy.readers.utils import np2str @@ -478,20 +479,21 @@ class PreloadableSegments: Attributes (variable, global, and group), shapes, dtypes, and dimension names are assumed to be always shareable between repeat cycles. - To use preloading, pass ``preload=True`` to ``reader_kwargs`` when creating - the Scene. + To use preloading, set the satpy configuration variable + ``readers.preload_segments`` to True. The initialisation parameter + for this filehandler might still be False, because the first segment (the + reference segment) of a repeat cycle is loaded normally. This feature is experimental. - .. versionadded:: 0.47 + .. versionadded:: 0.50 """ - def __init__(self, *args, preload=False, preload_step=2, preload_tries=300, - ref_fh=None, rc_cache=None, **kwargs): + def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload - self.preload_tries = preload_tries - self.preload_step = preload_step + self.preload_tries = satpy.config.get("readers.preload_tries") + self.preload_step = satpy.config.get("readers.preload_step") if preload: if not isinstance(ref_fh, BaseFileHandler): raise TypeError( @@ -500,7 +502,7 @@ def __init__(self, *args, preload=False, preload_step=2, preload_tries=300, self.ref_fh = ref_fh if not isinstance(rc_cache, (str, bytes, os.PathLike)): raise TypeError( - "Expect cache file when preloading, got " + "Expected cache file when preloading, got " f"{type(rc_cache)!s}") self.rc_cache = rc_cache super().__init__(*args, **kwargs) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index ad5670fc0a..19b0a35286 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -36,6 +36,8 @@ from pyresample.geometry import AreaDefinition, StackedAreaDefinition, SwathDefinition from trollsift.parser import Parser, globify, parse +import satpy + try: from yaml import CLoader as Loader except ImportError: @@ -1168,8 +1170,7 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): field which will be used if ``expected_segments`` is not defined. This will default to 1 segment. - This reader uses an optional ``preload`` keyword argument that can be - passed to the :meth:`~satpy.scene.Scene` class upon instantiation. + This reader uses the ``readers.preload_segments`` configuration setting. This argument is intended for near real time processing. When only one segment has arrived, the user can create a scene using this one segment and ``preload=True``, and Satpy will populate a scene based on @@ -1181,13 +1182,11 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): (how many tries before giving up). This feature is experimental. Use at your own risk. - - .. versionadded:: 0.47 """ def __init__(self, *args, **kwargs): """Initialise object.""" - self.preload = kwargs.pop("preload", False) + self.preload = satpy.config.get("readers.preload_segments") super().__init__(*args, **kwargs) def create_filehandlers(self, filenames, fh_kwargs=None): diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index f677d36f04..4bf07b8466 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -510,7 +510,7 @@ def test_incorrect_usage(self, fake_config, preloadable_false, cache_file, with pytest.raises(TypeError, match="Expect reference filehandler"): FakePreloadableHandler("dummy", {}, fake_config, preload=True, rc_cache=cache_file) - with pytest.raises(TypeError, match="Expect cache file"): + with pytest.raises(TypeError, match="Expected cache file"): FakePreloadableHandler("dummy", {}, fake_config, preload=True, ref_fh=preloadable_false) fph = FakePreloadableHandler("dummy", {}, fake_config, preload=True, diff --git a/satpy/tests/test_readers.py b/satpy/tests/test_readers.py index f7c414b089..c18d50afb3 100644 --- a/satpy/tests/test_readers.py +++ b/satpy/tests/test_readers.py @@ -34,6 +34,7 @@ import xarray as xr from pytest_lazy_fixtures import lf as lazy_fixture +import satpy from satpy.dataset.data_dict import get_key from satpy.dataset.dataid import DataID, ModifierTuple, WavelengthRange from satpy.readers import FSFile, find_files_and_readers, open_file_or_filename @@ -1119,9 +1120,11 @@ def test_create_preloadable_cache(tmp_path): dph = FakePreloadableHandler( os.fspath(tmp_path / "a-0.nc"), {"segment": 0}, fake_config["file_types"]["m9g"], - preload=False, rc_cache=tmp_path / "test.pkl") + rc_cache=tmp_path / "test.pkl", + preload=False) dph.file_content["/iceland/reykjavík"] = xr.DataArray(da.from_array([[0, 1, 2]])) - gsyr = GEOSegmentYAMLReader(fake_config, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader(fake_config) gsyr.file_handlers["handler"] = [dph] with unittest.mock.patch("satpy.readers.load_readers") as srl: diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index 1a4d2536a4..ff366b8a2d 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -29,6 +29,7 @@ import pytest import xarray as xr +import satpy import satpy.readers.yaml_reader as yr from satpy._compat import cache from satpy.dataset import DataQuery @@ -1574,11 +1575,12 @@ def test_predict_filename_info(tmp_path): def fake_gsyreader(): """Create a fake GeoSegmentYAMLReader.""" from satpy.readers.yaml_reader import GEOSegmentYAMLReader - return GEOSegmentYAMLReader( - {"reader": { - "name": "alicudi"}, - "file_types": { - "m9g": _fake_filetype_info}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + return GEOSegmentYAMLReader( + {"reader": { + "name": "alicudi"}, + "file_types": { + "m9g": _fake_filetype_info}}) def test_predict_filename(tmp_path, fake_gsyreader): @@ -1676,33 +1678,34 @@ def test_preloaded_instances_works( "{platform}-c-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", "{platform}-d-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"]} - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "island-reader"}, - "file_types": { - "m9g": fake_filetype_info, - "mag": ft_info_2}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "island-reader"}, + "file_types": { + "m9g": fake_filetype_info, + "mag": ft_info_2}}) - # filename info belonging to fake_simple_nc_file + # filename info belonging to fake_simple_nc_file - fn_info = {"platform": "M9a", "start_time": dt.datetime(2100, 1, 1, 5, 30), - "end_time": dt.datetime(2100, 1, 1, 5, 31), "segment": 1} + fn_info = {"platform": "M9a", "start_time": dt.datetime(2100, 1, 1, 5, 30), + "end_time": dt.datetime(2100, 1, 1, 5, 31), "segment": 1} - with unittest.mock.patch("appdirs.user_cache_dir") as au: - au.return_value = os.fspath(tmp_path / "cache") - # prepare cache files - dph = dummy_preloadable_handler(os.fspath(fake_simple_nc_file), - fn_info, fake_filetype_info) - for i in range(2, 4): # disk cache except for nr. 1 - fn = (tmp_path / "cache" / "satpy" / "preloadable" / - "DummyPreloadableHandler" / - f"M9G-a-99991231235959-99991231235959-{i:>02d}.pkl") - fn.parent.mkdir(exist_ok=True, parents=True) - dph.store_cache(os.fspath(fn)) + with unittest.mock.patch("appdirs.user_cache_dir") as au: + au.return_value = os.fspath(tmp_path / "cache") + # prepare cache files + dph = dummy_preloadable_handler(os.fspath(fake_simple_nc_file), + fn_info, fake_filetype_info) + for i in range(2, 4): # disk cache except for nr. 1 + fn = (tmp_path / "cache" / "satpy" / "preloadable" / + "DummyPreloadableHandler" / + f"M9G-a-99991231235959-99991231235959-{i:>02d}.pkl") + fn.parent.mkdir(exist_ok=True, parents=True) + dph.store_cache(os.fspath(fn)) - fhs = gsyr.create_filehandlers([os.fspath(fake_simple_nc_file)]) - assert len(fhs["m9g"]) == 3 + fhs = gsyr.create_filehandlers([os.fspath(fake_simple_nc_file)]) + assert len(fhs["m9g"]) == 3 def test_preloaded_instances_requirement( @@ -1714,11 +1717,12 @@ def test_preloaded_instances_requirement( ft_info = {**fake_filetype_info, "requires": ["pergola"]} - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "alicudi"}, - "file_types": { - "m9g": ft_info}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "alicudi"}, + "file_types": { + "m9g": ft_info}}) g = gsyr._new_filehandler_instances( ft_info, [(os.fspath(fake_simple_nc_file), @@ -1754,11 +1758,12 @@ def test_get_cache_filename_segment_only(tmp_path): "segment_tag": "segment", "expected_segments": 5} - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "filicudi"}, - "file_types": { - "m9g": ft_info}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "filicudi"}, + "file_types": { + "m9g": ft_info}}) fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: @@ -1780,11 +1785,12 @@ def test_get_cache_filename_cache_and_segment(tmp_path): "segment_tag": "segment", "expected_segments": 5} - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "salina"}, - "file_types": { - "m9g": ft_info}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "salina"}, + "file_types": { + "m9g": ft_info}}) fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: @@ -1808,11 +1814,12 @@ def test_get_cache_filename_including_time(tmp_path): "expected_segments": 5, "time_tags": ["start_time", "end_time"]} - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "salina"}, - "file_types": { - "m9g": ft_info}}, preload=True) + with satpy.config.set({"readers.preload_segments": True}): + gsyr = GEOSegmentYAMLReader( + {"reader": { + "name": "salina"}, + "file_types": { + "m9g": ft_info}}) fh = BaseFileHandler(fn, fn_info, ft_info) with unittest.mock.patch("appdirs.user_cache_dir") as au: From cda858141730635ec11afbdd75a78956b2cf5b67 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 13 Jun 2024 10:28:28 +0200 Subject: [PATCH 42/74] secende/rejoin --- satpy/readers/netcdf_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 637c4adff4..2a53401dbc 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -22,7 +22,7 @@ import time import dask.array as da -import dask.delayed +import dask.distributed import netCDF4 import numpy as np import xarray as xr @@ -632,7 +632,9 @@ def _log_and_wait(i, pat, wait): LOG.debug(f"Waiting for {pat!s} to appear.") if i % 60 == 30: LOG.debug(f"Still waiting for {pat!s}") + dask.distributed.secede() time.sleep(wait) + dask.distributed.rejoin() @dask.delayed From 7f6a8d467cecc47dc3bd7127e25d44c292fb698b Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 14 Jun 2024 10:02:49 +0200 Subject: [PATCH 43/74] Add test to reproduce GH 2815 --- satpy/tests/reader_tests/test_netcdf_utils.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 5e0bcc44a1..f48141c635 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -392,3 +392,40 @@ def test_get_data_as_xarray_scalar_h5netcdf(tmp_path): res = get_data_as_xarray(fid["test_data"]) np.testing.assert_equal(res.data, np.array(data)) assert res.attrs == NC_ATTRS + + +@pytest.fixture() +def dummy_nc(tmp_path): + """Fixture to create a dummy NetCDF file and return its path.""" + import xarray as xr + + fn = tmp_path / "sjaunja.nc" + ds = xr.Dataset(data_vars={"kaitum": (["x"], np.arange(10))}) + ds.to_netcdf(fn) + return fn + + +def test_caching_distributed(dummy_nc): + """Test that the distributed scheduler works with file handle caching. + + This is a test for GitHub issue 2815. + """ + from dask.distributed import Client + + from satpy.readers.netcdf_utils import NetCDF4FileHandler + + fh = NetCDF4FileHandler(dummy_nc, {}, {}, cache_handle=True) + + Client() + + def doubler(x): + return x * 2 + + # As documented in GH issue 2815, using dask distributed with the file + # handle cacher might fail in non-trivial ways, such as giving incorrect + # results. Testing map_blocks is one way to reproduce the problem + # reliably, even though the problem also manifests itself (in different + # ways) without map_blocks. + + dask_doubler = fh["kaitum"].map_blocks(doubler) + dask_doubler.compute() From 6d31c20ab3d914711eda2506083053b9fcf071ba Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 14 Jun 2024 11:10:52 +0200 Subject: [PATCH 44/74] make sure distributed client is local --- satpy/tests/reader_tests/test_netcdf_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index f48141c635..c381d37d4a 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -416,8 +416,6 @@ def test_caching_distributed(dummy_nc): fh = NetCDF4FileHandler(dummy_nc, {}, {}, cache_handle=True) - Client() - def doubler(x): return x * 2 @@ -427,5 +425,7 @@ def doubler(x): # reliably, even though the problem also manifests itself (in different # ways) without map_blocks. - dask_doubler = fh["kaitum"].map_blocks(doubler) - dask_doubler.compute() + + with Client(): + dask_doubler = fh["kaitum"].map_blocks(doubler) + dask_doubler.compute() From 1e26d1a1c1846e0d2a03d41466bb59aae89fc974 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 14 Jun 2024 11:32:10 +0200 Subject: [PATCH 45/74] Start utility function for distributed friendly Start work on a utility function to get a dask array from a dataset variable in a way that is friendly to dask.distributed. --- satpy/readers/utils.py | 30 +++++++++++++++++++++++ satpy/tests/reader_tests/test_utils.py | 34 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index c1bf7c7497..72bc26bf30 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -29,6 +29,7 @@ from shutil import which from subprocess import PIPE, Popen # nosec +import dask.array as da import numpy as np import pyproj import xarray as xr @@ -474,3 +475,32 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): with xr.set_options(keep_attrs=True): reflectance = reflectance / reflectance.dtype.type(sun_earth_dist * sun_earth_dist) return reflectance + + +def get_distributed_friendly_dask_array(manager, varname): + """Construct a dask array from a variable for dask distributed. + + When we construct a dask array using da.array and use that to create an + xarray dataarray, the result is not serialisable and dask graphs using + this dataarray cannot be computed when the dask distributed scheduler + is in use. To circumvent this problem, xarray provides the + CachingFileManager. See GH#2815 for more information. + + Args: + manager (xarray.backends.CachingFileManager): + Instance of xarray.backends.CachingFileManager encapsulating the + dataset to be read. + varname (str): + Name of the variable. + """ + def get_chunk(block_info): + with manager.acquire_context() as nc: + loc = block_info[None]["array-location"][0] + rv = nc[varname][loc[0]:loc[1]] + return rv + + return da.map_blocks( + get_chunk, + chunks=(10,), + meta=np.array([]), + dtype="i4") diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index ba43688b76..3a0d85baed 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -512,3 +512,37 @@ def test_generic_open_binary(tmp_path, data, filename, mode): read_binary_data = f.read() assert read_binary_data == dummy_data + + +@pytest.fixture() +def dummy_nc(tmp_path): + """Fixture to create a dummy NetCDF file and return its path.""" + import xarray as xr + + fn = tmp_path / "sjaunja.nc" + ds = xr.Dataset(data_vars={"kaitum": (["x"], np.arange(10, dtype="i4"))}) + ds.to_netcdf(fn) + return fn + + +def test_get_distributed_friendly_dask_array(dummy_nc): + """Test getting a dask distributed friendly dask array.""" + import netCDF4 + from dask.distributed import Client + from xarray.backends import CachingFileManager + + cfm = CachingFileManager(netCDF4.Dataset, dummy_nc, mode="r") + arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum") + + # As documented in GH issue 2815, using dask distributed with the file + # handle cacher might fail in non-trivial ways, such as giving incorrect + # results. Testing map_blocks is one way to reproduce the problem + # reliably, even though the problem also manifests itself (in different + # ways) without map_blocks. + + def doubler(x): + return x * 2 + + with Client(): + dask_doubler = arr.map_blocks(doubler) + dask_doubler.compute() From be40c5ba5b1210317e95c3a6680ec7c61d81d057 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 14 Jun 2024 14:11:47 +0200 Subject: [PATCH 46/74] Parameterise test and simplify implementation For the distributed-friendly dask array helper, parameterise the test to cover more cases. Simplify the implementation. --- satpy/readers/utils.py | 24 ++++++++++++------- satpy/tests/reader_tests/test_utils.py | 33 +++++++++++++++----------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 72bc26bf30..e39b46c0e6 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,7 +477,7 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_distributed_friendly_dask_array(manager, varname): +def get_distributed_friendly_dask_array(manager, varname, chunks=None): """Construct a dask array from a variable for dask distributed. When we construct a dask array using da.array and use that to create an @@ -486,21 +486,29 @@ def get_distributed_friendly_dask_array(manager, varname): is in use. To circumvent this problem, xarray provides the CachingFileManager. See GH#2815 for more information. + Should have at least one dimension. + + Example:: + + >>> import NetCDF4 + >>> from xarray.backends import CachingFileManager + >>> cfm = CachingFileManager(NetCDF4.Dataset, fn, mode="r") + >>> arr = get_distributed_friendly_dask_array(cfm, "my_var") + Args: manager (xarray.backends.CachingFileManager): Instance of xarray.backends.CachingFileManager encapsulating the dataset to be read. varname (str): Name of the variable. + chunks (tuple or None, optional): + Chunks to use when creating the dask array. """ - def get_chunk(block_info): + def get_chunk(): with manager.acquire_context() as nc: - loc = block_info[None]["array-location"][0] - rv = nc[varname][loc[0]:loc[1]] - return rv + return nc[varname][:] return da.map_blocks( get_chunk, - chunks=(10,), - meta=np.array([]), - dtype="i4") + chunks=chunks, + meta=np.array([])) diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 3a0d85baed..1b8af70e10 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -514,24 +514,23 @@ def test_generic_open_binary(tmp_path, data, filename, mode): assert read_binary_data == dummy_data -@pytest.fixture() -def dummy_nc(tmp_path): - """Fixture to create a dummy NetCDF file and return its path.""" - import xarray as xr - - fn = tmp_path / "sjaunja.nc" - ds = xr.Dataset(data_vars={"kaitum": (["x"], np.arange(10, dtype="i4"))}) - ds.to_netcdf(fn) - return fn - - -def test_get_distributed_friendly_dask_array(dummy_nc): +@pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4)]) +@pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) +def test_get_distributed_friendly_dask_array(tmp_path, shape, dtype): """Test getting a dask distributed friendly dask array.""" import netCDF4 from dask.distributed import Client from xarray.backends import CachingFileManager - cfm = CachingFileManager(netCDF4.Dataset, dummy_nc, mode="r") + fn = tmp_path / "sjaunja.nc" + ds = xr.Dataset( + data_vars={ + "kaitum": (["x", "y", "z"][:len(shape)], + np.arange(np.prod(shape), + dtype=dtype).reshape(shape))}) + ds.to_netcdf(fn) + + cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum") # As documented in GH issue 2815, using dask distributed with the file @@ -543,6 +542,12 @@ def test_get_distributed_friendly_dask_array(dummy_nc): def doubler(x): return x * 2 + # FIXME: setting up the client is slow, taking more than one second — + # consider putting it in a class-scoped fixture and putting this test + # in a class (so it is still shared between parameterised runs) with Client(): dask_doubler = arr.map_blocks(doubler) - dask_doubler.compute() + res = dask_doubler.compute() + assert res.shape == shape + assert res.dtype == dtype + np.testing.assert_array_equal(res, np.arange(np.prod(shape)).reshape(shape)*2) From cbd00f0439e09432bc4555458b7045693f827c71 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 14 Jun 2024 14:40:42 +0200 Subject: [PATCH 47/74] Force shape and dtype. First working prototype. We need to force the shape and the dtype when getting the dask-distributed-friendly xarray-dataarray. Seems to have a first working prototype now. --- satpy/readers/netcdf_utils.py | 38 ++++++++++++++++---------- satpy/readers/utils.py | 7 +++-- satpy/tests/reader_tests/test_utils.py | 9 ++++-- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index c8b8a3f85f..a6ca0bccef 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -19,14 +19,13 @@ import logging -import dask.array as da import netCDF4 import numpy as np import xarray as xr from satpy.readers import open_file_or_filename from satpy.readers.file_handlers import BaseFileHandler -from satpy.readers.utils import np2str +from satpy.readers.utils import get_distributed_friendly_dask_array, np2str from satpy.utils import get_legacy_chunk_size LOG = logging.getLogger(__name__) @@ -85,10 +84,12 @@ class NetCDF4FileHandler(BaseFileHandler): xarray_kwargs (dict): Addition arguments to `xarray.open_dataset` cache_var_size (int): Cache variables smaller than this size. cache_handle (bool): Keep files open for lifetime of filehandler. + Uses xarray.backends.CachingFileManager, which uses a least + recently used cache. """ - file_handle = None + manager = None def __init__(self, filename, filename_info, filetype_info, auto_maskandscale=False, xarray_kwargs=None, @@ -118,7 +119,8 @@ def __init__(self, filename, filename_info, filetype_info, self.collect_cache_vars(cache_var_size) if cache_handle: - self.file_handle = file_handle + self.manager = xr.backends.CachingFileManager( + netCDF4.Dataset, self.filename, mode="r") else: file_handle.close() @@ -196,9 +198,9 @@ def _get_required_variable_names(listed_variables, variable_name_replacements): def __del__(self): """Delete the file handler.""" - if self.file_handle is not None: + if self.manager is not None: try: - self.file_handle.close() + self.manager.close() except RuntimeError: # presumably closed already pass @@ -289,8 +291,8 @@ def _get_variable(self, key, val): group, key = parts else: group = None - if self.file_handle is not None: - val = self._get_var_from_filehandle(group, key) + if self.manager is not None: + val = self._get_var_from_manager(group, key) else: val = self._get_var_from_xr(group, key) return val @@ -319,18 +321,26 @@ def _get_var_from_xr(self, group, key): val.load() return val - def _get_var_from_filehandle(self, group, key): + def _get_var_from_manager(self, group, key): # Not getting coordinates as this is more work, therefore more # overhead, and those are not used downstream. + with self.manager.acquire_context() as ds: + if group is not None: + v = ds[group][key] + else: + v = ds[key] if group is None: - g = self.file_handle + dv = get_distributed_friendly_dask_array( + self.manager, key, + chunks=v.shape, dtype=v.dtype) else: - g = self.file_handle[group] - v = g[key] + dv = get_distributed_friendly_dask_array( + self.manager, key, group=group, + chunks=v.shape, dtype=v.dtype) attrs = self._get_object_attrs(v) x = xr.DataArray( - da.from_array(v), dims=v.dimensions, attrs=attrs, - name=v.name) + dv, + dims=v.dimensions, attrs=attrs, name=v.name) return x def __contains__(self, item): diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index e39b46c0e6..97c8b2f4ca 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,7 +477,7 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_distributed_friendly_dask_array(manager, varname, chunks=None): +def get_distributed_friendly_dask_array(manager, varname, chunks, dtype): """Construct a dask array from a variable for dask distributed. When we construct a dask array using da.array and use that to create an @@ -501,8 +501,10 @@ def get_distributed_friendly_dask_array(manager, varname, chunks=None): dataset to be read. varname (str): Name of the variable. - chunks (tuple or None, optional): + chunks (tuple): Chunks to use when creating the dask array. + dtype (dtype): + What dtype to use. """ def get_chunk(): with manager.acquire_context() as nc: @@ -511,4 +513,5 @@ def get_chunk(): return da.map_blocks( get_chunk, chunks=chunks, + dtype=dtype, meta=np.array([])) diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 1b8af70e10..876922477a 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -531,7 +531,8 @@ def test_get_distributed_friendly_dask_array(tmp_path, shape, dtype): ds.to_netcdf(fn) cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum") + arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum", + chunks=shape, dtype=dtype) # As documented in GH issue 2815, using dask distributed with the file # handle cacher might fail in non-trivial ways, such as giving incorrect @@ -548,6 +549,8 @@ def doubler(x): with Client(): dask_doubler = arr.map_blocks(doubler) res = dask_doubler.compute() - assert res.shape == shape - assert res.dtype == dtype + assert shape == dask_doubler.shape # we will need dtype before compute + assert shape == res.shape + assert dtype == dask_doubler.dtype + assert dtype == res.dtype np.testing.assert_array_equal(res, np.arange(np.prod(shape)).reshape(shape)*2) From af4ee66a1c424f3d6747026dfab95bd98c74172e Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 09:24:16 +0200 Subject: [PATCH 48/74] Add group support and speed up tests Add group support for getting a dask distributed friendly dask array. Speed up the related tests by sharing the dask distributed client setup and breakdown. --- satpy/readers/utils.py | 7 +- satpy/tests/reader_tests/test_utils.py | 92 +++++++++++++++----------- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 97c8b2f4ca..10de2b364c 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,7 +477,8 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_distributed_friendly_dask_array(manager, varname, chunks, dtype): +def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, + group="/"): """Construct a dask array from a variable for dask distributed. When we construct a dask array using da.array and use that to create an @@ -505,10 +506,12 @@ def get_distributed_friendly_dask_array(manager, varname, chunks, dtype): Chunks to use when creating the dask array. dtype (dtype): What dtype to use. + group (str): + What group to read the variable from. """ def get_chunk(): with manager.acquire_context() as nc: - return nc[varname][:] + return nc["/".join([group, varname])][:] return da.map_blocks( get_chunk, diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 876922477a..2de91e6d4b 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -514,43 +514,59 @@ def test_generic_open_binary(tmp_path, data, filename, mode): assert read_binary_data == dummy_data -@pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4)]) -@pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) -def test_get_distributed_friendly_dask_array(tmp_path, shape, dtype): - """Test getting a dask distributed friendly dask array.""" - import netCDF4 - from dask.distributed import Client - from xarray.backends import CachingFileManager - - fn = tmp_path / "sjaunja.nc" - ds = xr.Dataset( - data_vars={ - "kaitum": (["x", "y", "z"][:len(shape)], - np.arange(np.prod(shape), - dtype=dtype).reshape(shape))}) - ds.to_netcdf(fn) - - cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum", - chunks=shape, dtype=dtype) - - # As documented in GH issue 2815, using dask distributed with the file - # handle cacher might fail in non-trivial ways, such as giving incorrect - # results. Testing map_blocks is one way to reproduce the problem - # reliably, even though the problem also manifests itself (in different - # ways) without map_blocks. - - def doubler(x): - return x * 2 - - # FIXME: setting up the client is slow, taking more than one second — - # consider putting it in a class-scoped fixture and putting this test - # in a class (so it is still shared between parameterised runs) - with Client(): +class TestDistributed: + """Distributed-related tests. + + Distributed-related tests are grouped so that they can share a class-scoped + fixture setting up the distributed client, as this setup is relatively + slow. + """ + + @pytest.fixture(scope="class") + def dask_dist_client(self): + """Set up and close a dask distributed client.""" + from dask.distributed import Client + cl = Client() + yield cl + cl.close() + + + @pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4)]) + @pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) + @pytest.mark.parametrize("grp", ["/", "/in/a/group"]) + def test_get_distributed_friendly_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): + """Test getting a dask distributed friendly dask array.""" + import netCDF4 + from xarray.backends import CachingFileManager + + fn = tmp_path / "sjaunja.nc" + ds = xr.Dataset( + data_vars={ + "kaitum": (["x", "y", "z"][:len(shape)], + np.arange(np.prod(shape), + dtype=dtype).reshape(shape))}) + ds.to_netcdf(fn, group=grp) + + cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") + arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum", + chunks=shape, dtype=dtype, + group=grp) + + # As documented in GH issue 2815, using dask distributed with the file + # handle cacher might fail in non-trivial ways, such as giving incorrect + # results. Testing map_blocks is one way to reproduce the problem + # reliably, even though the problem also manifests itself (in different + # ways) without map_blocks. + + def doubler(x): + return x * 2 + dask_doubler = arr.map_blocks(doubler) res = dask_doubler.compute() - assert shape == dask_doubler.shape # we will need dtype before compute - assert shape == res.shape - assert dtype == dask_doubler.dtype - assert dtype == res.dtype - np.testing.assert_array_equal(res, np.arange(np.prod(shape)).reshape(shape)*2) + # test before and after computation, as to confirm we have the correct + # shape and dtype and that computing doesn't change them + assert shape == dask_doubler.shape + assert shape == res.shape + assert dtype == dask_doubler.dtype + assert dtype == res.dtype + np.testing.assert_array_equal(res, np.arange(np.prod(shape)).reshape(shape)*2) From dad3b1418dc971962596fa2ce6ff98138ccfc87a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 09:50:57 +0200 Subject: [PATCH 49/74] Add partial backward-compatibility fol file handle Add partial backward compatibility for accessing the file handle attribute when using caching with a NetCDF4FileHandler base class. Backward incompatibility is not 100%. Deleting the FileHandler closes the manager and therefore the ``file_handle`` property, however, when accessing the ``file_handle`` property after deleting the ``FileHandler``, it is reopened. Therefore, calling `__del__()`` manually and then accessing ``fh.file_handle`` will now return an open file (was a closed file). This should not happen in any sane use scenario. --- satpy/readers/netcdf_utils.py | 11 +++++++++++ satpy/tests/reader_tests/test_netcdf_utils.py | 2 -- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index a6ca0bccef..e4ddf5ccf7 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -18,6 +18,7 @@ """Helpers for reading netcdf-based files.""" import logging +import warnings import netCDF4 import numpy as np @@ -127,6 +128,16 @@ def __init__(self, filename, filename_info, filetype_info, def _get_file_handle(self): return netCDF4.Dataset(self.filename, "r") + @property + def file_handle(self): + """Backward-compatible way for file handle caching.""" + warnings.warn( + "attribute .file_handle is deprecated, use .manager instead", + DeprecationWarning) + if self.manager is None: + return None + return self.manager.acquire() + @staticmethod def _set_file_handle_auto_maskandscale(file_handle, auto_maskandscale): if hasattr(file_handle, "set_auto_maskandscale"): diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index c381d37d4a..6ff2b81b7f 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -226,8 +226,6 @@ def test_caching(self): np.testing.assert_array_equal( h["ds2_f"], np.arange(10. * 100).reshape((10, 100))) - h.__del__() - assert not h.file_handle.isopen() def test_filenotfound(self): """Test that error is raised when file not found.""" From fc58ca41c2b8a991151ec94f7fa87aebe7789b0b Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 11:17:11 +0200 Subject: [PATCH 50/74] Respect auto_maskandscale with new caching With the new dask-distributed-friendly caching, make sure we are respecting auto_maskandscale and are not applying scale factors twice. --- satpy/readers/netcdf_utils.py | 8 ++++++-- satpy/readers/utils.py | 9 ++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index e4ddf5ccf7..d66d89b6a5 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -101,6 +101,7 @@ def __init__(self, filename, filename_info, filetype_info, self.file_content = {} self.cached_file_content = {} self._use_h5netcdf = False + self._auto_maskandscale = auto_maskandscale try: file_handle = self._get_file_handle() except IOError: @@ -335,6 +336,7 @@ def _get_var_from_xr(self, group, key): def _get_var_from_manager(self, group, key): # Not getting coordinates as this is more work, therefore more # overhead, and those are not used downstream. + with self.manager.acquire_context() as ds: if group is not None: v = ds[group][key] @@ -343,11 +345,13 @@ def _get_var_from_manager(self, group, key): if group is None: dv = get_distributed_friendly_dask_array( self.manager, key, - chunks=v.shape, dtype=v.dtype) + chunks=v.shape, dtype=v.dtype, + auto_maskandscale=self._auto_maskandscale) else: dv = get_distributed_friendly_dask_array( self.manager, key, group=group, - chunks=v.shape, dtype=v.dtype) + chunks=v.shape, dtype=v.dtype, + auto_maskandscale=self._auto_maskandscale) attrs = self._get_object_attrs(v) x = xr.DataArray( dv, diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 10de2b364c..e59bf8d7ac 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -478,7 +478,7 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, - group="/"): + group="/", auto_maskandscale=None): """Construct a dask array from a variable for dask distributed. When we construct a dask array using da.array and use that to create an @@ -508,9 +508,16 @@ def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, What dtype to use. group (str): What group to read the variable from. + auto_maskandscale (bool, optional): + Apply automatic masking and scaling. This will only + work if CachingFileManager.acquire returns a handler with a + method set_auto_maskandscale, such as is the case for + NetCDF4.Dataset. """ def get_chunk(): with manager.acquire_context() as nc: + if auto_maskandscale is not None: + nc.set_auto_maskandscale(auto_maskandscale) return nc["/".join([group, varname])][:] return da.map_blocks( From 09c821ad19fff8adc72a1b2f34aa113737d9472f Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 12:23:13 +0200 Subject: [PATCH 51/74] Remove needless except block Remove a dead code except block that should never be reached. --- satpy/readers/netcdf_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index d66d89b6a5..4645d9b11f 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -211,10 +211,7 @@ def _get_required_variable_names(listed_variables, variable_name_replacements): def __del__(self): """Delete the file handler.""" if self.manager is not None: - try: - self.manager.close() - except RuntimeError: # presumably closed already - pass + self.manager.close() def _collect_global_attrs(self, obj): """Collect all the global attributes for the provided file object.""" From 4f9c5edfdd568597d06fd628ba3dc56b90f2c4e8 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 14:19:34 +0200 Subject: [PATCH 52/74] Test refactoring Migrate TestNetCDF4FileHandler from unittest.TestCase to a regular class. Use a pytest fixture for the temporary NetCDF file. --- satpy/tests/reader_tests/test_netcdf_utils.py | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 6ff2b81b7f..99a5e7cdb5 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -18,7 +18,6 @@ """Module for testing the satpy.readers.netcdf_utils module.""" import os -import unittest import numpy as np import pytest @@ -71,13 +70,15 @@ def get_test_content(self, filename, filename_info, filetype_info): raise NotImplementedError("Fake File Handler subclass must implement 'get_test_content'") -class TestNetCDF4FileHandler(unittest.TestCase): +class TestNetCDF4FileHandler: """Test NetCDF4 File Handler Utility class.""" - def setUp(self): + @pytest.fixture() + def dummy_nc_file(self, tmp_path): """Create a test NetCDF4 file.""" from netCDF4 import Dataset - with Dataset("test.nc", "w") as nc: + fn = tmp_path / "test.nc" + with Dataset(fn, "w") as nc: # Create dimensions nc.createDimension("rows", 10) nc.createDimension("cols", 100) @@ -116,17 +117,14 @@ def setUp(self): d.test_attr_str = "test_string" d.test_attr_int = 0 d.test_attr_float = 1.2 + return fn - def tearDown(self): - """Remove the previously created test file.""" - os.remove("test.nc") - - def test_all_basic(self): + def test_all_basic(self, dummy_nc_file): """Test everything about the NetCDF4 class.""" import xarray as xr from satpy.readers.netcdf_utils import NetCDF4FileHandler - file_handler = NetCDF4FileHandler("test.nc", {}, {}) + file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {}) assert file_handler["/dimension/rows"] == 10 assert file_handler["/dimension/cols"] == 100 @@ -165,7 +163,7 @@ def test_all_basic(self): assert file_handler.file_handle is None assert file_handler["ds2_sc"] == 42 - def test_listed_variables(self): + def test_listed_variables(self, dummy_nc_file): """Test that only listed variables/attributes area collected.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler @@ -175,12 +173,12 @@ def test_listed_variables(self): "attr/test_attr_str", ] } - file_handler = NetCDF4FileHandler("test.nc", {}, filetype_info) + file_handler = NetCDF4FileHandler(dummy_nc_file, {}, filetype_info) assert len(file_handler.file_content) == 2 assert "test_group/attr/test_attr_str" in file_handler.file_content assert "attr/test_attr_str" in file_handler.file_content - def test_listed_variables_with_composing(self): + def test_listed_variables_with_composing(self, dummy_nc_file): """Test that composing for listed variables is performed.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler @@ -199,7 +197,7 @@ def test_listed_variables_with_composing(self): ], } } - file_handler = NetCDF4FileHandler("test.nc", {}, filetype_info) + file_handler = NetCDF4FileHandler(dummy_nc_file, {}, filetype_info) assert len(file_handler.file_content) == 3 assert "test_group/ds1_f/attr/test_attr_str" in file_handler.file_content assert "test_group/ds1_i/attr/test_attr_str" in file_handler.file_content @@ -208,10 +206,10 @@ def test_listed_variables_with_composing(self): assert not any("another_parameter" in var for var in file_handler.file_content) assert "test_group/attr/test_attr_str" in file_handler.file_content - def test_caching(self): + def test_caching(self, dummy_nc_file): """Test that caching works as intended.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler - h = NetCDF4FileHandler("test.nc", {}, {}, cache_var_size=1000, + h = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_var_size=1000, cache_handle=True) assert h.file_handle is not None assert h.file_handle.isopen() @@ -234,21 +232,21 @@ def test_filenotfound(self): with pytest.raises(IOError, match=".*No such file or directory.*"): NetCDF4FileHandler("/thisfiledoesnotexist.nc", {}, {}) - def test_get_and_cache_npxr_is_xr(self): + def test_get_and_cache_npxr_is_xr(self, dummy_nc_file): """Test that get_and_cache_npxr() returns xr.DataArray.""" import xarray as xr from satpy.readers.netcdf_utils import NetCDF4FileHandler - file_handler = NetCDF4FileHandler("test.nc", {}, {}, cache_handle=True) + file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_handle=True) data = file_handler.get_and_cache_npxr("test_group/ds1_f") assert isinstance(data, xr.DataArray) - def test_get_and_cache_npxr_data_is_cached(self): + def test_get_and_cache_npxr_data_is_cached(self, dummy_nc_file): """Test that the data are cached when get_and_cache_npxr() is called.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler - file_handler = NetCDF4FileHandler("test.nc", {}, {}, cache_handle=True) + file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_handle=True) data = file_handler.get_and_cache_npxr("test_group/ds1_f") # Delete the dataset from the file content dict, it should be available from the cache @@ -262,7 +260,6 @@ class TestNetCDF4FsspecFileHandler: def test_default_to_netcdf4_lib(self): """Test that the NetCDF4 backend is used by default.""" - import os import tempfile import h5py From ec76fa6ed5c654b470e5207c98e69141cf7d2660 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 14:50:56 +0200 Subject: [PATCH 53/74] Broaden test match string for test_filenotfound Broaden the string that is matched against in TestNetCDF4FileHandler.test_filenotfound. On Linux and MacOS the expected failure gives "No such file or directory". On Windows it gives "Invalid file format". --- satpy/tests/reader_tests/test_netcdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/tests/reader_tests/test_netcdf_utils.py b/satpy/tests/reader_tests/test_netcdf_utils.py index 99a5e7cdb5..3c2bff4229 100644 --- a/satpy/tests/reader_tests/test_netcdf_utils.py +++ b/satpy/tests/reader_tests/test_netcdf_utils.py @@ -229,7 +229,7 @@ def test_filenotfound(self): """Test that error is raised when file not found.""" from satpy.readers.netcdf_utils import NetCDF4FileHandler - with pytest.raises(IOError, match=".*No such file or directory.*"): + with pytest.raises(IOError, match=".* file .*"): NetCDF4FileHandler("/thisfiledoesnotexist.nc", {}, {}) def test_get_and_cache_npxr_is_xr(self, dummy_nc_file): From 11e0d0f4117ffefd075d132d5f5559e606cf84bb Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 20 Jun 2024 16:59:43 +0200 Subject: [PATCH 54/74] Allow to use distributed or not Add a configuration option so the user can choose whether dask distributed is assumed for eager processing or not. Also document the limitations in using dask distributed. --- doc/source/config.rst | 10 ++++++++++ doc/source/reading.rst | 15 +++++++++++---- satpy/_config.py | 1 + satpy/readers/netcdf_utils.py | 20 ++++++++++++-------- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/doc/source/config.rst b/doc/source/config.rst index 82b25512b8..c36b8856e1 100644 --- a/doc/source/config.rst +++ b/doc/source/config.rst @@ -298,6 +298,16 @@ available. When preloading, how many times to check if a file has become available before giving up. +* **Environment variable**: ``SATPY_READERS__PRELOAD_DASK_DISTRIBUTED`` +* **YAML-Config Key**: ``readers.preload_dask_distributed``. +* **Default**: False + +When preloading, assume we are working in a dask distributed environment. +When active, Satpy workers will secede and rejoin while waiting for files. +This avoids the problem that tasks waiting for later files are blocking +workers, while tasks working on earlier files are needlessly waiting in +the queue. However, Satpy has limited compatibility with dask distributed. + Temporary Directory ^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/reading.rst b/doc/source/reading.rst index 6639fa32f9..07e7395157 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -333,20 +333,27 @@ the computation of the dask graphs until data become available. For additional configuration parameters, see :ref:`Settings`. -Some limitations that may be resolved in the future: +Known limitations as of Satpy 0.50: - Mixing different file types for the same reader is not yet supported. For FCI, that means it is not yet possible to mix FDHSI and HRFI data. - Only the nominal case has been tested. Missing segments are not yet supported. - Dask may not order the processing of the chunks optimally. That means some dask workers may be waiting for chunks 33–40 as chunks 1–32 are coming in - and are not being processed. A suboptimal workaround is to use 40 workers. + and are not being processed. Possible workarounds: + - Use 40 workers. + - Use the dask distributed scheduler using + ``from dask.distributed import Client; Client()``. This has only + limited support in Satpy. It is possible to read FCI L1C data, resample it + using the gradient search resampler, and write the resulting data using the + ``simple_image`` writer. The nearest neighbour resampler or the GeoTIFF + writer do not currently work (see https://github.com/pytroll/satpy/issues/1762) + If you use this scheduler, set the configuration variable + ``readers.preload_dask_distributed`` to True. - Currently, Satpy merely checks the existence of a file and not whether it has been completely written. This may lead to incomplete files being read, which might lead to failures. -Note that this uses the ``h5netcdf`` backend for opening NetCDF files. - For more technical background reading including hints on how this could be extended to other readers, see :class:`~satpy.readers.netcdf_utils.PreloadableSegments` and diff --git a/satpy/_config.py b/satpy/_config.py index 656b15cbfb..af6843c8c5 100644 --- a/satpy/_config.py +++ b/satpy/_config.py @@ -56,6 +56,7 @@ "preload_segments": False, "preload_step": 2, "preload_tries": 300, + "preload_dask_distributed": False, }, } diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 951caf9a14..8cbbc80b66 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -517,6 +517,7 @@ def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): self.preload = preload self.preload_tries = satpy.config.get("readers.preload_tries") self.preload_step = satpy.config.get("readers.preload_step") + self.use_distributed = satpy.config.get("readers.preload_dask_distributed") if preload: if not isinstance(ref_fh, BaseFileHandler): raise TypeError( @@ -589,7 +590,8 @@ def _get_inv_name_map(self): def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment fn_matched = _wait_for_file(self.filename, self.preload_tries, - self.preload_step) + self.preload_step, + self.use_distributed) dade = _get_delayed_value_from_nc(fn_matched, subst_name) array = da.from_delayed( dade, @@ -628,10 +630,10 @@ def store_cache(self, filename=None): @dask.delayed -def _wait_for_file(pat, max_tries=300, wait=2): +def _wait_for_file(pat, max_tries=300, wait=2, use_distributed=False): """Wait for file to appear.""" for i in range(max_tries): - if (match := _check_for_matching_file(i, pat, wait)): + if (match := _check_for_matching_file(i, pat, wait, use_distributed)): if i > 0: # only log if not found immediately LOG.debug(f"Found {match!s} matching {pat!s}!") return match @@ -639,25 +641,27 @@ def _wait_for_file(pat, max_tries=300, wait=2): raise TimeoutError(f"File matching {pat!s} failed to materialise") -def _check_for_matching_file(i, pat, wait): +def _check_for_matching_file(i, pat, wait, use_distributed=False): fns = glob.glob(pat) if len(fns) == 0: - _log_and_wait(i, pat, wait) + _log_and_wait(i, pat, wait, use_distributed) return if len(fns) > 1: raise ValueError(f"Expected one matching file, found {len(fns):d}") return fns[0] -def _log_and_wait(i, pat, wait): +def _log_and_wait(i, pat, wait, use_distributed=False): """Maybe log that we're waiting for pat, then wait.""" if i == 0: # only log if not yet present LOG.debug(f"Waiting for {pat!s} to appear.") if i % 60 == 30: LOG.debug(f"Still waiting for {pat!s}") - dask.distributed.secede() + if use_distributed: + dask.distributed.secede() time.sleep(wait) - dask.distributed.rejoin() + if use_distributed: + dask.distributed.rejoin() @dask.delayed From 06d8811eaa82c9286909cb55d1420d081f21b797 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 24 Jul 2024 11:37:25 +0200 Subject: [PATCH 55/74] fix docstring example spelling Fix the spelling in the docstring example using netCDF4. Co-authored-by: David Hoese --- satpy/readers/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index e59bf8d7ac..487e10fa3b 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -491,9 +491,9 @@ def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, Example:: - >>> import NetCDF4 + >>> import netCDF4 >>> from xarray.backends import CachingFileManager - >>> cfm = CachingFileManager(NetCDF4.Dataset, fn, mode="r") + >>> cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") >>> arr = get_distributed_friendly_dask_array(cfm, "my_var") Args: From aaf91b9ac53abb6a5c4c3f132b63ad57b6583bdb Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 24 Jul 2024 16:23:53 +0200 Subject: [PATCH 56/74] Prevent unexpected type promotion in unit test Add a workaround to prevent an unexpected type promotion in the unit test for dask distributed friendly dask arrays. --- satpy/tests/reader_tests/test_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 2de91e6d4b..08501f9f14 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -559,9 +559,10 @@ def test_get_distributed_friendly_dask_array(self, tmp_path, dask_dist_client, s # ways) without map_blocks. def doubler(x): - return x * 2 + # with a workaround for https://github.com/numpy/numpy/issues/27029 + return x * x.dtype.type(2) - dask_doubler = arr.map_blocks(doubler) + dask_doubler = arr.map_blocks(doubler, dtype=arr.dtype) res = dask_doubler.compute() # test before and after computation, as to confirm we have the correct # shape and dtype and that computing doesn't change them From a2ad42f972c42c1a3cde795d4c142ad2629ee1d4 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 24 Jul 2024 16:58:11 +0200 Subject: [PATCH 57/74] Use block info getting a dd-friendly da When getting a dask-distributed friendly dask array from a NetCDF file using the CachingFileManager, use the information provided in bloc_info on the array location in case we are reading not the entire variable. --- satpy/readers/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 487e10fa3b..04ced56ee9 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -514,11 +514,13 @@ def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, method set_auto_maskandscale, such as is the case for NetCDF4.Dataset. """ - def get_chunk(): + def get_chunk(block_info=None): + arrloc = block_info[None]["array-location"] with manager.acquire_context() as nc: if auto_maskandscale is not None: nc.set_auto_maskandscale(auto_maskandscale) - return nc["/".join([group, varname])][:] + var = nc["/".join([group, varname])] + return var[tuple(slice(*x) for x in arrloc)] return da.map_blocks( get_chunk, From 9126bbe69e1d996f59e11a27e271b9e1c17e7310 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 25 Jul 2024 09:59:57 +0200 Subject: [PATCH 58/74] Rename to serialisable and remove group argument Rename get_distributed_friendly_dask_array to get_serialisable_dask_array and remove the group argument, moving the responsibility for handlings groups to the caller. --- satpy/readers/netcdf_utils.py | 8 ++++---- satpy/readers/utils.py | 12 +++++------- satpy/tests/reader_tests/test_utils.py | 7 +++---- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 4645d9b11f..7dbf6f45d6 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -26,7 +26,7 @@ from satpy.readers import open_file_or_filename from satpy.readers.file_handlers import BaseFileHandler -from satpy.readers.utils import get_distributed_friendly_dask_array, np2str +from satpy.readers.utils import get_serialisable_dask_array, np2str from satpy.utils import get_legacy_chunk_size LOG = logging.getLogger(__name__) @@ -340,13 +340,13 @@ def _get_var_from_manager(self, group, key): else: v = ds[key] if group is None: - dv = get_distributed_friendly_dask_array( + dv = get_serialisable_dask_array( self.manager, key, chunks=v.shape, dtype=v.dtype, auto_maskandscale=self._auto_maskandscale) else: - dv = get_distributed_friendly_dask_array( - self.manager, key, group=group, + dv = get_serialisable_dask_array( + self.manager, "/".join([group, key]), chunks=v.shape, dtype=v.dtype, auto_maskandscale=self._auto_maskandscale) attrs = self._get_object_attrs(v) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 04ced56ee9..4ce1ac795c 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,9 +477,9 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, - group="/", auto_maskandscale=None): - """Construct a dask array from a variable for dask distributed. +def get_serialisable_dask_array(manager, varname, chunks, dtype, + auto_maskandscale=None): + """Construct a serialisable dask array from a variable. When we construct a dask array using da.array and use that to create an xarray dataarray, the result is not serialisable and dask graphs using @@ -501,13 +501,11 @@ def get_distributed_friendly_dask_array(manager, varname, chunks, dtype, Instance of xarray.backends.CachingFileManager encapsulating the dataset to be read. varname (str): - Name of the variable. + Name of the variable (possibly including a group path). chunks (tuple): Chunks to use when creating the dask array. dtype (dtype): What dtype to use. - group (str): - What group to read the variable from. auto_maskandscale (bool, optional): Apply automatic masking and scaling. This will only work if CachingFileManager.acquire returns a handler with a @@ -519,7 +517,7 @@ def get_chunk(block_info=None): with manager.acquire_context() as nc: if auto_maskandscale is not None: nc.set_auto_maskandscale(auto_maskandscale) - var = nc["/".join([group, varname])] + var = nc[varname] #"/".join([group, varname])] return var[tuple(slice(*x) for x in arrloc)] return da.map_blocks( diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 08501f9f14..13663ddb3a 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -534,7 +534,7 @@ def dask_dist_client(self): @pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4)]) @pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) @pytest.mark.parametrize("grp", ["/", "/in/a/group"]) - def test_get_distributed_friendly_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): + def test_get_serialisable_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): """Test getting a dask distributed friendly dask array.""" import netCDF4 from xarray.backends import CachingFileManager @@ -548,9 +548,8 @@ def test_get_distributed_friendly_dask_array(self, tmp_path, dask_dist_client, s ds.to_netcdf(fn, group=grp) cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - arr = hf.get_distributed_friendly_dask_array(cfm, "kaitum", - chunks=shape, dtype=dtype, - group=grp) + arr = hf.get_serialisable_dask_array(cfm, "/".join([grp, "kaitum"]), + chunks=shape, dtype=dtype) # As documented in GH issue 2815, using dask distributed with the file # handle cacher might fail in non-trivial ways, such as giving incorrect From 5e576f91853ec24702f0d8872cbc7608d1262906 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 25 Jul 2024 11:00:14 +0200 Subject: [PATCH 59/74] Use wrapper class for auto_maskandscale --- satpy/readers/netcdf_utils.py | 26 +++++++++++++++++++++++++- satpy/readers/utils.py | 14 +++----------- satpy/tests/reader_tests/test_utils.py | 2 +- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 7dbf6f45d6..722f69d137 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -17,6 +17,7 @@ # satpy. If not, see . """Helpers for reading netcdf-based files.""" +import functools import logging import warnings @@ -122,7 +123,9 @@ def __init__(self, filename, filename_info, filetype_info, if cache_handle: self.manager = xr.backends.CachingFileManager( - netCDF4.Dataset, self.filename, mode="r") + functools.partial(_NCDatasetWrapper, + auto_maskandscale=auto_maskandscale), + self.filename, mode="r") else: file_handle.close() @@ -465,3 +468,24 @@ def _get_attr(self, obj, key): if self._use_h5netcdf: return obj.attrs[key] return super()._get_attr(obj, key) + +class _NCDatasetWrapper(netCDF4.Dataset): + """Wrap netcdf4.Dataset setting auto_maskandscale globally. + + Helper class that wraps netcdf4.Dataset while setting extra parameters. + By encapsulating this in a helper class, we can + pass it to CachingFileManager directly. Currently sets + auto_maskandscale globally (for all variables). + """ + + def __init__(self, *args, auto_maskandscale=False, **kwargs): + """Initialise object.""" + super().__init__(*args, **kwargs) + self._set_extra_settings(auto_maskandscale=auto_maskandscale) + + def _set_extra_settings(self, auto_maskandscale): + """Set our own custom settings. + + Currently only applies set_auto_maskandscale. + """ + self.set_auto_maskandscale(auto_maskandscale) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 4ce1ac795c..b7f630b117 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,8 +477,7 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_serialisable_dask_array(manager, varname, chunks, dtype, - auto_maskandscale=None): +def get_serialisable_dask_array(manager, varname, chunks, dtype): """Construct a serialisable dask array from a variable. When we construct a dask array using da.array and use that to create an @@ -494,7 +493,7 @@ def get_serialisable_dask_array(manager, varname, chunks, dtype, >>> import netCDF4 >>> from xarray.backends import CachingFileManager >>> cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - >>> arr = get_distributed_friendly_dask_array(cfm, "my_var") + >>> arr = get_serialisable_dask_array(cfm, "my_var") Args: manager (xarray.backends.CachingFileManager): @@ -506,18 +505,11 @@ def get_serialisable_dask_array(manager, varname, chunks, dtype, Chunks to use when creating the dask array. dtype (dtype): What dtype to use. - auto_maskandscale (bool, optional): - Apply automatic masking and scaling. This will only - work if CachingFileManager.acquire returns a handler with a - method set_auto_maskandscale, such as is the case for - NetCDF4.Dataset. """ def get_chunk(block_info=None): arrloc = block_info[None]["array-location"] with manager.acquire_context() as nc: - if auto_maskandscale is not None: - nc.set_auto_maskandscale(auto_maskandscale) - var = nc[varname] #"/".join([group, varname])] + var = nc[varname] return var[tuple(slice(*x) for x in arrloc)] return da.map_blocks( diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 13663ddb3a..2445e51f75 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -535,7 +535,7 @@ def dask_dist_client(self): @pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) @pytest.mark.parametrize("grp", ["/", "/in/a/group"]) def test_get_serialisable_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): - """Test getting a dask distributed friendly dask array.""" + """Test getting a dask distributed friendly serialisable dask array.""" import netCDF4 from xarray.backends import CachingFileManager From 63e75073c98a751e7a935a9faa80f1ed40fc4605 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 25 Jul 2024 11:05:31 +0200 Subject: [PATCH 60/74] GB -> US spelling Pytroll uses US spelling. Rename serializable to serialisable. Remove removed keyword argument from call. --- satpy/readers/netcdf_utils.py | 12 +++++------- satpy/readers/utils.py | 8 ++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 722f69d137..e9ae2d24db 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -27,7 +27,7 @@ from satpy.readers import open_file_or_filename from satpy.readers.file_handlers import BaseFileHandler -from satpy.readers.utils import get_serialisable_dask_array, np2str +from satpy.readers.utils import get_serializable_dask_array, np2str from satpy.utils import get_legacy_chunk_size LOG = logging.getLogger(__name__) @@ -343,15 +343,13 @@ def _get_var_from_manager(self, group, key): else: v = ds[key] if group is None: - dv = get_serialisable_dask_array( + dv = get_serializable_dask_array( self.manager, key, - chunks=v.shape, dtype=v.dtype, - auto_maskandscale=self._auto_maskandscale) + chunks=v.shape, dtype=v.dtype) else: - dv = get_serialisable_dask_array( + dv = get_serializable_dask_array( self.manager, "/".join([group, key]), - chunks=v.shape, dtype=v.dtype, - auto_maskandscale=self._auto_maskandscale) + chunks=v.shape, dtype=v.dtype) attrs = self._get_object_attrs(v) x = xr.DataArray( dv, diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index b7f630b117..6e1b3dc0ae 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -477,11 +477,11 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): return reflectance -def get_serialisable_dask_array(manager, varname, chunks, dtype): - """Construct a serialisable dask array from a variable. +def get_serializable_dask_array(manager, varname, chunks, dtype): + """Construct a serializable dask array from a variable. When we construct a dask array using da.array and use that to create an - xarray dataarray, the result is not serialisable and dask graphs using + xarray dataarray, the result is not serializable and dask graphs using this dataarray cannot be computed when the dask distributed scheduler is in use. To circumvent this problem, xarray provides the CachingFileManager. See GH#2815 for more information. @@ -493,7 +493,7 @@ def get_serialisable_dask_array(manager, varname, chunks, dtype): >>> import netCDF4 >>> from xarray.backends import CachingFileManager >>> cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - >>> arr = get_serialisable_dask_array(cfm, "my_var") + >>> arr = get_serializable_dask_array(cfm, "my_var") Args: manager (xarray.backends.CachingFileManager): From ea0459593c4e1f5b1c8d08bd52bc215207959bbe Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 25 Jul 2024 11:11:53 +0200 Subject: [PATCH 61/74] Ensure meta dtype Ensure that the meta we pass to map_blocks also has the right dtype. Not sure if this is necessary when map_blocks already has the right dtype, but it can't hurt. --- satpy/readers/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 6e1b3dc0ae..0c02db8ae3 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -497,7 +497,7 @@ def get_serializable_dask_array(manager, varname, chunks, dtype): Args: manager (xarray.backends.CachingFileManager): - Instance of xarray.backends.CachingFileManager encapsulating the + Instance of :class:`~xarray.backends.CachingFileManager` encapsulating the dataset to be read. varname (str): Name of the variable (possibly including a group path). @@ -516,4 +516,4 @@ def get_chunk(block_info=None): get_chunk, chunks=chunks, dtype=dtype, - meta=np.array([])) + meta=np.array([], dtype=dtype)) From fde3896fc50f0a7420bc7bd28f3e038025e9a953 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 25 Jul 2024 13:15:01 +0200 Subject: [PATCH 62/74] Fix spelling in test --- satpy/tests/reader_tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/satpy/tests/reader_tests/test_utils.py b/satpy/tests/reader_tests/test_utils.py index 2485598826..40a872db29 100644 --- a/satpy/tests/reader_tests/test_utils.py +++ b/satpy/tests/reader_tests/test_utils.py @@ -534,7 +534,7 @@ def dask_dist_client(self): @pytest.mark.parametrize("shape", [(2,), (2, 3), (2, 3, 4)]) @pytest.mark.parametrize("dtype", ["i4", "f4", "f8"]) @pytest.mark.parametrize("grp", ["/", "/in/a/group"]) - def test_get_serialisable_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): + def test_get_serializable_dask_array(self, tmp_path, dask_dist_client, shape, dtype, grp): """Test getting a dask distributed friendly serialisable dask array.""" import netCDF4 from xarray.backends import CachingFileManager @@ -548,7 +548,7 @@ def test_get_serialisable_dask_array(self, tmp_path, dask_dist_client, shape, dt ds.to_netcdf(fn, group=grp) cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - arr = hf.get_serialisable_dask_array(cfm, "/".join([grp, "kaitum"]), + arr = hf.get_serializable_dask_array(cfm, "/".join([grp, "kaitum"]), chunks=shape, dtype=dtype) # As documented in GH issue 2815, using dask distributed with the file From 5b137e8e7df8da7ca3223c7ae8cb5730b71d4123 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 26 Jul 2024 11:05:00 +0200 Subject: [PATCH 63/74] Clarify docstring --- satpy/readers/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/satpy/readers/utils.py b/satpy/readers/utils.py index 80794dad80..4766a1c897 100644 --- a/satpy/readers/utils.py +++ b/satpy/readers/utils.py @@ -480,11 +480,11 @@ def remove_earthsun_distance_correction(reflectance, utc_date=None): def get_serializable_dask_array(manager, varname, chunks, dtype): """Construct a serializable dask array from a variable. - When we construct a dask array using da.array and use that to create an - xarray dataarray, the result is not serializable and dask graphs using - this dataarray cannot be computed when the dask distributed scheduler - is in use. To circumvent this problem, xarray provides the - CachingFileManager. See GH#2815 for more information. + When we construct a dask array using da.array from a file, and use + that to create an xarray dataarray, the result is not serializable + and dask graphs using this dataarray cannot be computed when the dask + distributed scheduler is in use. To circumvent this problem, xarray + provides the CachingFileManager. See GH#2815 for more information. Should have at least one dimension. @@ -492,8 +492,8 @@ def get_serializable_dask_array(manager, varname, chunks, dtype): >>> import netCDF4 >>> from xarray.backends import CachingFileManager - >>> cfm = CachingFileManager(netCDF4.Dataset, fn, mode="r") - >>> arr = get_serializable_dask_array(cfm, "my_var") + >>> cfm = CachingFileManager(netCDF4.Dataset, filename, mode="r") + >>> arr = get_serializable_dask_array(cfm, "my_var", 1024, "f4") Args: manager (xarray.backends.CachingFileManager): From c2b153332036c9e26452f7230ede67d47712e415 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 26 Jul 2024 12:42:32 +0200 Subject: [PATCH 64/74] Use cache already in scene creation When caching, make sure we use the CachingFileManager already upon scene creation and not only by the time we are loading. --- satpy/readers/netcdf_utils.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index e9ae2d24db..1e912be592 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -103,15 +103,22 @@ def __init__(self, filename, filename_info, filetype_info, self.cached_file_content = {} self._use_h5netcdf = False self._auto_maskandscale = auto_maskandscale - try: - file_handle = self._get_file_handle() - except IOError: - LOG.exception( - "Failed reading file %s. Possibly corrupted file", self.filename) - raise + if cache_handle: + self.manager = xr.backends.CachingFileManager( + functools.partial(_NCDatasetWrapper, + auto_maskandscale=auto_maskandscale), + self.filename, mode="r") + file_handle = self.manager.acquire() + else: + try: + file_handle = self._get_file_handle() + except IOError: + LOG.exception( + "Failed reading file %s. Possibly corrupted file", self.filename) + raise - self._set_file_handle_auto_maskandscale(file_handle, auto_maskandscale) - self._set_xarray_kwargs(xarray_kwargs, auto_maskandscale) + self._set_file_handle_auto_maskandscale(file_handle, auto_maskandscale) + self._set_xarray_kwargs(xarray_kwargs, auto_maskandscale) listed_variables = filetype_info.get("required_netcdf_variables") if listed_variables: @@ -121,12 +128,7 @@ def __init__(self, filename, filename_info, filetype_info, self.collect_dimensions("", file_handle) self.collect_cache_vars(cache_var_size) - if cache_handle: - self.manager = xr.backends.CachingFileManager( - functools.partial(_NCDatasetWrapper, - auto_maskandscale=auto_maskandscale), - self.filename, mode="r") - else: + if not cache_handle: file_handle.close() def _get_file_handle(self): From 9fce5a76416327ccf53d46f9c87facb144909599 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 26 Jul 2024 18:04:40 +0200 Subject: [PATCH 65/74] Use helper function rather than subclass Don't subclass netCDF4.Dataset, rather just return an instance from a helper function. Seems good enough and gets rid of the weird error messages upon exit. --- satpy/readers/netcdf_utils.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 1e912be592..3f31cf5ee9 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -105,7 +105,7 @@ def __init__(self, filename, filename_info, filetype_info, self._auto_maskandscale = auto_maskandscale if cache_handle: self.manager = xr.backends.CachingFileManager( - functools.partial(_NCDatasetWrapper, + functools.partial(_nc_dataset_wrapper, auto_maskandscale=auto_maskandscale), self.filename, mode="r") file_handle = self.manager.acquire() @@ -469,23 +469,14 @@ def _get_attr(self, obj, key): return obj.attrs[key] return super()._get_attr(obj, key) -class _NCDatasetWrapper(netCDF4.Dataset): +def _nc_dataset_wrapper(*args, auto_maskandscale, **kwargs): """Wrap netcdf4.Dataset setting auto_maskandscale globally. - Helper class that wraps netcdf4.Dataset while setting extra parameters. - By encapsulating this in a helper class, we can + Helper function that wraps netcdf4.Dataset while setting extra parameters. + By encapsulating this in a helper function, we can pass it to CachingFileManager directly. Currently sets auto_maskandscale globally (for all variables). """ - - def __init__(self, *args, auto_maskandscale=False, **kwargs): - """Initialise object.""" - super().__init__(*args, **kwargs) - self._set_extra_settings(auto_maskandscale=auto_maskandscale) - - def _set_extra_settings(self, auto_maskandscale): - """Set our own custom settings. - - Currently only applies set_auto_maskandscale. - """ - self.set_auto_maskandscale(auto_maskandscale) + nc = netCDF4.Dataset(*args, **kwargs) + nc.set_auto_maskandscale(auto_maskandscale) + return nc From 4993b657be3034ad2996eef2a074e21d144ed6cf Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 26 Jul 2024 18:16:42 +0200 Subject: [PATCH 66/74] restore non-cached group retrieval Some readers read entire groups; this needs xarray kwargs to be set even if caching is used. --- satpy/readers/netcdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 3f31cf5ee9..59bf1829d2 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -118,7 +118,7 @@ def __init__(self, filename, filename_info, filetype_info, raise self._set_file_handle_auto_maskandscale(file_handle, auto_maskandscale) - self._set_xarray_kwargs(xarray_kwargs, auto_maskandscale) + self._set_xarray_kwargs(xarray_kwargs, auto_maskandscale) listed_variables = filetype_info.get("required_netcdf_variables") if listed_variables: From aca09ff686c5689cc9646c8b27341a2ee405baf4 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Tue, 30 Jul 2024 16:20:46 +0200 Subject: [PATCH 67/74] Guard against file handle caching when preloading Guard against file handle caching with the xarray manager when preloading. We can't cache very well when data are not there yet. --- satpy/readers/netcdf_utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 42c4a6a05e..a366eb7096 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -109,11 +109,7 @@ def __init__(self, filename, filename_info, filetype_info, self._use_h5netcdf = False self._auto_maskandscale = auto_maskandscale if cache_handle: - self.manager = xr.backends.CachingFileManager( - functools.partial(_nc_dataset_wrapper, - auto_maskandscale=auto_maskandscale), - self.filename, mode="r") - file_handle = self.manager.acquire() + file_handle = self._get_cached_file_handle(auto_maskandscale) else: try: file_handle = self._get_file_handle() @@ -139,6 +135,13 @@ def __init__(self, filename, filename_info, filetype_info, def _get_file_handle(self): return netCDF4.Dataset(self.filename, "r") + def _get_cached_file_handle(self, auto_maskandscale): + self.manager = xr.backends.CachingFileManager( + functools.partial(_nc_dataset_wrapper, + auto_maskandscale=auto_maskandscale), + self.filename, mode="r") + return self.manager.acquire() + @property def file_handle(self): """Backward-compatible way for file handle caching.""" @@ -612,6 +615,11 @@ def _get_file_handle(self): return open(os.devnull, "r") return super()._get_file_handle() + def _get_cached_file_handle(self, auto_maskandscale): + if self.preload: + return open(os.devnull, "r") + return super()._get_cached_file_handle(auto_maskandscale) + def store_cache(self, filename=None): """Store RC-cachable data to cache.""" if self.preload: From ec49a6fec3f76873acf51cd6081bca86f4ad59b7 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 31 Jul 2024 11:56:32 +0200 Subject: [PATCH 68/74] Preload netCDF4.Variable, not xarray.DataArray, from reference filehandler When segment-shareable data are obtained from the reference filehandler, get this directly from its ``file_content`` attribute and not via the filehandlers ``__getitem__`` method. This solves the performance problem using preloading with dask distributed. Details: ``NetCDF4FileHandler.__getitem__`` turns a netCDF4.Variable into an xarray.DataArray encapsulating a dask array. Therefore, a small read, such as the FCI L1C NC file handler needs to perform upon ``Scene.load``, becomes a small compute. Those small computes have a small negative impact when using the normal dask scheduler, but a terrible negative impact when using the dask distributed scheduler, where in one case I tested, ``Scene.load`` increases from 10 seconds to 375 seconds. Time for ``Scene.load`` with preloaded segments with the regular scheduler, before this commit: 10.8 seconds Time for ``Scene.load`` with preloaded segments with the distributed scheduler, before this commit: 376 seconds Time for ``Scene.load`` with preloaded segments with the regular scheduler, after this commit: 6.5 seconds Time for ``Scene.load`` with preloaded segments with the distributed scheduler, after this commit: 6.9 seconds --- satpy/readers/netcdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index a366eb7096..5c36780a72 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -555,7 +555,7 @@ def _preload_listed_variables(self, listed_variables): for subst_name in self._get_required_variable_names( [raw_name], variable_name_replacements): if self._can_get_from_other_segment(raw_name): - self.file_content[subst_name] = self.ref_fh[subst_name] + self.file_content[subst_name] = self.ref_fh.file_content[subst_name] elif not self._can_get_from_other_rc(raw_name): self._collect_variable_delayed(subst_name) From 2b022286f192b5be0d3ff9a270f88030c850d104 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 2 Aug 2024 11:19:26 +0200 Subject: [PATCH 69/74] Reorganise preloading documentation Move preloading documentations to its own section under "Advanced topics". Fix internal reference links. --- doc/source/advanced/index.rst | 94 +++++++++++++++++++++++++++++++++++ doc/source/config.rst | 17 +++++-- doc/source/index.rst | 1 + doc/source/reading.rst | 75 ---------------------------- satpy/readers/netcdf_utils.py | 5 +- 5 files changed, 109 insertions(+), 83 deletions(-) create mode 100644 doc/source/advanced/index.rst diff --git a/doc/source/advanced/index.rst b/doc/source/advanced/index.rst new file mode 100644 index 0000000000..a06b2bf2e1 --- /dev/null +++ b/doc/source/advanced/index.rst @@ -0,0 +1,94 @@ +Advanced topics +=============== + +This section of the documentation contains advanced topics that are not relevant +for most users. + +.. _preload: + +Preloading segments for improved timeliness +------------------------------------------- + +Normally, data need to exist before they can be read. This requirement +impacts data processing timeliness. For data arriving in segments, +Satpy can process each segment immediately as it comes in. +This feature currently only works with the :mod:`~satpy.readers.fci_l1c_nc` reader. +This is experimental and likely to be instable and might change. + +Consider a near real time data reception situation where FCI segments are +delivered one by one. Classically, to produce full disc imagery, users +would wait for all needed segments to arrive, before they start processing +any data by passing all segments to the :class:`~satpy.scene.Scene`. +For a more timely imagery production, users can create the Scene, load the +data, resample, and even call :meth:`~satpy.scene.Scene.save_datasets` +before the data are complete (:meth:`~satpy.scene.Scene.`save_datasets` will wait until the data +are available, unless ``compute=False``). +Upon computation, much of the overhead in Satpy +internals has already been completed, and Satpy will process each segment +as it comes in. + +To do so, Satpy caches a selection of data and metadata between segments +and between repeat cycles. Caching between segments happens in-memory +and needs no preparation from the user, but where data are cached +between repeat cycles, the user needs to create this cache first from +a repeat cycle that is available completely:: + + >>> from satpy.readers import create_preloadable_cache + >>> create_preloadable_cache("fci_l1c_nc", fci_files) + +This needs to be done only once as long as data or metadata cached +between repeat cycles does not change (for example, the rows at which +each repeat cycle starts and ends). To make use of eager processing, set +the configuration variable ``readers.preload_segments``. When creating +the scene, pass only the path to the first segment:: + + >>> satpy.config.set({"readers.preload_segments": True}) + >>> sc = Scene( + ... filenames=[path_to_first_segment], + ... reader="fci_l1c_nc") + +Satpy will figure out the names of the remaining segments and find them as +they come in. If the data are already available, processing is similar to +the regular case. If the data are not yet available, Satpy will wait during +the computation of the dask graphs until data become available. + +For additional configuration parameters, see the :ref:`configuration documentation `. + +Known limitations as of Satpy 0.51: + +- Mixing different file types for the same reader is not yet supported. + For FCI, that means it is not yet possible to mix FDHSI and HRFI data. +- When segments are missing, processing times out and no image will be produced. + There is currently no way to produce an incomplete image with the missing + segment left out. +- Dask may not order the processing of the chunks optimally. That means some + dask workers may be waiting for chunks 33–40 as chunks 1–32 are coming in + and are not being processed. Possible workarounds: + - Use as many workers are there are chunks (for example, 40). + - Use the dask distributed scheduler using + ``from dask.distributed import Client; Client()``. This has only + limited support in Satpy and is highly experimental. It should be possible + to read FCI L1C data, resample it + using the gradient search resampler, and write the resulting data using the + ``simple_image`` writer. The nearest neighbour resampler or the GeoTIFF + writer do not currently work (see https://github.com/pytroll/satpy/issues/1762). + If you use this scheduler, set the configuration variable + ``readers.preload_dask_distributed`` to True. + This is not currently recommended in a production environment. Any feedback + is highly welcome. +- Currently, Satpy merely checks the existence of a file and not whether it + has been completely written. This may lead to incomplete files being read, + which might lead to failures. + +For more technical background reading including hints +on how this could be extended to other readers, see the API documentations for +:class:`~satpy.readers.netcdf_utils.PreloadableSegments` and +:class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. + +.. versionadded:: 0.51 + +.. toctree:: + :hidden: + :maxdepth: 1 + + preloaded_reading diff --git a/doc/source/config.rst b/doc/source/config.rst index 1205eb2fb5..85c13f3ef2 100644 --- a/doc/source/config.rst +++ b/doc/source/config.rst @@ -274,15 +274,21 @@ Clipping of negative radiances is currently implemented for the following reader * ``abi_l1b``, ``ami_l1b`` +.. _preload_settings: + Preloading segments ^^^^^^^^^^^^^^^^^^^ +.. note:: + + Preloading is an advanced topic and experimental. See :ref:`preload` + for details. + * **Environment variable**: ``SATPY_READERS__PRELOAD_SEGMENTS`` * **YAML-Config Key**: ``readers.preload_segments`` * **Default**: False -Preload segments for those readers where it is supported. See -:ref:`Reading data before they're available` for details. +Preload segments for those readers where it is supported. * **Environment variable**: ``SATPY_READERS__PRELOAD_STEP`` * **YAML-Config Key**: ``readers.preload_step`` @@ -304,9 +310,10 @@ before giving up. When preloading, assume we are working in a dask distributed environment. When active, Satpy workers will secede and rejoin while waiting for files. -This avoids the problem that tasks waiting for later files are blocking -workers, while tasks working on earlier files are needlessly waiting in -the queue. However, Satpy has limited compatibility with dask distributed. +This might partially avoid the problem that tasks waiting for later files +are blocking workers, while tasks working on earlier files are needlessly +waiting in the queue. However, Satpy has limited compatibility with +dask distributed. Temporary Directory ^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/index.rst b/doc/source/index.rst index 66a069fcda..919ff56e10 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -70,6 +70,7 @@ Documentation enhancements writing multiscene + advanced/index dev_guide/index .. toctree:: diff --git a/doc/source/reading.rst b/doc/source/reading.rst index 87c30a6e33..0dbea74046 100644 --- a/doc/source/reading.rst +++ b/doc/source/reading.rst @@ -404,81 +404,6 @@ are automatically added. Other possible coordinates you may see: * ``acq_time``: Instrument data acquisition time per scan or row of data. -Reading data before they're available -===================================== - -Normally, data need to exist before they can be read. This requirement -impacts data processing timeliness. For data arriving in segments, -Satpy can process each segment immediately as it comes in. -This feature currently only works with the :mod:`~satpy.readers.fci_l1c_nc` reader. -This is experimental and likely to be instable and might change. - -Consider a near real time data reception situation where FCI segments are -delivered one by one. Classically, to produce full disc imagery, users -would wait for all needed segments to arrive, before they start processing -any data by passing all segments to the :class:`~satpy.scene.Scene`. -For a more timely imagery production, users can create the Scene, load the -data, resample, and even call :meth:`~satpy.scene.Scene.save_datasets` -(as long as ``compute=False``) before the data are complete. -When they then trigger the computation, much of the overhead in Satpy -internals has already been completed, and Satpy will process each segment -as it comes in. - -To do so, Satpy caches a selection of data and metadata between segments -and between repeat cycles. Caching between segments happens in-memory -and needs no preparation from the user, but where data are cached -between repeat cycles, the user needs to create this cache first from -a repeat cycle that is available completely:: - - >>> from satpy.readers import create_preloadable_cache - >>> create_preloadable_cache("fci_l1c_nc", fci_files) - -This needs to be done only once as long as data or metadata cached -between repeat cycles does not change (for example, the rows at which -each repeat cycle starts and ends). To make use of eager processing, set -the configuration variable ``readers.preload_segments``. When creating -the scene, pass only the path to the first segment:: - - >>> satpy.config.set({"readers.preload_segments": True}) - >>> sc = Scene( - ... filenames=[path_to_first_segment], - ... reader="fci_l1c_nc") - -Satpy will figure out the names of the remaining segments and find them as -they come in. If the data are already available, processing is similar to -the regular case. If the data are not yet available, Satpy will wait during -the computation of the dask graphs until data become available. - -For additional configuration parameters, see :ref:`Settings`. - -Known limitations as of Satpy 0.50: - -- Mixing different file types for the same reader is not yet supported. - For FCI, that means it is not yet possible to mix FDHSI and HRFI data. -- Only the nominal case has been tested. Missing segments are not yet supported. -- Dask may not order the processing of the chunks optimally. That means some - dask workers may be waiting for chunks 33–40 as chunks 1–32 are coming in - and are not being processed. Possible workarounds: - - Use 40 workers. - - Use the dask distributed scheduler using - ``from dask.distributed import Client; Client()``. This has only - limited support in Satpy. It is possible to read FCI L1C data, resample it - using the gradient search resampler, and write the resulting data using the - ``simple_image`` writer. The nearest neighbour resampler or the GeoTIFF - writer do not currently work (see https://github.com/pytroll/satpy/issues/1762) - If you use this scheduler, set the configuration variable - ``readers.preload_dask_distributed`` to True. -- Currently, Satpy merely checks the existence of a file and not whether it - has been completely written. This may lead to incomplete files being read, - which might lead to failures. - -For more technical background reading including hints -on how this could be extended to other readers, see -:class:`~satpy.readers.netcdf_utils.PreloadableSegments` and -:class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. - -.. versionadded:: 0.50 - Adding a Reader to Satpy ======================== diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 5c36780a72..acddefec5a 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -644,8 +644,7 @@ def _wait_for_file(pat, max_tries=300, wait=2, use_distributed=False): """Wait for file to appear.""" for i in range(max_tries): if (match := _check_for_matching_file(i, pat, wait, use_distributed)): - if i > 0: # only log if not found immediately - LOG.debug(f"Found {match!s} matching {pat!s}!") + LOG.debug(f"Found {match!s} matching {pat!s}!") return match else: raise TimeoutError(f"File matching {pat!s} failed to materialise") @@ -663,7 +662,7 @@ def _check_for_matching_file(i, pat, wait, use_distributed=False): def _log_and_wait(i, pat, wait, use_distributed=False): """Maybe log that we're waiting for pat, then wait.""" - if i == 0: # only log if not yet present + if i == 0: LOG.debug(f"Waiting for {pat!s} to appear.") if i % 60 == 30: LOG.debug(f"Still waiting for {pat!s}") From a1847b755329cfd7763bb069e5edb1672b24cce6 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 2 Aug 2024 11:23:25 +0200 Subject: [PATCH 70/74] Fix typo in documentation --- doc/source/advanced/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/advanced/index.rst b/doc/source/advanced/index.rst index a06b2bf2e1..b64eb19c02 100644 --- a/doc/source/advanced/index.rst +++ b/doc/source/advanced/index.rst @@ -21,7 +21,7 @@ would wait for all needed segments to arrive, before they start processing any data by passing all segments to the :class:`~satpy.scene.Scene`. For a more timely imagery production, users can create the Scene, load the data, resample, and even call :meth:`~satpy.scene.Scene.save_datasets` -before the data are complete (:meth:`~satpy.scene.Scene.`save_datasets` will wait until the data +before the data are complete (:meth:`~satpy.scene.Scene.save_datasets` will wait until the data are available, unless ``compute=False``). Upon computation, much of the overhead in Satpy internals has already been completed, and Satpy will process each segment From 32f2ff7ac7c3c9842293c0ce59a15afe39afcf7a Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 2 Aug 2024 11:46:11 +0200 Subject: [PATCH 71/74] Rename preload configuration parameters Rename the preloading configuration parameters to use their own subsection. --- doc/source/config.rst | 19 ++++++++++--------- satpy/_config.py | 10 ++++++---- satpy/readers/netcdf_utils.py | 8 ++++---- satpy/readers/yaml_reader.py | 2 +- satpy/tests/test_readers.py | 2 +- satpy/tests/test_yaml_reader.py | 12 ++++++------ 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/doc/source/config.rst b/doc/source/config.rst index 85c13f3ef2..5fa52c7fdc 100644 --- a/doc/source/config.rst +++ b/doc/source/config.rst @@ -284,28 +284,28 @@ Preloading segments Preloading is an advanced topic and experimental. See :ref:`preload` for details. -* **Environment variable**: ``SATPY_READERS__PRELOAD_SEGMENTS`` -* **YAML-Config Key**: ``readers.preload_segments`` +* **Environment variable**: ``SATPY_READERS__PRELOAD__ENABLE`` +* **YAML-Config Key**: ``readers.preload.enable`` * **Default**: False Preload segments for those readers where it is supported. -* **Environment variable**: ``SATPY_READERS__PRELOAD_STEP`` -* **YAML-Config Key**: ``readers.preload_step`` +* **Environment variable**: ``SATPY_READERS__PRELOAD__STEP`` +* **YAML-Config Key**: ``readers.preload.step`` * **Default**: 2 When preloading, internal time in seconds to check if a file has become available. -* **Environment variable**: ``SATPY_READERS__PRELOAD_TRIES``. -* **YAML-Config Key**: ``readers.preload_tries``. +* **Environment variable**: ``SATPY_READERS__PRELOAD__ATTEMPTS``. +* **YAML-Config Key**: ``readers.preload.attempts``. * **Default**: 300 When preloading, how many times to check if a file has become available before giving up. -* **Environment variable**: ``SATPY_READERS__PRELOAD_DASK_DISTRIBUTED`` -* **YAML-Config Key**: ``readers.preload_dask_distributed``. +* **Environment variable**: ``SATPY_READERS__PRELOAD__ASSUME_DISTRIBUTED`` +* **YAML-Config Key**: ``readers.preload.assume_distributed``. * **Default**: False When preloading, assume we are working in a dask distributed environment. @@ -313,7 +313,8 @@ When active, Satpy workers will secede and rejoin while waiting for files. This might partially avoid the problem that tasks waiting for later files are blocking workers, while tasks working on earlier files are needlessly waiting in the queue. However, Satpy has limited compatibility with -dask distributed. +dask distributed. Please refer to the note at :ref:`preload` before +considering this option. Temporary Directory ^^^^^^^^^^^^^^^^^^^ diff --git a/satpy/_config.py b/satpy/_config.py index af6843c8c5..c5f8635c50 100644 --- a/satpy/_config.py +++ b/satpy/_config.py @@ -53,10 +53,12 @@ "sensor_angles_position_preference": "actual", "readers": { "clip_negative_radiances": False, - "preload_segments": False, - "preload_step": 2, - "preload_tries": 300, - "preload_dask_distributed": False, + "preload": { + "enable": False, + "step": 2, + "attempts": 300, + "assume_distributed": False, + }, }, } diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index acddefec5a..7c42048a05 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -520,9 +520,9 @@ class PreloadableSegments: def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): """Store attributes needed for preloading to work.""" self.preload = preload - self.preload_tries = satpy.config.get("readers.preload_tries") - self.preload_step = satpy.config.get("readers.preload_step") - self.use_distributed = satpy.config.get("readers.preload_dask_distributed") + self.preload_attempts = satpy.config.get("readers.preload.attempts") + self.preload_step = satpy.config.get("readers.preload.step") + self.use_distributed = satpy.config.get("readers.preload.assume_distributed") if preload: if not isinstance(ref_fh, BaseFileHandler): raise TypeError( @@ -594,7 +594,7 @@ def _get_inv_name_map(self): def _collect_variable_delayed(self, subst_name): md = self.ref_fh[subst_name] # some metadata from reference segment - fn_matched = _wait_for_file(self.filename, self.preload_tries, + fn_matched = _wait_for_file(self.filename, self.preload_attempts, self.preload_step, self.use_distributed) dade = _get_delayed_value_from_nc(fn_matched, subst_name) diff --git a/satpy/readers/yaml_reader.py b/satpy/readers/yaml_reader.py index 19b0a35286..5620d27f83 100644 --- a/satpy/readers/yaml_reader.py +++ b/satpy/readers/yaml_reader.py @@ -1186,7 +1186,7 @@ class GEOSegmentYAMLReader(GEOFlippableFileYAMLReader): def __init__(self, *args, **kwargs): """Initialise object.""" - self.preload = satpy.config.get("readers.preload_segments") + self.preload = satpy.config.get("readers.preload.enable") super().__init__(*args, **kwargs) def create_filehandlers(self, filenames, fh_kwargs=None): diff --git a/satpy/tests/test_readers.py b/satpy/tests/test_readers.py index d6db47b08f..9663b643e8 100644 --- a/satpy/tests/test_readers.py +++ b/satpy/tests/test_readers.py @@ -1135,7 +1135,7 @@ def test_create_preloadable_cache(tmp_path): rc_cache=tmp_path / "test.pkl", preload=False) dph.file_content["/iceland/reykjavík"] = xr.DataArray(da.from_array([[0, 1, 2]])) - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader(fake_config) gsyr.file_handlers["handler"] = [dph] diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index ff366b8a2d..b83c06643a 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1575,7 +1575,7 @@ def test_predict_filename_info(tmp_path): def fake_gsyreader(): """Create a fake GeoSegmentYAMLReader.""" from satpy.readers.yaml_reader import GEOSegmentYAMLReader - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): return GEOSegmentYAMLReader( {"reader": { "name": "alicudi"}, @@ -1678,7 +1678,7 @@ def test_preloaded_instances_works( "{platform}-c-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc", "{platform}-d-{start_time:%Y%m%d%H%M%S}-{end_time:%Y%m%d%H%M%S}-{segment:>02d}.nc"]} - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( {"reader": { "name": "island-reader"}, @@ -1717,7 +1717,7 @@ def test_preloaded_instances_requirement( ft_info = {**fake_filetype_info, "requires": ["pergola"]} - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( {"reader": { "name": "alicudi"}, @@ -1758,7 +1758,7 @@ def test_get_cache_filename_segment_only(tmp_path): "segment_tag": "segment", "expected_segments": 5} - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( {"reader": { "name": "filicudi"}, @@ -1785,7 +1785,7 @@ def test_get_cache_filename_cache_and_segment(tmp_path): "segment_tag": "segment", "expected_segments": 5} - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( {"reader": { "name": "salina"}, @@ -1814,7 +1814,7 @@ def test_get_cache_filename_including_time(tmp_path): "expected_segments": 5, "time_tags": ["start_time", "end_time"]} - with satpy.config.set({"readers.preload_segments": True}): + with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( {"reader": { "name": "salina"}, From 269036e6c5dfceb196dd9126106c6864fe34f276 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Fri, 2 Aug 2024 13:44:36 +0200 Subject: [PATCH 72/74] Refactor test for getting the cache filename Merge three very similar test functions to a single parametrised one. --- satpy/tests/test_yaml_reader.py | 84 +++++++++------------------------ 1 file changed, 23 insertions(+), 61 deletions(-) diff --git a/satpy/tests/test_yaml_reader.py b/satpy/tests/test_yaml_reader.py index b83c06643a..5ebed47b1b 100644 --- a/satpy/tests/test_yaml_reader.py +++ b/satpy/tests/test_yaml_reader.py @@ -1746,17 +1746,35 @@ def test_preloaded_instances_not_implemented(tmp_path, fake_gsyreader, list(g) -def test_get_cache_filename_segment_only(tmp_path): - """Test getting cache filename, segment only.""" +@pytest.mark.parametrize("include", [(), ("rc",), ("rc", "time")]) +def test_get_cache_filename(tmp_path, include): + """Test getting the pre-loading cache filename.""" from satpy.readers.yaml_reader import GEOSegmentYAMLReader - fn = tmp_path / "a-01.nc" + fn = fp = "a-" fn_info = {"segment": 1} ft_info = { "file_reader": BaseFileHandler, - "file_patterns": ["a-{segment:>02d}.nc"], "segment_tag": "segment", "expected_segments": 5} + if "time" in include: + fn += "20421015234500-234600-" + fp += "{start_time:%Y%m%d%H%M%S}-{end_time:%H%M%S}-" + fn_info["start_time"] = dt.datetime(2042, 10, 15, 23, 45) + fn_info["end_time"] = dt.datetime(2042, 10, 15, 23, 46) + ft_info["time_tags"] = ["start_time", "end_time"] + if "rc" in include: + fn += "04-" + fp += "{rc:>02d}-" + fn_info["rc"] = 4 + fn += "01.nc" + fp += "{segment:>02d}.nc" + + ft_info["file_patterns"] = [fp] + if "time" in include: + ref_fn = "a-99991231235959-235959-04-01.pkl" + else: + ref_fn = fn[:-2] + "pkl" with satpy.config.set({"readers.preload.enable": True}): gsyr = GEOSegmentYAMLReader( @@ -1770,60 +1788,4 @@ def test_get_cache_filename_segment_only(tmp_path): au.return_value = os.fspath(tmp_path / "cache") cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-01.pkl") - - -def test_get_cache_filename_cache_and_segment(tmp_path): - """Test getting the cache filename with segment and repeat cycle.""" - from satpy.readers.yaml_reader import GEOSegmentYAMLReader - - fn = tmp_path / "a-04-01.nc" - fn_info = {"rc": 4, "segment": 1} - ft_info = { - "file_reader": BaseFileHandler, - "file_patterns": ["a-{rc:>02d}-{segment:>02d}.nc"], - "segment_tag": "segment", - "expected_segments": 5} - - with satpy.config.set({"readers.preload.enable": True}): - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "salina"}, - "file_types": { - "m9g": ft_info}}) - fh = BaseFileHandler(fn, fn_info, ft_info) - - with unittest.mock.patch("appdirs.user_cache_dir") as au: - au.return_value = os.fspath(tmp_path / "cache") - cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) - assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-04-01.pkl") - - -def test_get_cache_filename_including_time(tmp_path): - """Test getting the cache filename including a dummpy time.""" - from satpy.readers.yaml_reader import GEOSegmentYAMLReader - - fn = tmp_path / "a-20421015234500-234600-04-01.nc" - fn_info = {"start_time": dt.datetime(2042, 10, 15, 23, 45), - "end_time": dt.datetime(2042, 10, 15, 23, 46), "rc": 4, "segment": 1} - ft_info = { - "file_reader": BaseFileHandler, - "file_patterns": ["a-{start_time:%Y%m%d%H%M%S}-{end_time:%H%M%S}-{rc:>02d}-{segment:>02d}.nc"], - "segment_tag": "segment", - "expected_segments": 5, - "time_tags": ["start_time", "end_time"]} - - with satpy.config.set({"readers.preload.enable": True}): - gsyr = GEOSegmentYAMLReader( - {"reader": { - "name": "salina"}, - "file_types": { - "m9g": ft_info}}) - fh = BaseFileHandler(fn, fn_info, ft_info) - - with unittest.mock.patch("appdirs.user_cache_dir") as au: - au.return_value = os.fspath(tmp_path / "cache") - cf = gsyr._get_cache_filename(os.fspath(fn), fn_info, fh) - assert cf == os.fspath(tmp_path / "cache" / "satpy" / "preloadable" / - "BaseFileHandler" / "a-99991231235959-235959-04-01.pkl") + "BaseFileHandler" / ref_fn) From 41637fa25db565925932d750b729938ffe5c8294 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 5 Aug 2024 11:49:20 +0200 Subject: [PATCH 73/74] Update versionadded to show 0.51, not 0.50 --- satpy/readers/netcdf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index 7c42048a05..d4106e4a97 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -514,7 +514,7 @@ class PreloadableSegments: This feature is experimental. - .. versionadded:: 0.50 + .. versionadded:: 0.51 """ def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs): From bde842e911a53f30c37b260b35e1af1f5760ee66 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Thu, 15 Aug 2024 16:10:05 +0200 Subject: [PATCH 74/74] version added: 0.52 --- doc/source/advanced/index.rst | 2 +- satpy/readers/netcdf_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/advanced/index.rst b/doc/source/advanced/index.rst index b64eb19c02..876037eb5d 100644 --- a/doc/source/advanced/index.rst +++ b/doc/source/advanced/index.rst @@ -85,7 +85,7 @@ on how this could be extended to other readers, see the API documentations for :class:`~satpy.readers.netcdf_utils.PreloadableSegments` and :class:`~satpy.readers.yaml_reader.GEOSegmentYAMLReader`. -.. versionadded:: 0.51 +.. versionadded:: 0.52 .. toctree:: :hidden: diff --git a/satpy/readers/netcdf_utils.py b/satpy/readers/netcdf_utils.py index d4106e4a97..01aeafbf1b 100644 --- a/satpy/readers/netcdf_utils.py +++ b/satpy/readers/netcdf_utils.py @@ -514,7 +514,7 @@ class PreloadableSegments: This feature is experimental. - .. versionadded:: 0.51 + .. versionadded:: 0.52 """ def __init__(self, *args, preload=False, ref_fh=None, rc_cache=None, **kwargs):