From f19215b24fa721fe484508ac9f1f5189c237777b Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 23 Jan 2025 18:24:58 -0500 Subject: [PATCH 01/10] Refactor missing methods --- CHANGELOG.rst | 5 + src/xclim/core/indicator.py | 20 +- src/xclim/core/missing.py | 797 ++++++++++++++++++------------------ src/xclim/core/options.py | 19 +- tests/test_missing.py | 20 +- 5 files changed, 421 insertions(+), 440 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b7570b3b..7a3ec3706 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,11 +16,16 @@ New features and enhancements * New function ``ensemble.partition.general_partition`` (:pull:`2035`) * Added a new ``xclim.indices.generic.bivariate_count_occurrences`` function to count instances where operations and performed and validated for two variables. (:pull:`2030`). * `xclim` now tracks energy usage and carbon emissions ("last run", "average", and "total") during CI workflows using the `eco-ci-energy-estimation` GitHub Action. (:pull:`2046`). +* Missing values method "pct" and "at_least_n" now accept a new "subfreq" option that allows to compute the missing mask in two steps. When given, the algorithm is applied at this "subfreq" resampling frequency first and then the result is resampled at the target indicator frequency. In the output, a period is invalid if any of its subgroup where flagged as invalid by the chosen method. Internal changes ^^^^^^^^^^^^^^^^ * `sphinx-codeautolink` and `pygments` have been temporarily pinned due to breaking API changes. (:pull:`2030`). * Adjusted the ``TestOfficialYaml`` test to use a dynamic method for finding the installed location of `xclim`. (:pull:`2028`). +* Refactor of the ``xclim.core.missing`` module, usage of the ``Missing`` objects has been broken. + - Objects are initialized with their options and then called with the data, input frequency, target frequency and indexer. + - Subclasses receive non-resampled DataArray in their ``is_missing`` methods. + - ``MissingWMO`` now uses ``xclim.indices.helpers.resample_map`` which should greatly improve performance in a dask context. v0.54.0 (2024-12-16) -------------------- diff --git a/src/xclim/core/indicator.py b/src/xclim/core/indicator.py index 37543fec6..367805192 100644 --- a/src/xclim/core/indicator.py +++ b/src/xclim/core/indicator.py @@ -1524,12 +1524,6 @@ def __init__(self, **kwds): "Cannot set `missing_options` with `missing` method being from context." ) - # Validate hard-coded missing options - kls = MISSING_METHODS[self.missing] - self._missing = kls.execute - if self.missing_options: - kls.validate(**self.missing_options) - super().__init__(**kwds) def _history_string(self, das, params): @@ -1541,6 +1535,8 @@ def _history_string(self, das, params): if missing != "skip": mopts = self.missing_options or OPTIONS[MISSING_OPTIONS].get(missing) + if mopts.get("subfreq", "absent") is None: + mopts.pop("subfreq") # impertinent default if mopts: opt_str += f", missing_options={mopts}" @@ -1555,17 +1551,19 @@ def _postprocess(self, outs, das, params): outs = super()._postprocess(outs, das, params) freq = self._get_missing_freq(params) - if self.missing != "skip" or freq is False: + method = ( + self.missing if self.missing != "from_context" else OPTIONS[CHECK_MISSING] + ) + if method != "skip" and freq is not False: # Mask results that do not meet criteria defined by the `missing` method. # This means all outputs must have the same dimensions as the broadcasted inputs (excluding time) - options = self.missing_options or OPTIONS[MISSING_OPTIONS].get( - self.missing, {} - ) + options = self.missing_options or OPTIONS[MISSING_OPTIONS].get(method, {}) + misser = MISSING_METHODS[method](**options) # We flag periods according to the missing method. skip variables without a time coordinate. src_freq = self.src_freq if isinstance(self.src_freq, str) else None miss = ( - self._missing(da, freq, src_freq, options, params.get("indexer", {})) + misser(da, freq, src_freq, **params.get("indexer", {})) for da in das.values() if "time" in da.coords ) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index ed644b3ce..6aa82ecdf 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -9,18 +9,17 @@ consecutive and overall missing values per month. `xclim` has a registry of missing value detection algorithms that can be extended by users to customize the behavior -of indicators. Once registered, algorithms can be used within indicators by setting the `missing` attribute of an -`Indicator` subclass. By default, `xclim` registers the following algorithms: +of indicators. Once registered, algorithms can be used by setting the global option as ``xc.set_options(check_missing="method")`` +or within indicators by setting the `missing` attribute of an `Indicator` subclass. +By default, `xclim` registers the following algorithms: * `any`: A result is missing if any input value is missing. * `at_least_n`: A result is missing if less than a given number of valid values are present. * `pct`: A result is missing if more than a given fraction of values are missing. * `wmo`: A result is missing if 11 days are missing, or 5 consecutive values are missing in a month. - * `skip`: Skip missing value detection. - * `from_context`: Look-up the missing value algorithm from options settings. See :py:func:`xclim.set_options`. To define another missing value algorithm, subclass :py:class:`MissingBase` and decorate it with -:py:func:`xclim.core.options.register_missing_method`. +:py:func:`xclim.core.options.register_missing_method`. See subclassing guidelines in ``MissingBase``'s doc. """ from __future__ import annotations @@ -29,7 +28,7 @@ import xarray as xr from xclim.core.calendar import ( - get_calendar, + compare_offsets, is_offset_divisor, parse_offset, select_time, @@ -41,9 +40,12 @@ OPTIONS, register_missing_method, ) +from xclim.indices import run_length as rl +from xclim.indices.helpers import resample_map __all__ = [ "at_least_n_valid", + "expected_count", "missing_any", "missing_from_context", "missing_pct", @@ -51,111 +53,158 @@ "register_missing_method", ] -_np_timedelta64 = {"D": "timedelta64[D]", "h": "timedelta64[h]"} +# Mapping from sub-monthly CFtime freq strings to numpy timedelta64 units +# Only "minute" is different between the two +_freq_to_timedelta = {"min": "m"} -class MissingBase: - r"""Base class used to determined where Indicator outputs should be masked. - Subclasses should implement `is_missing` and `validate` methods. +def expected_count( + time: xr.DataArray, + freq: str | None = None, + src_timestep: str | None = None, + **indexer, +) -> xr.DataArray: + """ + Get expected number of step of length ``src_timestep`` per each resampling period + ``freq`` that ``time`` covers. - Decorate subclasses with `xclim.core.options.register_missing_method` to add them - to the registry before using them in an Indicator. + The determination of the resampling periods intersecting with the input array are + done following xarray's and pandas' heuristics. The input coordinate needs not be + continuous if `src_timestep` is given. Parameters ---------- - da : xr.DataArray - Input data. - freq : str - Resampling frequency. + time : xr.DataArray, optional + Input time coordinate from which the final resample time coordinate is guessed. + freq : str, optional. + Resampling frequency. If absent, the count for the full time range is returned. src_timestep : str, Optional The expected input frequency. If not given, it will be inferred from the input array. **indexer : Indexer Time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, month=1 to select January, or month=[6,7,8] to select summer months. If not indexer is given, all values are considered. - """ + See :py:func:`xc.core.calendar.select_time`. - def __init__(self, da, freq, src_timestep, **indexer): + Returns + ------- + xr.DataArray + Integer array at the resampling frequency with the number of expected elements in each period. + """ + if src_timestep is None: + src_timestep = xr.infer_freq(time) if src_timestep is None: - src_timestep = xr.infer_freq(da.time) - if src_timestep is None: - raise ValueError( - "`src_timestep` must be given as it cannot be inferred." - ) - self.null, self.count = self.prepare(da, freq, src_timestep, **indexer) + raise ValueError( + "A src_timestep must be passed when it can't be inferred from the data." + ) - @classmethod - def execute( - cls, - da: xr.DataArray, - freq: str, - src_timestep: str, - options: dict, - indexer: dict, - ) -> xr.DataArray: - """Create the instance and call it in one operation. + if freq is not None and not is_offset_divisor(src_timestep, freq): + raise NotImplementedError( + "Missing checks not implemented for timeseries resampled to a frequency that is not " + f"aligned with the source timestep. {src_timestep} is not a divisor of {freq}." + ) - Parameters - ---------- - da : xr.DataArray - Input data. - freq : str - Resampling frequency. - src_timestep : str - The expected input frequency. If not given, it will be inferred from the input array. - options : dict - Dictionary of options to pass to the instance. - indexer : dict - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. - If no indexer is given, all values are considered. + # Ensure a DataArray constructed like we expect + time = xr.DataArray( + time.values, dims=("time",), coords={"time": time.values}, name="time" + ) - Returns - ------- - xr.DataArray - The executed DataArray. - """ - obj = cls(da, freq, src_timestep, **indexer) - return obj(**options) + if freq: + resamp = time.resample(time=freq).first() + resamp_time = resamp.indexes["time"] + _, _, is_start, _ = parse_offset(freq) + if is_start: + start_time = resamp_time + end_time = start_time.shift(1, freq=freq) + else: + end_time = resamp_time + start_time = end_time.shift(-1, freq=freq) + else: # freq=None, means the whole timeseries as a single period + i = time.indexes["time"] + start_time = i[:1] + end_time = i[-1:] + + # don't forget : W is converted to 7D + mult, base, _, _ = parse_offset(src_timestep) + if indexer or base in "YAQM": + # Create a full synthetic time series and compare the number of days with the original series. + t = xr.date_range( + start_time[0], + end_time[-1], + freq=src_timestep, + calendar=time.dt.calendar, + use_cftime=(start_time.dtype == "O"), + ) + + sda = xr.DataArray(data=np.ones(len(t)), coords={"time": t}, dims=("time",)) + indexer.update({"drop": True}) + st = select_time(sda, **indexer) + if freq: + count = st.notnull().resample(time=freq).sum(dim="time") + else: + count = st.notnull().sum(dim="time") + else: # simpler way for sub monthly without indexer. + delta = end_time - start_time + unit = _freq_to_timedelta.get(base, base) + n = delta.values.astype(f"timedelta64[{unit}]").astype(float) / mult + + if freq: + count = xr.DataArray(n, coords={"time": resamp.time}, dims="time") + else: + count = xr.DataArray(n[0] + 1) + return count + + +class MissingBase: + r"""Base class used to determined where Indicator outputs should be masked. + + Subclasses should implement the ``is_missing``, ``validate`` and ``__init__`` + methods. The ``__init__`` is to be implement in order to change the docstring + and signature but is not expected to do anything other than the validation + of the options, everything else should happen in the call (i.e. ``is_missing``). + Subclasses can also override the ``_validate_src_timestep`` method to add restrictions + on allowed values. That method should return False on invalid ``src_timestep``. + + Decorate subclasses with `xclim.core.options.register_missing_method` to add them + to the registry before using them in an Indicator. + """ + + def __init__(self, **options): + if not self.validate(**options): + raise ValueError( + f"Options {options} are not valid for {self.__class__.__name__}." + ) + self.options = options @staticmethod - def split_freq(freq: str) -> list[str] | tuple[str, None]: - """Split the frequency into a base frequency and a suffix. + def validate(**options): + r"""Validates optional arguments. Parameters ---------- - freq : str - The frequency string. + **options : dict + Optional arguments. Returns ------- - list of str or (str, None) - A list of the base frequency and the suffix, or a tuple with the base frequency and None. + bool + False if the options are not valid. """ - if freq is None: - return "", None - - if "-" in freq: - return freq.split("-") - - return freq, None + return True @staticmethod - def is_null( - da: xr.DataArray, freq: str, **indexer - ) -> xr.DataArray | xr.DataArray.resample: + def is_null(da: xr.DataArray, **indexer) -> xr.DataArray: r"""Return a boolean array indicating which values are null. Parameters ---------- da : xr.DataArray Input data. - freq : str - Resampling frequency, from the periods defined in :ref:`timeseries.resampling`. **indexer : {dim: indexer}, optional The time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, month=1 to select January, or month=[6,7,8] to select summer months. + See :py:func:`xclim.core.calendar.select_time`. Returns ------- @@ -173,126 +222,89 @@ def is_null( raise ValueError("No data for selected period.") null = selected.isnull() - if freq: - return null.resample(time=freq) - return null - def prepare( - self, da: xr.DataArray, freq: str, src_timestep: str, **indexer - ) -> tuple[xr.DataArray, xr.DataArray]: - r"""Prepare arrays to be fed to the `is_missing` function. + def _validate_src_timestep(self, src_timestep): + return True + + def is_missing( + self, + null: xr.DataArray, + count: xr.DataArray, + freq: str | None, + ) -> xr.DataArray: + """Return whether the values within each period should be considered missing or not. + + Must be implemented by subclasses. Parameters ---------- - da : xr.DataArray - Input data. - freq : str - Resampling frequency, from the periods defined in :ref:`timeseries.resampling`. - src_timestep : str - Expected input frequency, from the periods defined in :ref:`timeseries.resampling`. - **indexer : {dim: indexer}, optional - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. If not indexer is given, - all values are considered. + null : DataArray + Boolean array of invalid values (that has already been indexed). + count : DataArray + Indexer-aware integer array of expected elements at the resampling frequency. + freq : str or None + The resampling frequency, or None if the temporal dimension is to be collapsed. Returns ------- - xr.DataArray, xr.DataArray - Boolean array indicating which values are null, array of expected number of valid values. - - Raises - ------ - NotImplementedError - If no frequency is provided and the source timestep is not a divisor of the resampling frequency. - - Notes - ----- - If `freq=None` and an indexer is given, then missing values during period at the start or end of array won't be - flagged. + DataArray + Boolean array at the resampled frequency, + True on the periods that should be considered missing. """ - # This function can probably be made simpler once CFPeriodIndex is implemented. - null = self.is_null(da, freq, **indexer) - - p_freq, _ = self.split_freq(freq) - - c = null.sum(dim="time") - - # Otherwise, simply use the start and end dates to find the expected number of days. - if p_freq.endswith("S"): - start_time = c.indexes["time"] - end_time = start_time.shift(1, freq=freq) - elif p_freq: - end_time = c.indexes["time"] - start_time = end_time.shift(-1, freq=freq) - else: - i = da.time.to_index() - start_time = i[:1] - end_time = i[-1:] - - if freq is not None and not is_offset_divisor(src_timestep, freq): - raise NotImplementedError( - "Missing checks not implemented for timeseries resampled to a frequency that is not " - f"aligned with the source timestep. {src_timestep} is not a divisor of {freq}." - ) - - offset = parse_offset(src_timestep) - if indexer or offset[1] in "YAQM": - # Create a full synthetic time series and compare the number of days with the original series. - t = xr.date_range( - start_time[0], - end_time[-1], - freq=src_timestep, - calendar=get_calendar(da), - use_cftime=(start_time.dtype == "O"), - ) - - sda = xr.DataArray(data=np.ones(len(t)), coords={"time": t}, dims=("time",)) - indexer.update({"drop": True}) - st = select_time(sda, **indexer) - if freq: - count = st.notnull().resample(time=freq).sum(dim="time") - else: - count = st.notnull().sum(dim="time") - - else: - delta = end_time - start_time - n = ( - delta.values.astype(_np_timedelta64[offset[1]]).astype(float) - / offset[0] - ) - - if freq: - count = xr.DataArray(n, coords={"time": c.time}, dims="time") - else: - count = xr.DataArray(n[0] + 1) - - return null, count - - def is_missing(self, null, count, **kwargs): # numpydoc ignore=PR01 - """Return whether the values within each period should be considered missing or not.""" raise NotImplementedError() - @staticmethod - def validate(**kwargs) -> bool: - r"""Return whether options arguments are valid or not. + def __call__( + self, + da: xr.DataArray, + freq: str | None = None, + src_timestep: str | None = None, + **indexer, + ) -> xr.DataArray: + """ + Compute the missing period mask according to the object's algorithm. Parameters ---------- - **kwargs : dict - Options arguments. + da : xr.DataArray + Input data, must have a "time" coordinate. + freq : str, optional + Resampling frequency. If absent, a collapse of the temporal dimension is assumed. + src_timestep : str, optional + The expected source input frequency. If not given, it will be inferred from the input array. + **indexer : Indexer + Time attribute and values over which to subset the array. For example, use season='DJF' to select winter + values, month=1 to select January, or month=[6,7,8] to select summer months. + If not indexer is given, all values are considered. + See :py:func:`xclim.core.calendar.select_time`. Returns ------- - bool - Whether the options are valid or not. + DataArray + Boolean array at the resampled frequency, + True on the periods that should be considered missing or invalid. """ - return True + if src_timestep is None: + src_timestep = xr.infer_freq(da.time) + if src_timestep is None: + raise ValueError( + "The source timestep can't be inferred from the data, but it is required" + " to compute the missing values mask." + ) - def __call__(self, **kwargs): - if not self.validate(**kwargs): - raise ValueError("Invalid arguments") - return self.is_missing(self.null, self.count, **kwargs) + if not self._validate_src_timestep(src_timestep): + raise ValueError( + f"Input source timestep {src_timestep} is invalid " + f"for missing method {self.__class__.__name__}." + ) + + count = expected_count(da.time, freq=freq, src_timestep=src_timestep, **indexer) + null = self.is_null(da, **indexer) + return self.is_missing(null, count, freq) + + def __repr__(self): + opt_str = ", ".join([f"{k}={v}" for k, v in self.options.items()]) + return f"<{self.__class__.__name__}({opt_str})>" # ----------------------------------------------- @@ -302,57 +314,81 @@ def __call__(self, **kwargs): @register_missing_method("any") class MissingAny(MissingBase): - r"""Return whether there are missing days in the array. + """Mask periods as missing if in any of its elements is missing or invalid.""" - Parameters - ---------- - da : xr.DataArray - Input array. - freq : str - Resampling frequency. - src_timestep : {"D", "h", "M"} - Expected input frequency. - **indexer : {dim: indexer, }, optional - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. + def __init__(self): + """Create a MissingAny object.""" + super().__init__() - Returns - ------- - xr.DataArray - A boolean array set to True if period has missing values. - """ + def is_missing( + self, null: xr.DataArray, count: xr.DataArray, freq: str | None + ) -> xr.DataArray: + if freq is not None: + null = null.resample(time=freq) + cond0 = null.count(dim="time") != count # Check total number of days + cond1 = null.sum(dim="time") > 0 # Check if any is missing + return cond0 | cond1 - def __init__(self, da, freq, src_timestep, **indexer): - super().__init__(da, freq, src_timestep, **indexer) - def is_missing( - self, null: xr.DataArray, count: xr.DataArray, **kwargs - ) -> xr.DataArray: # noqa - r"""Return whether the values within each period should be considered missing or not. +class MissingTwoSteps(MissingBase): + r"""Base class used to determined where Indicator outputs should be masked, + in a two step process. + + In addition to what :py:class:`MissingBase` does, subclasses first perform the mask + determination at some frequency and then resample at the (coarser) target frequency. + This allows the application of specific methods at a finer resolution than the target one. + The sub-groups are merged using the "Any" method : a group is invalid if any of its + sub-groups are invalid. + + The first resampling frequency should be implemented as an additional "subfreq" option. + A value of None means that only one resampling is done at the request target frequency. + """ + + def __call__( + self, + da: xr.DataArray, + freq: str | None = None, + src_timestep: str | None = None, + **indexer, + ) -> xr.DataArray: + """ + Compute the missing period mask according to the object's algorithm. Parameters ---------- - null : xr.DataArray - Boolean array indicating which values are null. - count : xr.DataArray - Array of expected number of valid values. - **kwargs : dict - Additional arguments. - - Returns - ------- - xr.DataArray - A boolean array set to True if period has missing values. + da : xr.DataArray + Input data, must have a "time" coordinate. + freq : str, optional + Target resampling frequency. If absent, a collapse of the temporal dimension is assumed. + src_timestep : str, optional + The expected source input frequency. If not given, it will be inferred from the input array. + **indexer : Indexer + Time attribute and values over which to subset the array. For example, use season='DJF' to select winter + values, month=1 to select January, or month=[6,7,8] to select summer months. + If not indexer is given, all values are considered. + See :py:func:`xclim.core.calendar.select_time`. """ - cond0 = null.count(dim="time") != count # Check total number of days - cond1 = null.sum(dim="time") > 0 # Check if any is missing - return cond0 | cond1 + subfreq = self.options["subfreq"] or freq + if ( + subfreq is not None + and freq is not None + and compare_offsets(freq, "<", subfreq) + ): + raise ValueError( + "The target resampling frequency cannot be finer than the first-step " + f"frequency. Got : {subfreq} > {freq}." + ) + miss = super().__call__(da, freq=subfreq, src_timestep=src_timestep, **indexer) + if subfreq != freq: + miss = MissingAny()( + miss.where(~miss), freq, src_timestep=subfreq, **indexer + ) + return miss @register_missing_method("wmo") -class MissingWMO(MissingAny): - r"""Return whether a series fails WMO criteria for missing days. +class MissingWMO(MissingTwoSteps): + """Mask periods as missing using the WMO criteria for missing days. The World Meteorological Organisation recommends that where monthly means are computed from daily values, it should be considered missing if either of these two criteria are met: @@ -363,219 +399,110 @@ class MissingWMO(MissingAny): Stricter criteria are sometimes used in practice, with a tolerance of 5 missing values or 3 consecutive missing values. - Parameters - ---------- - da : DataArray - Input array. - freq : str - Resampling frequency. - nm : int - Number of missing values per month that should not be exceeded. - nc : int - Number of consecutive missing values per month that should not be exceeded. - src_timestep : {"D"} - Expected input frequency. Only daily values are supported. - **indexer : {dim: indexer, }, optional - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. - - Returns - ------- - xr.DataArray - A boolean array set to True if period has missing values. - Notes ----- If used at frequencies larger than a month, for example on an annual or seasonal basis, the function will return - True if any month within a period is missing. + True if any month within a period is masked. """ - def __init__(self, da, freq, src_timestep, **indexer): - # Force computation on monthly frequency - if src_timestep != "D": - raise ValueError( - "WMO method to estimate missing data is only defined for daily series." - ) - - if not freq.startswith("M"): - raise ValueError - - super().__init__(da, freq, src_timestep, **indexer) - - @classmethod - def execute( - cls, - da: xr.DataArray, - freq: str, - src_timestep: str, - options: dict, - indexer: dict, - ) -> xr.DataArray: - """Create the instance and call it in one operation. + def __init__(self, nm: int = 11, nc: int = 5): + """Create a MissingWMO object. Parameters ---------- - da : xr.DataArray - Input data. - freq : str - Resampling frequency. - src_timestep : str - The expected input frequency. If not given, it will be inferred from the input array. - options : dict - Dictionary of options to pass to the instance. - indexer : dict - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. - - Returns - ------- - xr.DataArray - The executed WMO-missing standards-compliant DataArray. + nm : int + Minimal number of missing elements for a month to be masked. + nc : int + Minimal number of consecutive missing elements for a month to be masked. """ - if freq[0] not in ["Y", "A", "Q", "M"]: - raise ValueError( - "MissingWMO can only be used with Monthly or longer frequencies." - ) - obj = cls(da, "ME", src_timestep, **indexer) - miss = obj(**options) - # Replace missing months by NaNs - mda = miss.where(miss == 0) - return MissingAny(mda, freq, "ME", **indexer)() - - def is_missing(self, null, count, nm=11, nc=5): - from xclim.indices import ( - run_length as rl, # pylint: disable=import-outside-toplevel - ) + super().__init__(nm=nm, nc=nc, subfreq="MS") + + @staticmethod + def validate(nm: int, nc: int, subfreq: str | None = None): + return nm < 31 and nc < 31 + + def _validate_src_timestep(self, src_timestep): + return src_timestep == "D" + + def is_missing( + self, null: xr.DataArray, count: xr.DataArray, freq: str + ) -> xr.DataArray: + nullr = null.resample(time=freq) # Check total number of days - cond0 = null.count(dim="time") != count + cond0 = nullr.count(dim="time") != count # Check if more than threshold is missing - cond1 = null.sum(dim="time") >= nm + cond1 = nullr.sum(dim="time") >= self.options["nm"] # Check for consecutive missing values - cond2 = null.map(rl.longest_run, dim="time") >= nc + longest_run = resample_map(null, "time", freq, rl.longest_run, map_blocks=True) + cond2 = longest_run >= self.options["nc"] return cond0 | cond1 | cond2 - @staticmethod - def validate(nm, nc): - return nm < 31 and nc < 31 - @register_missing_method("pct") -class MissingPct(MissingBase): - r"""Return whether there are more missing days in the array than a given percentage. - - Parameters - ---------- - da : DataArray - Input array. - freq : str - Resampling frequency. - tolerance : float - Fraction of missing values that are tolerated [0,1]. - src_timestep : {"D", "h"} - Expected input frequency. - **indexer : {dim: indexer, }, optional - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, - month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. - - Returns - ------- - xr.DataArray - A boolean array set to True if period has missing values. - """ +class MissingPct(MissingTwoSteps): + """Mask periods as missing when there are more then a given percentage of missing days.""" - def is_missing(self, null, count, tolerance=0.1): - if tolerance < 0 or tolerance > 1: - raise ValueError("tolerance should be between 0 and 1.") + def __init__(self, tolerance: float = 0.1, subfreq: str | None = None): + """Create a MissingPct object. - n = count - null.count(dim="time").fillna(0) + null.sum(dim="time").fillna(0) - return n / count >= tolerance + Parameters + ---------- + tolerance: float + The maximum tolerated proportion of missing values, + given as a number between 0 and 1. + subfreq : str, optional + If given, compute a mask at this frequency using this method and + then resample at the target frequency using the "any" method on sub-groups. + """ + super().__init__(tolerance=tolerance, subfreq=subfreq) @staticmethod - def validate(tolerance): + def validate(tolerance: float, subfreq: str | None = None): return 0 <= tolerance <= 1 + def is_missing( + self, null: xr.DataArray, count: xr.DataArray, freq: str | None + ) -> xr.DataArray: + if freq is not None: + null = null.resample(time=freq) -@register_missing_method("at_least_n") -class AtLeastNValid(MissingBase): - r"""Return whether there are at least a given number of valid values. - - Parameters - ---------- - da : xr.DataArray - Input array. - freq : str - Resampling frequency. - n : int - Minimum of valid values required. - src_timestep : {"D", "h"} - Expected input frequency. - indexer : {dim: indexer, }, optional - Time attribute and values over which to subset the array. For example, use season='DJF' to select winter - values, month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. + n = count - null.count(dim="time").fillna(0) + null.sum(dim="time").fillna(0) + return n / count >= self.options["tolerance"] - Returns - ------- - xr.DataArray - A boolean array set to True if period has missing values. - """ - def __init__(self, da, freq, src_timestep, **indexer): - # No need to compute count, so no check required on `src_timestep`. - self.null = self.is_null(da, freq, **indexer) - self.count = None # Not needed +@register_missing_method("at_least_n") +class AtLeastNValid(MissingTwoSteps): + r"""Mask periods as missing if they don't have at least a given number of valid values (ignoring the expected count of elements).""" - def is_missing(self, null, count, n: int = 20): - """Check for missing results after a reduction operation. + def __init__(self, n: int = 20, subfreq: str | None = None): + """Create a AtLeastNValid object. - The result of a reduction operation is considered missing if less than `n` values are valid. + Parameters + ---------- + n: float + The minimum number of valid values needed. + subfreq : str, optional + If given, compute a mask at this frequency using this method and + then resample at the target frequency using the "any" method on sub-groups. """ - nvalid = null.count(dim="time").fillna(0) - null.sum(dim="time").fillna(0) - return nvalid < n + super().__init__(n=n, subfreq=subfreq) @staticmethod - def validate(n: int) -> bool: + def validate(n: int, subfreq: str | None = None): return n > 0 - -@register_missing_method("skip") -class Skip(MissingBase): # pylint: disable=missing-class-docstring - def __init__(self, da, freq=None, src_timestep=None, **indexer): - pass - - def is_missing(self, null, count): - """Return whether the values within each period should be considered missing or not.""" - return False - - def __call__(self): - return False - - -@register_missing_method("from_context") -class FromContext(MissingBase): - """Return whether each element of the resampled da should be considered missing according to the currently set options in `xclim.set_options`. - - See Also - -------- - xclim.set_options : For modifying run configurations - xclim.core.options.register_missing_method : For adding new missing value detection algorithms. - """ - - @classmethod - def execute(cls, da, freq, src_timestep, options, indexer): - name = OPTIONS[CHECK_MISSING] - kls = MISSING_METHODS[name] - opts = OPTIONS[MISSING_OPTIONS][name] - - return kls.execute(da, freq, src_timestep, opts, indexer) + def is_missing( + self, null: xr.DataArray, count: xr.DataArray, freq: str | None + ) -> xr.DataArray: + valid = ~null + if freq is not None: + valid = valid.resample(time=freq) + nvalid = valid.sum(dim="time") + return nvalid < self.options["n"] # -------------------------- @@ -589,8 +516,7 @@ def missing_any( # noqa: D103 # numpydoc ignore=GL08 da: xr.DataArray, freq: str, src_timestep: str | None = None, **indexer ) -> xr.DataArray: """Return whether there are missing days in the array.""" - src_timestep = src_timestep or xr.infer_freq(da.time) - return MissingAny(da, freq, src_timestep, **indexer)() + return MissingAny()(da, freq, src_timestep, **indexer) def missing_wmo( # noqa: D103 # numpydoc ignore=GL08 @@ -601,10 +527,7 @@ def missing_wmo( # noqa: D103 # numpydoc ignore=GL08 src_timestep: str | None = None, **indexer, ) -> xr.DataArray: - src_timestep = src_timestep or xr.infer_freq(da.time) - return MissingWMO.execute( - da, freq, src_timestep, options={"nm": nm, "nc": nc}, indexer=indexer - ) + return MissingWMO(nm=nm, nc=nc)(da, freq, src_timestep, **indexer) def missing_pct( # noqa: D103 # numpydoc ignore=GL08 @@ -612,28 +535,84 @@ def missing_pct( # noqa: D103 # numpydoc ignore=GL08 freq: str, tolerance: float, src_timestep: str | None = None, + subfreq: str | None = None, **indexer, ) -> xr.DataArray: - src_timestep = src_timestep or xr.infer_freq(da.time) - return MissingPct(da, freq, src_timestep, **indexer)(tolerance=tolerance) + return MissingPct(tolerance=tolerance, subfreq=subfreq)( + da, freq, src_timestep, **indexer + ) def at_least_n_valid( # noqa: D103 # numpydoc ignore=GL08 - da: xr.DataArray, freq: str, n: int = 1, src_timestep: str | None = None, **indexer + da: xr.DataArray, + freq: str, + n: int = 20, + src_timestep: str | None = None, + subfreq: str | None = None, + **indexer, ) -> xr.DataArray: - src_timestep = src_timestep or xr.infer_freq(da.time) - return AtLeastNValid(da, freq, src_timestep, **indexer)(n=n) + return AtLeastNValid(n=n, subfreq=subfreq)(da, freq, src_timestep, **indexer) def missing_from_context( # noqa: D103 # numpydoc ignore=GL08 da: xr.DataArray, freq: str, src_timestep: str | None = None, **indexer ) -> xr.DataArray: - src_timestep = src_timestep or xr.infer_freq(da.time) - return FromContext.execute(da, freq, src_timestep, options={}, indexer=indexer) + """Mask periods as missing according to the algorithm and options set in xclim's global options. + The options can be manipulated with :py:func:`xclim.core.options.set_options`. -missing_any.__doc__ = MissingAny.__doc__ -missing_wmo.__doc__ = MissingWMO.__doc__ -missing_pct.__doc__ = MissingPct.__doc__ -at_least_n_valid.__doc__ = AtLeastNValid.__doc__ -missing_from_context.__doc__ = FromContext.__doc__ + Parameters + ---------- + da : xr.DataArray + Input data, must have a "time" coordinate. + freq : str, optional + Resampling frequency. If absent, a collapse of the temporal dimension is assumed. + src_timestep : str, optional + The expected source input frequency. If not given, it will be inferred from the input array. + **indexer : Indexer + Time attribute and values over which to subset the array. For example, use season='DJF' to select winter + values, month=1 to select January, or month=[6,7,8] to select summer months. + If not indexer is given, all values are considered. + See :py:func:`xclim.core.calendar.select_time`. + + Returns + ------- + DataArray + Boolean array at the resampled frequency, + True on the periods that should be considered missing or invalid. + """ + method = OPTIONS[CHECK_MISSING] + MissCls = MISSING_METHODS[method] + opts = OPTIONS[MISSING_OPTIONS].get(method, {}) + return MissCls(**opts)(da, freq, src_timestep, **indexer) + + +def _get_convenient_doc(cls): + maindoc = cls.__doc__ + initdoc = cls.__init__.__doc__ + calldoc = cls.__call__.__doc__ + + params = [] + ip = 10000 + for i, line in enumerate(initdoc.split("\n")): + if line.strip() == "Parameters": + ip = i + if i >= ip + 2: + params.append(line) + + doc = [maindoc, ""] + ip = 10000 + for i, line in enumerate(calldoc.split("\n")): + if line.strip() == "Parameters": + ip = i + if i >= ip: + doc.append(line) + if i >= ip + 1: + doc.extend(params) + return "\n".join(doc) + + +missing_any.__doc__ = _get_convenient_doc(MissingAny) +missing_wmo.__doc__ = _get_convenient_doc(MissingWMO) +missing_pct.__doc__ = _get_convenient_doc(MissingPct) +at_least_n_valid.__doc__ = _get_convenient_doc(AtLeastNValid) diff --git a/src/xclim/core/options.py b/src/xclim/core/options.py index dcfd8cf3b..9a718c4c9 100644 --- a/src/xclim/core/options.py +++ b/src/xclim/core/options.py @@ -51,13 +51,12 @@ def _valid_missing_options(mopts): """Check if all methods and their options in mopts are valid.""" return all( - meth in MISSING_METHODS # Ensure the method is registered in MISSING_METHODS - and all( - opt in OPTIONS[MISSING_OPTIONS][meth] for opt in opts.keys() - ) # Check if all options provided for the method are valid - and MISSING_METHODS[meth].validate( - **opts - ) # Validate the options using the method's validator; defaults to True if no validation is needed + # Ensure the method is registered in MISSING_METHODS + meth in MISSING_METHODS + # Check if all options provided for the method are valid + and all(opt in OPTIONS[MISSING_OPTIONS][meth] for opt in opts.keys()) + # Validate the options using the method's validator; defaults to True if no validation is needed + and MISSING_METHODS[meth].validate(**(OPTIONS[MISSING_OPTIONS][meth] | opts)) for meth, opts in mopts.items() # Iterate over each method and its options in mopts ) @@ -66,7 +65,7 @@ def _valid_missing_options(mopts): METADATA_LOCALES: _valid_locales, DATA_VALIDATION: _LOUDNESS_OPTIONS.__contains__, CF_COMPLIANCE: _LOUDNESS_OPTIONS.__contains__, - CHECK_MISSING: lambda meth: meth != "from_context" and meth in MISSING_METHODS, + CHECK_MISSING: lambda meth: meth in MISSING_METHODS or meth == "skip", MISSING_OPTIONS: _valid_missing_options, RUN_LENGTH_UFUNC: _RUN_LENGTH_UFUNC_OPTIONS.__contains__, SDBA_EXTRA_OUTPUT: lambda opt: isinstance(opt, bool), @@ -111,11 +110,11 @@ def register_missing_method(name: str) -> Callable: """ def _register_missing_method(cls): - sig = signature(cls.is_missing) + sig = signature(cls.__init__) opts = { key: param.default if param.default != param.empty else None for key, param in sig.parameters.items() - if key not in ["self", "null", "count"] + if key not in ["self"] } MISSING_METHODS[name] = cls diff --git a/tests/test_missing.py b/tests/test_missing.py index b6523de08..4be89d2f6 100644 --- a/tests/test_missing.py +++ b/tests/test_missing.py @@ -10,7 +10,7 @@ K2C = 273.15 -class TestMissingBase: +class test_expected_count: """The base class is well tested for daily input through the subclasses.""" def test_3hourly_input(self, random): @@ -18,39 +18,39 @@ def test_3hourly_input(self, random): n = 21 * 8 time = xr.cftime_range(start="2002-01-01", periods=n, freq="3h") ts = xr.DataArray(random.random(n), dims="time", coords={"time": time}) - mb = missing.MissingBase(ts, freq="MS", src_timestep="3h") + count = missing.expected_count(ts, freq="MS", src_timestep="3h") # Make sure count is 31 * 8, because we're requesting a MS freq. - assert mb.count == 31 * 8 + assert count == 31 * 8 def test_monthly_input(self, random): """Creating array with 11 months.""" n = 11 time = xr.cftime_range(start="2002-01-01", periods=n, freq="ME") ts = xr.DataArray(random.random(n), dims="time", coords={"time": time}) - mb = missing.MissingBase(ts, freq="YS", src_timestep="ME") + count = missing.expected_count(ts, freq="YS", src_timestep="ME") # Make sure count is 12, because we're requesting a YS freq. - assert mb.count == 12 + assert count == 12 n = 5 time = xr.cftime_range(start="2002-06-01", periods=n, freq="MS") ts = xr.DataArray(random.random(n), dims="time", coords={"time": time}) - mb = missing.MissingBase(ts, freq="YS", src_timestep="MS", season="JJA") - assert mb.count == 3 + count = missing.expected_count(ts, freq="YS", src_timestep="MS", season="JJA") + assert count == 3 def test_seasonal_input(self, random): """Creating array with 11 seasons.""" n = 11 time = xr.cftime_range(start="2002-04-01", periods=n, freq="QS-JAN") ts = xr.DataArray(random.random(n), dims="time", coords={"time": time}) - mb = missing.MissingBase(ts, freq="YS", src_timestep="QS-JAN") + count = missing.expected_count(ts, freq="YS", src_timestep="QS-JAN") # Make sure count is 12, because we're requesting a YS freq. - np.testing.assert_array_equal(mb.count, [4, 4, 4, 1]) + np.testing.assert_array_equal(count, [4, 4, 4, 1]) with pytest.raises( NotImplementedError, match="frequency that is not aligned with the source timestep.", ): - missing.MissingBase(ts, freq="YS", src_timestep="QS-DEC") + missing.expect_count(ts, freq="YS", src_timestep="QS-DEC") class TestMissingAnyFills: From 5a969261c481195ddad862450b05b5684dc3d422 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 10:44:11 -0500 Subject: [PATCH 02/10] Avoid circular imports --- src/xclim/core/missing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index 6aa82ecdf..7413cb890 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -40,8 +40,6 @@ OPTIONS, register_missing_method, ) -from xclim.indices import run_length as rl -from xclim.indices.helpers import resample_map __all__ = [ "at_least_n_valid", @@ -427,6 +425,9 @@ def _validate_src_timestep(self, src_timestep): def is_missing( self, null: xr.DataArray, count: xr.DataArray, freq: str ) -> xr.DataArray: + from xclim.indices import run_length as rl + from xclim.indices.helpers import resample_map + nullr = null.resample(time=freq) # Check total number of days From cd7c81cf2938a28f36b248fe1127950ff161de60 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 10:44:44 -0500 Subject: [PATCH 03/10] Avoid circular imports --- src/xclim/core/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index 7413cb890..0370c92d8 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -555,7 +555,7 @@ def at_least_n_valid( # noqa: D103 # numpydoc ignore=GL08 return AtLeastNValid(n=n, subfreq=subfreq)(da, freq, src_timestep, **indexer) -def missing_from_context( # noqa: D103 # numpydoc ignore=GL08 +def missing_from_context( da: xr.DataArray, freq: str, src_timestep: str | None = None, **indexer ) -> xr.DataArray: """Mask periods as missing according to the algorithm and options set in xclim's global options. From bb53816212245bc73f10a6addcdb2393c738e9e6 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 10:58:21 -0500 Subject: [PATCH 04/10] Better docstrings --- src/xclim/core/missing.py | 61 ++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index 0370c92d8..0ba4ed7a2 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -24,6 +24,8 @@ from __future__ import annotations +import textwrap + import numpy as np import xarray as xr @@ -155,7 +157,8 @@ def expected_count( class MissingBase: - r"""Base class used to determined where Indicator outputs should be masked. + r""" + Base class used to determined where Indicator outputs should be masked. Subclasses should implement the ``is_missing``, ``validate`` and ``__init__`` methods. The ``__init__`` is to be implement in order to change the docstring @@ -177,7 +180,8 @@ def __init__(self, **options): @staticmethod def validate(**options): - r"""Validates optional arguments. + r""" + Validates optional arguments. Parameters ---------- @@ -193,7 +197,8 @@ def validate(**options): @staticmethod def is_null(da: xr.DataArray, **indexer) -> xr.DataArray: - r"""Return a boolean array indicating which values are null. + r""" + Return a boolean array indicating which values are null. Parameters ---------- @@ -231,7 +236,8 @@ def is_missing( count: xr.DataArray, freq: str | None, ) -> xr.DataArray: - """Return whether the values within each period should be considered missing or not. + """ + Return whether the values within each period should be considered missing or not. Must be implemented by subclasses. @@ -329,7 +335,8 @@ def is_missing( class MissingTwoSteps(MissingBase): - r"""Base class used to determined where Indicator outputs should be masked, + r""" + Base class used to determined where Indicator outputs should be masked, in a two step process. In addition to what :py:class:`MissingBase` does, subclasses first perform the mask @@ -365,6 +372,12 @@ def __call__( values, month=1 to select January, or month=[6,7,8] to select summer months. If not indexer is given, all values are considered. See :py:func:`xclim.core.calendar.select_time`. + + Returns + ------- + DataArray + Boolean array at the resampled frequency, + True on the periods that should be considered missing or invalid. """ subfreq = self.options["subfreq"] or freq if ( @@ -386,7 +399,8 @@ def __call__( @register_missing_method("wmo") class MissingWMO(MissingTwoSteps): - """Mask periods as missing using the WMO criteria for missing days. + """ + Mask periods as missing using the WMO criteria for missing days. The World Meteorological Organisation recommends that where monthly means are computed from daily values, it should be considered missing if either of these two criteria are met: @@ -404,7 +418,8 @@ class MissingWMO(MissingTwoSteps): """ def __init__(self, nm: int = 11, nc: int = 5): - """Create a MissingWMO object. + """ + Create a MissingWMO object. Parameters ---------- @@ -448,7 +463,8 @@ class MissingPct(MissingTwoSteps): """Mask periods as missing when there are more then a given percentage of missing days.""" def __init__(self, tolerance: float = 0.1, subfreq: str | None = None): - """Create a MissingPct object. + """ + Create a MissingPct object. Parameters ---------- @@ -480,7 +496,8 @@ class AtLeastNValid(MissingTwoSteps): r"""Mask periods as missing if they don't have at least a given number of valid values (ignoring the expected count of elements).""" def __init__(self, n: int = 20, subfreq: str | None = None): - """Create a AtLeastNValid object. + """ + Create a AtLeastNValid object. Parameters ---------- @@ -523,9 +540,9 @@ def missing_any( # noqa: D103 # numpydoc ignore=GL08 def missing_wmo( # noqa: D103 # numpydoc ignore=GL08 da: xr.DataArray, freq: str, + src_timestep: str | None = None, nm: int = 11, nc: int = 5, - src_timestep: str | None = None, **indexer, ) -> xr.DataArray: return MissingWMO(nm=nm, nc=nc)(da, freq, src_timestep, **indexer) @@ -534,8 +551,8 @@ def missing_wmo( # noqa: D103 # numpydoc ignore=GL08 def missing_pct( # noqa: D103 # numpydoc ignore=GL08 da: xr.DataArray, freq: str, - tolerance: float, src_timestep: str | None = None, + tolerance: float = 0.1, subfreq: str | None = None, **indexer, ) -> xr.DataArray: @@ -547,8 +564,8 @@ def missing_pct( # noqa: D103 # numpydoc ignore=GL08 def at_least_n_valid( # noqa: D103 # numpydoc ignore=GL08 da: xr.DataArray, freq: str, - n: int = 20, src_timestep: str | None = None, + n: int = 20, subfreq: str | None = None, **indexer, ) -> xr.DataArray: @@ -558,7 +575,8 @@ def at_least_n_valid( # noqa: D103 # numpydoc ignore=GL08 def missing_from_context( da: xr.DataArray, freq: str, src_timestep: str | None = None, **indexer ) -> xr.DataArray: - """Mask periods as missing according to the algorithm and options set in xclim's global options. + """ + Mask periods as missing according to the algorithm and options set in xclim's global options. The options can be manipulated with :py:func:`xclim.core.options.set_options`. @@ -589,27 +607,30 @@ def missing_from_context( def _get_convenient_doc(cls): - maindoc = cls.__doc__ - initdoc = cls.__init__.__doc__ - calldoc = cls.__call__.__doc__ + maindoc = textwrap.dedent(cls.__doc__) + initdoc = textwrap.dedent(cls.__init__.__doc__) + calldoc = textwrap.dedent(cls.__call__.__doc__) params = [] ip = 10000 for i, line in enumerate(initdoc.split("\n")): if line.strip() == "Parameters": ip = i - if i >= ip + 2: + if i >= ip + 2 and line.strip(): params.append(line) - doc = [maindoc, ""] + doc = [maindoc] + if "\n" not in maindoc: + doc.append("") + ip = 10000 for i, line in enumerate(calldoc.split("\n")): if line.strip() == "Parameters": ip = i + elif "**indexer" in line: + doc.extend(params) if i >= ip: doc.append(line) - if i >= ip + 1: - doc.extend(params) return "\n".join(doc) From ec6ebf66ab15a41e0804734d5e10e925d7772c30 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 11:26:54 -0500 Subject: [PATCH 05/10] Remove unnecessary crit in WMO - upd changes --- CHANGELOG.rst | 10 +++++++--- src/xclim/core/missing.py | 5 +---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7a3ec3706..9b7cd1448 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,7 +4,11 @@ Changelog v0.55.0 (unreleased) -------------------- -Contributors to this version: Juliette Lavoie (:user:`juliettelavoie`), Trevor James Smith (:user:`Zeitsperre`). +Contributors to this version: Juliette Lavoie (:user:`juliettelavoie`), Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`). + +Breaking changes +^^^^^^^^^^^^^^^^ +* Missing value method "WMO" was modified to remove the criterion that the timeseries needs to be continuous (without holes). (:pull:`2058`). New indicators ^^^^^^^^^^^^^^ @@ -16,13 +20,13 @@ New features and enhancements * New function ``ensemble.partition.general_partition`` (:pull:`2035`) * Added a new ``xclim.indices.generic.bivariate_count_occurrences`` function to count instances where operations and performed and validated for two variables. (:pull:`2030`). * `xclim` now tracks energy usage and carbon emissions ("last run", "average", and "total") during CI workflows using the `eco-ci-energy-estimation` GitHub Action. (:pull:`2046`). -* Missing values method "pct" and "at_least_n" now accept a new "subfreq" option that allows to compute the missing mask in two steps. When given, the algorithm is applied at this "subfreq" resampling frequency first and then the result is resampled at the target indicator frequency. In the output, a period is invalid if any of its subgroup where flagged as invalid by the chosen method. +* Missing values method "pct" and "at_least_n" now accept a new "subfreq" option that allows to compute the missing mask in two steps. When given, the algorithm is applied at this "subfreq" resampling frequency first and then the result is resampled at the target indicator frequency. In the output, a period is invalid if any of its subgroup where flagged as invalid by the chosen method. (:pull:`2058`, :issue:`1820`). Internal changes ^^^^^^^^^^^^^^^^ * `sphinx-codeautolink` and `pygments` have been temporarily pinned due to breaking API changes. (:pull:`2030`). * Adjusted the ``TestOfficialYaml`` test to use a dynamic method for finding the installed location of `xclim`. (:pull:`2028`). -* Refactor of the ``xclim.core.missing`` module, usage of the ``Missing`` objects has been broken. +* Refactor of the ``xclim.core.missing`` module, usage of the ``Missing`` objects has been broken. (:pull:`2058`, :issue:`1820`, :issue:`2000`). - Objects are initialized with their options and then called with the data, input frequency, target frequency and indexer. - Subclasses receive non-resampled DataArray in their ``is_missing`` methods. - ``MissingWMO`` now uses ``xclim.indices.helpers.resample_map`` which should greatly improve performance in a dask context. diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index 0ba4ed7a2..69fda5efe 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -445,9 +445,6 @@ def is_missing( nullr = null.resample(time=freq) - # Check total number of days - cond0 = nullr.count(dim="time") != count - # Check if more than threshold is missing cond1 = nullr.sum(dim="time") >= self.options["nm"] @@ -455,7 +452,7 @@ def is_missing( longest_run = resample_map(null, "time", freq, rl.longest_run, map_blocks=True) cond2 = longest_run >= self.options["nc"] - return cond0 | cond1 | cond2 + return cond1 | cond2 @register_missing_method("pct") From 5c27e2d416a76a7dd70e5ca01965a0b7db0150dc Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 11:43:23 -0500 Subject: [PATCH 06/10] Fix tests - fix wmo --- src/xclim/core/missing.py | 4 +++- tests/test_indicators.py | 4 ---- tests/test_missing.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index 69fda5efe..ff9e64628 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -445,8 +445,10 @@ def is_missing( nullr = null.resample(time=freq) + # Total number of missing or invalid days + missing_days = (count - nullr.count(dim="time")) + nullr.sum(dim="time") # Check if more than threshold is missing - cond1 = nullr.sum(dim="time") >= self.options["nm"] + cond1 = missing_days >= self.options["nm"] # Check for consecutive missing values longest_run = resample_map(null, "time", freq, rl.longest_run, map_blocks=True) diff --git a/tests/test_indicators.py b/tests/test_indicators.py index 20f71fdac..d53e7dc73 100644 --- a/tests/test_indicators.py +++ b/tests/test_indicators.py @@ -417,10 +417,6 @@ def test_missing(tas_series): assert not m[0].isnull() assert "check_missing=pct, missing_options={'tolerance': 0.05}" in m.history - with xclim.set_options(check_missing="wmo"): - m = uniIndTemp(a, freq="YS") - assert m[0].isnull() - # With freq=None c = uniClim(a) assert c.isnull() diff --git a/tests/test_missing.py b/tests/test_missing.py index 4be89d2f6..1eb389db4 100644 --- a/tests/test_missing.py +++ b/tests/test_missing.py @@ -185,7 +185,7 @@ def test_missing_days(self, tas_series): assert out[2] def test_missing_days_in_quarter(self, tas_series): - a = np.arange(360.0) + a = np.arange(350.0) a[5:16] = np.nan # Number of missing values under acceptable limit in a month ts = tas_series(a) out = missing.missing_wmo(ts, freq="QS-JAN") From fe1d814eef0ed3a4847ddbc9aa836dc27f7bab16 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 11:47:06 -0500 Subject: [PATCH 07/10] Fix missing wmo again - it is problematic still --- src/xclim/core/missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index ff9e64628..de44ecc7a 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -450,7 +450,8 @@ def is_missing( # Check if more than threshold is missing cond1 = missing_days >= self.options["nm"] - # Check for consecutive missing values + # Check for consecutive invalid values + # FIXME: This does not take holes in consideration longest_run = resample_map(null, "time", freq, rl.longest_run, map_blocks=True) cond2 = longest_run >= self.options["nc"] From ba0b45cb798106bb3679051ab1b3faad34438169 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 24 Jan 2025 11:59:04 -0500 Subject: [PATCH 08/10] fix test --- tests/test_options.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_options.py b/tests/test_options.py index c80f9e27c..e7fa6e1da 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -20,8 +20,11 @@ ("check_missing", "wmo"), ("check_missing", "any"), ("missing_options", {"wmo": {"nm": 10, "nc": 3}}), - ("missing_options", {"pct": {"tolerance": 0.1}}), - ("missing_options", {"wmo": {"nm": 10, "nc": 3}, "pct": {"tolerance": 0.1}}), + ("missing_options", {"pct": {"subfreq": None, "tolerance": 0.1}}), + ( + "missing_options", + {"wmo": {"nm": 10, "nc": 3}, "pct": {"subfreq": None, "tolerance": 0.1}}, + ), ], ) def test_set_options_valid(option, value): From db7986ad139d0a1f7d1cb76323517505fa68dc0f Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 11 Feb 2025 09:51:10 -0500 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: David Huard --- src/xclim/core/missing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index de44ecc7a..a4300b3fd 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -78,7 +78,7 @@ def expected_count( time : xr.DataArray, optional Input time coordinate from which the final resample time coordinate is guessed. freq : str, optional. - Resampling frequency. If absent, the count for the full time range is returned. + Resampling frequency. If not given or None, the count for the full time range is returned. src_timestep : str, Optional The expected input frequency. If not given, it will be inferred from the input array. **indexer : Indexer @@ -161,7 +161,7 @@ class MissingBase: Base class used to determined where Indicator outputs should be masked. Subclasses should implement the ``is_missing``, ``validate`` and ``__init__`` - methods. The ``__init__`` is to be implement in order to change the docstring + methods. The ``__init__`` is to be implemented in order to change the docstring and signature but is not expected to do anything other than the validation of the options, everything else should happen in the call (i.e. ``is_missing``). Subclasses can also override the ``_validate_src_timestep`` method to add restrictions @@ -181,7 +181,7 @@ def __init__(self, **options): @staticmethod def validate(**options): r""" - Validates optional arguments. + Validate optional arguments. Parameters ---------- @@ -246,7 +246,7 @@ def is_missing( null : DataArray Boolean array of invalid values (that has already been indexed). count : DataArray - Indexer-aware integer array of expected elements at the resampling frequency. + Indexer-aware integer array of number of expected elements at the resampling frequency. freq : str or None The resampling frequency, or None if the temporal dimension is to be collapsed. @@ -273,7 +273,7 @@ def __call__( da : xr.DataArray Input data, must have a "time" coordinate. freq : str, optional - Resampling frequency. If absent, a collapse of the temporal dimension is assumed. + Resampling frequency. If None, a collapse of the temporal dimension is assumed. src_timestep : str, optional The expected source input frequency. If not given, it will be inferred from the input array. **indexer : Indexer @@ -318,7 +318,7 @@ def __repr__(self): @register_missing_method("any") class MissingAny(MissingBase): - """Mask periods as missing if in any of its elements is missing or invalid.""" + """Mask periods as missing if any of its elements is missing or invalid.""" def __init__(self): """Create a MissingAny object.""" @@ -364,13 +364,13 @@ def __call__( da : xr.DataArray Input data, must have a "time" coordinate. freq : str, optional - Target resampling frequency. If absent, a collapse of the temporal dimension is assumed. + Target resampling frequency. If None, a collapse of the temporal dimension is assumed. src_timestep : str, optional The expected source input frequency. If not given, it will be inferred from the input array. **indexer : Indexer Time attribute and values over which to subset the array. For example, use season='DJF' to select winter values, month=1 to select January, or month=[6,7,8] to select summer months. - If not indexer is given, all values are considered. + If no indexer is given, all values are considered. See :py:func:`xclim.core.calendar.select_time`. Returns From 34e8170b40bbdd410edaa59497b11472145d179a Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 11 Feb 2025 09:55:54 -0500 Subject: [PATCH 10/10] add comment about todo --- src/xclim/core/missing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xclim/core/missing.py b/src/xclim/core/missing.py index a4300b3fd..61349e230 100644 --- a/src/xclim/core/missing.py +++ b/src/xclim/core/missing.py @@ -334,6 +334,7 @@ def is_missing( return cond0 | cond1 +# TODO: Make coarser method controllable. class MissingTwoSteps(MissingBase): r""" Base class used to determined where Indicator outputs should be masked,