diff --git a/examples/benchmark.md b/examples/benchmark.md index be5a73bf..e22b8991 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -305,6 +305,32 @@ except ModuleNotFoundError: For the example, we use a simple MLP model with 3 layers of neurons. Then we train the model without taking a group on the stations +```python +import numpy as np +from qolmat.imputations.imputers_pytorch import ImputerDiffusion +from qolmat.imputations.diffusions.ddpms import TabDDPM + +X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]]) +imputer = ImputerDiffusion(model=TabDDPM(random_state=11), epochs=50, batch_size=1) + +imputer.fit_transform(X) +``` + +```python +import numpy as np +from qolmat.imputations.imputers_pytorch import ImputerDiffusion +from qolmat.imputations.diffusions.ddpms import TabDDPM + +X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]]) +imputer = ImputerDiffusion(model=TabDDPM(random_state=11), epochs=50, batch_size=1) + +imputer.fit_transform(X) +``` + +```python +1.33573675, 1.40472937 +``` + ```python fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols)) for i_station, (station, df) in enumerate(df_data.groupby("station")): diff --git a/qolmat/analysis/__init__.py b/qolmat/analysis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index 8bd85b9d..5669ac7b 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -34,8 +34,9 @@ class LittleTest(McarTest): imputer : Optional[ImputerEM] Imputer based on the EM algorithm. The 'model' attribute must be equal to 'multinormal'. If None, the default ImputerEM is taken. - random_state : Union[None, int, np.random.RandomState], optional - Controls the randomness of the fit_transform, by default None + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. """ def __init__( diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py index bd3818f8..65b6d6ea 100644 --- a/qolmat/benchmark/missing_patterns.py +++ b/qolmat/benchmark/missing_patterns.py @@ -63,8 +63,9 @@ class _HoleGenerator: Names of the columns for which holes must be created, by default None ratio_masked : Optional[float] Ratio of values ​​to mask, by default 0.05. - random_state : Optional[int] - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data """ @@ -150,8 +151,9 @@ class UniformHoleGenerator(_HoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked values ​​to add, by default 0.05. - random_state : Optional[int], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. sample_proportional: bool, optional If True, generates holes in target columns with same equal frequency. If False, reproduces the empirical proportions between the variables. @@ -215,8 +217,9 @@ class _SamplerHoleGenerator(_HoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked values ​​to add, by default 0.05. - random_state : Optional[int], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data """ @@ -321,8 +324,9 @@ class GeometricHoleGenerator(_SamplerHoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked values ​​to add, by default 0.05. - random_state : Union[None, int, np.random.RandomState], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data """ @@ -390,8 +394,9 @@ class EmpiricalHoleGenerator(_SamplerHoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked values ​​to add, by default 0.05. - random_state : Optional[int], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data """ @@ -485,8 +490,9 @@ class MultiMarkovHoleGenerator(_HoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked values to add, by default 0.05 - random_state : Optional[int], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data """ @@ -634,8 +640,9 @@ class GroupedHoleGenerator(_HoleGenerator): Names of the columns for which holes must be created, by default None ratio_masked : Optional[float], optional Ratio of masked to add, by default 0.05 - random_state : Optional[int], optional - The seed used by the random number generator, by default 42. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. groups : Tuple[str, ...] Names of the columns forming the groups, by default [] """ diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index e4401272..231f870e 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -1,16 +1,17 @@ -from typing import Dict, List, Callable, Tuple +from typing import Dict, List, Callable, Tuple, Union from typing_extensions import Self -import math +import sys import numpy as np import pandas as pd import time from datetime import timedelta from tqdm import tqdm -import gc import torch from torch.utils.data import DataLoader, TensorDataset from sklearn import preprocessing +from sklearn import utils as sku + from qolmat.imputations.diffusions.base import AutoEncoder, ResidualBlock, ResidualBlockTS from qolmat.imputations.diffusions.utils import get_num_params @@ -39,6 +40,7 @@ def __init__( p_dropout: float = 0.0, num_sampling: int = 1, is_clip: bool = True, + random_state: Union[None, int, np.random.RandomState] = None, ): """Diffusion model for tabular data based on Denoising Diffusion Probabilistic Models (DDPM) of @@ -68,6 +70,9 @@ def __init__( Dropout probability, by default 0.0 num_sampling : int, optional Number of samples generated for each cell, by default 1 + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. """ self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -108,6 +113,9 @@ def __init__( self.is_clip = is_clip self.normalizer_x = preprocessing.StandardScaler() + self.random_state = sku.check_random_state(random_state) + seed_torch = self.random_state.randint(2**31 - 1) + torch.manual_seed(seed_torch) def _q_sample(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Section 3.2, algorithm 1 formula implementation. Forward process, defined by `q`. @@ -345,7 +353,6 @@ def fit( round: int = 10, cols_imputed: Tuple[str, ...] = (), ) -> Self: - """Fit data Parameters @@ -537,6 +544,7 @@ def __init__( p_dropout: float = 0.0, num_sampling: int = 1, is_rolling: bool = False, + random_state: Union[None, int, np.random.RandomState] = None, ): """Diffusion model for time-series data based on the works of Ho et al., 2020 (https://arxiv.org/abs/2006.11239), @@ -575,6 +583,9 @@ def __init__( Number of samples generated for each cell, by default 1 is_rolling : bool, optional Use pandas.DataFrame.rolling for preprocessing data, by default False + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. """ super().__init__( num_noise_steps, @@ -586,6 +597,7 @@ def __init__( num_blocks, p_dropout, num_sampling, + random_state=random_state, ) self.dim_feedforward = dim_feedforward diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py index c2ee8a4a..1cf7d5d3 100644 --- a/qolmat/imputations/imputers_pytorch.py +++ b/qolmat/imputations/imputers_pytorch.py @@ -568,6 +568,17 @@ def __init__( freq_str : str Frequency string of DateOffset of Pandas. It is for processing time-series data, used in diffusion models e.g., TsDDPM. + + Examples + -------- + >>> import numpy as np + >>> from qolmat.imputations.imputers_pytorch import ImputerDiffusion + >>> from qolmat.imputations.diffusions.ddpms import TabDDPM + >>> + >>> X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]]) + >>> imputer = ImputerDiffusion(model=TabDDPM(random_state=11), epochs=50, batch_size=1) + >>> + >>> df_imputed = imputer.fit_transform(X) """ super().__init__(groups=groups, columnwise=False) self.model = model