From e8e3d13aebc5426985a8c665e6d302300ee06f05 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Thu, 20 Feb 2025 13:03:45 +0000 Subject: [PATCH 1/7] Reworked TimeSeries feature extraction to follow project struture fix: implemented BaseTSFeature extrature parent class fix: TSFEL child class from BaseTSExtractor class fix: TimeSeries feature extractor factory class --- src/pymdma/api/hooks.py | 4 +- src/pymdma/common/definitions.py | 6 +- src/pymdma/time_series/input_layer.py | 14 +-- src/pymdma/time_series/models/__init__.py | 4 + .../extractor.py} | 86 +++---------------- src/pymdma/time_series/models/features.py | 24 ++++++ src/pymdma/time_series/models/tsfel.py | 64 ++++++++++++++ tests/conftest.py | 6 +- 8 files changed, 124 insertions(+), 84 deletions(-) rename src/pymdma/time_series/{utils/extract_features.py => models/extractor.py} (51%) create mode 100644 src/pymdma/time_series/models/features.py create mode 100644 src/pymdma/time_series/models/tsfel.py diff --git a/src/pymdma/api/hooks.py b/src/pymdma/api/hooks.py index c261a47..113ff6b 100644 --- a/src/pymdma/api/hooks.py +++ b/src/pymdma/api/hooks.py @@ -1,12 +1,12 @@ from loguru import logger from ..image.models.features import ExtractorFactory as ImageFeatureExtractor -from ..time_series.utils.extract_features import FeatureExtractor as TimeSeriesFeatureExtractor +from ..time_series.models.features import ExtractorFactory as TimeSeriesFeatureExtractor def load_models_hook(ml_models, device="cpu"): logger.info("Loading ml models") # feature extractors ml_models["dino_vits8"] = ImageFeatureExtractor.model_from_name("dino_vits8").to(device) - ml_models["tsfel"] = TimeSeriesFeatureExtractor("tsfel", device) + ml_models["tsfel"] = TimeSeriesFeatureExtractor.model_from_name("tsfel") logger.info("Models loaded successfully") diff --git a/src/pymdma/common/definitions.py b/src/pymdma/common/definitions.py index eb31468..20a933c 100644 --- a/src/pymdma/common/definitions.py +++ b/src/pymdma/common/definitions.py @@ -79,5 +79,9 @@ def __init__(self, name: str) -> None: self.name = name @abstractmethod - def _extract_features_dataloader(self, dataloader): + def extract_features_from_files(self, *args, **kwargs): + pass + + @abstractmethod + def _extract_features_dataloader(self, dataloader, **kwargs): pass diff --git a/src/pymdma/time_series/input_layer.py b/src/pymdma/time_series/input_layer.py index e63dee5..67c3436 100644 --- a/src/pymdma/time_series/input_layer.py +++ b/src/pymdma/time_series/input_layer.py @@ -10,7 +10,7 @@ from pymdma.constants import ReferenceType, ValidationDomain from .data.simple_dataset import SimpleDataset -from .utils.extract_features import FeatureExtractor +from .models.features import ExtractorFactory # Get the absolute path of the parent directory parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) @@ -176,16 +176,18 @@ def get_embeddings( if model_instances is not None: if model_name in model_instances: extractor = model_instances[model_name] - elif model_name == "default" and FeatureExtractor.default in model_instances: - extractor = model_instances[FeatureExtractor.default] - extractor = FeatureExtractor(model_name, device=self.device) if extractor is None else extractor + elif model_name == "default" and ExtractorFactory.default in model_instances: + extractor = model_instances[ExtractorFactory.default] + if extractor is None: + model_name = ExtractorFactory.default if model_name == "default" else model_name + extractor = ExtractorFactory.model_from_name(model_name) if extractor is None else extractor - reference_features, _labels, _ = extractor.extract_features_dataloader( + reference_features, _labels, _ = extractor._extract_features_dataloader( self.reference_loader, self.reference_loader.dataset.fs, self.reference_loader.dataset.dims, ) - synth_features, _labels, self.instance_ids = extractor.extract_features_dataloader( + synth_features, _labels, self.instance_ids = extractor._extract_features_dataloader( self.target_loader, self.target_loader.dataset.fs, self.target_loader.dataset.dims, diff --git a/src/pymdma/time_series/models/__init__.py b/src/pymdma/time_series/models/__init__.py index e69de29..900d5f1 100644 --- a/src/pymdma/time_series/models/__init__.py +++ b/src/pymdma/time_series/models/__init__.py @@ -0,0 +1,4 @@ +from .features import ExtractorFactory +from .tsfel import TSFEL + +__all__ = ["ExtractorFactory", "TSFEL"] diff --git a/src/pymdma/time_series/utils/extract_features.py b/src/pymdma/time_series/models/extractor.py similarity index 51% rename from src/pymdma/time_series/utils/extract_features.py rename to src/pymdma/time_series/models/extractor.py index 24583d4..c8c1016 100644 --- a/src/pymdma/time_series/utils/extract_features.py +++ b/src/pymdma/time_series/models/extractor.py @@ -1,48 +1,29 @@ from pathlib import Path -from typing import List +from typing import Callable, List, Union import numpy as np +import torch import tsfel +from torch import nn from torch.utils.data import DataLoader +from pymdma.common.definitions import EmbedderInterface + from ..data.simple_dataset import _read_sig_file -class FeatureExtractor: +class BaseTSExtractor(nn.Module, EmbedderInterface): default: str = "tsfel" + extractor: Union[nn.Module, Callable] = None def __init__( self, - name: str, - device: str = "cpu", **kwargs, ): - """Initializes the feature extractor with the given parameters. - - Parameters - ---------- - name: str - identifier of the extractor to be used. - device: str - model device. Defaults to "cpu". - **kwargs: Additional keyword arguments. - - Raises - ------ - ValueError - if invalid variable "name" is provided for the extractor. - """ - self.name = name if name != "default" else "tsfel" - self.device = device - - if self.name == "tsfel": - self.extractor = TSFEL() - else: - raise ValueError(f"Invalid extractor name: {self.name}") - - if self.name != "tsfel": - self.extractor._model.to(device) + super().__init__() + pass + @torch.no_grad() def extract_features_from_files(self, files: List[Path], fs: int, dims: List, batch_size: int = 4): """Extract features from a list of image files. @@ -72,12 +53,13 @@ def extract_features_from_files(self, files: List[Path], fs: int, dims: List, ba for bsize in batch_sizes: end = start + bsize signals = [_read_sig_file(f) for f in files[start:end]] - batch = self.extractor.extract(signals, fs, dims) + batch = self(signals, fs, dims) act_array.append(batch) start += bsize return np.concatenate(act_array, axis=0) - def extract_features_dataloader(self, dataloader: DataLoader, fs: int, dims: List): + @torch.no_grad() + def _extract_features_dataloader(self, dataloader: DataLoader, fs: int, dims: List): """Use selected approach to extract features from all signals in the dataloader. @@ -101,50 +83,10 @@ def extract_features_dataloader(self, dataloader: DataLoader, fs: int, dims: Lis ids_array = [] for batch, labels, signal_ids in dataloader: - batch_feat = self.extractor.extract(batch, fs, dims) + batch_feat = self(batch, fs, dims) act_array.append(batch_feat) labels_array.extend(labels) ids_array.extend(signal_ids) features = np.concatenate(act_array, axis=0) - return features, labels_array, ids_array - - -class TSFEL: - def __init__(self, domains=None): - # Generate default domain value - if domains is None: - domains = ["temporal", "statistical", "spectral"] - self.domains = domains - - def extract(self, batch_windows, fs, dims): - """Extracts features from a batch of samples. - - Parameters - ---------- - batch_windows: List - Batch of signals with len(dims) chans. - fs: int - Sampling frequency - dims: List(str) - list with the names of each signal dimension/channel ex: name of each ECG Lead - - Returns - ------- - features: DataFrame - DataFrame with the features from each batch. - """ - cfg_file = {} - for domain in self.domains: - cfg_file.update(tsfel.get_features_by_domain(domain)) - - features = tsfel.time_series_features_extractor( - cfg_file, - batch_windows, - fs=fs, - window_size=None, - header_names=dims, - ) - - return features diff --git a/src/pymdma/time_series/models/features.py b/src/pymdma/time_series/models/features.py new file mode 100644 index 0000000..198006c --- /dev/null +++ b/src/pymdma/time_series/models/features.py @@ -0,0 +1,24 @@ +from typing import List, Optional + +from .tsfel import TSFEL + + +class ExtractorFactory: + default = "tsfel" + + @staticmethod + def model_from_name( + name: str, + domains: Optional[List[str]] = None, + **kwargs, + ): + """Initializes the feature extractor with the given parameters. + + Args: + name (str): identifier of the extractor to be used. + device (str): model device. Defaults to "cpu". + """ + if name == "tsfel": + return TSFEL(domains, **kwargs) + else: + raise ValueError(f"Model {name} not available.") diff --git a/src/pymdma/time_series/models/tsfel.py b/src/pymdma/time_series/models/tsfel.py new file mode 100644 index 0000000..d12d9e7 --- /dev/null +++ b/src/pymdma/time_series/models/tsfel.py @@ -0,0 +1,64 @@ +from typing import List, Optional + +import tsfel + +from .extractor import BaseTSExtractor + + +class TSFEL(BaseTSExtractor): + def __init__( + self, + domains: Optional[List[str]] = None, + verbose: bool = False, + ): + # Generate default domain value + """Initializes the TSFEL feature extractor with the specified domains + and verbosity. + + Parameters + ---------- + domains : Optional[List[str]] + A list of domains to extract features from. If None, the default domains + ["temporal", "statistical", "spectral"] will be used. + verbose : bool + If True, enables verbose output during feature extraction. + """ + + if domains is None: + domains = ["temporal", "statistical", "spectral"] + self.domains = domains + self.verbose = verbose + + # update domain configurations + self.cfg_file = {} + for domain in self.domains: + self.cfg_file.update(tsfel.get_features_by_domain(domain)) + + def __call__(self, batch_windows, fs, dims): + """Extracts features from a batch of samples. + + Parameters + ---------- + batch_windows: List + Batch of signals with len(dims) chans. + fs: int + Sampling frequency + dims: List(str) + list with the names of each signal dimension/channel ex: name of each ECG Lead + + Returns + ------- + features: DataFrame + DataFrame with the features from each batch. + """ + print(self.verbose, int(self.verbose)) + features = tsfel.time_series_features_extractor( + self.cfg_file, + batch_windows, + fs=fs, + window_size=None, + header_names=dims, + verbose=int(self.verbose), + ) + + return features diff --git a/tests/conftest.py b/tests/conftest.py index db053bb..f294b9c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,7 @@ from pymdma.image.models.features import ExtractorFactory as ImageFeatureExtractor from pymdma.time_series.data.simple_dataset import _read_sig_file from pymdma.time_series.input_layer import _get_data_files_path -from pymdma.time_series.utils.extract_features import FeatureExtractor as TimeSeriesFeatureExtractor +from pymdma.time_series.models.features import ExtractorFactory as TimeSeriesFeatureExtractor MODALITIES = ["image", "tabular", "time_series"] VALIDATION_TYPES = ["input_val", "synthesis_val"] @@ -73,7 +73,7 @@ def get_transforms(input_size: Tuple[int], interpolation: int = Image.BILINEAR): [ transforms.Resize(input_size, interpolation=interpolation), transforms.ToTensor(), - ] + ], ) return get_transforms @@ -99,7 +99,7 @@ def synth_ts_filenames(): @pytest.fixture() def ts_feature_extractor(): def get_extractor(name): - return TimeSeriesFeatureExtractor(name) + return TimeSeriesFeatureExtractor.model_from_name(name) return get_extractor From b3da40ef60a03b367009d86fdf8195a5432a9527 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Thu, 20 Feb 2025 13:05:47 +0000 Subject: [PATCH 2/7] fix: removed input layer logic from time_series examples notebook fix: changed pymdma install directive in time_series notebook --- notebooks/time_series_examples.ipynb | 113 ++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 19 deletions(-) diff --git a/notebooks/time_series_examples.ipynb b/notebooks/time_series_examples.ipynb index fef7d95..9149369 100644 --- a/notebooks/time_series_examples.ipynb +++ b/notebooks/time_series_examples.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install \"pymdma[time_series] @ https://github.com/fraunhoferportugal/pymdma.git\" --find-links \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" + "%pip install \"pymdma[time_series]\" --find-links \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" ] }, { @@ -34,8 +34,66 @@ "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", + "import wfdb\n", "\n", - "from pymdma.time_series.input_layer import TimeSeriesInputLayer" + "from pymdma.time_series.models.features import ExtractorFactory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_sig_file(file_path: Path):\n", + " \"\"\"Read a signal file from the supported file extensions.\n", + "\n", + " Parameters:\n", + " -----------\n", + " file_path: Union[str, Path])\n", + " Path to the file.\n", + "\n", + " Returns\n", + " --------\n", + " dict\n", + " Dictionary containing the data from the .mat file.\n", + "\n", + " Raises\n", + " ------\n", + " ValueError\n", + " If a file extension different from .mat is found.\n", + " \"\"\"\n", + " file_path = Path(file_path)\n", + " # Check if the file has a .mat extension\n", + " if file_path.suffix in [\".mat\", \".dat\"]:\n", + " directory_path = file_path.parent\n", + " return wfdb.rdsamp(directory_path / file_path.stem)[0]\n", + " else:\n", + " # Raise a ValueError for files with unsupported extensions\n", + " raise AssertionError(f\"Unsupported file extension: {Path(file_path).suffix} (file: {file_path})\")\n", + "\n", + "\n", + "def extract_fs_dims(file_path):\n", + " \"\"\"Extracts the sampling frequency and the dimension names of the signal\n", + " from a header file. Only works for this specific .hea file structure.\n", + "\n", + " Parameters\n", + " ----------\n", + " file_path: str\n", + " The path to the header file.\n", + "\n", + " Returns\n", + " -------\n", + " fs : int\n", + " Sampling frequency.\n", + " dims: List(str)\n", + " Names of the signal dimensions.\n", + " \"\"\"\n", + " with open(file_path) as f:\n", + " lines = f.readlines()\n", + " dims = [lines[i].strip().split(\" \")[-1] for i in range(1, 13)]\n", + " fs = lines[0].strip().split(\" \")[2]\n", + " return int(fs), dims" ] }, { @@ -46,23 +104,15 @@ "source": [ "parent_dir = os.path.dirname(os.getcwd())\n", "\n", - "validation_domain = \"synthesis_val\"\n", - "reference_type = \"dataset\"\n", + "# List signal files from source dirs\n", "target_data_path = Path(parent_dir + \"/data/test/time_series/synthesis_val/dataset/\")\n", "reference_data_path = Path(parent_dir + \"/data/test/time_series/synthesis_val/reference/\")\n", - "batch_size = 5\n", - "\n", - "ts_input_layer = TimeSeriesInputLayer(\n", - " validation_domain == validation_domain,\n", - " reference_type=reference_type,\n", - " target_data=target_data_path,\n", - " reference_data=reference_data_path,\n", - " batch_size=batch_size,\n", - ")\n", + "ref_sig_files = [sig for sig in reference_data_path.glob(\"**/*\") if sig.suffix in {\".mat\", \".dat\", \".csv\"}]\n", + "target_sig_files = [sig for sig in target_data_path.glob(\"**/*\") if sig.suffix in {\".mat\", \".dat\", \".csv\"}]\n", "\n", - "\n", - "# Get raw data for input validation\n", - "ref_data, target_data = ts_input_layer.get_full_samples()" + "# Read signal files\n", + "ref_data = np.array([read_sig_file(sig_file) for sig_file in ref_sig_files])\n", + "target_data = np.array([read_sig_file(sig_file) for sig_file in target_sig_files])" ] }, { @@ -104,7 +154,7 @@ " fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))\n", " for ax, signal, score in zip(axs.flat, signals, scores):\n", " ax.plot(signal[:, 0]) # ploting only Lead I of the ECG signal\n", - " ax.set_title(f\"{metric}: {score:.2f}\")\n", + " ax.set_title(f\"{metric}: {score:.3f}\")\n", " ax.axis(\"off\")\n", " ax.set_aspect(\"auto\")\n", " # Add a title to the entire figure\n", @@ -124,6 +174,21 @@ "This section demonstrates how to use the input validation functions with the signal-to-noise ratio (`SNR`) as an example." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pymdma.time_series.measures.input_val import Uniqueness\n", + "\n", + "uniqueness = Uniqueness()\n", + "uniqueness_result = uniqueness.compute(ref_data) # compute the metric\n", + "_dataset_level, instance_level = uniqueness_result.value # fetch the instance level results\n", + "\n", + "plot_instances_score(ref_data, \"Uniqueness\", instance_level, n_cols=5)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -178,8 +243,18 @@ "metadata": {}, "outputs": [], "source": [ + "# Extract the sampling frequency and the dimension names of the signal from a header file\n", + "hea_ref = ref_sig_files[0].parent / f\"{ref_sig_files[0].stem}.hea\"\n", + "hea_target = target_sig_files[0].parent / f\"{target_sig_files[0].stem}.hea\"\n", + "ref_fs, ref_dim = extract_fs_dims(hea_ref)\n", + "target_fs, target_dim = extract_fs_dims(hea_target)\n", + "\n", + "\n", "# Get features for synthetic data quality metrics computation\n", - "ref_features, target_features = ts_input_layer.get_embeddings(\"tsfel\")\n", + "tsfel = ExtractorFactory.model_from_name(\"tsfel\", verbose=False)\n", + "ref_features = tsfel.extract_features_from_files(ref_sig_files, ref_fs, ref_dim)\n", + "target_features = tsfel.extract_features_from_files(target_sig_files, target_fs, target_dim)\n", + "# ref_features, target_features = ts_input_layer.get_embeddings(\"tsfel\")\n", "\n", "print(\"Reference features shape:\", ref_features.shape)\n", "print(\"Synthetic features shape:\", target_features.shape)" @@ -344,7 +419,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.20" + "version": "3.11.11" } }, "nbformat": 4, From 49e2bc58284f7bb61f1a5bd4481c67e7c44f6974 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Thu, 20 Feb 2025 13:59:17 +0000 Subject: [PATCH 3/7] fix: moved extractor import to synthesis part of notebook --- Makefile | 2 +- notebooks/image_examples.ipynb | 27 ++++++++++++++++++++++++++- notebooks/time_series_examples.ipynb | 7 +++---- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index d276c42..dacfbd0 100644 --- a/Makefile +++ b/Makefile @@ -127,7 +127,7 @@ setup-all: @echo -e "$(INFO) Creating development virtual environment...$(TERMINATOR)" && \ python3 -m venv .venv-dev && \ source .venv-dev/bin/activate && \ - pip install -U poetry<2.0.0 && \ + pip install -U "poetry<2.0.0" && \ poetry run pip install --upgrade pip setuptools && \ poetry install --with dev --all-extras && \ echo -e "$(SUCCESS) Virtual environment created successfully!$(TERMINATOR)" && \ diff --git a/notebooks/image_examples.ipynb b/notebooks/image_examples.ipynb index 8a83648..3d36199 100644 --- a/notebooks/image_examples.ipynb +++ b/notebooks/image_examples.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install \"pymdma[image] @ https://github.com/fraunhoferportugal/pymdma.git\" --extra-index-url \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" + "%pip install \"pymdma[image]\" --extra-index-url \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" ] }, { @@ -250,6 +250,7 @@ "source": [ "from pymdma.image.measures.input_val import MSSSIM\n", "\n", + "\n", "def generate_full_ref_dataset(dataset):\n", " distorted = []\n", " for idx, img in enumerate(dataset):\n", @@ -260,6 +261,7 @@ " distorted.append((dst).astype(np.uint8))\n", " return [np.array(x) for x in distorted]\n", "\n", + "\n", "distorted = generate_full_ref_dataset(dataset)\n", "\n", "mssim = MSSSIM()\n", @@ -341,6 +343,7 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "from umap import UMAP\n", "\n", "umap = UMAP(n_components=2, random_state=10, n_jobs=1)\n", @@ -454,6 +457,28 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_idx = np.argsort(giqa_instance)[::-1][:200]\n", + "best_samples = [np.asarray(Image.open(images_synth[i])) for i in best_idx]\n", + "\n", + "best_fig = plot_instances_grid(best_samples, n_cols=25)\n", + "best_fig.suptitle(\"CIFAKE Best samples\", fontsize=16)\n", + "plt.show()\n", + "\n", + "\n", + "worst_idx = np.argsort(giqa_instance)[:200]\n", + "worst_samples = [np.asarray(Image.open(images_synth[i])) for i in worst_idx]\n", + "\n", + "worst_fig = plot_instances_grid(worst_samples, n_cols=25)\n", + "worst_fig.suptitle(\"CIFAKE Worst samples\", fontsize=16)\n", + "plt.show()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/time_series_examples.ipynb b/notebooks/time_series_examples.ipynb index 9149369..056778b 100644 --- a/notebooks/time_series_examples.ipynb +++ b/notebooks/time_series_examples.ipynb @@ -34,9 +34,7 @@ "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import wfdb\n", - "\n", - "from pymdma.time_series.models.features import ExtractorFactory" + "import wfdb" ] }, { @@ -243,6 +241,8 @@ "metadata": {}, "outputs": [], "source": [ + "from pymdma.time_series.models.features import ExtractorFactory\n", + "\n", "# Extract the sampling frequency and the dimension names of the signal from a header file\n", "hea_ref = ref_sig_files[0].parent / f\"{ref_sig_files[0].stem}.hea\"\n", "hea_target = target_sig_files[0].parent / f\"{target_sig_files[0].stem}.hea\"\n", @@ -254,7 +254,6 @@ "tsfel = ExtractorFactory.model_from_name(\"tsfel\", verbose=False)\n", "ref_features = tsfel.extract_features_from_files(ref_sig_files, ref_fs, ref_dim)\n", "target_features = tsfel.extract_features_from_files(target_sig_files, target_fs, target_dim)\n", - "# ref_features, target_features = ts_input_layer.get_embeddings(\"tsfel\")\n", "\n", "print(\"Reference features shape:\", ref_features.shape)\n", "print(\"Synthetic features shape:\", target_features.shape)" From fd04bbcc50bd2ee82b3e67a601da0e57052a27b4 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Fri, 21 Feb 2025 10:55:03 +0000 Subject: [PATCH 4/7] fix: removed debug print in TSFEL class --- src/pymdma/time_series/models/tsfel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pymdma/time_series/models/tsfel.py b/src/pymdma/time_series/models/tsfel.py index d12d9e7..1ab81ef 100644 --- a/src/pymdma/time_series/models/tsfel.py +++ b/src/pymdma/time_series/models/tsfel.py @@ -51,7 +51,6 @@ def __call__(self, batch_windows, fs, dims): features: DataFrame DataFrame with the features from each batch. """ - print(self.verbose, int(self.verbose)) features = tsfel.time_series_features_extractor( self.cfg_file, batch_windows, From 404e59ebd175d167f78076b79f5ff4a3f9547396 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Fri, 21 Feb 2025 16:37:37 +0000 Subject: [PATCH 5/7] fix: remove change from source installation to pypi on notebooks --- notebooks/tabular_examples.ipynb | 354 +++++++++++++------------------ 1 file changed, 145 insertions(+), 209 deletions(-) diff --git a/notebooks/tabular_examples.ipynb b/notebooks/tabular_examples.ipynb index c393baf..d1136a6 100644 --- a/notebooks/tabular_examples.ipynb +++ b/notebooks/tabular_examples.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install \"pymdma[tabular] @ https://github.com/fraunhoferportugal/pymdma.git\" --find-links \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" + "%pip install \"pymdma[tabular]\" --find-links \"https://download.pytorch.org/whl/cpu/torch_stable.html\"" ] }, { @@ -22,16 +22,15 @@ "metadata": {}, "outputs": [], "source": [ + "from typing import Callable, List, Tuple\n", + "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from typing import Tuple, Callable, List\n", - "from scipy.stats import gaussian_kde\n", - "from sklearn.neighbors import NearestNeighbors\n", "from matplotlib.offsetbox import AnchoredText\n", - "\n", - "from sklearn.datasets import make_classification" + "from scipy.stats import gaussian_kde\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.neighbors import NearestNeighbors" ] }, { @@ -48,7 +47,9 @@ "outputs": [], "source": [ "# Nearest Neighbor Model\n", - "def _get_nn_model(train: np.ndarray, distance_type: str = 'euclidean'):\n", + "\n", + "\n", + "def _get_nn_model(train: np.ndarray, distance_type: str = \"euclidean\"):\n", " \"\"\"\n", " Find nearest neighbors of test in train with first categoric_slice-many variables being categorical.\n", "\n", @@ -70,14 +71,10 @@ "\n", "# Distances\n", "def _get_nn_distances(\n", - " tgt_emb: np.ndarray, \n", - " syn_emb: np.ndarray,\n", - " distance_type: dict = 'euclidean',\n", - " size: int = None\n", - ") -> Tuple[np.ndarray]: \n", + " tgt_emb: np.ndarray, syn_emb: np.ndarray, distance_type: dict = \"euclidean\", size: int = None\n", + ") -> Tuple[np.ndarray]:\n", " # checkpoint\n", - " assert tgt_emb.shape[1] == syn_emb.shape[1], \\\n", - " \"Train and Syn have mismatched columns\"\n", + " assert tgt_emb.shape[1] == syn_emb.shape[1], \"Train and Syn have mismatched columns\"\n", "\n", " # split into tgt_train, tgt_query, and syn_query\n", " if size is None:\n", @@ -86,7 +83,7 @@ " tgt_size, syn_size = size, size\n", "\n", " # train and query from target\n", - " tgt_query = tgt_emb[-int(tgt_size):]\n", + " tgt_query = tgt_emb[-int(tgt_size) :]\n", "\n", " # syn_train is not needed\n", " # if sample_size = synthetic_size, syn_query is all syn dataset\n", @@ -99,7 +96,7 @@ " # target\n", " tgt_query_nn, _ = nn_model.kneighbors(tgt_query, n_neighbors=3)\n", " tgt_query_nn = tgt_query_nn[:, 1:] # except the closest (itself)\n", - " \n", + "\n", " # synthetic\n", " syn_query_nn, _ = nn_model.kneighbors(syn_query, n_neighbors=2)\n", "\n", @@ -111,55 +108,51 @@ " for label, query in query_dict.items():\n", " # closest neighbor\n", " aux_dcr = query[:, 0]\n", - " \n", + "\n", " # normalized closest neighbor distances\n", - " aux_nndr = aux_dcr / (query[:, 1] + 1e-10) \n", - " \n", + " aux_nndr = aux_dcr / (query[:, 1] + 1e-10)\n", + "\n", " # assign\n", " dcr[label] = aux_dcr\n", " nndr[label] = aux_nndr\n", - " \n", + "\n", " return dcr, nndr\n", "\n", "\n", "# Probability Density Function\n", "def _get_nn_pdf(\n", - " tgt_dist: np.ndarray, \n", + " tgt_dist: np.ndarray,\n", " syn_dist: np.ndarray,\n", ") -> Tuple[np.ndarray]:\n", - " \n", + "\n", " # get distributions bins\n", " t_min, t_max = min(tgt_dist), max(tgt_dist)\n", " s_min, s_max = min(syn_dist), max(syn_dist)\n", - " bins = np.linspace(\n", - " min([t_min, s_min]), \n", - " max([t_max, s_max]), \n", - " 600\n", - " )\n", + " bins = np.linspace(min([t_min, s_min]), max([t_max, s_max]), 600)\n", "\n", " # get distributions\n", " # tgt pdf dists\n", - " pdf_tgt = gaussian_kde(\n", - " tgt_dist\n", - " ).pdf(bins)\n", + " pdf_tgt = gaussian_kde(tgt_dist).pdf(bins)\n", " pdf_tgt /= sum(pdf_tgt)\n", - " \n", + "\n", " # syn pdf dists\n", - " pdf_syn = gaussian_kde(\n", - " syn_dist\n", - " ).pdf(bins)\n", + " pdf_syn = gaussian_kde(syn_dist).pdf(bins)\n", " pdf_syn /= sum(pdf_syn)\n", - " \n", + "\n", " return pdf_tgt, pdf_syn, bins\n", "\n", "\n", "def subplot_dim_optm(dim: int):\n", " import math\n", + "\n", " matrix_n, matrix_m = int(np.sqrt(dim)), int(np.sqrt(dim))\n", - " matrix_n += math.ceil((dim - matrix_m ** 2) / matrix_n)\n", + " matrix_n += math.ceil((dim - matrix_m**2) / matrix_n)\n", " return matrix_n, matrix_m\n", "\n", + "\n", "# Plot Generative Quality\n", + "\n", + "\n", "def plot_generative_quality(\n", " real_data_list: List[np.ndarray],\n", " fake_data_list: List[np.ndarray],\n", @@ -171,7 +164,7 @@ "):\n", " # plot matrix dim\n", " n_dim, m_dim = subplot_dim_optm(dim=len(real_data_list))\n", - " \n", + "\n", " # figures\n", " fig1, axes_emb = plt.figure(n_dim, m_dim, figsize=(12, 8))\n", " fig2, axes_dist = plt.figure(n_dim, m_dim, figsize=(12, 8))\n", @@ -179,59 +172,37 @@ " # flatten axes array\n", " axes_emb = axes_emb.flatten()\n", " axes_dist = axes_dist.flatten()\n", - " \n", + "\n", " # loop\n", - " for real_data, fake_data, real_pdf, fake_pdf, bins, name, ax_emb, ax_dist in zip(real_data_list, fake_data_list, real_pdf_list, fake_pdf_list, bins_list, names, axes_emb, axes_dist):\n", + " for real_data, fake_data, real_pdf, fake_pdf, bins, name, ax_emb, ax_dist in zip(\n", + " real_data_list, fake_data_list, real_pdf_list, fake_pdf_list, bins_list, names, axes_emb, axes_dist\n", + " ):\n", " # embeddings\n", " tgt_emb2d = emb_obj.transform(real_data)\n", - " syn_emb2d = emb_obj.transform(fake_data) \n", - " \n", - " ax_emb.scatter(\n", - " tgt_emb2d[:, 0],\n", - " tgt_emb2d[:, 1],\n", - " color='forestgreen',\n", - " marker='o',\n", - " label='Real',\n", - " alpha=0.7\n", - " )\n", - " ax_emb.scatter(\n", - " syn_emb2d[:, 0],\n", - " syn_emb2d[:, 1],\n", - " color='darkred',\n", - " marker='*',\n", - " label='Fake',\n", - " alpha=0.7\n", - " )\n", - " \n", + " syn_emb2d = emb_obj.transform(fake_data)\n", + "\n", + " ax_emb.scatter(tgt_emb2d[:, 0], tgt_emb2d[:, 1], color=\"forestgreen\", marker=\"o\", label=\"Real\", alpha=0.7)\n", + " ax_emb.scatter(syn_emb2d[:, 0], syn_emb2d[:, 1], color=\"darkred\", marker=\"*\", label=\"Fake\", alpha=0.7)\n", + "\n", " # set settings\n", " ax_emb.legend()\n", - " ax_emb.set_xlabel('Embedding nr. 0')\n", - " ax_emb.set_ylabel('Embedding nr. 1')\n", + " ax_emb.set_xlabel(\"Embedding nr. 0\")\n", + " ax_emb.set_ylabel(\"Embedding nr. 1\")\n", "\n", " # set title\n", " ax_emb.set_title(name)\n", - " \n", - " # distances plot \n", - " ax_dist.plot(\n", - " bins,\n", - " real_pdf,\n", - " color='forestgreen',\n", - " label='Real'\n", - " )\n", - " ax_dist.fill_between(bins, real_pdf, 0, color='forestgreen', alpha=.1)\n", - " \n", - " ax_dist.plot(\n", - " bins,\n", - " fake_pdf,\n", - " color='darkred',\n", - " label='Fake'\n", - " )\n", - " ax_dist.fill_between(bins, fake_pdf, 0, color='darkred', alpha=.1)\n", - " \n", + "\n", + " # distances plot\n", + " ax_dist.plot(bins, real_pdf, color=\"forestgreen\", label=\"Real\")\n", + " ax_dist.fill_between(bins, real_pdf, 0, color=\"forestgreen\", alpha=0.1)\n", + "\n", + " ax_dist.plot(bins, fake_pdf, color=\"darkred\", label=\"Fake\")\n", + " ax_dist.fill_between(bins, fake_pdf, 0, color=\"darkred\", alpha=0.1)\n", + "\n", " # set settings\n", " ax_dist.legend()\n", - " ax_dist.set_xlabel('Distances')\n", - " ax_dist.set_ylabel('Relative Frequency')\n", + " ax_dist.set_xlabel(\"Distances\")\n", + " ax_dist.set_ylabel(\"Relative Frequency\")\n", "\n", " # set title\n", " ax_dist.set_title(name)\n", @@ -252,7 +223,7 @@ "\n", " # dataframe conversion\n", " X_df = pd.DataFrame(X, columns=cols)\n", - " X_df['tgt'] = y\n", + " X_df[\"tgt\"] = y\n", "\n", " return X_df\n", "\n", @@ -268,37 +239,30 @@ " # plot matrix dim\n", " dim = len(dataset_list)\n", " n_dim, m_dim = subplot_dim_optm(dim=dim)\n", - " \n", + "\n", " # figures\n", " if share_ax:\n", " fig, axes = plt.subplots(n_dim, m_dim, figsize=(12, 8), sharex=True, sharey=True)\n", " else:\n", " fig, axes = plt.subplots(n_dim, m_dim, figsize=(12, 8))\n", - " \n", + "\n", " # flatten axes array\n", " axes = axes.flatten()\n", "\n", " # choose a color map\n", - " colors = plt.cm.get_cmap('tab10', dim).colors\n", - " \n", + " colors = plt.cm.get_cmap(\"tab10\", dim).colors\n", + "\n", " # loop\n", " for dataset, name, color, ax in zip(dataset_list, names, colors, axes):\n", " # embeddings\n", " data_emb = emb_obj.fit_transform(dataset) if with_fit else emb_obj.transform(dataset)\n", - " \n", + "\n", " # scatter plot\n", - " ax.scatter(\n", - " data_emb[:, 0],\n", - " data_emb[:, 1],\n", - " facecolors=color,\n", - " edgecolors=color,\n", - " marker='o',\n", - " alpha=0.7\n", - " )\n", - " \n", + " ax.scatter(data_emb[:, 0], data_emb[:, 1], facecolors=color, edgecolors=color, marker=\"o\", alpha=0.7)\n", + "\n", " # set settings\n", - " ax.set_xlabel('Embedding nr. 0')\n", - " ax.set_ylabel('Embedding nr. 1')\n", + " ax.set_xlabel(\"Embedding nr. 0\")\n", + " ax.set_ylabel(\"Embedding nr. 1\")\n", "\n", " # set title\n", " ax.set_title(name)\n", @@ -311,81 +275,69 @@ "\n", " return fig\n", "\n", + "\n", "def _get_1d_pdf(data: np.ndarray, n_bins: int = 300):\n", " # get distributions bins\n", " d_min, d_max = min(data), max(data)\n", - " bins = np.linspace(\n", - " d_min, \n", - " d_max, \n", - " n_bins\n", - " )\n", + " bins = np.linspace(d_min, d_max, n_bins)\n", "\n", " # get distributions\n", " # tgt pdf dists\n", - " pdf = gaussian_kde(\n", - " data.astype(float)\n", - " ).pdf(bins)\n", + " pdf = gaussian_kde(data.astype(float)).pdf(bins)\n", " pdf /= sum(pdf)\n", - " \n", + "\n", " return pdf, bins\n", "\n", - "def plot_kde(reference: np.ndarray, target_list: List[np.ndarray], column_names: List[str] = None, tag_names: List[str] = None, annots: np.ndarray = None):\n", + "\n", + "def plot_kde(\n", + " reference: np.ndarray,\n", + " target_list: List[np.ndarray],\n", + " column_names: List[str] = None,\n", + " tag_names: List[str] = None,\n", + " annots: np.ndarray = None,\n", + "):\n", " num_columns = reference.shape[1]\n", " num_datasets = len(target_list)\n", - " \n", + "\n", " # default feature names if not provided\n", " if column_names is None:\n", " column_names = [f\"Col {i+1}\" for i in range(reference.shape[-1])]\n", "\n", " if tag_names is None:\n", " tag_names = [f\"Dataset {i+1}\" for i in range(len(target_list))]\n", - " \n", + "\n", " # set up the plot grid\n", " fig, axes = plt.subplots(num_columns, num_datasets, figsize=(16, 10))\n", - " \n", + "\n", " for i in range(num_columns):\n", " # iterate through each feature (row in subplot grid)\n", " ref_pdf, ref_bins = _get_1d_pdf(reference[:, i], n_bins=400)\n", "\n", - " # set ylabel \n", + " # set ylabel\n", " axes[i, 0].set_ylabel(column_names[i])\n", "\n", " # iterate\n", " for j, target in enumerate(target_list):\n", " # plot KDEs for each target dataset (columns in subplot grid)\n", - " at = AnchoredText(str(annots[i, j]), prop=dict(size=7), frameon=False, loc='upper right')\n", - " \n", + " at = AnchoredText(str(annots[i, j]), prop=dict(size=7), frameon=False, loc=\"upper right\")\n", + "\n", " # target\n", " tgt_pdf, tgt_bins = _get_1d_pdf(target[:, i], n_bins=400)\n", "\n", " # plot the reference KDE on each row\n", - " axes[i, j].plot(\n", - " ref_bins,\n", - " ref_pdf,\n", - " color='forestgreen',\n", - " label='Real',\n", - " ls='--',\n", - " alpha=.3\n", - " )\n", - " axes[i, j].fill_between(ref_bins, ref_pdf, 0, color='forestgreen', alpha=.1)\n", + " axes[i, j].plot(ref_bins, ref_pdf, color=\"forestgreen\", label=\"Real\", ls=\"--\", alpha=0.3)\n", + " axes[i, j].fill_between(ref_bins, ref_pdf, 0, color=\"forestgreen\", alpha=0.1)\n", "\n", " # plot the target KDE on each row\n", - " axes[i, j].plot(\n", - " tgt_bins,\n", - " tgt_pdf,\n", - " color='indianred',\n", - " label='Target',\n", - " ls='--',\n", - " alpha=.3\n", - " )\n", - " axes[i, j].fill_between(tgt_bins, tgt_pdf, 0, color='indianred', alpha=.1)\n", + " axes[i, j].plot(tgt_bins, tgt_pdf, color=\"indianred\", label=\"Target\", ls=\"--\", alpha=0.3)\n", + " axes[i, j].fill_between(tgt_bins, tgt_pdf, 0, color=\"indianred\", alpha=0.1)\n", "\n", " # add annotation\n", " axes[i, j].add_artist(at)\n", - " \n", + "\n", " # window params\n", " if not i:\n", - " axes[i, j].set_title(f'{tag_names[j]}')\n", + " axes[i, j].set_title(f\"{tag_names[j]}\")\n", "\n", " axes[i, j].set_yticks([])\n", " axes[i, j].set_xticks([])\n", @@ -419,9 +371,9 @@ "outputs": [], "source": [ "## Input validation\n", - "# A --> High Volume of Samples + Informative Features \n", - "tag_in1 = 'A'\n", - "name_in1 = 'A - High Vol. + Inform.'\n", + "# A --> High Volume of Samples + Informative Features\n", + "tag_in1 = \"A\"\n", + "name_in1 = \"A - High Vol. + Inform.\"\n", "dataset_in1 = make_dataset(\n", " n_samples=2000,\n", " n_features=10,\n", @@ -435,12 +387,12 @@ " shift=0.0,\n", " scale=3.0,\n", " shuffle=True,\n", - " random_state=42\n", + " random_state=42,\n", ").to_numpy()\n", "\n", "# B --> High Volume of Samples + Non-Informative Features\n", - "tag_in2 = 'B'\n", - "name_in2 = 'B - High Vol. + Non-Inform.'\n", + "tag_in2 = \"B\"\n", + "name_in2 = \"B - High Vol. + Non-Inform.\"\n", "dataset_in2 = make_dataset(\n", " n_samples=2000,\n", " n_features=10,\n", @@ -454,12 +406,12 @@ " shift=0.0,\n", " scale=3.0,\n", " shuffle=True,\n", - " random_state=42\n", + " random_state=42,\n", ").to_numpy()\n", "\n", "# C --> Small Volume of Samples + High Dimensionality + Informative Features\n", - "tag_in3 = 'C'\n", - "name_in3 = 'C - Small Vol. + Inform.'\n", + "tag_in3 = \"C\"\n", + "name_in3 = \"C - Small Vol. + Inform.\"\n", "dataset_in3 = make_dataset(\n", " n_samples=200,\n", " n_features=100,\n", @@ -473,12 +425,12 @@ " shift=0.0,\n", " scale=3.0,\n", " shuffle=True,\n", - " random_state=42\n", + " random_state=42,\n", ").to_numpy()\n", "\n", "# D --> Small Volume of Samples + High Dimensionality + Non-informative Features\n", - "tag_in4 = 'D'\n", - "name_in4 = 'D - Small Vol. + Non-Inform.'\n", + "tag_in4 = \"D\"\n", + "name_in4 = \"D - Small Vol. + Non-Inform.\"\n", "dataset_in4 = make_dataset(\n", " n_samples=200,\n", " n_features=100,\n", @@ -492,13 +444,13 @@ " shift=0.0,\n", " scale=3.0,\n", " shuffle=True,\n", - " random_state=42\n", + " random_state=42,\n", ").to_numpy()\n", "\n", "## Synthesis Evaluation\n", "# reference dataset\n", - "tag_ref = 'R'\n", - "name_ref = 'R - Reference Dataset'\n", + "tag_ref = \"R\"\n", + "name_ref = \"R - Reference Dataset\"\n", "dataset_ref = make_dataset(\n", " n_samples=2000,\n", " n_features=10,\n", @@ -512,35 +464,35 @@ " shift=0.0,\n", " scale=3.0,\n", " shuffle=True,\n", - " random_state=42\n", + " random_state=42,\n", ").to_numpy()\n", "\n", "# A --> random dataset\n", - "tag_syn1 = 'A'\n", - "name_syn1 = 'A - Random'\n", + "tag_syn1 = \"A\"\n", + "name_syn1 = \"A - Random\"\n", "dataset_s1 = np.random.random(dataset_ref.shape) * 3.0\n", "\n", "# B --> cumulative small distortion\n", - "tag_syn2 = 'B'\n", - "name_syn2 = 'B - Small Add Distortion'\n", + "tag_syn2 = \"B\"\n", + "name_syn2 = \"B - Small Add Distortion\"\n", "rnd = np.random.random(dataset_ref.shape) * 0.01\n", "dataset_s2 = np.copy(dataset_ref) + rnd\n", "\n", "# C --> cumulative large distortion\n", - "tag_syn3 = 'C'\n", - "name_syn3 = 'C - Large Add Distortion'\n", + "tag_syn3 = \"C\"\n", + "name_syn3 = \"C - Large Add Distortion\"\n", "rnd = np.random.random(dataset_ref.shape) * 100\n", "dataset_s3 = np.copy(dataset_ref) + rnd\n", "\n", "# D --> small multiplicative distortion\n", - "tag_syn4 = 'D'\n", - "name_syn4 = 'D - Small Mult. Distortion'\n", + "tag_syn4 = \"D\"\n", + "name_syn4 = \"D - Small Mult. Distortion\"\n", "rnd = 0.7\n", "dataset_s4 = np.copy(dataset_ref) * rnd\n", "\n", "# E --> large multiplicative distortion\n", - "tag_syn5 = 'E'\n", - "name_syn5 = 'E - Large Mult. Distortion'\n", + "tag_syn5 = \"E\"\n", + "name_syn5 = \"E - Large Mult. Distortion\"\n", "rnd = 100\n", "dataset_s5 = np.copy(dataset_ref) * rnd" ] @@ -572,7 +524,7 @@ "dataset_norm = list(map(lambda x: scale_obj.fit_transform(x), dataset_list))\n", "names = [name_in1, name_in2, name_in3, name_in4]\n", "\n", - "# plot \n", + "# plot\n", "_ = plot_datasets(dataset_norm, names, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -607,7 +559,7 @@ "dataset_norm = list(map(lambda x: scale_obj.transform(x), dataset_list))\n", "names = [name_ref, name_syn1, name_syn2, name_syn3, name_syn4, name_syn5]\n", "\n", - "# plot \n", + "# plot\n", "_ = plot_datasets(dataset_norm, names, emb_obj, with_fit=False, share_ax=True)" ] }, @@ -663,14 +615,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.input_val import KAnonymityScore\n", "from pymdma.tabular.embeddings.embed import UMAPEmbedder\n", + "from pymdma.tabular.measures.input_val import KAnonymityScore\n", "\n", "# umap embedder\n", "emb_obj = UMAPEmbedder(n_components=2)\n", "\n", "# privacy\n", - "score_name = 'KAnonimity'\n", + "score_name = \"KAnonimity\"\n", "k_anom = KAnonymityScore(column_names=None, qi_names=None) # K-anonimity\n", "\n", "# score list\n", @@ -683,9 +635,9 @@ "\n", " # append\n", " scores.append(f\"{tag} - {score_name} = {aux_score}\")\n", - " \n", "\n", - "# plot \n", + "\n", + "# plot\n", "_ = plot_datasets(dataset_list, scores, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -702,14 +654,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.input_val import VIFactorScore\n", "from pymdma.tabular.embeddings.embed import UMAPEmbedder\n", + "from pymdma.tabular.measures.input_val import VIFactorScore\n", "\n", "# umap embedder\n", "emb_obj = UMAPEmbedder(n_components=2)\n", "\n", "# quality\n", - "score_name = 'VIF Score'\n", + "score_name = \"VIF Score\"\n", "vif = VIFactorScore(column_names=None) # VIF\n", "\n", "# score list\n", @@ -722,9 +674,9 @@ "\n", " # append\n", " scores.append(f\"{tag} - {score_name} = {aux_score}\")\n", - " \n", "\n", - "# plot \n", + "\n", + "# plot\n", "_ = plot_datasets(dataset_list, scores, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -734,14 +686,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.input_val import DimCurseScore\n", "from pymdma.tabular.embeddings.embed import UMAPEmbedder\n", + "from pymdma.tabular.measures.input_val import DimCurseScore\n", "\n", "# umap embedder\n", "emb_obj = UMAPEmbedder(n_components=2)\n", "\n", "# quality\n", - "score_name = 'Dim. Curse'\n", + "score_name = \"Dim. Curse\"\n", "dimc = DimCurseScore() # Dimensionality Curse\n", "\n", "# score list\n", @@ -754,9 +706,9 @@ "\n", " # append\n", " scores.append(f\"{tag} - {score_name} = {aux_score}\")\n", - " \n", "\n", - "# plot \n", + "\n", + "# plot\n", "_ = plot_datasets(dataset_list, scores, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -766,14 +718,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.input_val import UniformityScore\n", "from pymdma.tabular.embeddings.embed import UMAPEmbedder\n", + "from pymdma.tabular.measures.input_val import UniformityScore\n", "\n", "# umap embedder\n", "emb_obj = UMAPEmbedder(n_components=2)\n", "\n", "# quality\n", - "score_name = 'Uniformity'\n", + "score_name = \"Uniformity\"\n", "unif = UniformityScore(column_names=None) # Uniformity\n", "\n", "# score list\n", @@ -785,14 +737,9 @@ " aux_score = list(unif.compute(dataset).stats[0].values())\n", "\n", " # append\n", - " scores.append(\n", - " f\"{tag} - {score_name} = \" +\n", - " f\"{round(aux_score[0], 1)}\" +\n", - " u'\\u00B1' + \n", - " f\"{round(aux_score[1], 1)} %\"\n", - " )\n", + " scores.append(f\"{tag} - {score_name} = \" + f\"{round(aux_score[0], 1)}\" + \"\\u00b1\" + f\"{round(aux_score[1], 1)} %\")\n", "\n", - "# plot \n", + "# plot\n", "_ = plot_datasets(dataset_list, scores, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -802,14 +749,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.input_val import OutlierScore\n", "from pymdma.tabular.embeddings.embed import UMAPEmbedder\n", + "from pymdma.tabular.measures.input_val import OutlierScore\n", "\n", "# umap embedder\n", "emb_obj = UMAPEmbedder(n_components=2)\n", "\n", "# quality\n", - "score_name = 'Outlier Score'\n", + "score_name = \"Outlier Score\"\n", "outl = OutlierScore() # Outliers\n", "\n", "# score list\n", @@ -821,14 +768,9 @@ " aux_score = list(outl.compute(dataset).stats[0].values())\n", "\n", " # append\n", - " scores.append(\n", - " f\"{tag} - {score_name} = \" +\n", - " f\"{round(aux_score[0], 1)}\" +\n", - " u'\\u00B1' + \n", - " f\"{round(aux_score[1], 1)} %\"\n", - " )\n", - " \n", - "# plot \n", + " scores.append(f\"{tag} - {score_name} = \" + f\"{round(aux_score[0], 1)}\" + \"\\u00b1\" + f\"{round(aux_score[1], 1)} %\")\n", + "\n", + "# plot\n", "_ = plot_datasets(dataset_list, scores, emb_obj, with_fit=True, share_ax=False)" ] }, @@ -923,7 +865,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.synthesis_val import ImprovedPrecision, ImprovedRecall, Authenticity, Coverage\n", + "from pymdma.tabular.measures.synthesis_val import Authenticity, Coverage, ImprovedPrecision, ImprovedRecall\n", "\n", "ip, ip_name = ImprovedPrecision(k=5), \"P\"\n", "ir, ir_name = ImprovedRecall(k=5), \"R\"\n", @@ -955,7 +897,7 @@ " else:\n", " # append\n", " scores.append(f\"Reference Dataset - {score_s}\")\n", - " \n", + "\n", "\n", "# plot\n", "_ = plot_datasets(dataset_norm_list, scores, emb_obj, with_fit=False, share_ax=True)" @@ -974,12 +916,11 @@ "metadata": {}, "outputs": [], "source": [ - "from pymdma.tabular.measures.synthesis_val import StatisticalSimScore, CoherenceScore\n", - "from pymdma.tabular.measures.synthesis_val import DCRPrivacy\n", + "from pymdma.tabular.measures.synthesis_val import CoherenceScore, DCRPrivacy, StatisticalSimScore\n", "\n", "ssim, ssim_name = StatisticalSimScore(), \"AttSim\"\n", - "coher, coher_name = CoherenceScore(weights=None, corr_type='pearson'), \"CorrCoH\"\n", - "dcr, dcr_name = DCRPrivacy(distance_type='euclidean'), \"DCR\"\n", + "coher, coher_name = CoherenceScore(weights=None, corr_type=\"pearson\"), \"CorrCoH\"\n", + "dcr, dcr_name = DCRPrivacy(distance_type=\"euclidean\"), \"DCR\"\n", "\n", "# score list\n", "scores = []\n", @@ -987,9 +928,9 @@ "# compute scores\n", "for dataset, tag in zip(dataset_norm_list, tag_list):\n", " # compute\n", - " ssim_score = ssim.compute(ref, dataset).stats[0].get('mean')\n", + " ssim_score = ssim.compute(ref, dataset).stats[0].get(\"mean\")\n", " coher_score = coher.compute(ref, dataset).value[0]\n", - " dcr_score = dcr.compute(ref, dataset).value[0].get('privacy')\n", + " dcr_score = dcr.compute(ref, dataset).value[0].get(\"privacy\")\n", "\n", " # aggregate all scores\n", " aux_scores = [ssim_score, coher_score, dcr_score]\n", @@ -1005,7 +946,7 @@ " else:\n", " # append\n", " scores.append(f\"Reference Dataset - {score_s}\")\n", - " \n", + "\n", "\n", "# plot\n", "_ = plot_datasets(dataset_norm_list, scores, emb_obj, with_fit=False, share_ax=True)" @@ -1050,11 +991,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": ".venv-dev", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -1065,7 +1001,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.11.11" } }, "nbformat": 4, From d1fa557c3b1d37bd327a1f4ec8bfefe0f43c2dc5 Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Fri, 21 Feb 2025 16:46:50 +0000 Subject: [PATCH 6/7] Minor version bump 0.1.8 --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ README.md | 2 +- VERSION | 2 +- pyproject.toml | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3928997..ecd2ad7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,35 @@ All notable changes to this project will be documented in this file. This format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.8] - 2025-02-21 +Minor patch release with Time Series standardization and documentation updates. + +### Changed + - Introduced `BaseTSExtractor` torch module for time_series + - Loading TSFEL config on class instanciation + +### Fixed + - Removed input layer logic from time_series example notebook (should not use internal methods or classes) + + +## [0.1.7] - 2025-01-31 +Minor patch release with reduced dependency tree and documentaion updates. + +### Added + - `convert_grayscale` option in `PSNR`metric + - Explicitely listing `SSIM` and `MSSIM` params + - Non deterministic warning for MSID documentation + +### Changed + - Renamed `MSSIM` to `MSSSIM` + - GIQA documentation to indicate QS and DS as in paper + - Removed unnecessary torchmetrics dependency for image + +### Fixed + - Added `np.isclose` verification in hypersphere estimation functions for synthetic evaluation + + ## [0.1.6] - 2024-12-10 Minor patch release with new image features extraction method and documentation updates. diff --git a/README.md b/README.md index 8412b02..72ceefa 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Depending on the data modality you want to use, you may need to install addition ```bash pip install "pymdma[image] @ git+https://github.com/fraunhoferportugal/pymdma.git" # image dependencies pip install "pymdma[tabular] @ git+https://github.com/fraunhoferportugal/pymdma.git" # tabular dependencies -pip install "pymdma[tabular] @ git+https://github.com/fraunhoferportugal/pymdma.git" # time series dependencies +pip install "pymdma[time_series] @ git+https://github.com/fraunhoferportugal/pymdma.git" # time series dependencies ``` For a minimal installation, you can install the package without CUDA support by forcing pip to install torch from the CPU index with the `--find-url` command. diff --git a/VERSION b/VERSION index 1180819..699c6c6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.7 +0.1.8 diff --git a/pyproject.toml b/pyproject.toml index 5a31d32..5705ab3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ # https://github.com/microsoft/vscode-python/blob/master/CHANGELOG.md#enhancements-1 [tool.poetry] name = "pymdma" -version = "0.1.7" +version = "0.1.8" description = "Multimodal Data Metrics for Auditing real and synthetic data" authors = ["Fraunhofer AICOS "] maintainers = [ From f4964656ce47ecf274630d91147087469075994b Mon Sep 17 00:00:00 2001 From: Ivo Facoco Date: Fri, 21 Feb 2025 17:29:58 +0000 Subject: [PATCH 7/7] fix: remove repeated cell in image notebook --- notebooks/image_examples.ipynb | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/notebooks/image_examples.ipynb b/notebooks/image_examples.ipynb index 3d36199..f2dcd68 100644 --- a/notebooks/image_examples.ipynb +++ b/notebooks/image_examples.ipynb @@ -250,7 +250,6 @@ "source": [ "from pymdma.image.measures.input_val import MSSSIM\n", "\n", - "\n", "def generate_full_ref_dataset(dataset):\n", " distorted = []\n", " for idx, img in enumerate(dataset):\n", @@ -317,17 +316,15 @@ "\n", "from pymdma.image.models.features import ExtractorFactory\n", "\n", + "N_SAMPLES = 2000\n", "random.seed(10)\n", "\n", "cifake_test_path = cifake_path / \"test\"\n", "test_images_ref = Path(cifake_test_path / \"REAL\") # real images\n", "test_images_synth = Path(cifake_test_path / \"FAKE\") # synthetic images\n", "\n", - "images_ref = [img for img in test_images_ref.iterdir() if img.is_file()]\n", - "images_synth = [img for img in test_images_synth.iterdir() if img.is_file()]\n", - "\n", - "# images_ref = random.sample([img for img in test_images_ref.iterdir() if img.is_file()], 5000)\n", - "# images_synth = random.sample([img for img in test_images_synth.iterdir() if img.is_file()], 5000)\n", + "images_ref = [img for img in test_images_ref.iterdir() if img.is_file()][:N_SAMPLES]\n", + "images_synth = [img for img in test_images_synth.iterdir() if img.is_file()][:N_SAMPLES]\n", "\n", "extractor = ExtractorFactory.model_from_name(name=\"dino_vits8\")\n", "ref_features = extractor.extract_features_from_files(images_ref)\n", @@ -343,7 +340,6 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", "from umap import UMAP\n", "\n", "umap = UMAP(n_components=2, random_state=10, n_jobs=1)\n", @@ -449,36 +445,12 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "\n", "giqa_result.plot(\"GIQA\", bins=50)\n", "plt.xlabel(\"Score\")\n", "plt.ylabel(\"Frequency\")\n", "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_idx = np.argsort(giqa_instance)[::-1][:200]\n", - "best_samples = [np.asarray(Image.open(images_synth[i])) for i in best_idx]\n", - "\n", - "best_fig = plot_instances_grid(best_samples, n_cols=25)\n", - "best_fig.suptitle(\"CIFAKE Best samples\", fontsize=16)\n", - "plt.show()\n", - "\n", - "\n", - "worst_idx = np.argsort(giqa_instance)[:200]\n", - "worst_samples = [np.asarray(Image.open(images_synth[i])) for i in worst_idx]\n", - "\n", - "worst_fig = plot_instances_grid(worst_samples, n_cols=25)\n", - "worst_fig.suptitle(\"CIFAKE Worst samples\", fontsize=16)\n", - "plt.show()" - ] - }, { "cell_type": "code", "execution_count": null,