From 0cebb0ad2965fe115611039911c56900ec095f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?= Date: Thu, 14 Sep 2023 16:53:33 +0200 Subject: [PATCH 1/6] :sparkles: Allow turning off input scaling --- src/move/conf/schema.py | 6 +++++- src/move/tasks/encode_data.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py index c9dee984..d7832830 100644 --- a/src/move/conf/schema.py +++ b/src/move/conf/schema.py @@ -28,6 +28,10 @@ class InputConfig: name: str weight: int = 1 +@dataclass +class ContinuousInputConfig(InputConfig): + scale: bool = True + @dataclass class DataConfig: @@ -36,7 +40,7 @@ class DataConfig: results_path: str = MISSING sample_names: str = MISSING categorical_inputs: list[InputConfig] = MISSING - continuous_inputs: list[InputConfig] = MISSING + continuous_inputs: list[ContinuousInputConfig] = MISSING categorical_names: list[str] = MISSING continuous_names: list[str] = MISSING categorical_weights: list[int] = MISSING diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py index 5092064a..53749390 100644 --- a/src/move/tasks/encode_data.py +++ b/src/move/tasks/encode_data.py @@ -39,12 +39,15 @@ def encode_data(config: DataConfig): if mappings: io.dump_mappings(interim_data_path / "mappings.json", mappings) - for dataset_name in config.continuous_names: - logger.info(f"Encoding '{dataset_name}'") - filepath = raw_data_path / f"{dataset_name}.tsv" + for input_config in config.continuous_inputs: + scale = not hasattr(input_config, "scale") or input_config.scale + action_name = "Encoding" if scale else "Reading" + logger.info(f"{action_name} '{input_config.name}'") + filepath = raw_data_path / f"{input_config.name}.tsv" names, values = io.read_tsv(filepath, sample_names) - values, mask_1d = preprocessing.scale(values) - names = names[mask_1d] - logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}") - io.dump_names(interim_data_path / f"{dataset_name}.txt", names) - np.save(interim_data_path / f"{dataset_name}.npy", values) + if scale: + values, mask_1d = preprocessing.scale(values) + names = names[mask_1d] + logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}") + io.dump_names(interim_data_path / f"{input_config.name}.txt", names) + np.save(interim_data_path / f"{input_config.name}.npy", values) From 039f6ce1e96224f63c6166d5370bd97c96102c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?= Date: Fri, 2 Feb 2024 15:25:15 +0100 Subject: [PATCH 2/6] :bug: Keep dims if NaN row --- src/move/analysis/metrics.py | 12 ++++++------ src/move/tasks/analyze_latent.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/move/analysis/metrics.py b/src/move/analysis/metrics.py index 35f5bc60..180126f1 100644 --- a/src/move/analysis/metrics.py +++ b/src/move/analysis/metrics.py @@ -33,9 +33,9 @@ def calculate_accuracy( y_pred = np.ma.masked_array(reconstruction, mask=is_nan) num_features = np.ma.count(y_true, axis=1) - scores = np.ma.filled(np.sum(y_true == y_pred, axis=1) / num_features, 0) + scores = np.sum(y_true == y_pred, axis=1) / num_features - return scores + return np.ma.filled(scores, 0) def calculate_cosine_similarity( @@ -64,12 +64,12 @@ def calculate_cosine_similarity( # Equivalent to `np.diag(sklearn.metrics.pairwise.cosine_similarity(x, y))` # But can handle masked arrays - scores = np.ma.compressed(np.sum(x * y, axis=1)) / (norm(x) * norm(y)) + scores = np.sum(x * y, axis=1) / (norm(x) * norm(y)) - return scores + return np.ma.filled(scores, 0) -def norm(x: np.ma.MaskedArray, axis: int = 1) -> FloatArray: +def norm(x: np.ma.MaskedArray, axis: int = 1) -> np.ma.MaskedArray: """Return Euclidean norm. This function is equivalent to `np.linalg.norm`, but it can handle masked arrays. @@ -80,4 +80,4 @@ def norm(x: np.ma.MaskedArray, axis: int = 1) -> FloatArray: Returns: 1D array with the specified axis removed. """ - return np.ma.compressed(np.sqrt(np.sum(x**2, axis=axis))) + return np.sqrt(np.sum(x**2, axis=axis)) diff --git a/src/move/tasks/analyze_latent.py b/src/move/tasks/analyze_latent.py index 132a371a..788d08d5 100644 --- a/src/move/tasks/analyze_latent.py +++ b/src/move/tasks/analyze_latent.py @@ -220,7 +220,9 @@ def analyze_latent(config: MOVEConfig) -> None: scores.append(cosine_sim) logger.debug("Generating plot: reconstruction metrics") - fig = viz.plot_metrics_boxplot(scores, labels) + + plot_scores = [np.ma.compressed(np.ma.masked_equal(each, 0)) for each in scores] + fig = viz.plot_metrics_boxplot(plot_scores, labels) fig_path = str(output_path / "reconstruction_metrics.png") fig.savefig(fig_path, bbox_inches="tight") fig_df = pd.DataFrame(dict(zip(labels, scores)), index=df_index) From a150c7646a1f9eea7222f857f98708ab281e4b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?= Date: Fri, 2 Feb 2024 15:51:44 +0100 Subject: [PATCH 3/6] :see_no_evil: Ignore non-default files - Ignore non-default stuff from root folder, tutorial folders, etc --- .gitignore | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index d9b88844..26e3ab4b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,9 @@ __pycache__/ *.py[cod] -# NumPy binary files -data*/*.npy +# NumPy/PyTorch binary files +*.npy +*.pt # Distribution and packaging files build/ @@ -31,10 +32,13 @@ outputs/ *.log # Tutorial files -**/interim_data/ -**/processed_data/ -**/results/ -tutorial/maize/data +tutorial/* +!tutorial/config/*maize*.yaml +!tutorial/config/*random_small*.yaml +!tutorial/data +!tutorial/maize/maize_dataset.py +!tutorial/notebooks/*.ipynb +!tutorial/README.md # Virtual environment venv/ @@ -42,4 +46,15 @@ virtualvenv/ # docs files docs/build/ -docs/source/_templates/ \ No newline at end of file +docs/source/_templates/ + +# Root folder +/*.* +!/.gitignore +!/.readthedocs.yaml +!/LICENSE +!/MANIFEST.in +!/README.md +!/pyproject.toml +!/requirements.txt +!/setup.cfg From 4151e709a3c945e351f226875f9e18a78139221a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?= Date: Fri, 2 Feb 2024 16:33:58 +0100 Subject: [PATCH 4/6] :bookmark: Update version number --- src/move/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/move/__init__.py b/src/move/__init__.py index a4ce9fd3..a4afcdcb 100644 --- a/src/move/__init__.py +++ b/src/move/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations __license__ = "MIT" -__version__ = (1, 4, 9) +__version__ = (1, 4, 10) __all__ = ["conf", "data", "models", "training_loop", "VAE"] HYDRA_VERSION_BASE = "1.2" From 17aeb0197bbd4939908c3e3b49ff91af049c76c9 Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 23 Dec 2022 08:50:32 +0100 Subject: [PATCH 5/6] :memo: packages for running pytorch on GPU not installed --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4f7eee22..e5e7e6c6 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ you do not have powerful GPUs available, it is possible to run using only CPUs. For instance, the tutorial data set consisting of simulated drug, metabolomics and proteomics data for 500 individuals runs fine on a standard macbook. +> Note: The pip installation of `move-dl` does not setup your local GPU automatically + # The MOVE pipeline MOVE has five-six steps: From 92bced0148f61b84372388c7df87ef6aff40b491 Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 23 Dec 2022 09:18:53 +0100 Subject: [PATCH 6/6] :memo: small changes and hints --- tutorial/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tutorial/README.md b/tutorial/README.md index a047b0eb..557ebb70 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -4,7 +4,7 @@ We have provided a tutorial. In this first tutorial, we inspect datasets reporting whether 500 fictitious individuals have taken one of 20 imaginary -drugs. We have included a pair of pretend omics datasets, with measurements +drugs. We have included a pair of simulated omics datasets, with measurements for each sample (individual). All these measurements were generated randomly, but we have added 200 associations between different pairs of drugs and omics features. Let us find them with MOVE! @@ -146,10 +146,11 @@ reconstructing our input data and generating an informative latent space. Run: >>> move-dl data=random_small task=random_small__latent ``` -:arrow_up: This command will create four types of plot: +:arrow_up: This command will create four types of plot in the `results/latent_space` folder: -- Loss curve shows the overall loss, KLD term, binary cross-entropy term, and -sum of squared errors term over number of training epochs. +- Loss curve shows the overall loss and each of it's three components: + Kullback-Leiber-Divergence (KLD) term, binary cross-entropy term, + and sum of squared errors term over number of training epochs. - Reconstructions metrics boxplot shows a score (accuracy or cosine similarity for categorical and continuous datasets, respectively) per reconstructed dataset. @@ -171,7 +172,8 @@ and the omics features. Run: >>> move-dl data=random_small task=random_small__id_assoc_ttest ``` -:arrow_up: This command will create a `results_sig_assoc.tsv` file, listing +:arrow_up: This command will create a `results_sig_assoc.tsv` +file in `results/identify_asscociations`, listing each pair of associated features and the corresponding median p-value for such association. There should be ~120 associations found.