From 0cebb0ad2965fe115611039911c56900ec095f0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?=
 <ricardo.medina@cpr.ku.dk>
Date: Thu, 14 Sep 2023 16:53:33 +0200
Subject: [PATCH 1/6] :sparkles: Allow turning off input scaling

---
 src/move/conf/schema.py       |  6 +++++-
 src/move/tasks/encode_data.py | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py
index c9dee984..d7832830 100644
--- a/src/move/conf/schema.py
+++ b/src/move/conf/schema.py
@@ -28,6 +28,10 @@ class InputConfig:
     name: str
     weight: int = 1
 
+@dataclass
+class ContinuousInputConfig(InputConfig):
+    scale: bool = True
+
 
 @dataclass
 class DataConfig:
@@ -36,7 +40,7 @@ class DataConfig:
     results_path: str = MISSING
     sample_names: str = MISSING
     categorical_inputs: list[InputConfig] = MISSING
-    continuous_inputs: list[InputConfig] = MISSING
+    continuous_inputs: list[ContinuousInputConfig] = MISSING
     categorical_names: list[str] = MISSING
     continuous_names: list[str] = MISSING
     categorical_weights: list[int] = MISSING
diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py
index 5092064a..53749390 100644
--- a/src/move/tasks/encode_data.py
+++ b/src/move/tasks/encode_data.py
@@ -39,12 +39,15 @@ def encode_data(config: DataConfig):
     if mappings:
         io.dump_mappings(interim_data_path / "mappings.json", mappings)
 
-    for dataset_name in config.continuous_names:
-        logger.info(f"Encoding '{dataset_name}'")
-        filepath = raw_data_path / f"{dataset_name}.tsv"
+    for input_config in config.continuous_inputs:
+        scale = not hasattr(input_config, "scale") or input_config.scale
+        action_name = "Encoding" if scale else "Reading"
+        logger.info(f"{action_name} '{input_config.name}'")
+        filepath = raw_data_path / f"{input_config.name}.tsv"
         names, values = io.read_tsv(filepath, sample_names)
-        values, mask_1d = preprocessing.scale(values)
-        names = names[mask_1d]
-        logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
-        io.dump_names(interim_data_path / f"{dataset_name}.txt", names)
-        np.save(interim_data_path / f"{dataset_name}.npy", values)
+        if scale:
+            values, mask_1d = preprocessing.scale(values)
+            names = names[mask_1d]
+            logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
+        io.dump_names(interim_data_path / f"{input_config.name}.txt", names)
+        np.save(interim_data_path / f"{input_config.name}.npy", values)

From 039f6ce1e96224f63c6166d5370bd97c96102c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?=
 <ricardo.hernandez.medina@sund.ku.dk>
Date: Fri, 2 Feb 2024 15:25:15 +0100
Subject: [PATCH 2/6] :bug: Keep dims if NaN row

---
 src/move/analysis/metrics.py     | 12 ++++++------
 src/move/tasks/analyze_latent.py |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/move/analysis/metrics.py b/src/move/analysis/metrics.py
index 35f5bc60..180126f1 100644
--- a/src/move/analysis/metrics.py
+++ b/src/move/analysis/metrics.py
@@ -33,9 +33,9 @@ def calculate_accuracy(
     y_pred = np.ma.masked_array(reconstruction, mask=is_nan)
 
     num_features = np.ma.count(y_true, axis=1)
-    scores = np.ma.filled(np.sum(y_true == y_pred, axis=1) / num_features, 0)
+    scores = np.sum(y_true == y_pred, axis=1) / num_features
 
-    return scores
+    return np.ma.filled(scores, 0)
 
 
 def calculate_cosine_similarity(
@@ -64,12 +64,12 @@ def calculate_cosine_similarity(
 
     # Equivalent to `np.diag(sklearn.metrics.pairwise.cosine_similarity(x, y))`
     # But can handle masked arrays
-    scores = np.ma.compressed(np.sum(x * y, axis=1)) / (norm(x) * norm(y))
+    scores = np.sum(x * y, axis=1) / (norm(x) * norm(y))
 
-    return scores
+    return np.ma.filled(scores, 0)
 
 
-def norm(x: np.ma.MaskedArray, axis: int = 1) -> FloatArray:
+def norm(x: np.ma.MaskedArray, axis: int = 1) -> np.ma.MaskedArray:
     """Return Euclidean norm. This function is equivalent to `np.linalg.norm`,
     but it can handle masked arrays.
 
@@ -80,4 +80,4 @@ def norm(x: np.ma.MaskedArray, axis: int = 1) -> FloatArray:
     Returns:
         1D array with the specified axis removed.
     """
-    return np.ma.compressed(np.sqrt(np.sum(x**2, axis=axis)))
+    return np.sqrt(np.sum(x**2, axis=axis))
diff --git a/src/move/tasks/analyze_latent.py b/src/move/tasks/analyze_latent.py
index 132a371a..788d08d5 100644
--- a/src/move/tasks/analyze_latent.py
+++ b/src/move/tasks/analyze_latent.py
@@ -220,7 +220,9 @@ def analyze_latent(config: MOVEConfig) -> None:
         scores.append(cosine_sim)
 
     logger.debug("Generating plot: reconstruction metrics")
-    fig = viz.plot_metrics_boxplot(scores, labels)
+
+    plot_scores = [np.ma.compressed(np.ma.masked_equal(each, 0)) for each in scores]
+    fig = viz.plot_metrics_boxplot(plot_scores, labels)
     fig_path = str(output_path / "reconstruction_metrics.png")
     fig.savefig(fig_path, bbox_inches="tight")
     fig_df = pd.DataFrame(dict(zip(labels, scores)), index=df_index)

From a150c7646a1f9eea7222f857f98708ab281e4b13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?=
 <ricardo.hernandez.medina@sund.ku.dk>
Date: Fri, 2 Feb 2024 15:51:44 +0100
Subject: [PATCH 3/6] :see_no_evil: Ignore non-default files

- Ignore non-default stuff from root folder, tutorial folders, etc
---
 .gitignore | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index d9b88844..26e3ab4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,9 @@
 __pycache__/
 *.py[cod]
 
-# NumPy binary files
-data*/*.npy
+# NumPy/PyTorch binary files
+*.npy
+*.pt
 
 # Distribution and packaging files
 build/
@@ -31,10 +32,13 @@ outputs/
 *.log
 
 # Tutorial files
-**/interim_data/
-**/processed_data/
-**/results/
-tutorial/maize/data
+tutorial/*
+!tutorial/config/*maize*.yaml
+!tutorial/config/*random_small*.yaml
+!tutorial/data
+!tutorial/maize/maize_dataset.py
+!tutorial/notebooks/*.ipynb
+!tutorial/README.md
 
 # Virtual environment
 venv/
@@ -42,4 +46,15 @@ virtualvenv/
 
 # docs files
 docs/build/
-docs/source/_templates/
\ No newline at end of file
+docs/source/_templates/
+
+# Root folder
+/*.*
+!/.gitignore
+!/.readthedocs.yaml
+!/LICENSE
+!/MANIFEST.in
+!/README.md
+!/pyproject.toml
+!/requirements.txt
+!/setup.cfg

From 4151e709a3c945e351f226875f9e18a78139221a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Hern=C3=A1ndez=20Medina?=
 <ricardo.hernandez.medina@sund.ku.dk>
Date: Fri, 2 Feb 2024 16:33:58 +0100
Subject: [PATCH 4/6] :bookmark: Update version number

---
 src/move/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/move/__init__.py b/src/move/__init__.py
index a4ce9fd3..a4afcdcb 100644
--- a/src/move/__init__.py
+++ b/src/move/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 __license__ = "MIT"
-__version__ = (1, 4, 9)
+__version__ = (1, 4, 10)
 __all__ = ["conf", "data", "models", "training_loop", "VAE"]
 
 HYDRA_VERSION_BASE = "1.2"

From 17aeb0197bbd4939908c3e3b49ff91af049c76c9 Mon Sep 17 00:00:00 2001
From: Henry <henry.webel@cpr.ku.dk>
Date: Fri, 23 Dec 2022 08:50:32 +0100
Subject: [PATCH 5/6] :memo: packages for running pytorch on GPU not installed

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 4f7eee22..e5e7e6c6 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,8 @@ you do not have powerful GPUs available, it is possible to run using only CPUs.
 For instance, the tutorial data set consisting of simulated drug, metabolomics
 and proteomics data for 500 individuals runs fine on a standard macbook.
 
+> Note: The pip installation of `move-dl` does not setup your local GPU automatically
+
 # The MOVE pipeline
 
 MOVE has five-six steps:

From 92bced0148f61b84372388c7df87ef6aff40b491 Mon Sep 17 00:00:00 2001
From: Henry <henry.webel@cpr.ku.dk>
Date: Fri, 23 Dec 2022 09:18:53 +0100
Subject: [PATCH 6/6] :memo: small changes and hints

---
 tutorial/README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tutorial/README.md b/tutorial/README.md
index a047b0eb..557ebb70 100644
--- a/tutorial/README.md
+++ b/tutorial/README.md
@@ -4,7 +4,7 @@
 
 We have provided a tutorial. In this first tutorial, we inspect datasets 
 reporting whether 500 fictitious individuals have taken one of 20 imaginary
-drugs. We have included a pair of pretend omics datasets, with measurements
+drugs. We have included a pair of simulated omics datasets, with measurements
 for each sample (individual). All these measurements were generated randomly,
 but we have added 200 associations between different pairs of drugs and omics
 features. Let us find them with MOVE!
@@ -146,10 +146,11 @@ reconstructing our input data and generating an informative latent space. Run:
 >>> move-dl data=random_small task=random_small__latent
 ```
 
-:arrow_up: This command will create four types of plot:
+:arrow_up: This command will create four types of plot in the `results/latent_space` folder:
 
-- Loss curve shows the overall loss, KLD term, binary cross-entropy term, and
-sum of squared errors term over number of training epochs.
+- Loss curve shows the overall loss and each of it's three components:
+  Kullback-Leiber-Divergence (KLD) term, binary cross-entropy term,
+  and sum of squared errors term over number of training epochs.
 - Reconstructions metrics boxplot shows a score (accuracy or cosine similarity
 for categorical and continuous datasets, respectively) per reconstructed
 dataset.
@@ -171,7 +172,8 @@ and the omics features. Run:
 >>> move-dl data=random_small task=random_small__id_assoc_ttest
 ```
 
-:arrow_up: This command will create a `results_sig_assoc.tsv` file, listing
+:arrow_up: This command will create a `results_sig_assoc.tsv` 
+file in `results/identify_asscociations`, listing
 each pair of associated features and the corresponding median p-value for such
 association. There should be ~120 associations found.