added support for subset > data size (#40)

Co-authored-by: Jspaezp <[email protected]>
wfondrie · Sep 3, 2021 · 5148da1 · 5148da1
1 parent 4f76a56
commit 5148da1
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog for mokapot  
 
+## [0.7.4] - 2021-09-03
+### Changed
+- Improved documentation and added warnings for `--subset_max_train`. Thanks
+  @jspaezp!
+
 ## [0.7.3] - 2021-07-20
 ### Fixed
 - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also,

diff --git a/mokapot/config.py b/mokapot/config.py
@@ -216,8 +216,10 @@ def _parser():
         type=int,
         default=None,
         help=(
-            "Use only a random subset of PSMs for training. "
-            "This is useful for very large datasets."
+            "Maximum number of PSMs to use during the training "
+            "of each of the cross validation folds in the model. "
+            "This is useful for very large datasets and will be "
+            "ignored if less PSMS are available."
         ),
     )
 

diff --git a/mokapot/model.py b/mokapot/model.py
@@ -266,12 +266,26 @@ def fit(self, psms):
             )
 
         if self.subset_max_train is not None:
-            subset_idx = np.random.choice(
-                len(psms), self.subset_max_train, replace=False
-            )
-
-            psms = copy.copy(psms)
-            psms._data = psms._data.iloc[subset_idx, :]
+            if self.subset_max_train > len(psms):
+                LOGGER.warning(
+                    "The provided subset value (%i) is larger than the number "
+                    "of psms in the training split (%i), so it will be "
+                    "ignored.",
+                    self.subset_max_train,
+                    len(psms),
+                )
+            else:
+                LOGGER.info(
+                    "Subsetting PSMs (%i) to (%i).",
+                    len(psms),
+                    self.subset_max_train,
+                )
+                subset_idx = np.random.choice(
+                    len(psms), self.subset_max_train, replace=False
+                )
+
+                psms = copy.copy(psms)
+                psms._data = psms._data.iloc[subset_idx, :]
 
         # Choose the initial direction
         start_labels, feat_pass = _get_starting_labels(psms, self)

diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py
@@ -67,6 +67,8 @@ def test_cli_options(tmp_path, scope_files):
         "--max_iter",
         "1",
         "--keep_decoys",
+        "--subset_max_train",
+        "50000",
     ]
 
     subprocess.run(cmd, check=True)

diff --git a/tests/unit_tests/test_model.py b/tests/unit_tests/test_model.py
@@ -72,6 +72,18 @@ def test_model_fit(psms):
     assert model.is_trained
 
 
+def test_model_fit_large_subset(psms):
+    model = mokapot.Model(
+        LogisticRegression(),
+        train_fdr=0.05,
+        max_iter=1,
+        subset_max_train=2_000_000_000,
+    )
+    model.fit(psms)
+
+    assert model.is_trained
+
+
 def test_model_predict(psms):
     """Test predictions"""
     model = mokapot.Model(LogisticRegression(), train_fdr=0.05, max_iter=1)