From 6e84ecf3bbeb80ce61f3bad084e57a5270afea03 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 10 Jun 2024 10:22:38 -0700
Subject: [PATCH] MRG: fix `remaining_bp` output from sourmash gather (#3195)

Adjusts `remaining_bp` from `sourmash gather` to take into account the
original size of the query sketch, not just the identified ones. Adds a
specific test, and updates documentation to provide a bit more detail,
too.

Fixes https://github.com/sourmash-bio/sourmash/issues/3194

TODO:
- [x] actually fixme :)

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 doc/classifying-signatures.md |  4 +-
 src/sourmash/search.py        |  5 ++-
 tests/test_sourmash.py        | 71 +++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md
index 498ecefafd..33c3777cbd 100644
--- a/doc/classifying-signatures.md
+++ b/doc/classifying-signatures.md
@@ -556,7 +556,7 @@ Note that order of columns is not guaranteed and may change between versions.
 | `md5`                          | string        | Full md5sum of the match sketch. |
 | `f_match_orig`                 | float         | The fraction of the match in the full query. Rank independent. |
 | `gather_result_rank`           | float         | Rank of this match in the results. |
-| `remaining_bp`                 | integer       | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. |
+| `remaining_bp`                 | integer       | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. Starts at `query_n_hashes`. Unweighted. See `sum_weighted_found` for the related weighted value. |
 | `query_filename`               | string        | The filename from which the query was loaded. |
 | `query_name`                   | string        | The query sketch name. |
 | `query_md5`                    | string        | Truncated md5sum of the query sketch. |
@@ -564,7 +564,7 @@ Note that order of columns is not guaranteed and may change between versions.
 | `ksize`                        | integer       | K-mer size for the sketches used in the comparison. |
 | `moltype`                      | string        | Molecule type of the comparison. |
 | `scaled`                       | integer       | Scaled value of the comparison. |
-| `query_n_hashes`               | integer       | Number of hashes in the query sketch. |
+| `query_n_hashes`               | integer       | Number of hashes in the query sketch. Unweighted. See `total_weighted_hashes` for weighted value. |
 | `query_abundance`              | boolean       | True if the query has abundance information; False otherwise. |
 | `query_containment_ani`        | float         | ANI estimated from the query containment in the match. |
 | `match_containment_ani`        | float         | ANI estimated from the match containment in the query. |
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index f730d1daf5..3ac9ab6a75 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -476,6 +476,7 @@ class GatherResult(PrefetchResult):
     orig_query_abunds: list = None
     sum_weighted_found: int = None
     total_weighted_hashes: int = None
+    noident_len: int = 0
 
     gather_write_cols = [
         "intersect_bp",
@@ -589,7 +590,8 @@ def build_gather_result(self):
 
         # here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled)
         self.remaining_bp = (
-            self.gather_comparison.mh1_cmp.unique_dataset_hashes
+            self.noident_len
+            + self.gather_comparison.mh1_cmp.unique_dataset_hashes
             - self.gather_comparison.total_unique_intersect_hashes
         )
 
@@ -937,6 +939,7 @@ def __next__(self):
             estimate_ani_ci=self.estimate_ani_ci,
             sum_weighted_found=sum_weighted_found,
             total_weighted_hashes=total_weighted_hashes,
+            noident_len=len(self.noident_mh) * self.noident_mh.scaled,
         )
 
         self.result_n += 1
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index c3f0858395..dfb4b1eda8 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -4539,6 +4539,77 @@ def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather):
     assert "No matches found for --threshold-bp at 50.0 kbp." in runtmp.last_result.err
 
 
+def test_gather_metagenome_3_thermo(runtmp):
+    # test gather matches in more detail.
+    match1 = "gather/GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig"
+    match2 = "gather/GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig"
+    match3 = "gather/GCF_000008545.1_ASM854v1_genomic.fna.gz.sig"
+
+    match1 = utils.get_test_data(match1)
+    match2 = utils.get_test_data(match2)
+    match3 = utils.get_test_data(match3)
+
+    query_sig = utils.get_test_data("gather/combined.sig")
+
+    runtmp.sourmash(
+        "gather",
+        query_sig,
+        match1,
+        match2,
+        match3,
+        "-k",
+        "21",
+        "--threshold-bp=0",
+        "-o",
+        "match3.csv",
+    )
+
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    outfile = runtmp.output("match3.csv")
+    with sourmash_args.FileInputCSV(outfile) as r:
+        rows = list(r)
+
+    assert len(rows) == 3
+
+    # first row
+    row = rows[0]
+    assert row["name"].startswith("NC_000853.1 ")
+    f_match = float(row["f_match"])
+    f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
+    unique_intersect_bp = int(row["unique_intersect_bp"])
+    remaining_bp = int(row["remaining_bp"])
+    assert f_match == 1.0
+    assert f_unique_to_query == round(0.13096862, 5)
+    assert unique_intersect_bp == 1920000
+    assert remaining_bp == 12740000
+
+    # second row
+    row = rows[1]
+    assert row["name"].startswith("NC_011978.1 ")
+    f_match = float(row["f_match"])
+    f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
+    unique_intersect_bp = int(row["unique_intersect_bp"])
+    remaining_bp = int(row["remaining_bp"])
+    assert round(f_match, 5) == round(0.898936170212766, 5)
+    assert f_unique_to_query == round(0.115279, 5)
+    assert unique_intersect_bp == 1690000
+    assert remaining_bp == 11050000
+
+    # third row
+    row = rows[2]
+    assert row["name"].startswith("NC_009486.1 ")
+    f_match = float(row["f_match"])
+    f_unique_to_query = round(float(row["f_unique_to_query"]), 5)
+    unique_intersect_bp = int(row["unique_intersect_bp"])
+    remaining_bp = int(row["remaining_bp"])
+    assert round(f_match, 5) == round(0.4842105, 5)
+    assert f_unique_to_query == round(0.0627557, 5)
+    assert unique_intersect_bp == 920000
+    assert remaining_bp == 10130000
+
+
 def test_gather_metagenome(runtmp):
     testdata_glob = utils.get_test_data("gather/GCF*.sig")
     testdata_sigs = glob.glob(testdata_glob)