From 6e84ecf3bbeb80ce61f3bad084e57a5270afea03 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 10 Jun 2024 10:22:38 -0700 Subject: [PATCH] MRG: fix `remaining_bp` output from sourmash gather (#3195) Adjusts `remaining_bp` from `sourmash gather` to take into account the original size of the query sketch, not just the identified ones. Adds a specific test, and updates documentation to provide a bit more detail, too. Fixes https://github.com/sourmash-bio/sourmash/issues/3194 TODO: - [x] actually fixme :) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/classifying-signatures.md | 4 +- src/sourmash/search.py | 5 ++- tests/test_sourmash.py | 71 +++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md index 498ecefafd..33c3777cbd 100644 --- a/doc/classifying-signatures.md +++ b/doc/classifying-signatures.md @@ -556,7 +556,7 @@ Note that order of columns is not guaranteed and may change between versions. | `md5` | string | Full md5sum of the match sketch. | | `f_match_orig` | float | The fraction of the match in the full query. Rank independent. | | `gather_result_rank` | float | Rank of this match in the results. | -| `remaining_bp` | integer | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. | +| `remaining_bp` | integer | How many bp remain in the query after subtracting this match, estimated by multiplying remaining hashes by scaled. Starts at `query_n_hashes`. Unweighted. See `sum_weighted_found` for the related weighted value. | | `query_filename` | string | The filename from which the query was loaded. | | `query_name` | string | The query sketch name. | | `query_md5` | string | Truncated md5sum of the query sketch. | @@ -564,7 +564,7 @@ Note that order of columns is not guaranteed and may change between versions. | `ksize` | integer | K-mer size for the sketches used in the comparison. | | `moltype` | string | Molecule type of the comparison. | | `scaled` | integer | Scaled value of the comparison. | -| `query_n_hashes` | integer | Number of hashes in the query sketch. | +| `query_n_hashes` | integer | Number of hashes in the query sketch. Unweighted. See `total_weighted_hashes` for weighted value. | | `query_abundance` | boolean | True if the query has abundance information; False otherwise. | | `query_containment_ani` | float | ANI estimated from the query containment in the match. | | `match_containment_ani` | float | ANI estimated from the match containment in the query. | diff --git a/src/sourmash/search.py b/src/sourmash/search.py index f730d1daf5..3ac9ab6a75 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -476,6 +476,7 @@ class GatherResult(PrefetchResult): orig_query_abunds: list = None sum_weighted_found: int = None total_weighted_hashes: int = None + noident_len: int = 0 gather_write_cols = [ "intersect_bp", @@ -589,7 +590,8 @@ def build_gather_result(self): # here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled) self.remaining_bp = ( - self.gather_comparison.mh1_cmp.unique_dataset_hashes + self.noident_len + + self.gather_comparison.mh1_cmp.unique_dataset_hashes - self.gather_comparison.total_unique_intersect_hashes ) @@ -937,6 +939,7 @@ def __next__(self): estimate_ani_ci=self.estimate_ani_ci, sum_weighted_found=sum_weighted_found, total_weighted_hashes=total_weighted_hashes, + noident_len=len(self.noident_mh) * self.noident_mh.scaled, ) self.result_n += 1 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index c3f0858395..dfb4b1eda8 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4539,6 +4539,77 @@ def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): assert "No matches found for --threshold-bp at 50.0 kbp." in runtmp.last_result.err +def test_gather_metagenome_3_thermo(runtmp): + # test gather matches in more detail. + match1 = "gather/GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig" + match2 = "gather/GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig" + match3 = "gather/GCF_000008545.1_ASM854v1_genomic.fna.gz.sig" + + match1 = utils.get_test_data(match1) + match2 = utils.get_test_data(match2) + match3 = utils.get_test_data(match3) + + query_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "gather", + query_sig, + match1, + match2, + match3, + "-k", + "21", + "--threshold-bp=0", + "-o", + "match3.csv", + ) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + outfile = runtmp.output("match3.csv") + with sourmash_args.FileInputCSV(outfile) as r: + rows = list(r) + + assert len(rows) == 3 + + # first row + row = rows[0] + assert row["name"].startswith("NC_000853.1 ") + f_match = float(row["f_match"]) + f_unique_to_query = round(float(row["f_unique_to_query"]), 5) + unique_intersect_bp = int(row["unique_intersect_bp"]) + remaining_bp = int(row["remaining_bp"]) + assert f_match == 1.0 + assert f_unique_to_query == round(0.13096862, 5) + assert unique_intersect_bp == 1920000 + assert remaining_bp == 12740000 + + # second row + row = rows[1] + assert row["name"].startswith("NC_011978.1 ") + f_match = float(row["f_match"]) + f_unique_to_query = round(float(row["f_unique_to_query"]), 5) + unique_intersect_bp = int(row["unique_intersect_bp"]) + remaining_bp = int(row["remaining_bp"]) + assert round(f_match, 5) == round(0.898936170212766, 5) + assert f_unique_to_query == round(0.115279, 5) + assert unique_intersect_bp == 1690000 + assert remaining_bp == 11050000 + + # third row + row = rows[2] + assert row["name"].startswith("NC_009486.1 ") + f_match = float(row["f_match"]) + f_unique_to_query = round(float(row["f_unique_to_query"]), 5) + unique_intersect_bp = int(row["unique_intersect_bp"]) + remaining_bp = int(row["remaining_bp"]) + assert round(f_match, 5) == round(0.4842105, 5) + assert f_unique_to_query == round(0.0627557, 5) + assert unique_intersect_bp == 920000 + assert remaining_bp == 10130000 + + def test_gather_metagenome(runtmp): testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob)