From 5c904556b7e07060a42a62a6631635e944426c71 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 8 Sep 2023 15:19:27 -0400 Subject: [PATCH 01/47] start wiring in new rgfa interface --- src/algorithms/gfa_to_handle.cpp | 26 +- src/gfa.cpp | 44 ++- src/rgfa.cpp | 654 +++++++++++++++++++++++++++++++ src/rgfa.hpp | 115 ++++++ 4 files changed, 821 insertions(+), 18 deletions(-) create mode 100644 src/rgfa.cpp create mode 100644 src/rgfa.hpp diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index 65f19320676..ea0ae94dfc5 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -1,6 +1,6 @@ #include "gfa_to_handle.hpp" #include "../path.hpp" - +#include "../rgfa.hpp" #include namespace vg { @@ -300,16 +300,24 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* // Start later than 0 subrange = std::pair(offset, PathMetadata::NO_END_POSITION); } + + string rgfa_path_name; + if (path_rank > 0) { + // Special logic for off-reference paths, which get loaded into special rGFA cover paths + rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length); + } else { + // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. + rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + path_name, + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + subrange); + } + path_handle_t path = graph->create_path_handle(rgfa_path_name); - // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. - path_handle_t path = graph->create_path(PathSense::GENERIC, - PathMetadata::NO_SAMPLE_NAME, - path_name, - PathMetadata::NO_HAPLOTYPE, - PathMetadata::NO_PHASE_BLOCK, - subrange); // Then cache it - found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset)); + found = rgfa_cache->emplace_hint(found, rgfa_path_name, std::make_pair(path, offset)); } // Add the step to the path diff --git a/src/gfa.cpp b/src/gfa.cpp index 7b9453333c4..3f3344d16cd 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -1,6 +1,7 @@ #include "gfa.hpp" #include "utility.hpp" #include "path.hpp" +#include "rgfa.hpp" #include #include @@ -18,7 +19,16 @@ static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, bool rgfa_pline, bool use_w_lines) { - + + RGFACover rgfa_cover; + if (!rgfa_paths.empty()) { + // index the rgfa cover in the graph (combination of rank-0 rgfa_paths and rank-1 paths with rgfa sample name) + unordered_set rgfa_path_handles; + for (const string& rgfa_path_name : rgfa_paths) { + rgfa_path_handles.insert(graph->get_path_handle(rgfa_path_name)); + } + rgfa_cover.load(graph, rgfa_path_handles); + } // TODO: Support sorting nodes, paths, and/or edges for canonical output // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. @@ -40,12 +50,18 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } out << "\n"; - //Compute the rGFA tags of given paths (todo: support non-zero ranks) + //Compute the rGFA tags of given paths. These paths are the rank-0 reference paths (passed in rgfa_paths set) + //along with paths that have the special rgfa sample name (rank>0) paths. unordered_map> node_offsets; - for (const string& path_name : rgfa_paths) { - path_handle_t path_handle = graph->get_path_handle(path_name); - size_t offset = 0; - graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (rgfa_paths.count(path_name) || RGFACover::is_rgfa_path_name(path_name)) { + size_t offset = 0; + subrange_t path_subrange = graph->get_subrange(path_handle); + if (path_subrange != PathMetadata::NO_SUBRANGE) { + offset = path_subrange.first; + } + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { handle_t handle = graph->get_handle_of_step(step_handle); nid_t node_id = graph->get_id(handle); if (graph->get_is_reverse(handle)) { @@ -62,7 +78,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } offset += graph->get_length(handle); }); - } + } + }); //Go through each node in the graph graph->for_each_handle([&](const handle_t& h) { @@ -73,9 +90,18 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& auto it = node_offsets.find(node_id); if (it != node_offsets.end()) { // add rGFA tags - out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) + string sample_name = graph->get_sample_name(it->second.first); + string locus_name = graph->get_locus_name(it->second.first); + int64_t haplotype = graph->get_haplotype(it->second.first); + if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) { + std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name); + } + string rgfa_sn = PathMetadata::create_path_name(PathSense::REFERENCE, sample_name, locus_name, + haplotype, PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + out << "\t" << "SN:Z:" << rgfa_sn << "\t" << "SO:i:" << it->second.second - << "\t" << "SR:i:0"; // todo: support non-zero ranks? + << "\t" << "SR:i:" << rgfa_cover.get_rank(node_id); } out << "\n"; // Writing `std::endl` would flush the buffer. return true; diff --git a/src/rgfa.cpp b/src/rgfa.cpp new file mode 100644 index 00000000000..1215dd486f6 --- /dev/null +++ b/src/rgfa.cpp @@ -0,0 +1,654 @@ +#include "rgfa.hpp" +#include +#include + +//#define debug + +namespace vg { + +using namespace std; + +const string RGFACover::rgfa_sample_name = "_rGFA_"; + +string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + // we move the sample into the locus + // todo: is there something nicer? + string rgfa_locus; + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + if (path_sample != PathMetadata::NO_SAMPLE_NAME) { + rgfa_locus = "SN:Z:" + path_sample; + } + // the contig name will be behind SC... + rgfa_locus += ":SC:Z:" + path_locus; + + // we apply the subrange offset + path_subrange.first += start; + path_subrange.second = path_subrange.first + length; + + + // and return the final path, with sample/locus/rgfa-rank embedded in locus + // (as it's a reference path, we alsos strip out the phase block) + return PathMetadata::create_path_name(PathSense::REFERENCE, RGFACover::rgfa_sample_name, + rgfa_locus, path_haplotype, + PathMetadata::NO_PHASE_BLOCK, path_subrange); + +} + +bool RGFACover::is_rgfa_path_name(const string& path_name) { + return path_name == RGFACover::rgfa_sample_name; +} + +pair RGFACover::parse_rgfa_locus_name(const string& locus_name) { + pair sample_locus; + vector toks = split_delims(locus_name, ":"); + assert(toks.size() % 3 == 0); + for (int64_t i = 0; i < toks.size(); i+=3) { + if (toks[i] == "SN") { + assert(toks[i+1] == "Z"); + sample_locus.first = toks[i+2]; + } else if (toks[i] == "SC") { + assert(toks[i+1] == "Z"); + sample_locus.second = toks[i+2]; + } else { + assert(false); + } + } + return sample_locus; +} + + +void RGFACover::clear(MutablePathMutableHandleGraph* graph) { + vector rgfa_paths; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + rgfa_paths.push_back(path_handle); + }); + for (path_handle_t path_handle : rgfa_paths) { + graph->destroy_path(path_handle); + } +} + +void RGFACover::compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length) { + + // start from scratch + this->rgfa_intervals.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_back(ref_path_handle))); + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (node_to_interval.count(node_id)) { + cerr << "[rgfa error]: node " << node_id << " covered by two reference paths," + << " including " << graph->get_path_name(ref_path_handle) + << ". rGFA support current requires disjoint acyclic reference paths" << endl; + exit(1); + } + node_to_interval[node_id] = rgfa_intervals.size() - 1; + }); + } + + // we use the path traversal finder for everything + // (even gbwt haplotypes, as we're using the path handle interface) + PathTraversalFinder path_trav_finder(*graph, *snarl_manager); + + // we collect the rgfa cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>> rgfa_intervals_vector(thread_count); + vector> node_to_interval_vector(thread_count); + + // we process top-level snarls in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // per-thread output + vector>& thread_rgfa_intervals = rgfa_intervals_vector[omp_get_thread_num()]; + unordered_map& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()]; + + vector queue = {snarl}; + + while(!queue.empty()) { + const Snarl* cur_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover + compute_snarl(*cur_snarl, path_trav_finder, minimum_length, + thread_rgfa_intervals, + thread_node_to_interval); + + + // recurse on the children + const vector& children = snarl_manager->children_of(cur_snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(child_snarl); + } + } + }); + + // now we need to fold up the thread covers + for (int64_t t = 0; t < thread_count; ++t) { + int64_t offset = this->rgfa_intervals.size(); + + // todo: figure out one-shot stl invocation to move + for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { + this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]); + } + rgfa_intervals_vector[t].clear(); + + for (const auto& node_interval : node_to_interval_vector[t]) { + this->node_to_interval[node_interval.first] = node_interval.second + offset; + } + node_to_interval_vector[t].clear(); + } + + // todo: we could optionally go through uncovered nodes and try to greedily search + // for rgfa intervals at this point, since it seems for some graphs there are + // regions that don't get found via the traversal finder + +} + +void RGFACover::load(const PathHandleGraph* graph, + const unordered_set& reference_paths) { + // start from scratch + this->rgfa_intervals.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_back(ref_path_handle))); + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1; + }); + } + + // then the rgfa cover paths + // since we want to keep our structures in terms of original paths, we have to map back + // to them here (if we don't have original paths, then we can't find the overlaps and + // therefore nesting relationships between them. + + // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise + // (this does not make guarantees about degenerate fragmentation related cases tho) + unordered_map, vector> sample_locus_to_paths; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + sample_locus_to_paths[parse_rgfa_locus_name(graph->get_path_name(path_handle))] = {}; + }); + graph->for_each_path_handle([&](path_handle_t path_handle) { + pair sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle)); + if (sample_locus_to_paths.count(sample_locus)) { + sample_locus_to_paths[sample_locus].push_back(path_handle); + } + }); + + // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval + // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph + // so should probably have a better error message than the asserts below (ie if exact interval match not found) + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + // pase the rgfa locus to get the original sample and locus + pair source_sample_locus = parse_rgfa_locus_name(graph->get_path_name(path_handle)); + // find the sample in our index + const vector& source_paths = sample_locus_to_paths.at(source_sample_locus); + // find the containing path + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + int64_t rgfa_haplotype = graph->get_haplotype(path_handle); + const path_handle_t* source_path = nullptr; + subrange_t source_subrange; + for (const path_handle_t& source_path_candidate : source_paths) { + if (graph->get_haplotype(source_path_candidate) == rgfa_haplotype) { + source_subrange = graph->get_subrange(source_path_candidate); + if (source_subrange == PathMetadata::NO_SUBRANGE) { + source_subrange.first = 0; + source_subrange.second = 0; + graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) { + source_subrange.second += graph->get_length(graph->get_handle_of_step(step)); + }); + } + if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) { + source_path = &source_path_candidate; + break; + } + } + } + assert(source_path != nullptr); + // now find the exact interval in the containing path and update our data structure + bool found_start = false; + step_handle_t source_start; + int64_t cur_offset = 0; + graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) { + if (cur_offset + source_subrange.first == rgfa_subrange.first) { + source_start = cur_step; + found_start = true; + } + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + return !found_start; + }); + assert(found_start); + + bool found_end; + step_handle_t source_end; + cur_offset = 0; + for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) { + if (cur_offset == rgfa_subrange.second) { + found_end = true; + source_end = cur_step; + break; + } + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + } + assert(found_end); + + // we can finally add our interval + this->rgfa_intervals.push_back(make_pair(source_start, source_end)); + for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { + node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1; + } + }); +} + +void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); + + // compute the offsets in parallel, assuming we don't have a path position index of all paths + // todo: we could be smarter about not traversing the same path multiple times! + vector rgfa_offsets(this->rgfa_intervals.size()); + vector rgfa_lengths(this->rgfa_intervals.size()); +#pragma omp parallel for + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); + // below check is because we don't write the reference path, which is in the cover but + // not using the naming convention + if (is_rgfa_path_name(graph->get_path_name(source_path_handle))) { + rgfa_offsets[i] = 0; + mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) { + if (step_handle == rgfa_intervals[i].first) { + return false; + } + rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle)); + return true; + }); + rgfa_lengths[i] = 0; + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); + } + } + } + + // write the rgfa paths + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); + string source_path_name = graph->get_path_name(source_path_handle); + if (is_rgfa_path_name(source_path_name)) { + string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]); + path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name); + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle)); + } + } + } + + this->forwardize_rgfa_paths(mutable_graph); +} + +int64_t RGFACover::get_rank(nid_t node_id) const { + if (!node_to_interval.count(node_id)) { + return -1; + } + + const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); + + // since our decomposition is based on snarl tranversals, we know that fragments must + // overlap their parents on snarl end points (at the very least) + // therefore we can find parents by scanning along the rgfa paths. + step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first); + int64_t left_rank = 0; + if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) { + left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent))); + } + + step_handle_t right_parent = graph->get_next_step(rgfa_interval.second); + int64_t right_rank = 0; + if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { + right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent))); + } + + return min(left_rank, right_rank); +} + +step_handle_t RGFACover::get_step(nid_t node_id) const { + assert(node_to_interval.count(node_id)); + + const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); + path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); + for (step_handle_t step = rgfa_interval.first; step != rgfa_interval.second; step = graph->get_next_step(step)) { + if (graph->get_id(graph->get_handle_of_step(step)) == node_id) { + return step; + } + } + assert(false); + return step_handle_t(); +} + +pair*, + const pair*> +RGFACover::get_parent_intervals(const pair& interval) const { + + pair*, + const pair*> parents = make_pair(nullptr, nullptr); + + // since our decomposition is baseds on snarl tranversals, we know that fragments must + // overlap their parents on snarl end points (at the very least) + // therefore we can find parents by scanning along the rgfa paths. + step_handle_t left_parent = graph->get_previous_step(interval.first); + if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(interval.first))) { + int64_t interval_idx = this->node_to_interval.at(graph->get_id(graph->get_handle_of_step(left_parent))); + parents.first = &this->rgfa_intervals.at(interval_idx); + } + + step_handle_t right_parent = graph->get_next_step(interval.second); + if (right_parent != graph->path_end(graph->get_path_handle_of_step(interval.second))) { + int64_t interval_idx = node_to_interval.at(graph->get_id(graph->get_handle_of_step(right_parent))); + parents.second = &this->rgfa_intervals.at(interval_idx); + } + return parents; +} + +const vector>& RGFACover::get_intervals() const { + return this->rgfa_intervals; +} + +const pair* RGFACover::get_interval(nid_t node_id) const { + if (this->node_to_interval.count(node_id)) { + return &this->rgfa_intervals.at(node_to_interval.at(node_id)); + } + return nullptr; +} + + +void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval) { + + // start by finding the path traversals through the snarl + vector> travs; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (is_rgfa_path_name(trav_path_name)) { + // we ignore existing (off-reference) rGFA paths + // todo: shoulgd there be better error handling? + cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl; + continue; + } + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed = true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } + travs.push_back(trav); + } + } + + + // build an initial ranked list of candidate traversal fragments + vector>>> ranked_trav_fragments; + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // only a reference traversal (or deletion that we don't need to consider) + // will have its first two nodes covered + if (this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][0]))) && + this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][1])))) { + continue; + } + + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav, thread_node_to_interval); + + for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } + } + if (!cyclic && interval_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, uncovered_interval); + ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(trav_idx, uncovered_interval))); + } + } + } + + // todo: typedef! + auto heap_comp = [] (const pair>>& s1, + const pair>>& s2) { + return s1.first < s2.first; + }; + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.second.first); + const pair& uncovered_interval = best_stats_fragment.second.second; + + // our traversal may have been partially covered by a different iteration, if so, we need to break it up + // and continue + vector> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, chopped_interval); + ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(best_stats_fragment.second.first, chopped_interval))); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + } + } + continue; + } + + // add the interval to the local (thread safe) structures + step_handle_t step = trav[uncovered_interval.first]; + int64_t interval_length = uncovered_interval.second - uncovered_interval.first; + for (int64_t i = 0; i < interval_length; ++i) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); + step = graph->get_next_step(step); + } + thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second-1])); + } +} + +vector> RGFACover::get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval) { + + vector> intervals; + int64_t start = -1; + unordered_set dupe_check; + for (size_t i = 0; i < trav.size(); ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = dupe ? i : -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; +} + +int64_t RGFACover::get_coverage(const vector& trav, const pair& uncovered_interval) { + path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); + int64_t coverage = 0; + + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + coverage += length * all_steps.size(); + } + + return coverage; +} + +// copied pretty much verbatem from +// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 +void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); + + unordered_map id_map; + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_rgfa_path_name(path_name)) { + size_t fw_count = 0; + size_t total_steps = 0; + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + handle_t flipped_handle = mutable_graph->create_handle(mutable_graph->get_sequence(handle)); + id_map[mutable_graph->get_id(flipped_handle)] = mutable_graph->get_id(handle); + mutable_graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (mutable_graph->get_id(prev_handle) != mutable_graph->get_id(handle)) { + mutable_graph->create_edge(prev_handle, flipped_handle); + } + }); + mutable_graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (mutable_graph->get_id(handle) != mutable_graph->get_id(next_handle)) { + mutable_graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (mutable_graph->has_edge(handle, handle)) { + mutable_graph->create_edge(flipped_handle, flipped_handle); + } + if (mutable_graph->has_edge(handle, mutable_graph->flip(handle))) { + mutable_graph->create_edge(flipped_handle, mutable_graph->flip(flipped_handle)); + } + if (mutable_graph->has_edge(mutable_graph->flip(handle), handle)) { + mutable_graph->create_edge(mutable_graph->flip(flipped_handle), flipped_handle); + } + vector steps = mutable_graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (mutable_graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = mutable_graph->get_next_step(step); + handle_t new_handle = mutable_graph->get_is_reverse(mutable_graph->get_handle_of_step(step)) ? flipped_handle : + mutable_graph->flip(flipped_handle); + mutable_graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << mutable_graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(mutable_graph->steps_of_handle(handle).empty()); + dynamic_cast(mutable_graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + mutable_graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + + // do a check just to be sure + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_rgfa_path_name(path_name)) { + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + cerr << "[rGFA] error: Failed to fowardize node " << mutable_graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} + +} + diff --git a/src/rgfa.hpp b/src/rgfa.hpp new file mode 100644 index 00000000000..cd9841a9d68 --- /dev/null +++ b/src/rgfa.hpp @@ -0,0 +1,115 @@ +#ifndef VG_RGFA_HPP_INCLUDED +#define VG_RGFA_HPP_INCLUDED + +/** + * \file rgfa.hpp + * + * Interface for computing and querying rGFA path covers. + * + * An rGFA cover is a set of path fragments (stored as separate paths) in the graph. + * They are always relative to an existing reference sample (ie GRCh38). + * rGFA path fragments have a special name format, where sample=rGFA + * + * The data structures used in this class are always relative to the original paths + * in the graph. The REfERENCE-SENSE fragments that are used to serialize the + * cover (and for surjection) can be created and loaded, but they are not used + * beyond that. + */ + +#include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" + +namespace vg { + +using namespace std; + +class RGFACover { +public: + // I'm not sure how much we want to allow this to change. For now just define it here + static const string rgfa_sample_name; + + // make an rGFA path name from a subrange of a normal path + static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length); + + // test if path name is rGFA + static bool is_rgfa_path_name(const string& path_name); + + // break up the rGFA locus name back into original sample and locus + static pair parse_rgfa_locus_name(const string& locus_name); + +public: + // clear out the existing rGFA cover from the graph. recommended to run this + // before compute() + void clear(MutablePathMutableHandleGraph* graph); + + // compute the rgfa cover from the graph, starting with a given set of reference paths + void compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length); + + // load the rgfa cover from the graph, assuming it's been computed already and + // saved in special rgfa paths + // this function assumes that the rGFA cover paths are exactly consistent + // with the original paths. + void load(const PathHandleGraph* graph, + const unordered_set& reference_paths); + + // apply the rgfa cover to a graph (must have been computed first), adding it + // as a bunch of (reference sense) paths with a special sample name + void apply(MutablePathMutableHandleGraph* mutable_graph); + + // get the rgfa rank (level) of a given node (0 if on a reference path) + int64_t get_rank(nid_t node_id) const; + + // get the rgfa step of a given node + step_handle_t get_step(nid_t node_id) const; + + // get the parent intervals + pair*, + const pair*> get_parent_intervals(const pair& interval) const; + + // get the intervals + const vector>& get_intervals() const; + + // get an interval from a node + // return nullptr if node not in an interval + const pair* get_interval(nid_t node_id) const; + + +protected: + + // compute the cover for the given snarl, by greedily finding the convered rgfa paths through it + // the cover is added to the two "thread_" structures. + void compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval); + + // get intervals in travs that are not covered according to this->node_to_interval or + // the thread_node_to_interval parameter + vector> get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval); + + // get the total coverage of a traversal (sum of step lengths) + int64_t get_coverage(const vector& trav, const pair& uncovered_interval); + + // make sure all nodes in all rGFA paths are in forward orientation + // this is always possible because they are, by definition, disjoint + // this should only be run from inside apply() + void forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph); + + +protected: + + const PathHandleGraph* graph; + + vector> rgfa_intervals; + + unordered_map node_to_interval; +}; + +/// Export the given VG graph to the given GFA file. +} + +#endif From a8dcd2bd38211979ff5fa0684b1e07c4759a6079 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sun, 10 Sep 2023 17:54:38 -0400 Subject: [PATCH 02/47] start wiring in cli interface --- src/gfa.cpp | 6 +- src/rgfa.cpp | 121 +++++++++++++++++++++------------- src/rgfa.hpp | 3 + src/subcommand/paths_main.cpp | 106 ++++++++++++++++++++++------- 4 files changed, 162 insertions(+), 74 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 3f3344d16cd..fcc2e13c220 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -96,8 +96,10 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) { std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name); } - string rgfa_sn = PathMetadata::create_path_name(PathSense::REFERENCE, sample_name, locus_name, - haplotype, PathMetadata::NO_PHASE_BLOCK, + string rgfa_sn = PathMetadata::create_path_name(sample_name.empty() ? PathSense::GENERIC : PathSense::REFERENCE, + sample_name, locus_name, + sample_name.empty() ? PathMetadata::NO_HAPLOTYPE : haplotype, + PathMetadata::NO_PHASE_BLOCK, PathMetadata::NO_SUBRANGE); out << "\t" << "SN:Z:" << rgfa_sn << "\t" << "SO:i:" << it->second.second diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 1215dd486f6..5ef6649d165 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -2,7 +2,7 @@ #include #include -//#define debug +#define debug namespace vg { @@ -20,16 +20,19 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in subrange_t path_subrange; PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); + if (path_subrange == PathMetadata::NO_SUBRANGE) { + path_subrange = make_pair(0, 0); + } // we move the sample into the locus // todo: is there something nicer? string rgfa_locus; assert(path_locus != PathMetadata::NO_LOCUS_NAME); if (path_sample != PathMetadata::NO_SAMPLE_NAME) { - rgfa_locus = "SN:Z:" + path_sample; + rgfa_locus = path_sample + "::"; } // the contig name will be behind SC... - rgfa_locus += ":SC:Z:" + path_locus; + rgfa_locus += path_locus; // we apply the subrange offset path_subrange.first += start; @@ -45,27 +48,22 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in } bool RGFACover::is_rgfa_path_name(const string& path_name) { - return path_name == RGFACover::rgfa_sample_name; + return PathMetadata::parse_sample_name(path_name) == RGFACover::rgfa_sample_name; } pair RGFACover::parse_rgfa_locus_name(const string& locus_name) { - pair sample_locus; - vector toks = split_delims(locus_name, ":"); - assert(toks.size() % 3 == 0); - for (int64_t i = 0; i < toks.size(); i+=3) { - if (toks[i] == "SN") { - assert(toks[i+1] == "Z"); - sample_locus.first = toks[i+2]; - } else if (toks[i] == "SC") { - assert(toks[i+1] == "Z"); - sample_locus.second = toks[i+2]; - } else { - assert(false); - } + pair sample_locus = make_pair(PathMetadata::NO_SAMPLE_NAME, PathMetadata::NO_LOCUS_NAME); + + auto pos = locus_name.find("::"); + if (pos != string::npos) { + sample_locus.first = locus_name.substr(0, pos); + sample_locus.second = locus_name.substr(pos + 2); + } else { + sample_locus.second = locus_name; } - return sample_locus; -} + return sample_locus; +} void RGFACover::clear(MutablePathMutableHandleGraph* graph) { vector rgfa_paths; @@ -102,7 +100,12 @@ void RGFACover::compute(const PathHandleGraph* graph, node_to_interval[node_id] = rgfa_intervals.size() - 1; }); } + this->num_ref_intervals = this->rgfa_intervals.size(); +#ifdef debug + cerr << "[rgfa] Selected " << rgfa_intervals.size() << " rank=0 reference paths" << endl; +#endif + // we use the path traversal finder for everything // (even gbwt haplotypes, as we're using the path handle interface) PathTraversalFinder path_trav_finder(*graph, *snarl_manager); @@ -146,6 +149,9 @@ void RGFACover::compute(const PathHandleGraph* graph, for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]); } +#ifdef debug + cerr << "Adding " << rgfa_intervals_vector[t].size() << " rgfa intervals from thread " << t << endl; +#endif rgfa_intervals_vector[t].clear(); for (const auto& node_interval : node_to_interval_vector[t]) { @@ -175,6 +181,7 @@ void RGFACover::load(const PathHandleGraph* graph, node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1; }); } + this->num_ref_intervals = this->rgfa_intervals.size(); // then the rgfa cover paths // since we want to keep our structures in terms of original paths, we have to map back @@ -185,7 +192,8 @@ void RGFACover::load(const PathHandleGraph* graph, // (this does not make guarantees about degenerate fragmentation related cases tho) unordered_map, vector> sample_locus_to_paths; graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { - sample_locus_to_paths[parse_rgfa_locus_name(graph->get_path_name(path_handle))] = {}; + string locus_name = graph->get_locus_name(path_handle); + sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {}; }); graph->for_each_path_handle([&](path_handle_t path_handle) { pair sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle)); @@ -199,20 +207,30 @@ void RGFACover::load(const PathHandleGraph* graph, // so should probably have a better error message than the asserts below (ie if exact interval match not found) graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { // pase the rgfa locus to get the original sample and locus - pair source_sample_locus = parse_rgfa_locus_name(graph->get_path_name(path_handle)); + pair source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle)); // find the sample in our index const vector& source_paths = sample_locus_to_paths.at(source_sample_locus); // find the containing path subrange_t rgfa_subrange = graph->get_subrange(path_handle); assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); int64_t rgfa_haplotype = graph->get_haplotype(path_handle); + // allow match between 0 and NO_HAPLOTYPE + if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) { + rgfa_haplotype = 0; + } const path_handle_t* source_path = nullptr; subrange_t source_subrange; for (const path_handle_t& source_path_candidate : source_paths) { - if (graph->get_haplotype(source_path_candidate) == rgfa_haplotype) { + int64_t source_haplotype = graph->get_haplotype(source_path_candidate); + if (source_haplotype == PathMetadata::NO_HAPLOTYPE) { + source_haplotype = 0; + } + if (source_haplotype == rgfa_haplotype) { source_subrange = graph->get_subrange(source_path_candidate); if (source_subrange == PathMetadata::NO_SUBRANGE) { source_subrange.first = 0; + } + if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) { source_subrange.second = 0; graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) { source_subrange.second += graph->get_length(graph->get_handle_of_step(step)); @@ -241,7 +259,6 @@ void RGFACover::load(const PathHandleGraph* graph, bool found_end; step_handle_t source_end; - cur_offset = 0; for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) { if (cur_offset == rgfa_subrange.second) { found_end = true; @@ -268,38 +285,38 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { vector rgfa_offsets(this->rgfa_intervals.size()); vector rgfa_lengths(this->rgfa_intervals.size()); #pragma omp parallel for - for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); - // below check is because we don't write the reference path, which is in the cover but - // not using the naming convention - if (is_rgfa_path_name(graph->get_path_name(source_path_handle))) { - rgfa_offsets[i] = 0; - mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) { - if (step_handle == rgfa_intervals[i].first) { - return false; - } - rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle)); - return true; - }); - rgfa_lengths[i] = 0; - for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; - step_handle = mutable_graph->get_next_step(step_handle)) { - rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); +#ifdef debug + cerr << "computing offset for application of rgfa path " << graph->get_path_name(source_path_handle) << endl; +#endif + rgfa_offsets[i] = 0; + mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) { + if (step_handle == rgfa_intervals[i].first) { + return false; } + rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle)); + return true; + }); + rgfa_lengths[i] = 0; + step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second); + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step; + step_handle = mutable_graph->get_next_step(step_handle)) { + rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); + } } // write the rgfa paths - for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); string source_path_name = graph->get_path_name(source_path_handle); - if (is_rgfa_path_name(source_path_name)) { - string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]); - path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name); - for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; - step_handle = mutable_graph->get_next_step(step_handle)) { - mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle)); - } + string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]); + path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name); + step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second); + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step; + step_handle = mutable_graph->get_next_step(step_handle)) { + mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle)); } } @@ -479,6 +496,13 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav const vector& trav = travs.at(best_stats_fragment.second.first); const pair& uncovered_interval = best_stats_fragment.second.second; +#ifdef debug + cerr << "best trav: "; + for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx)); + cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <> chopped_intervals; @@ -518,6 +542,9 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav // add the interval to the local (thread safe) structures step_handle_t step = trav[uncovered_interval.first]; int64_t interval_length = uncovered_interval.second - uncovered_interval.first; +#ifdef debug + cerr << "adding interval with length " << interval_length << endl; +#endif for (int64_t i = 0; i < interval_length; ++i) { thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); step = graph->get_next_step(step); diff --git a/src/rgfa.hpp b/src/rgfa.hpp index cd9841a9d68..1b0ea598d73 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -106,6 +106,9 @@ class RGFACover { vector> rgfa_intervals; + // so rgfa_intervals[0,num_ref_intervals-1] will be all rank-0 reference intervals + int64_t num_ref_intervals; + unordered_map node_to_interval; }; diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 1fc293df752..19d54070c29 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,9 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../rgfa.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -37,6 +40,10 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " rGFA cover" << endl + << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -101,6 +108,8 @@ int main_paths(int argc, char** argv) { bool extract_as_fasta = false; bool drop_paths = false; bool retain_paths = false; + int64_t rgfa_min_len = -1; + string snarl_filename; string graph_file; string gbwt_file; string path_prefix; @@ -130,6 +139,8 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"rgfa-cover", required_argument, 0, 'R'}, + {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -145,16 +156,15 @@ int main_paths(int argc, char** argv) { {"reference-paths", no_argument, 0, 'R'}, {"haplotype-paths", no_argument, 0, 'H'}, {"coverage", no_argument, 0, 'c'}, - + {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. - {"threads", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGHp:ct:", long_options, &option_index); // Detect the end of the options. @@ -183,13 +193,22 @@ int main_paths(int argc, char** argv) { case 'd': drop_paths = true; output_formats++; - break; + break; case 'r': retain_paths = true; output_formats++; break; - + + case 'R': + rgfa_min_len = parse(optarg); + output_formats++; + break; + + case 's': + snarl_filename = optarg; + break; + case 'X': extract_as_gam = true; output_formats++; @@ -265,9 +284,16 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'T': - std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'q': std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; @@ -320,7 +346,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -371,9 +397,23 @@ int main_paths(int argc, char** argv) { exit(1); } } - - - + + // Load or compute the snarls + unique_ptr snarl_manager; + if (rgfa_min_len >= 0) { + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -566,7 +606,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || rgfa_min_len >= 0) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -578,24 +618,40 @@ int main_paths(int argc, char** argv) { exit(1); } - vector to_destroy; - if (drop_paths) { + if (drop_paths || retain_paths) { + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } + } else { + assert(rgfa_min_len >= 0); + RGFACover rgfa_cover; + // clean out existing cover + rgfa_cover.clear(mutable_graph); + // load up the rank-0 reference path selection + unordered_set reference_paths; for_each_selected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); + reference_paths.insert(path_handle); }); - } else { - for_each_unselected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); - }); - } - for (string& path_name : to_destroy) { - mutable_graph->destroy_path(graph->get_path_handle(path_name)); + // compute the new cover + rgfa_cover.compute(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); + // and write it out to the graph + rgfa_cover.apply(mutable_graph); } // output the graph - serializable_graph->serialize(cout); + vg::io::save_handle_graph(graph.get(), std::cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one From a2261b4ae956db2b828e77351ba478239fe71eef Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 19 Sep 2023 17:55:46 -0400 Subject: [PATCH 03/47] add rgfa options to save_graph interface --- src/io/save_handle_graph.hpp | 14 ++++++++++---- src/subcommand/paths_main.cpp | 4 +++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp index bee5b1297c5..58946fd5490 100644 --- a/src/io/save_handle_graph.hpp +++ b/src/io/save_handle_graph.hpp @@ -42,11 +42,14 @@ using namespace std; * Save a handle graph. * Todo: should this be somewhere else (ie in vgio with new types registered?) */ -inline void save_handle_graph(HandleGraph* graph, ostream& os) { +inline void save_handle_graph(HandleGraph* graph, ostream& os, + const set& rgfa_paths = {}, + bool rgfa_pline = false, + bool use_w_lines = true) { if (dynamic_cast(graph) != nullptr) { // We loaded a GFA into a handle graph, want to write back to GFA - graph_to_gfa(dynamic_cast(graph), os); + graph_to_gfa(dynamic_cast(graph), os, rgfa_paths, rgfa_pline, use_w_lines); } else if (dynamic_cast(graph) != nullptr) { // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. dynamic_cast(graph)->serialize(os); @@ -58,14 +61,17 @@ inline void save_handle_graph(HandleGraph* graph, ostream& os) { } } -inline void save_handle_graph(HandleGraph* graph, const string& dest_path) { +inline void save_handle_graph(HandleGraph* graph, const string& dest_path, + const set& rgfa_paths = {}, + bool rgfa_pline = false, + bool use_w_lines = true) { if (dynamic_cast(graph) != nullptr) { // We loaded a GFA into a handle graph, want to write back to GFA ofstream os(dest_path); if (!os) { throw runtime_error("error[save_handle_graph]: Unable to write to: " + dest_path); } - graph_to_gfa(dynamic_cast(graph), os); + graph_to_gfa(dynamic_cast(graph), os, rgfa_paths, rgfa_pline, use_w_lines); } else if (dynamic_cast(graph) != nullptr) { // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. dynamic_cast(graph)->serialize(dest_path); diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 19d54070c29..db948397cc2 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -618,6 +618,7 @@ int main_paths(int argc, char** argv) { exit(1); } + set reference_path_names; if (drop_paths || retain_paths) { vector to_destroy; if (drop_paths) { @@ -643,6 +644,7 @@ int main_paths(int argc, char** argv) { unordered_set reference_paths; for_each_selected_path([&](const path_handle_t& path_handle) { reference_paths.insert(path_handle); + reference_path_names.insert(graph->get_path_name(path_handle)); }); // compute the new cover rgfa_cover.compute(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); @@ -651,7 +653,7 @@ int main_paths(int argc, char** argv) { } // output the graph - vg::io::save_handle_graph(graph.get(), std::cout); + vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one From 60f83519b857ea5097c8940bdc751a0f82e7f5e9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 19 Sep 2023 22:28:38 -0400 Subject: [PATCH 04/47] open intervals --- src/rgfa.cpp | 20 +++++++++++--------- src/rgfa.hpp | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 5ef6649d165..f0989a041c8 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -88,7 +88,7 @@ void RGFACover::compute(const PathHandleGraph* graph, // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), - graph->path_back(ref_path_handle))); + graph->path_end(ref_path_handle))); graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); if (node_to_interval.count(node_id)) { @@ -176,7 +176,7 @@ void RGFACover::load(const PathHandleGraph* graph, // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), - graph->path_back(ref_path_handle))); + graph->path_end(ref_path_handle))); graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1; }); @@ -205,7 +205,7 @@ void RGFACover::load(const PathHandleGraph* graph, // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph // so should probably have a better error message than the asserts below (ie if exact interval match not found) - graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { // pase the rgfa locus to get the original sample and locus pair source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle)); // find the sample in our index @@ -270,6 +270,7 @@ void RGFACover::load(const PathHandleGraph* graph, assert(found_end); // we can finally add our interval + source_end = graph->get_next_step(source_end); this->rgfa_intervals.push_back(make_pair(source_start, source_end)); for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1; @@ -299,8 +300,7 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { return true; }); rgfa_lengths[i] = 0; - step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second); - for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step; + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; step_handle = mutable_graph->get_next_step(step_handle)) { rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); @@ -313,8 +313,7 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { string source_path_name = graph->get_path_name(source_path_handle); string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]); path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name); - step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second); - for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step; + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; step_handle = mutable_graph->get_next_step(step_handle)) { mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle)); } @@ -325,10 +324,12 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { int64_t RGFACover::get_rank(nid_t node_id) const { if (!node_to_interval.count(node_id)) { + cerr << "rank is -1 because " << node_id << " is not in the node_to_interval structure" << endl; return -1; } const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); + cerr << "get rank" << endl; // since our decomposition is based on snarl tranversals, we know that fragments must // overlap their parents on snarl end points (at the very least) @@ -339,7 +340,8 @@ int64_t RGFACover::get_rank(nid_t node_id) const { left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent))); } - step_handle_t right_parent = graph->get_next_step(rgfa_interval.second); + // don't need to go next, since already one past + step_handle_t right_parent = rgfa_interval.second; int64_t right_rank = 0; if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent))); @@ -549,7 +551,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); step = graph->get_next_step(step); } - thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second-1])); + thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second])); } } diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 1b0ea598d73..4619303d356 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -104,6 +104,7 @@ class RGFACover { const PathHandleGraph* graph; + // intervals are end-exclusive (like bed) vector> rgfa_intervals; // so rgfa_intervals[0,num_ref_intervals-1] will be all rank-0 reference intervals From 6dd3856e2e4a81f55c8e3f4a94a04467d8bf6d86 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 20 Sep 2023 10:31:07 -0400 Subject: [PATCH 05/47] clean a few cases --- src/gfa.cpp | 7 ++++--- src/rgfa.cpp | 4 +--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index fcc2e13c220..93e457fc40d 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -55,7 +55,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& unordered_map> node_offsets; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); - if (rgfa_paths.count(path_name) || RGFACover::is_rgfa_path_name(path_name)) { + if (rgfa_paths.count(path_name) || (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(path_name))) { size_t offset = 0; subrange_t path_subrange = graph->get_subrange(path_handle); if (path_subrange != PathMetadata::NO_SUBRANGE) { @@ -93,7 +93,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& string sample_name = graph->get_sample_name(it->second.first); string locus_name = graph->get_locus_name(it->second.first); int64_t haplotype = graph->get_haplotype(it->second.first); - if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) { + if (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) { std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name); } string rgfa_sn = PathMetadata::create_path_name(sample_name.empty() ? PathSense::GENERIC : PathSense::REFERENCE, @@ -134,7 +134,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); - if (rgfa_pline || !rgfa_paths.count(path_name)) { + if (rgfa_pline || rgfa_paths.empty() || + (!rgfa_paths.count(path_name) && !RGFACover::is_rgfa_path_name(path_name))) { if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. cerr << "warning [gfa]: path " << path_name << " will be interpreted as reference sense " diff --git a/src/rgfa.cpp b/src/rgfa.cpp index f0989a041c8..bc690555c71 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -2,7 +2,7 @@ #include #include -#define debug +//#define debug namespace vg { @@ -324,12 +324,10 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { int64_t RGFACover::get_rank(nid_t node_id) const { if (!node_to_interval.count(node_id)) { - cerr << "rank is -1 because " << node_id << " is not in the node_to_interval structure" << endl; return -1; } const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); - cerr << "get rank" << endl; // since our decomposition is based on snarl tranversals, we know that fragments must // overlap their parents on snarl end points (at the very least) From 860660bacfea395f6a659a07930add2d5ce7d81f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 20 Sep 2023 12:32:30 -0400 Subject: [PATCH 06/47] rgfa tests --- src/algorithms/gfa_to_handle.cpp | 5 ++-- src/rgfa.cpp | 5 ++-- src/rgfa.hpp | 3 +- test/rgfa/rgfa_ins.gfa | 15 ++++++++++ test/rgfa/rgfa_ins2.gfa | 18 ++++++++++++ test/rgfa/rgfa_ins3.gfa | 18 ++++++++++++ test/rgfa/rgfa_tiny.gfa | 38 +++++++++++++++++++++++++ test/t/11_vg_paths.t | 48 +++++++++++++++++++++++++++++++- 8 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 test/rgfa/rgfa_ins.gfa create mode 100644 test/rgfa/rgfa_ins2.gfa create mode 100644 test/rgfa/rgfa_ins3.gfa create mode 100644 test/rgfa/rgfa_tiny.gfa diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index ea0ae94dfc5..abff52c2add 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -291,7 +291,6 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* } if (found == rgfa_cache->end()) { // Need to make a new path, possibly with subrange start info. - std::pair subrange; if (offset == 0) { // Don't send a subrange @@ -304,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* string rgfa_path_name; if (path_rank > 0) { // Special logic for off-reference paths, which get loaded into special rGFA cover paths - rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length); + rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false); } else { // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC, @@ -317,7 +316,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* path_handle_t path = graph->create_path_handle(rgfa_path_name); // Then cache it - found = rgfa_cache->emplace_hint(found, rgfa_path_name, std::make_pair(path, offset)); + found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset)); } // Add the step to the path diff --git a/src/rgfa.cpp b/src/rgfa.cpp index bc690555c71..37619b626d8 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -10,7 +10,8 @@ using namespace std; const string RGFACover::rgfa_sample_name = "_rGFA_"; -string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length) { +string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length, + bool specify_subrange_end) { PathSense path_sense; string path_sample; @@ -36,7 +37,7 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in // we apply the subrange offset path_subrange.first += start; - path_subrange.second = path_subrange.first + length; + path_subrange.second = specify_subrange_end ? path_subrange.first + length : PathMetadata::NO_END_POSITION; // and return the final path, with sample/locus/rgfa-rank embedded in locus diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 4619303d356..3404b0aa0b6 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -30,7 +30,8 @@ class RGFACover { static const string rgfa_sample_name; // make an rGFA path name from a subrange of a normal path - static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length); + static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length, + bool specify_subrange_end = true); // test if path name is rGFA static bool is_rgfa_path_name(const string& path_name); diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa new file mode 100644 index 00000000000..9751b52ba47 --- /dev/null +++ b/test/rgfa/rgfa_ins.gfa @@ -0,0 +1,15 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +P x 1+,5+ * +P y 1+,2+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa new file mode 100644 index 00000000000..ef6f1f3055c --- /dev/null +++ b/test/rgfa/rgfa_ins2.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 1+,2+,6+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa new file mode 100644 index 00000000000..861ac58d3b5 --- /dev/null +++ b/test/rgfa/rgfa_ins3.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 5-,4-,6-,2-,1- * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa new file mode 100644 index 00000000000..498d2e0a6dc --- /dev/null +++ b/test/rgfa/rgfa_tiny.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P y 1+,2+,4+,6+,8+,9+,10+,12+,13+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 4 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 3bba6231d56..1409bdfe2c2 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 20 +plan tests 25 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -61,4 +61,50 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa +printf "P _rGFA_#y[33] 10+ * +P _rGFA_#y[38] 13+ * +P _rGFA_#y[8] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +grep ^P rgfa_tiny.rgfa | grep rGFA | sort > rgfa_tiny_fragments.rgfa +diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa +is $? 0 "Found the expected rgfa SNP cover of tiny graph" + +rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa + +vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins.rgfa +printf "P _rGFA_#z[8] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa +diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion" + +rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa +printf "P _rGFA_#y[8] 2+,6+,4+ * +P _rGFA_#z[11] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa +diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa +printf "P _rGFA_#y[8] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa +diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa +is $? 0 "rgfa Minimum fragment length filters out small fragment" + +rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa + +vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins3.rgfa +printf "P x 1+,5+ * +P _rGFA_#y[3] 4+,6+,2+ * +P y 5-,4+,6+,2+,1- * +P _rGFA_#z[11] 3+ * +P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa +grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa +diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization" + +rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa From 37945cbc91b8c7b7f2904f9564d59e332896d814 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Sep 2023 12:19:51 -0400 Subject: [PATCH 07/47] Add tag test --- test/t/11_vg_paths.t | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 1409bdfe2c2..dd00510ccfa 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 25 +plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -88,6 +88,18 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion that requires rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa +printf "S 1 CAAATAAG SN:Z:x SO:i:0 SR:i:0 +S 2 TTT SN:Z:y SO:i:8 SR:i:1 +S 3 TTT SN:Z:z SO:i:11 SR:i:2 +S 4 TTT SN:Z:y SO:i:21 SR:i:1 +S 5 TTT SN:Z:x SO:i:8 SR:i:0 +S 6 TTTTTTTTTT SN:Z:y SO:i:11 SR:i:1\n" > rgfa_ins2_expected_segs.rgfa +diff rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa +is $? 0 "Found the expected rgfa tags for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa + vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa printf "P _rGFA_#y[8] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa From f3642eeae68f2b1b9600f4571038444a81c3e88d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 21 Sep 2023 17:06:05 -0400 Subject: [PATCH 08/47] add rgfa support to deconstruct --- src/deconstructor.cpp | 33 +++++++++++++++++++-------------- src/rgfa.cpp | 23 +++++++++++++++++++++++ src/rgfa.hpp | 5 +++++ test/t/26_deconstruct.t | 9 ++++++++- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 118d9df45a5..d95ee4d72d6 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1,6 +1,7 @@ #include "deconstructor.hpp" #include "traversal_finder.hpp" #include +#include "rgfa.hpp" //#define debug @@ -792,22 +793,24 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; + string fixed_trav_name = RGFACover::revert_rgfa_path_name(ref_trav_name, true); + vcflib::Variant v; v.quality = 60; // in VCF we usually just want the contig - string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + string contig_name = PathMetadata::parse_locus_name(fixed_trav_name); if (contig_name == PathMetadata::NO_LOCUS_NAME) { - contig_name = ref_trav_name; + contig_name = fixed_trav_name; } else if (long_ref_contig) { // the sample name isn't unique enough, so put a full ugly name in the vcf - if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { - contig_name = ref_trav_name; + if (PathMetadata::parse_sense(fixed_trav_name) == PathSense::GENERIC) { + contig_name = fixed_trav_name; } else { contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, - PathMetadata::parse_sample_name(ref_trav_name), + PathMetadata::parse_sample_name(fixed_trav_name), contig_name, - PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::parse_haplotype(fixed_trav_name), PathMetadata::NO_PHASE_BLOCK, PathMetadata::NO_SUBRANGE); } @@ -939,13 +942,14 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand graph->for_each_path_handle([&](const path_handle_t& path_handle) { string path_name = graph->get_path_name(path_handle); if (!this->ref_paths.count(path_name)) { - string sample_name = graph->get_sample_name(path_handle); + path_name = RGFACover::revert_rgfa_path_name(path_name, true); + string sample_name = PathMetadata::parse_sample_name(path_name); // for backward compatibility if (sample_name == PathMetadata::NO_SAMPLE_NAME) { sample_name = path_name; } if (!ref_samples.count(sample_name)) { - size_t haplotype = graph->get_haplotype(path_handle); + size_t haplotype = PathMetadata::parse_haplotype(path_name); if (haplotype == PathMetadata::NO_HAPLOTYPE) { haplotype = 0; } @@ -1032,18 +1036,19 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand for (handle_t handle : graph->scan_path(path_handle)) { path_len += graph->get_length(handle); } - string locus_name = graph->get_locus_name(path_handle); + string fixed_refpath = RGFACover::revert_rgfa_path_name(refpath, true); + string locus_name = PathMetadata::parse_locus_name(fixed_refpath); if (locus_name == PathMetadata::NO_LOCUS_NAME) { - locus_name = refpath; + locus_name = fixed_refpath; } else if (long_ref_contig) { // the sample name isn't unique enough, so put a full ugly name in the vcf - if (graph->get_sense(path_handle) == PathSense::GENERIC) { - locus_name = graph->get_path_name(path_handle); + if (PathMetadata::parse_sense(fixed_refpath) == PathSense::GENERIC) { + locus_name = fixed_refpath; } else { locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, - graph->get_sample_name(path_handle), + PathMetadata::parse_sample_name(fixed_refpath), locus_name, - graph->get_haplotype(path_handle), + PathMetadata::parse_haplotype(fixed_refpath), PathMetadata::NO_PHASE_BLOCK, PathMetadata::NO_SUBRANGE); } diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 37619b626d8..5a36ac21b5f 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -66,6 +66,29 @@ pair RGFACover::parse_rgfa_locus_name(const string& locus_name) return sample_locus; } +string RGFACover::revert_rgfa_path_name(const string& rgfa_path_name, bool strip_subrange) { + if (!is_rgfa_path_name(rgfa_path_name)) { + return rgfa_path_name; + } + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(rgfa_path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + std::tie(path_sample, path_locus) = parse_rgfa_locus_name(path_locus); + + if (path_sense == PathSense::REFERENCE && path_sample == PathMetadata::NO_SAMPLE_NAME) { + path_sense = PathSense::GENERIC; + } + return PathMetadata::create_path_name(path_sense, path_sample, + path_locus, path_haplotype, + path_phase_block, strip_subrange ? PathMetadata::NO_SUBRANGE : path_subrange); +} + void RGFACover::clear(MutablePathMutableHandleGraph* graph) { vector rgfa_paths; graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 3404b0aa0b6..26473dfcab9 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -39,6 +39,11 @@ class RGFACover { // break up the rGFA locus name back into original sample and locus static pair parse_rgfa_locus_name(const string& locus_name); + // undo make_rgfa_path_name, getting back an original non-rgfa name + // if the input's not a rgfa path name, just return it + static string revert_rgfa_path_name(const string& path_name, + bool strip_subrange = false); + public: // clear out the existing rGFA cover from the graph. recommended to run this // before compute() diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index aa11d17d092..4a60f64faff 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 24 +plan tests 26 vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg vg index tiny.vg -x tiny.xg @@ -157,3 +157,10 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction" rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa +vg deconstruct rgfa/rgfa_ins2.gfa -P y -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_y.vcf +vg paths -x rgfa/rgfa_ins2.gfa -R 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf +diff rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf +is "$?" 0 "deconstruct properly handles rGFA reference" +is $(cat rgfa_ins2_rgfa.vcf | wc -l) 1 "deconstruct properly handles rGFA reference (sanity check)" + +rm -f rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf From f8160722c7c54c4a5e0afbf702dd017b4ff867c9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Sep 2023 09:22:51 -0400 Subject: [PATCH 09/47] fix up some edge cases --- src/deconstructor.cpp | 3 ++- src/rgfa.cpp | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index d95ee4d72d6..cd61c721c7c 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -612,7 +612,8 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { } #endif if (ref_paths.count(path_trav_name) && - (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + (ref_trav_name.empty() || path_trav_name < ref_trav_name || + (RGFACover::is_rgfa_path_name(ref_trav_name) && !RGFACover::is_rgfa_path_name(path_trav_name)))) { ref_trav_name = path_trav_name; #ifdef debug #pragma omp critical (cerr) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 5a36ac21b5f..22b3541b31f 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -327,7 +327,6 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; step_handle = mutable_graph->get_next_step(step_handle)) { rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); - } } @@ -351,20 +350,21 @@ int64_t RGFACover::get_rank(nid_t node_id) const { return -1; } - const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); + int64_t interval_idx = this->node_to_interval.at(node_id); + const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); // since our decomposition is based on snarl tranversals, we know that fragments must // overlap their parents on snarl end points (at the very least) // therefore we can find parents by scanning along the rgfa paths. step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first); - int64_t left_rank = 0; + int64_t left_rank = interval_idx < num_ref_intervals ? 0 : -1; if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) { left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent))); } // don't need to go next, since already one past step_handle_t right_parent = rgfa_interval.second; - int64_t right_rank = 0; + int64_t right_rank = interval_idx < num_ref_intervals ? 0 : -1; if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent))); } @@ -464,6 +464,9 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav travs.push_back(trav); } } +#ifdef debug + cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl; +#endif // build an initial ranked list of candidate traversal fragments @@ -573,7 +576,8 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); step = graph->get_next_step(step); } - thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second])); + thread_rgfa_intervals.push_back(make_pair(trav.at(uncovered_interval.first), + graph->get_next_step(trav.at(uncovered_interval.second - 1)))); } } From 7c86b03507aeac95974bb8a8ee21904ace495c18 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Sep 2023 11:15:34 -0400 Subject: [PATCH 10/47] allow multiple -S in call --- src/subcommand/call_main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index c8dc59128c6..cdf2d4fcda1 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -58,7 +58,7 @@ void help_call(char** argv) { << " -N, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output" << endl << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl - << " -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p)" << endl + << " -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p; multiple allowed)" << endl << " -o, --ref-offset N Offset in reference path (multiple allowed, 1 per path)" << endl << " -l, --ref-length N Override length of reference in the contig field of output VCF" << endl << " -d, --ploidy N Ploidy of sample. Only 1 and 2 supported. (default: 2)" << endl @@ -83,7 +83,7 @@ int main_call(int argc, char** argv) { string ref_fasta_filename; string ins_fasta_filename; vector ref_paths; - string ref_sample; + set ref_sample_set; vector ref_path_offsets; vector ref_path_lengths; string min_support_string; @@ -229,7 +229,7 @@ int main_call(int argc, char** argv) { ref_paths.push_back(optarg); break; case 'S': - ref_sample = optarg; + ref_sample_set.insert(optarg); break; case 'o': ref_path_offsets.push_back(parse(optarg)); @@ -363,7 +363,7 @@ int main_call(int argc, char** argv) { cerr << "error [vg call]: -c/-C no supported with -v, -l or -n" << endl; return 1; } - if (!ref_paths.empty() && !ref_sample.empty()) { + if (!ref_paths.empty() && !ref_sample_set.empty()) { cerr << "error [vg call]: -S cannot be used with -p" << endl; return 1; } @@ -507,7 +507,7 @@ int main_call(int argc, char** argv) { PathSense path_sense = PathMetadata::parse_sense(name); if (!Paths::is_alt(name) && path_sense != PathSense::HAPLOTYPE) { string sample_name = PathMetadata::parse_sample_name(name); - if (ref_sample.empty() || sample_name == ref_sample) { + if (ref_sample_set.empty() || ref_sample_set.count(sample_name)) { ref_paths.push_back(name); // keep track of length best we can using maximum coordinate in event of subpaths subrange_t subrange; From e0e3cc20a69ee5e7f4a328d705740bf959d1e43d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 22 Sep 2023 11:15:44 -0400 Subject: [PATCH 11/47] rgfa call test --- test/t/18_vg_call.t | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 6a457f4eb9f..d4af9ad9ef5 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 19 +plan tests 20 # Toy example of hand-made pileup (and hand inspected truth) to make sure some # obvious (and only obvious) SNPs are detected by vg call @@ -191,6 +191,18 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf +vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa +vg sim -x rgfa_ins2.rgfa -n 20 -l 20 -a > rgfa_ins2.gam +vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack +vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf + +printf "x 8 >1>5 G GTTTTTTTTTTTTTTTT +y 11 >2>4 TTTTTTTTTTT T,TTTT\n" > rgfa_ins2.truth +diff rgfa_ins2.vcf rgfa_ins2.truth +is $? 0 "vg call makes expected calls on rgfa references" + +rm -f rgfa_ins2.rgfa rgfa_ins2.gam rgfa_ins2.pack rgfa_ins2.vcf rgfa_ins2.truth + From b6b955f272da6535db2810bb4e59c3ff0dcd1f10 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sat, 23 Sep 2023 14:17:08 -0400 Subject: [PATCH 12/47] speed up rgfa path application --- src/rgfa.cpp | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 22b3541b31f..5149b784e2d 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -305,30 +305,40 @@ void RGFACover::load(const PathHandleGraph* graph, void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { assert(this->graph == static_cast(mutable_graph)); - // compute the offsets in parallel, assuming we don't have a path position index of all paths - // todo: we could be smarter about not traversing the same path multiple times! - vector rgfa_offsets(this->rgfa_intervals.size()); - vector rgfa_lengths(this->rgfa_intervals.size()); -#pragma omp parallel for + // index paths isued in rgfa cover + unordered_map step_to_offset; + unordered_set source_path_set; for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); -#ifdef debug - cerr << "computing offset for application of rgfa path " << graph->get_path_name(source_path_handle) << endl; -#endif - rgfa_offsets[i] = 0; - mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) { - if (step_handle == rgfa_intervals[i].first) { - return false; + step_to_offset[rgfa_intervals[i].first] = -1; + source_path_set.insert(source_path_handle); + } + vector source_paths(source_path_set.begin(), source_path_set.end()); // for old-style omp interface +#pragma omp parallel for + for (int64_t i = 0; i < source_paths.size(); ++i) { + int64_t offset = 0; + graph->for_each_step_in_path(source_paths[i], [&](step_handle_t step_handle) { + if (step_to_offset.count(step_handle)) { + step_to_offset[step_handle] = offset; } - rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle)); - return true; + offset += graph->get_length(graph->get_handle_of_step(step_handle)); }); + } + + // compute the offsets in parallel + vector rgfa_offsets(this->rgfa_intervals.size()); + vector rgfa_lengths(this->rgfa_intervals.size()); + +#pragma omp parallel for + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + rgfa_offsets[i] = step_to_offset.at(rgfa_intervals[i].first); rgfa_lengths[i] = 0; for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; step_handle = mutable_graph->get_next_step(step_handle)) { rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); - } + } } + step_to_offset.clear(); // write the rgfa paths for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { From e02e8a2ace6205d82fa969e45e8432bd87f91cbf Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sat, 23 Sep 2023 14:37:22 -0400 Subject: [PATCH 13/47] sample option for surject path selection, some rgfa support --- src/hts_alignment_emitter.cpp | 6 ++++-- src/subcommand/call_main.cpp | 2 +- src/subcommand/surject_main.cpp | 14 +++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index 2df5a1fac51..b301e25f97a 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -12,6 +12,7 @@ #include "algorithms/find_translation.hpp" #include #include +#include "rgfa.hpp" #include @@ -179,6 +180,7 @@ pair>, unordered_map> extract_path auto& own_length = get<1>(path_info); auto& base_length = get<2>(path_info); string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path); + base_path_name = RGFACover::revert_rgfa_path_name(base_path_name); if (!base_path_set.count(base_path_name)) { path_names_and_lengths.push_back(make_pair(base_path_name, base_length)); base_path_set.insert(base_path_name); @@ -229,7 +231,7 @@ vector> get_sequence_dictionary(const strin if (print_subrange_warnings) { subrange_t subrange; - std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange); + std::string base_path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(sequence_name), &subrange); if (subrange != PathMetadata::NO_SUBRANGE) { // The user is asking explicitly to surject to a path that is a // subrange of some other logical path, like @@ -693,7 +695,7 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector 1 && ref_sample.empty()) { + if (ref_sample_names.size() > 1 && ref_sample_set.empty()) { cerr << "error [vg call]: Multiple reference samples detected: ["; size_t count = 0; for (const string& n : ref_sample_names) { diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 8121ababf47..6d4d21e1dfa 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -39,6 +39,7 @@ void help_surject(char** argv) { << " -t, --threads N number of threads to use" << endl << " -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic)" << endl << " -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE" << endl + << " -n, --into-sample NAME surject into pahts from this sample (multiple allowed)" << endl << " -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl << " -M, --multimap include secondary alignments to all overlapping paths instead of just primary" << endl << " -G, --gaf-input input file is GAF instead of GAM" << endl @@ -84,6 +85,7 @@ int main_surject(int argc, char** argv) { string xg_name; vector path_names; + vector path_sample_names; string path_file; string output_format = "GAM"; string input_format = "GAM"; @@ -113,6 +115,7 @@ int main_surject(int argc, char** argv) { {"threads", required_argument, 0, 't'}, {"into-path", required_argument, 0, 'p'}, {"into-paths", required_argument, 0, 'F'}, + {"into-sample", required_argument, 0, 'n'}, {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths {"subpath-local", required_argument, 0, 'l'}, {"interleaved", no_argument, 0, 'i'}, @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:", + c = getopt_long (argc, argv, "hx:p:F:n:liGmcbsN:R:f:C:t:SPa:ALMVw:", long_options, &option_index); // Detect the end of the options. @@ -158,6 +161,10 @@ int main_surject(int argc, char** argv) { case 'F': path_file = optarg; break; + + case 'n': + path_sample_names.push_back(optarg); + break; case 'l': subpath_global = false; @@ -279,6 +286,11 @@ int main_surject(int argc, char** argv) { // Get the paths to surject into and their length information, either from // the given file, or from the provided list, or from sniffing the graph. + for (const string& sample_name : path_sample_names) { + path_handle_graph->for_each_path_of_sample(sample_name, [&](path_handle_t path_handle) { + path_names.push_back(path_handle_graph->get_path_name(path_handle)); + }); + } vector> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *xgidx); // Clear out path_names so we don't accidentally use it path_names.clear(); From 17f3f572405a483edd4863b4a48de3eddf32a566 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sun, 24 Sep 2023 19:21:02 -0400 Subject: [PATCH 14/47] fix bugs in load() and get_rank() --- src/rgfa.cpp | 101 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 5149b784e2d..eefb238a187 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -1,6 +1,7 @@ #include "rgfa.hpp" #include #include +#include //#define debug @@ -275,29 +276,43 @@ void RGFACover::load(const PathHandleGraph* graph, if (cur_offset + source_subrange.first == rgfa_subrange.first) { source_start = cur_step; found_start = true; + } else { + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); } - cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); return !found_start; }); assert(found_start); + assert(graph->get_id(graph->get_handle_of_step(source_start)) == + graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))); + assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) == + graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle)))); - bool found_end; + bool found_end = false; step_handle_t source_end; - for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) { - if (cur_offset == rgfa_subrange.second) { + int64_t num_steps = 0; + for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) { + ++num_steps; + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + + if (cur_offset + source_subrange.first == rgfa_subrange.second) { found_end = true; source_end = cur_step; break; } - cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + if (!graph->has_next_step(cur_step)) { + break; + } } assert(found_end); + assert(num_steps == graph->get_step_count(path_handle)); // we can finally add our interval source_end = graph->get_next_step(source_end); this->rgfa_intervals.push_back(make_pair(source_start, source_end)); for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { - node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1; + int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step)); + assert(!node_to_interval.count(cur_id)); + node_to_interval[cur_id] = rgfa_intervals.size() - 1; } }); } @@ -356,46 +371,58 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { } int64_t RGFACover::get_rank(nid_t node_id) const { - if (!node_to_interval.count(node_id)) { - return -1; - } - int64_t interval_idx = this->node_to_interval.at(node_id); - const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); + // search back to reference in order to find the rank. + unordered_set visited; + priority_queue> queue; + queue.push(make_pair(0, node_id)); - // since our decomposition is based on snarl tranversals, we know that fragments must - // overlap their parents on snarl end points (at the very least) - // therefore we can find parents by scanning along the rgfa paths. - step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first); - int64_t left_rank = interval_idx < num_ref_intervals ? 0 : -1; - if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) { - left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent))); - } + nid_t current_id; + int64_t distance = 0; - // don't need to go next, since already one past - step_handle_t right_parent = rgfa_interval.second; - int64_t right_rank = interval_idx < num_ref_intervals ? 0 : -1; - if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { - right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent))); - } + while (!queue.empty()) { + std::tie(distance, current_id) = queue.top(); + queue.pop(); - return min(left_rank, right_rank); -} + if (!visited.count(current_id)) { + + visited.insert(current_id); + + if (this->node_to_interval.count(current_id)) { + int64_t interval_idx = this->node_to_interval.at(current_id); -step_handle_t RGFACover::get_step(nid_t node_id) const { - assert(node_to_interval.count(node_id)); + // we've hit the reference, can stop searching + if (interval_idx < this->num_ref_intervals) { + return distance; + } - const pair& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id)); - path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); - for (step_handle_t step = rgfa_interval.first; step != rgfa_interval.second; step = graph->get_next_step(step)) { - if (graph->get_id(graph->get_handle_of_step(step)) == node_id) { - return step; + // visit the neighbouring intervals + const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); + step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first); + if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) { + queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(left_parent)))); + } + const step_handle_t& right_parent = rgfa_interval.second; + if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { + queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(right_parent)))); + } + } else { + // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) + graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + } + } } - assert(false); - return step_handle_t(); -} + // this shouldn't happen? + return -1; +} + pair*, const pair*> RGFACover::get_parent_intervals(const pair& interval) const { From 4d65d74a45fbf0b715d3dfc5d2e6d8ac29c47c77 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 25 Sep 2023 15:21:10 -0400 Subject: [PATCH 15/47] ignore rgfa path fragments in depth and clip coverage calcs --- src/algorithms/coverage_depth.cpp | 26 +++++++++++++++++++------- src/algorithms/coverage_depth.hpp | 10 ++++++---- src/clip.cpp | 21 +++++++++++++++------ src/clip.hpp | 11 +++++++---- src/subcommand/clip_main.cpp | 24 +++++++++++++++++++++--- src/subcommand/depth_main.cpp | 22 +++++++++++++++++++--- 6 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index c92471348da..d49e1880bf5 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -295,7 +295,8 @@ pair sample_gam_depth(const HandleGraph& graph, const vector& ignore_paths, ostream& out_stream) { assert(graph.has_path(path_name)); path_handle_t path_handle = graph.get_path_handle(path_name); @@ -311,10 +312,14 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m size_t step_count = 0; handle_t handle = graph.get_handle_of_step(step_handle); graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + if (ignore_paths.count(step_path_handle)) { + return true; + } if (count_cycles) { ++step_count; } else { - path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + auto it = path_to_name.find(step_path_handle); if (it == path_to_name.end()) { string step_path_name = graph.get_path_name(step_path_handle); @@ -323,6 +328,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m } path_set.insert(it->second); } + return true; }); size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; size_t node_len = graph.get_length(handle); @@ -339,7 +345,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage, bool count_cycles) { + size_t min_coverage, bool count_cycles, const unordered_set& ignore_paths) { // compute the mean and variance of our base coverage across the bin size_t bin_length = 0; @@ -357,10 +363,13 @@ pair path_depth_of_bin(const PathHandleGraph& graph, unordered_set path_set; size_t step_count = 0; graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); + if (ignore_paths.count(step_path_handle)) { + return true; + } if (count_cycles) { ++step_count; } else { - path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); auto it = path_to_name.find(step_path_handle); if (it == path_to_name.end()) { string step_path_name = graph.get_path_name(step_path_handle); @@ -368,7 +377,8 @@ pair path_depth_of_bin(const PathHandleGraph& graph, it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first; } path_set.insert(it->second); - } + } + return true; }); size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; @@ -386,7 +396,8 @@ vector> binned_path_depth(const PathHandle const string& path_name, size_t bin_size, size_t min_coverage, - bool count_cycles) { + bool count_cycles, + const unordered_set& ignore_paths) { path_handle_t path_handle = graph.get_path_handle(path_name); @@ -414,7 +425,8 @@ vector> binned_path_depth(const PathHandle step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; size_t bin_start = bins[i].first; size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; - pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles); + pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles, + ignore_paths); binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); } diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 021cc57fa52..9cb7af0a1a6 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -67,14 +67,16 @@ pair sample_mapping_depth(const HandleGraph& graph, const vector /// print path-name offset base-coverage for every base on a path (just like samtools depth) /// ignoring things below min_coverage. offsets are 1-based in output stream /// coverage here is the number of steps from (unique) other paths -void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream); +void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, + const unordered_set& ignore_paths, ostream& out_stream); /// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage, bool count_cycles); + size_t min_coverage, bool count_cycles, const unordered_set& ignore_paths); -vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size, - size_t min_coverage, bool count_cycles); +vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, + size_t bin_size, size_t min_coverage, bool count_cycles, + const unordered_set& ignore_paths); } } diff --git a/src/clip.cpp b/src/clip.cpp index 0d2c5efd78d..45e0e5367fc 100644 --- a/src/clip.cpp +++ b/src/clip.cpp @@ -481,7 +481,8 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph function)> iterate_handles, function)> iterate_edges, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose) { + int64_t min_fragment_len, const unordered_set& ignore_paths, + bool verbose) { // find all nodes in the snarl that are not on the reference interval (reference path name from containing interval) unordered_set to_delete; @@ -502,6 +503,10 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph bool on_ref = false; size_t depth = 0; graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { + path_handle_t path_handle = graph->get_path_handle_of_step(step_handle); + if (ignore_paths.count(path_handle)) { + return true; + } ++depth; if (depth > min_depth || on_ref) { return false; @@ -509,7 +514,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph if (!ref_prefixes.empty() || region) { // if we have a region, do exact comparison to it. // otherwise, do a prefix check against ref_prefix - string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + string path_name = graph->get_path_name(path_handle); if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) { on_ref = true; return false; @@ -540,6 +545,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph edge_depths.resize(edge_count + 1); graph->for_each_path_handle([&](path_handle_t path_handle) { + if (!ignore_paths.count(path_handle)) { bool is_ref_path = check_prefixes(graph->get_path_name(path_handle)); handle_t prev_handle; bool first = true; @@ -563,6 +569,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph } prev_handle = handle; }); + } }); unordered_set edges_to_delete; @@ -622,7 +629,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph } void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose) { + int64_t min_fragment_len, const unordered_set& ignore_paths, bool verbose) { function)> iterate_handles = [&] (function visit_handle) { graph->for_each_handle([&](handle_t handle) { @@ -636,11 +643,13 @@ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_ }); }; - clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, + ignore_paths, verbose); } void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, - SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose) { + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, + const unordered_set& ignore_paths, bool verbose) { function)> iterate_handles = [&] (function visit_handle) { @@ -678,7 +687,7 @@ void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* gra } vector ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end()); - clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, ignore_paths, verbose); } diff --git a/src/clip.hpp b/src/clip.hpp index e57a260b98c..4c1294aad71 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -64,19 +64,22 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph function)> iterate_handles, function)> iterate_edges, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose); + int64_t min_fragment_len, const unordered_set& ignore_paths, + bool verbose); /** * Run above function on graph */ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose); + int64_t min_fragment_len, const unordered_set& ignore_paths, bool verbose); /** * Or on contained snarls */ -void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, - SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose); +void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, + const vector& regions, SnarlManager& snarl_manager, bool include_endpoints, + int64_t min_depth, int64_t min_fragment_len, + const unordered_set& ignore_paths, bool verbose); /** * clip out deletion edges diff --git a/src/subcommand/clip_main.cpp b/src/subcommand/clip_main.cpp index ddbd5956862..d219b6dd35b 100644 --- a/src/subcommand/clip_main.cpp +++ b/src/subcommand/clip_main.cpp @@ -8,6 +8,7 @@ #include #include "../clip.hpp" #include +#include "../rgfa.hpp" #include #include @@ -44,6 +45,7 @@ void help_clip(char** argv) { << " -m, --min-fragment-len N Don't write novel path fragment if it is less than N bp long" << endl << " -B, --output-bed Write BED-style file of affected intervals instead of clipped graph. " << endl << " Columns 4-9 are: snarl node-count edge-count shallow-node-count shallow-edge-count avg-degree" << endl + << " -i, --ignore-path-prefix Ignore all paths beginning with given prefix, ex when computing depth [_rGFA_]." << endl << " -t, --threads N number of threads to use [default: all available]" << endl << " -v, --verbose Print some logging messages" << endl << endl; @@ -54,6 +56,7 @@ int main_clip(int argc, char** argv) { string bed_path; string snarls_path; vector ref_prefixes; + vector ignore_prefixes = {RGFACover::rgfa_sample_name}; int64_t min_depth = -1; int64_t min_fragment_len = 0; bool verbose = false; @@ -102,13 +105,14 @@ int main_clip(int argc, char** argv) { {"snarls", required_argument, 0, 'r'}, {"min-fragment-len", required_argument, 0, 'm'}, {"output-bed", no_argument, 0, 'B'}, + {"ignore-path-prefixes", required_argument, 0, 'i'}, {"threads", required_argument, 0, 't'}, {"verbose", required_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bt:v", + c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bi:t:v", long_options, &option_index); // Detect the end of the options. @@ -180,6 +184,9 @@ int main_clip(int argc, char** argv) { case 'B': out_bed = true; break; + case 'i': + ignore_prefixes.push_back(optarg); + break; case 'v': verbose = true; break; @@ -336,13 +343,24 @@ int main_clip(int argc, char** argv) { } if (min_depth >= 0) { + // compute paths to ignore + unordered_set ignore_set; + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + for (const string& ip : ignore_prefixes) { + if (path_name.compare(0, ip.length(), ip) == 0) { + ignore_set.insert(path_handle); + break; + } + } + }); // run the depth clipping if (bed_path.empty()) { // do the whole graph - clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, ignore_set, verbose); } else { // do the contained snarls - clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, verbose); + clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, ignore_set, verbose); } } else if (max_deletion >= 0) { diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index 54284f67fd5..2fd42cd6ae6 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -45,6 +45,7 @@ void help_depth(char** argv) { << " -P, --paths-by STR select the paths with the given name prefix" << endl << " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl << " -m, --min-coverage N ignore nodes with less than N coverage depth [1]" << endl + << " -i, --ignore-prefix P ignore paths with given prefix when computing path coverage [_rGFA_]" << endl << " -t, --threads N number of threads to use [all available]" << endl; } @@ -69,6 +70,7 @@ int main_depth(int argc, char** argv) { bool count_cycles = false; size_t min_coverage = 1; + vector ignore_path_prefixes; int c; optind = 2; // force optind past command positional argument @@ -87,13 +89,14 @@ int main_depth(int argc, char** argv) { {"min-mapq", required_argument, 0, 'Q'}, {"min-coverage", required_argument, 0, 'm'}, {"count-cycles", no_argument, 0, 'c'}, + {"ignore-prefix", required_argument, 0, 'i'}, {"threads", required_argument, 0, 't'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:", + c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ci:t:", long_options, &option_index); // Detect the end of the options. @@ -138,6 +141,9 @@ int main_depth(int argc, char** argv) { case 'c': count_cycles = true; break; + case 'i': + ignore_path_prefixes.push_back(optarg); + break; case 't': { int num_threads = parse(optarg); @@ -197,6 +203,7 @@ int main_depth(int argc, char** argv) { // we want our paths sorted by the subpath parse so the output is sorted map, string> ref_paths; unordered_set base_path_set; + unordered_set ignore_paths; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); @@ -222,6 +229,14 @@ int main_depth(int argc, char** argv) { assert(!ref_paths.count(coord)); ref_paths[coord] = path_name; } + if (pack_filename.empty()) { + for (const string& ignore_prefix : ignore_path_prefixes) { + if (path_name.compare(0, ignore_prefix.length(), ignore_prefix) == 0) { + ignore_paths.insert(path_handle); + break; + } + } + } }); for (const auto& ref_name : ref_paths_input_set) { @@ -240,7 +255,8 @@ int main_depth(int argc, char** argv) { if (!pack_filename.empty()) { binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); } else { - binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles); + binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles, + ignore_paths); } for (auto& bin_cov : binned_depth) { // bins can ben nan if min_coverage filters everything out. just skip @@ -253,7 +269,7 @@ int main_depth(int argc, char** argv) { if (!pack_filename.empty()) { algorithms::packed_depths(*packer, ref_path, min_coverage, cout); } else { - algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout); + algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, ignore_paths, cout); } } } From b46eaaa115d10f5a3a8752a6f54514378038c2b5 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Sep 2023 14:06:41 -0400 Subject: [PATCH 16/47] use (by default) fragment not original paths in rgfa index --- src/rgfa.cpp | 235 +++++++++++++++++++++++++++------------------------ src/rgfa.hpp | 8 +- 2 files changed, 129 insertions(+), 114 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index eefb238a187..4428463b53d 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -192,7 +192,8 @@ void RGFACover::compute(const PathHandleGraph* graph, } void RGFACover::load(const PathHandleGraph* graph, - const unordered_set& reference_paths) { + const unordered_set& reference_paths, + bool use_original_paths) { // start from scratch this->rgfa_intervals.clear(); this->node_to_interval.clear(); @@ -200,121 +201,132 @@ void RGFACover::load(const PathHandleGraph* graph, // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { - this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), - graph->path_end(ref_path_handle))); graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { - node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1; + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size(); }); + this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); } this->num_ref_intervals = this->rgfa_intervals.size(); - // then the rgfa cover paths - // since we want to keep our structures in terms of original paths, we have to map back - // to them here (if we don't have original paths, then we can't find the overlaps and - // therefore nesting relationships between them. - - // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise - // (this does not make guarantees about degenerate fragmentation related cases tho) - unordered_map, vector> sample_locus_to_paths; - graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { - string locus_name = graph->get_locus_name(path_handle); - sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {}; - }); - graph->for_each_path_handle([&](path_handle_t path_handle) { - pair sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle)); - if (sample_locus_to_paths.count(sample_locus)) { - sample_locus_to_paths[sample_locus].push_back(path_handle); - } - }); + if (!use_original_paths) { + // just suck up the rgfa fragments right into the data structure + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size(); + }); + this->rgfa_intervals.push_back(make_pair(graph->path_begin(path_handle), + graph->path_end(path_handle))); + }); + } else { + // then the rgfa cover paths + // since we want to keep our structures in terms of original paths, we have to map back + // to them here (if we don't have original paths, then we can't find the overlaps and + // therefore nesting relationships between them. + + // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise + // (this does not make guarantees about degenerate fragmentation related cases tho) + unordered_map, vector> sample_locus_to_paths; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + string locus_name = graph->get_locus_name(path_handle); + sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {}; + }); + graph->for_each_path_handle([&](path_handle_t path_handle) { + pair sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle)); + if (sample_locus_to_paths.count(sample_locus)) { + sample_locus_to_paths[sample_locus].push_back(path_handle); + } + }); - // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval - // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph - // so should probably have a better error message than the asserts below (ie if exact interval match not found) - graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { - // pase the rgfa locus to get the original sample and locus - pair source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle)); - // find the sample in our index - const vector& source_paths = sample_locus_to_paths.at(source_sample_locus); - // find the containing path - subrange_t rgfa_subrange = graph->get_subrange(path_handle); - assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); - int64_t rgfa_haplotype = graph->get_haplotype(path_handle); - // allow match between 0 and NO_HAPLOTYPE - if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) { - rgfa_haplotype = 0; - } - const path_handle_t* source_path = nullptr; - subrange_t source_subrange; - for (const path_handle_t& source_path_candidate : source_paths) { - int64_t source_haplotype = graph->get_haplotype(source_path_candidate); - if (source_haplotype == PathMetadata::NO_HAPLOTYPE) { - source_haplotype = 0; + // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval + // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph + // so should probably have a better error message than the asserts below (ie if exact interval match not found) + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + // pase the rgfa locus to get the original sample and locus + pair source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle)); + // find the sample in our index + const vector& source_paths = sample_locus_to_paths.at(source_sample_locus); + // find the containing path + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + int64_t rgfa_haplotype = graph->get_haplotype(path_handle); + // allow match between 0 and NO_HAPLOTYPE + if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) { + rgfa_haplotype = 0; } - if (source_haplotype == rgfa_haplotype) { - source_subrange = graph->get_subrange(source_path_candidate); - if (source_subrange == PathMetadata::NO_SUBRANGE) { - source_subrange.first = 0; + const path_handle_t* source_path = nullptr; + subrange_t source_subrange; + for (const path_handle_t& source_path_candidate : source_paths) { + int64_t source_haplotype = graph->get_haplotype(source_path_candidate); + if (source_haplotype == PathMetadata::NO_HAPLOTYPE) { + source_haplotype = 0; } - if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) { - source_subrange.second = 0; - graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) { - source_subrange.second += graph->get_length(graph->get_handle_of_step(step)); - }); - } - if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) { - source_path = &source_path_candidate; - break; + if (source_haplotype == rgfa_haplotype) { + source_subrange = graph->get_subrange(source_path_candidate); + if (source_subrange == PathMetadata::NO_SUBRANGE) { + source_subrange.first = 0; + } + if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) { + source_subrange.second = 0; + graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) { + source_subrange.second += graph->get_length(graph->get_handle_of_step(step)); + }); + } + if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) { + source_path = &source_path_candidate; + break; + } } } - } - assert(source_path != nullptr); - // now find the exact interval in the containing path and update our data structure - bool found_start = false; - step_handle_t source_start; - int64_t cur_offset = 0; - graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) { - if (cur_offset + source_subrange.first == rgfa_subrange.first) { - source_start = cur_step; - found_start = true; - } else { + assert(source_path != nullptr); + // now find the exact interval in the containing path and update our data structure + bool found_start = false; + step_handle_t source_start; + int64_t cur_offset = 0; + graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) { + if (cur_offset + source_subrange.first == rgfa_subrange.first) { + source_start = cur_step; + found_start = true; + } else { + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + } + return !found_start; + }); + assert(found_start); + assert(graph->get_id(graph->get_handle_of_step(source_start)) == + graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))); + assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) == + graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle)))); + + bool found_end = false; + step_handle_t source_end; + int64_t num_steps = 0; + for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) { + ++num_steps; cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); - } - return !found_start; - }); - assert(found_start); - assert(graph->get_id(graph->get_handle_of_step(source_start)) == - graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))); - assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) == - graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle)))); - - bool found_end = false; - step_handle_t source_end; - int64_t num_steps = 0; - for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) { - ++num_steps; - cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); - if (cur_offset + source_subrange.first == rgfa_subrange.second) { - found_end = true; - source_end = cur_step; - break; + if (cur_offset + source_subrange.first == rgfa_subrange.second) { + found_end = true; + source_end = cur_step; + break; + } + if (!graph->has_next_step(cur_step)) { + break; + } } - if (!graph->has_next_step(cur_step)) { - break; + assert(found_end); + assert(num_steps == graph->get_step_count(path_handle)); + + // we can finally add our interval + source_end = graph->get_next_step(source_end); + this->rgfa_intervals.push_back(make_pair(source_start, source_end)); + for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { + int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step)); + assert(!node_to_interval.count(cur_id)); + node_to_interval[cur_id] = rgfa_intervals.size() - 1; } - } - assert(found_end); - assert(num_steps == graph->get_step_count(path_handle)); - - // we can finally add our interval - source_end = graph->get_next_step(source_end); - this->rgfa_intervals.push_back(make_pair(source_start, source_end)); - for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { - int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step)); - assert(!node_to_interval.count(cur_id)); - node_to_interval[cur_id] = rgfa_intervals.size() - 1; - } - }); + }); + } } void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { @@ -396,16 +408,17 @@ int64_t RGFACover::get_rank(nid_t node_id) const { return distance; } - // visit the neighbouring intervals + // search out of the snarl -- any parent traversals will overlap here const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); - step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first); - if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) { - queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(left_parent)))); - } - const step_handle_t& right_parent = rgfa_interval.second; - if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) { - queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(right_parent)))); - } + + graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) { + queue.push(make_pair(distance + 1, graph->get_id(prev))); + }); + step_handle_t last_step = graph->get_previous_step(rgfa_interval.second); + graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { + queue.push(make_pair(distance + 1, graph->get_id(next))); + }); + } else { // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 26473dfcab9..6fa426680ef 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -57,10 +57,12 @@ class RGFACover { // load the rgfa cover from the graph, assuming it's been computed already and // saved in special rgfa paths - // this function assumes that the rGFA cover paths are exactly consistent - // with the original paths. + // the use_original_paths flag specifies whether the cover indexes the original + // paths or the rgfa paths fragments (if the former, then they must exist + // and be 100% consistent with the paths used to construct the fragments) void load(const PathHandleGraph* graph, - const unordered_set& reference_paths); + const unordered_set& reference_paths, + bool use_original_paths = false); // apply the rgfa cover to a graph (must have been computed first), adding it // as a bunch of (reference sense) paths with a special sample name From 1942dc6cfc60f866b86747b085de0465426b1bbf Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 26 Sep 2023 15:16:05 -0400 Subject: [PATCH 17/47] break ties with name when choosing cover --- src/rgfa.cpp | 24 ++++++++++-------------- src/rgfa.hpp | 12 ++++++++++++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 4428463b53d..0a205ea660b 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -478,6 +478,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav // start by finding the path traversals through the snarl vector> travs; + vector trav_names; { pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); travs.reserve(path_travs.first.size()); @@ -512,6 +513,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav std::reverse(trav.begin(), trav.end()); } travs.push_back(trav); + trav_names.push_back(trav_path_name); } } #ifdef debug @@ -520,7 +522,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav // build an initial ranked list of candidate traversal fragments - vector>>> ranked_trav_fragments; + vector ranked_trav_fragments; for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { // only a reference traversal (or deletion that we don't need to consider) // will have its first two nodes covered @@ -548,30 +550,24 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav } if (!cyclic && interval_length >= minimum_length) { int64_t trav_coverage = get_coverage(trav, uncovered_interval); - ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(trav_idx, uncovered_interval))); + ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval}); } } } - // todo: typedef! - auto heap_comp = [] (const pair>>& s1, - const pair>>& s2) { - return s1.first < s2.first; - }; - // put the fragments into a max heap - std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); // now greedily pull out traversal intervals from the ranked list until none are left while (!ranked_trav_fragments.empty()) { // get the best scoring (max) fragment from heap auto best_stats_fragment = ranked_trav_fragments.front(); - std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); ranked_trav_fragments.pop_back(); - const vector& trav = travs.at(best_stats_fragment.second.first); - const pair& uncovered_interval = best_stats_fragment.second.second; + const vector& trav = travs.at(best_stats_fragment.trav_idx); + const pair& uncovered_interval = best_stats_fragment.fragment; #ifdef debug cerr << "best trav: "; @@ -609,8 +605,8 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav } if (chopped_trav_length >= minimum_length) { int64_t trav_coverage = get_coverage(trav, chopped_interval); - ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(best_stats_fragment.second.first, chopped_interval))); - std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval}); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); } } continue; diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 6fa426680ef..2a51cf7aecd 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -119,6 +119,18 @@ class RGFACover { int64_t num_ref_intervals; unordered_map node_to_interval; + + // used when selecting traversals to make the greedy cover + struct RankedFragment { + int64_t coverage; + const string* name; + int64_t trav_idx; + pair fragment; + bool operator<(const RankedFragment& f2) { + // note: name comparison is flipped because we want to select high coverage / low name + return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name); + } + }; }; /// Export the given VG graph to the given GFA file. From e53973492b17162962eac3449bd9603040aa8e75 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 29 Sep 2023 15:43:17 -0400 Subject: [PATCH 18/47] work around step iterator issue in gbz --- src/rgfa.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 0a205ea660b..8aeb14419f5 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -414,7 +414,14 @@ int64_t RGFACover::get_rank(nid_t node_id) const { graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) { queue.push(make_pair(distance + 1, graph->get_id(prev))); }); - step_handle_t last_step = graph->get_previous_step(rgfa_interval.second); + // hack around gbwtgraph bug (feature?) that does not let you decrement path_end + path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); + step_handle_t last_step; + if (rgfa_interval.second == graph->path_end(path_handle)) { + last_step = graph->path_back(path_handle); + } else { + last_step = graph->get_previous_step(rgfa_interval.second); + } graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { queue.push(make_pair(distance + 1, graph->get_id(next))); }); From 2a5ffad837c032d7cc76e1a5c6dd8e4f55b58378 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 29 Sep 2023 15:43:47 -0400 Subject: [PATCH 19/47] get rgfa tests passing; add back end range where possible --- src/algorithms/gfa_to_handle.cpp | 2 +- test/t/11_vg_paths.t | 18 +++++++++--------- test/t/18_vg_call.t | 2 +- test/t/48_vg_convert.t | 7 +++++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index abff52c2add..aae969973c3 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -303,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* string rgfa_path_name; if (path_rank > 0) { // Special logic for off-reference paths, which get loaded into special rGFA cover paths - rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false); + rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, true); } else { // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC, diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index dd00510ccfa..ec41182c368 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -62,9 +62,9 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa -printf "P _rGFA_#y[33] 10+ * -P _rGFA_#y[38] 13+ * -P _rGFA_#y[8] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +printf "P _rGFA_#y[33-34] 10+ * +P _rGFA_#y[38-39] 13+ * +P _rGFA_#y[8-9] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa grep ^P rgfa_tiny.rgfa | grep rGFA | sort > rgfa_tiny_fragments.rgfa diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa is $? 0 "Found the expected rgfa SNP cover of tiny graph" @@ -72,7 +72,7 @@ is $? 0 "Found the expected rgfa SNP cover of tiny graph" rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins.rgfa -printf "P _rGFA_#z[8] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +printf "P _rGFA_#z[8-11] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa is $? 0 "Found the expected rgfa cover for simple nested insertion" @@ -80,8 +80,8 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion" rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa -printf "P _rGFA_#y[8] 2+,6+,4+ * -P _rGFA_#z[11] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +printf "P _rGFA_#y[8-11] 2+,6+,4+ * +P _rGFA_#z[11-14] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" @@ -101,7 +101,7 @@ is $? 0 "Found the expected rgfa tags for simple nested insertion that requires rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa -printf "P _rGFA_#y[8] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +printf "P _rGFA_#y[8-11] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa is $? 0 "rgfa Minimum fragment length filters out small fragment" @@ -110,9 +110,9 @@ rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins3.rgfa printf "P x 1+,5+ * -P _rGFA_#y[3] 4+,6+,2+ * +P _rGFA_#y[3-6] 4+,6+,2+ * P y 5-,4+,6+,2+,1- * -P _rGFA_#z[11] 3+ * +P _rGFA_#z[11-14] 3+ * P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index d4af9ad9ef5..28b1122f11d 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -192,7 +192,7 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa -vg sim -x rgfa_ins2.rgfa -n 20 -l 20 -a > rgfa_ins2.gam +vg sim -x rgfa_ins2.rgfa -n 30 -s 99 -l 20 -a > rgfa_ins2.gam vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index bed23a12c41..f967c048ec0 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -189,7 +189,7 @@ is "$?" 0 "converting gfa and equivalent rgfa produces same output" rm -f tiny.gfa.gfa tiny.rgfa.gfa -is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by y -x - | grep -v "^#" | cut -f1)" "y[35]" "rank-1 rgfa subpath found" +is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by _rGFA_ -x - | grep -v "^#" | cut -f1)" "_rGFA_#y[35-36]" "rank-1 rgfa subpath found" vg convert -g tiny/tiny.rgfa -T gfa-id-mapping.tsv > /dev/null grep ^S tiny/tiny.rgfa | awk '{print $2}' | sort > rgfa_nodes @@ -404,12 +404,15 @@ is "${?}" "0" "GFA -> HashGraph -> GFA conversion preserves path metadata" # We can't do rGFA to GBZ directly until the GBWTGraph GFA parser learns to read tags # rGFA to HashGraph to GBZ to GFA -vg paths -M -x graphs/rgfa_with_reference.rgfa | sort >paths.truth.txt +# todo: would be great if gbz could preserve the NO_HAPLOTYPE +# gbz also strips the end of the subrange +vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' -e 's/-2//g' -e 's/-8//g' | sort >paths.truth.txt vg convert -a graphs/rgfa_with_reference.rgfa > rgfa_with_reference.hg vg gbwt -x rgfa_with_reference.hg -E --gbz-format -g rgfa_with_reference.gbz vg paths -M -x rgfa_with_reference.gbz | sort >paths.gbz.txt cmp paths.gbz.txt paths.truth.txt is "${?}" "0" "rGFA -> HashGraph -> GBZ conversion preserves path metadata" +vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' | sort >paths.truth.txt vg convert -f rgfa_with_reference.gbz >extracted.gfa vg paths -M -x extracted.gfa | sort >paths.gfa.txt cmp paths.gfa.txt paths.truth.txt From c4bca6660aeaec128b94083eae5f34ea0d829b57 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 5 Oct 2023 15:57:55 -0400 Subject: [PATCH 20/47] fix option collision from last merge --- src/subcommand/paths_main.cpp | 8 ++++---- test/t/11_vg_paths.t | 12 ++++++------ test/t/18_vg_call.t | 2 +- test/t/26_deconstruct.t | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index db948397cc2..34c36a25978 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -41,7 +41,7 @@ void help_paths(char** argv) { << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl << " rGFA cover" << endl - << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -f, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " output path data:" << endl @@ -139,7 +139,7 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, - {"rgfa-cover", required_argument, 0, 'R'}, + {"rgfa-min-length", required_argument, 0, 'f'}, {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, @@ -164,7 +164,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGHp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drf:R:s:aGHp:ct:", long_options, &option_index); // Detect the end of the options. @@ -200,7 +200,7 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'R': + case 'f': rgfa_min_len = parse(optarg); output_formats++; break; diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index ec41182c368..382e6f740a0 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -61,7 +61,7 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len -vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa +vg paths -v rgfa/rgfa_tiny.gfa --rgfa-min-length 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa printf "P _rGFA_#y[33-34] 10+ * P _rGFA_#y[38-39] 13+ * P _rGFA_#y[8-9] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa @@ -71,7 +71,7 @@ is $? 0 "Found the expected rgfa SNP cover of tiny graph" rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa -vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins.rgfa +vg paths -v rgfa/rgfa_ins.gfa --rgfa-min-length 5 -Q x | vg convert - -fW > rgfa_ins.rgfa printf "P _rGFA_#z[8-11] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa @@ -79,7 +79,7 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion" rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa -vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa printf "P _rGFA_#y[8-11] 2+,6+,4+ * P _rGFA_#z[11-14] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa @@ -88,7 +88,7 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion that requires rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa -vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa printf "S 1 CAAATAAG SN:Z:x SO:i:0 SR:i:0 S 2 TTT SN:Z:y SO:i:8 SR:i:1 S 3 TTT SN:Z:z SO:i:11 SR:i:2 @@ -100,7 +100,7 @@ is $? 0 "Found the expected rgfa tags for simple nested insertion that requires rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa -vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa printf "P _rGFA_#y[8-11] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa @@ -108,7 +108,7 @@ is $? 0 "rgfa Minimum fragment length filters out small fragment" rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa -vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins3.rgfa +vg paths -v rgfa/rgfa_ins3.gfa --rgfa-min-length 3 -Q x | vg convert - -fW > rgfa_ins3.rgfa printf "P x 1+,5+ * P _rGFA_#y[3-6] 4+,6+,2+ * P y 5-,4+,6+,2+,1- * diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 28b1122f11d..64668b15a5c 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -191,7 +191,7 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf -vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa +vg paths -x rgfa/rgfa_ins2.gfa -Q x -f 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa vg sim -x rgfa_ins2.rgfa -n 30 -s 99 -l 20 -a > rgfa_ins2.gam vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 4a60f64faff..2f955daedd4 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -158,7 +158,7 @@ rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon vg deconstruct rgfa/rgfa_ins2.gfa -P y -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_y.vcf -vg paths -x rgfa/rgfa_ins2.gfa -R 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf +vg paths -x rgfa/rgfa_ins2.gfa -f 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf diff rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf is "$?" 0 "deconstruct properly handles rGFA reference" is $(cat rgfa_ins2_rgfa.vcf | wc -l) 1 "deconstruct properly handles rGFA reference (sanity check)" From c8419abf477701ab1f5153790d0744d244c8e55e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 5 Oct 2023 16:16:57 -0400 Subject: [PATCH 21/47] rGFA selection option --- src/subcommand/paths_main.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 34c36a25978..999654eff17 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -57,6 +57,7 @@ void help_paths(char** argv) { << " -p, --paths-file FILE select the paths named in a file (one per line)" << endl << " -Q, --paths-by STR select the paths with the given name prefix" << endl << " -S, --sample STR select the haplotypes or reference paths for this sample" << endl + << " --rgfa-paths select all rGFA fragments (must have been added with -R previously)" << endl << " -a, --variant-paths select the variant paths added by 'vg construct -a'" << endl << " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select the reference paths" << endl @@ -94,6 +95,9 @@ unordered_map SENSE_TO_STRING { {PathSense::HAPLOTYPE, "HAPLOTYPE"} }; +// Long options with no corresponding short options. +const int OPT_SELECT_RGFA_FRAGMENTS = 1000; + int main_paths(int argc, char** argv) { if (argc == 2) { @@ -155,7 +159,8 @@ int main_paths(int argc, char** argv) { {"generic-paths", no_argument, 0, 'G'}, {"reference-paths", no_argument, 0, 'R'}, {"haplotype-paths", no_argument, 0, 'H'}, - {"coverage", no_argument, 0, 'c'}, + {"coverage", no_argument, 0, 'c'}, + {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS}, {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. {"threads-by", required_argument, 0, 'q'}, @@ -284,6 +289,11 @@ int main_paths(int argc, char** argv) { output_formats++; break; + case OPT_SELECT_RGFA_FRAGMENTS: + sample_name = RGFACover::rgfa_sample_name; + selection_criteria++; + break; + case 't': { int num_threads = parse(optarg); @@ -350,7 +360,7 @@ int main_paths(int argc, char** argv) { std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { - std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p) cannot be used" << std::endl; + std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths,) cannot be used" << std::endl; std::exit(EXIT_FAILURE); } if (select_alt_paths && !gbwt_file.empty()) { @@ -766,7 +776,7 @@ int main_paths(int argc, char** argv) { } for_each_selected_path([&](path_handle_t path_handle) { if (list_names) { - cout << graph->get_path_name(path_handle); + cout << RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle)); if (list_lengths) { size_t path_length = 0; for (handle_t handle : graph->scan_path(path_handle)) { From 22974f7ea03440b2556a43cbbc087ee3cc04925b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 5 Oct 2023 16:17:10 -0400 Subject: [PATCH 22/47] leave subrange in surjected-to rgfa fragments --- src/hts_alignment_emitter.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index b301e25f97a..768f517ff1c 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -695,10 +695,17 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector& dest) { From 98a67512d7d96ed1307ec85998715af1d43f829f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 10:37:39 -0400 Subject: [PATCH 23/47] keep rgfa subpaths in surject --- src/hts_alignment_emitter.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index 768f517ff1c..c8ae527a13f 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -151,8 +151,9 @@ static bool for_each_subpath_of(const PathPositionHandleGraph& graph, const stri /// Returns the base path name for this path (i.e. the path's name without any subrange). static string get_path_base_name(const PathPositionHandleGraph& graph, const path_handle_t& path) { - if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { - // This is a full path + string path_name = graph.get_path_name(path); + if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE || RGFACover::is_rgfa_path_name(path_name)) { + // This is a full path or an rGFA fragment that we don't want to touch return graph.get_path_name(path); } else { // This is a subpath, so remember what it's a subpath of, and use that. @@ -231,7 +232,14 @@ vector> get_sequence_dictionary(const strin if (print_subrange_warnings) { subrange_t subrange; - std::string base_path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(sequence_name), &subrange); + std::string base_path_name; + if (RGFACover::is_rgfa_path_name(sequence_name)) { + base_path_name = RGFACover::revert_rgfa_path_name(sequence_name, false); + // a white lie, but we want our subranges in rgfa intervals, at least for now + subrange = PathMetadata::NO_SUBRANGE; + } else { + base_path_name = Paths::strip_subrange(sequence_name, &subrange); + } if (subrange != PathMetadata::NO_SUBRANGE) { // The user is asking explicitly to surject to a path that is a // subrange of some other logical path, like From 2ba15832861fde7d466e86902b916d5cbcb5f21c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 10:38:08 -0400 Subject: [PATCH 24/47] revert rgfa name processing in paths listing -- too confusing for now --- src/subcommand/paths_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 999654eff17..4ebf6ec26a5 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -776,7 +776,7 @@ int main_paths(int argc, char** argv) { } for_each_selected_path([&](path_handle_t path_handle) { if (list_names) { - cout << RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle)); + cout << graph->get_path_name(path_handle); if (list_lengths) { size_t path_length = 0; for (handle_t handle : graph->scan_path(path_handle)) { From 07aa374a1fcba9c920b5a770ac4b26820372a01e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 10:54:57 -0400 Subject: [PATCH 25/47] clean rgfa names in fasta output --- src/subcommand/paths_main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 4ebf6ec26a5..58b13910f64 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -519,7 +519,7 @@ int main_paths(int argc, char** argv) { // Write as a Path in a VG chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { - write_fasta_sequence(name, path_sequence(*graph, path), cout); + write_fasta_sequence(RGFACover::revert_rgfa_path_name(name, false), path_sequence(*graph, path), cout); } if (list_lengths) { cout << path.name() << "\t" << path_to_length(path) << endl; @@ -831,7 +831,8 @@ int main_paths(int argc, char** argv) { } else if (extract_as_vg) { chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { - write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout); + write_fasta_sequence(RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle), false), + path_sequence(*graph, path), cout); } } }); From 65a5d7f763f94d6038b41fa10b278888912de9a7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 11:27:45 -0400 Subject: [PATCH 26/47] fair enough, clang --- src/rgfa.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 2a51cf7aecd..954f3d6d94d 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -126,7 +126,7 @@ class RGFACover { const string* name; int64_t trav_idx; pair fragment; - bool operator<(const RankedFragment& f2) { + bool operator<(const RankedFragment& f2) const { // note: name comparison is flipped because we want to select high coverage / low name return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name); } From 182cf39cdee9b1e26ba2cc6e59aec1fa7b406112 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 6 Oct 2023 15:10:49 -0400 Subject: [PATCH 27/47] fix rgfa tests post merge --- src/algorithms/gfa_to_handle.cpp | 4 ---- test/t/48_vg_convert.t | 6 ++---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index aae969973c3..c8a483f10ea 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -674,10 +674,6 @@ tuple, GFAParser::chars_t, GFAPar // Grab the haplotype number int64_t haplotype_number = take_number(cursor, end, -1, "parsing haplotype number"); - if (haplotype_number == -1) { - // This field is required - throw GFAFormatError("Missing haplotype number in W line", cursor); - } take_tab(cursor, end, "parsing end of haplotype number"); // Grab the sequence/contig/locus name diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index f967c048ec0..7eb46260af5 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -404,15 +404,13 @@ is "${?}" "0" "GFA -> HashGraph -> GFA conversion preserves path metadata" # We can't do rGFA to GBZ directly until the GBWTGraph GFA parser learns to read tags # rGFA to HashGraph to GBZ to GFA -# todo: would be great if gbz could preserve the NO_HAPLOTYPE -# gbz also strips the end of the subrange -vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' -e 's/-2//g' -e 's/-8//g' | sort >paths.truth.txt +vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/-2//g' -e 's/-8//g' | sort >paths.truth.txt vg convert -a graphs/rgfa_with_reference.rgfa > rgfa_with_reference.hg vg gbwt -x rgfa_with_reference.hg -E --gbz-format -g rgfa_with_reference.gbz vg paths -M -x rgfa_with_reference.gbz | sort >paths.gbz.txt cmp paths.gbz.txt paths.truth.txt is "${?}" "0" "rGFA -> HashGraph -> GBZ conversion preserves path metadata" -vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' | sort >paths.truth.txt +vg paths -M -x graphs/rgfa_with_reference.rgfa | sort >paths.truth.txt vg convert -f rgfa_with_reference.gbz >extracted.gfa vg paths -M -x extracted.gfa | sort >paths.gfa.txt cmp paths.gfa.txt paths.truth.txt From 1d07f4c2a2bccc41d9f519710d40106d2c251d7e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 10 Oct 2023 12:39:39 -0400 Subject: [PATCH 28/47] start rgfa VCF annotator --- src/rgfa.cpp | 255 ++++++++++++++++++++++++++-------- src/rgfa.hpp | 10 +- src/subcommand/paths_main.cpp | 36 ++++- 3 files changed, 239 insertions(+), 62 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 8aeb14419f5..4da96291152 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -202,7 +202,19 @@ void RGFACover::load(const PathHandleGraph* graph, // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { - node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size(); + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (graph->get_is_reverse(graph->get_handle_of_step(step_handle))) { + cerr << "[rgfa] error: Reversed step " << node_id << " found in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be forward-only." << endl; + exit(1); + } + if (node_to_interval.count(node_id)) { + cerr << "[rgfa] error: Cycle found on node " << node_id << "in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be acyclic." << endl; + exit(1); + + } + node_to_interval[node_id] = rgfa_intervals.size(); }); this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), graph->path_end(ref_path_handle))); @@ -385,62 +397,8 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { int64_t RGFACover::get_rank(nid_t node_id) const { // search back to reference in order to find the rank. - unordered_set visited; - priority_queue> queue; - queue.push(make_pair(0, node_id)); - - nid_t current_id; - int64_t distance = 0; - - while (!queue.empty()) { - std::tie(distance, current_id) = queue.top(); - queue.pop(); - - if (!visited.count(current_id)) { - - visited.insert(current_id); - - if (this->node_to_interval.count(current_id)) { - int64_t interval_idx = this->node_to_interval.at(current_id); - - // we've hit the reference, can stop searching - if (interval_idx < this->num_ref_intervals) { - return distance; - } - - // search out of the snarl -- any parent traversals will overlap here - const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); - - graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) { - queue.push(make_pair(distance + 1, graph->get_id(prev))); - }); - // hack around gbwtgraph bug (feature?) that does not let you decrement path_end - path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); - step_handle_t last_step; - if (rgfa_interval.second == graph->path_end(path_handle)) { - last_step = graph->path_back(path_handle); - } else { - last_step = graph->get_previous_step(rgfa_interval.second); - } - graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { - queue.push(make_pair(distance + 1, graph->get_id(next))); - }); - - } else { - // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) - graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { - queue.push(make_pair(distance, graph->get_id(next))); - }); - graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { - queue.push(make_pair(distance, graph->get_id(next))); - }); - } - - } - } - - // this shouldn't happen? - return -1; + vector> ref_steps = this->get_reference_nodes(node_id, true); + return ref_steps.at(0).first; } pair*, @@ -758,5 +716,188 @@ void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_gra }); } +vector> RGFACover::get_reference_nodes(nid_t node_id, bool first) const { + + // search back to reference in order to find the rank. + unordered_set visited; + priority_queue> queue; + queue.push(make_pair(0, node_id)); + + nid_t current_id; + int64_t distance = 0; + + // output reference intervals + vector> output_reference_nodes; + + while (!queue.empty()) { + std::tie(distance, current_id) = queue.top(); + queue.pop(); + + if (!visited.count(current_id)) { + + visited.insert(current_id); + + if (this->node_to_interval.count(current_id)) { + int64_t interval_idx = this->node_to_interval.at(current_id); + + const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); + + // we've hit the reference, fish out its step and stop searching. + if (interval_idx < this->num_ref_intervals) { + output_reference_nodes.push_back(make_pair(distance, current_id)); + if (first) { + break; + } + continue; + } + + // search out of the snarl -- any parent traversals will overlap here + graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) { + queue.push(make_pair(distance + 1, graph->get_id(prev))); + }); + // hack around gbwtgraph bug (feature?) that does not let you decrement path_end + path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); + step_handle_t last_step; + if (rgfa_interval.second == graph->path_end(path_handle)) { + last_step = graph->path_back(path_handle); + } else { + last_step = graph->get_previous_step(rgfa_interval.second); + } + graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { + queue.push(make_pair(distance + 1, graph->get_id(next))); + }); + + } else { + // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) + graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + } + + } + } + + assert(output_reference_nodes.size() > 0 && (first == (output_reference_nodes.size() == 1))); + return output_reference_nodes; +} + +void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { + vcflib::Variant var(vcf); + + // want a reference position lookup + unordered_map node_to_ref_pos; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->rgfa_intervals.at(i); + // assumption: ref intervals span entire path + assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first); + int64_t pos = 0; + for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second; + step_handle = graph->get_next_step(step_handle)) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + assert(!graph->get_is_reverse(handle)); + assert(!node_to_ref_pos.count(node_id)); + node_to_ref_pos[node_id] = pos; + pos += graph->get_length(handle); + } + } + + // also want an rGFA path lookup + unordered_map name_to_rgfa_path; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + name_to_rgfa_path[RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle))] = path_handle; + }); + + // remove header lines we're going to add + vcf.removeInfoHeaderLine("R_CHROM"); + vcf.removeInfoHeaderLine("R_START"); + vcf.removeInfoHeaderLine("R_END"); + vcf.removeInfoHeaderLine("RANK"); + + // and add them back, so as not to duplicate them if they are already there + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + + os << vcf.header << endl; + + string prev_sequence_name; + string prev_r_chrom; + string prev_r_start; + string prev_r_end; + string prev_rank; + while (vcf.getNextVariant(var)) { + if (name_to_rgfa_path.count(var.sequenceName)) { + string r_chrom; + string r_start; + string r_end; + string rank; + if (var.sequenceName == prev_sequence_name) { + // just use the previous values, which will be the same + r_chrom = prev_r_chrom; + r_start = prev_r_start; + r_end = prev_r_end; + rank = prev_rank; + } else { + // compute from the cover + path_handle_t path_handle = name_to_rgfa_path.at(var.sequenceName); + nid_t first_node = graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + min_ref_pos = min(min_ref_pos, ref_pos); + max_ref_pos = max(max_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + min_rank = min(min_rank, rank_node.first); + } + + r_start = std::to_string(min_ref_pos); + r_end = std::to_string(max_ref_pos); + rank = std::to_string(min_rank); + } + + if (!var.info.count("R_CHROM")) { + var.format.push_back("R_CHROM"); + } + var.info["R_CHROM"] = {r_chrom}; + if (!var.info.count("R_START")) { + var.format.push_back("R_START"); + } + var.info["R_START"] = {r_start}; + if (!var.info.count("R_END")) { + var.format.push_back("R_END"); + } + var.info["R_END"] = {r_end}; + if (!var.info.count("RANK")) { + var.format.push_back("RANK"); + } + var.info["RANK"] = {rank}; + + prev_sequence_name = var.sequenceName; + prev_r_chrom = r_chrom; + prev_r_start = r_start; + prev_r_end = r_end; + prev_rank = rank; + + } else { + //cerr << "rGFA [warning]: VCF reference " << var.sequenceName << " not found in graph" << endl; + } + os << var << endl; + } +} + } diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 954f3d6d94d..fe7559f67c6 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -85,6 +85,8 @@ class RGFACover { // return nullptr if node not in an interval const pair* get_interval(nid_t node_id) const; + // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover + void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os); protected: @@ -106,7 +108,11 @@ class RGFACover { // this is always possible because they are, by definition, disjoint // this should only be run from inside apply() void forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph); - + + // search back to the reference and return when its found + // (here distance is the number of intervals crossed, aka rank) + // "first" toggles returning the first interfval found vs all of them + vector> get_reference_nodes(nid_t node_id, bool first) const; protected: @@ -133,6 +139,8 @@ class RGFACover { }; }; + + /// Export the given VG graph to the given GFA file. } diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 58b13910f64..ea7cc56096b 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -44,6 +44,7 @@ void help_paths(char** argv) { << " -f, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl + << " --rgfa-vcf FILE add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -97,6 +98,7 @@ unordered_map SENSE_TO_STRING { // Long options with no corresponding short options. const int OPT_SELECT_RGFA_FRAGMENTS = 1000; +const int OPT_ANNOTATE_RGFA_VCF = 1001; int main_paths(int argc, char** argv) { @@ -113,6 +115,7 @@ int main_paths(int argc, char** argv) { bool drop_paths = false; bool retain_paths = false; int64_t rgfa_min_len = -1; + string rgfa_vcf_filename; string snarl_filename; string graph_file; string gbwt_file; @@ -161,6 +164,7 @@ int main_paths(int argc, char** argv) { {"haplotype-paths", no_argument, 0, 'H'}, {"coverage", no_argument, 0, 'c'}, {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS}, + {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF}, {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. {"threads-by", required_argument, 0, 'q'}, @@ -293,6 +297,11 @@ int main_paths(int argc, char** argv) { sample_name = RGFACover::rgfa_sample_name; selection_criteria++; break; + + case OPT_ANNOTATE_RGFA_VCF: + rgfa_vcf_filename = optarg; + output_formats++; + break; case 't': { @@ -356,11 +365,11 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { - std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths,) cannot be used" << std::endl; + std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths) cannot be used" << std::endl; std::exit(EXIT_FAILURE); } if (select_alt_paths && !gbwt_file.empty()) { @@ -664,8 +673,27 @@ int main_paths(int argc, char** argv) { // output the graph vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names); - } - else if (coverage) { + } else if (!rgfa_vcf_filename.empty()) { + RGFACover rgfa_cover; + // load up the rank-0 reference path selection + unordered_set reference_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + reference_paths.insert(path_handle); + }); + // load up the cover (which must have been previously computed and serialized with -f) + rgfa_cover.load(graph.get(), reference_paths); + if (rgfa_cover.get_intervals().size() == 0) { + cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl; + exit(1); + } + vcflib::VariantCallFile variant_file; + variant_file.open(rgfa_vcf_filename); + if (!variant_file.is_open()) { + cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; + exit(1); + } + rgfa_cover.annotate_vcf(variant_file, cout); + } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one // (we're doing the whole graph here, which could be inefficient in the case the user is selecting // a small path) From b8cc93756aac1e17a78caab314e240546e69f412 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Oct 2023 10:53:18 -0400 Subject: [PATCH 29/47] Fix assertion bug --- src/rgfa.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 4da96291152..d1d8fd289e6 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -780,7 +780,7 @@ vector> RGFACover::get_reference_nodes(nid_t node_id, bool } } - assert(output_reference_nodes.size() > 0 && (first == (output_reference_nodes.size() == 1))); + assert(output_reference_nodes.size() > 0 && (!first || (output_reference_nodes.size() == 1))); return output_reference_nodes; } @@ -859,8 +859,11 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { assert(r_chrom.empty() || r_chrom == name); r_chrom = name; int64_t ref_pos = node_to_ref_pos.at(rank_node.second); - min_ref_pos = min(min_ref_pos, ref_pos); - max_ref_pos = max(max_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); min_rank = min(min_rank, rank_node.first); } From 220d33d003c2e08d4a47beafd02974c970cbe88d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 19 Oct 2023 15:05:03 -0400 Subject: [PATCH 30/47] better rgfa support in call --- src/graph_caller.cpp | 124 +++++++++++++++++++++++++++++++++++-------- src/graph_caller.hpp | 22 ++++++-- 2 files changed, 118 insertions(+), 28 deletions(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index c9a518718e0..18db98d1ce0 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -1,6 +1,7 @@ #include "graph_caller.hpp" #include "algorithms/expand_context.hpp" #include "annotation.hpp" +#include "rgfa.hpp" //#define debug @@ -18,34 +19,44 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re // Used to recurse on children of parents that can't be called size_t thread_count = get_thread_count(); - vector> snarl_queue(thread_count); + vector>> snarl_queue(thread_count); // Run the snarl caller on a snarl, and queue up the children if it fails - auto process_snarl = [&](const Snarl* snarl) { + auto process_snarl = [&](const Snarl* snarl, int ploidy_override) { if (!snarl_manager.is_trivial(snarl, graph)) { #ifdef debug cerr << "GraphCaller running call_snarl on " << pb2json(*snarl) << endl; #endif - - bool was_called = call_snarl(*snarl); - if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { - const vector& children = snarl_manager.children_of(snarl); - vector& thread_queue = snarl_queue[omp_get_thread_num()]; - thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + const vector& children = snarl_manager.children_of(snarl); + vector child_ploidies(children.size(), -1); + bool was_called = call_snarl(*snarl, ploidy_override, &child_ploidies); + if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + for (int64_t i = 0; i < children.size(); ++i) { + thread_queue.push_back(make_pair(children[i], child_ploidies[i])); + } } } }; // Start with the top level snarls - snarl_manager.for_each_top_level_snarl_parallel(process_snarl); + // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl() + vector top_level_snarls; + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + top_level_snarls.push_back(snarl); + }); +#pragma omp parallel for schedule(dynamic, 1) + for (int64_t i = 0; i < top_level_snarls.size(); ++i) { + process_snarl(top_level_snarls[i], -1); + } // Then recurse on any children the snarl caller failed to handle while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), - [](const vector& snarl_vec) {return snarl_vec.empty();})) { - vector cur_queue; - for (vector& thread_queue : snarl_queue) { + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { cur_queue.reserve(cur_queue.size() + thread_queue.size()); std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); @@ -53,7 +64,7 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < cur_queue.size(); ++i) { - process_snarl(cur_queue[i]); + process_snarl(cur_queue[i].first, cur_queue[i].second); } } @@ -175,8 +186,48 @@ vector GraphCaller::break_chain(const HandleGraph& graph, const Chain& ch return chain_frags; } + + +void GraphCaller::resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector& travs, + const vector& genotype, vector& child_ploidies) { + const vector& children = snarl_manager.children_of(snarl); + + child_ploidies.resize(children.size()); -VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false) + if (!children.empty()) { + // index the nodes in the traversals + vector> trav_indexes(genotype.size()); + for (int64_t i = 0; i < genotype.size(); ++i) { + const SnarlTraversal& trav = travs[genotype[i]]; + unordered_set& trav_idx = trav_indexes[i]; + if (i > 0 && genotype[i] == genotype[i-1]) { + trav_idx = trav_indexes[i-1]; + } else { + for (int j = 0; j < trav.visit_size(); ++j) { + trav_idx.insert(graph.get_handle(trav.visit(j).node_id(), trav.visit(j).backward())); + } + } + } + + // for every child snarl, count the number of traversals that spans it -- this will be its ploidy + for (int64_t i = 0; i < children.size(); ++i) { + const Snarl* child_snarl = children[i]; + child_ploidies[i] = 0; + handle_t child_start = graph.get_handle(child_snarl->start().node_id(), child_snarl->start().backward()); + handle_t child_end = graph.get_handle(child_snarl->end().node_id(), child_snarl->end().backward()); + for (int64_t j = 0; j < trav_indexes.size(); ++j) { + unordered_set& trav_idx = trav_indexes[j]; + if ((trav_idx.count(child_start) && trav_idx.count(child_end)) || + (trav_idx.count(graph.flip(child_start)) && trav_idx.count(graph.flip(child_end)))) { + ++child_ploidies[i]; + } + } + } + } +} + + +VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false), write_full_names(false) { output_variants.resize(get_thread_count()); } @@ -491,11 +542,11 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa // resolve subpath naming subrange_t subrange; - string basepath_name = Paths::strip_subrange(ref_path_name, &subrange); + string basepath_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(ref_path_name), &subrange); size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; // in VCF we usually just want a contig string contig_name = PathMetadata::parse_locus_name(basepath_name); - if (contig_name != PathMetadata::NO_LOCUS_NAME) { + if (contig_name != PathMetadata::NO_LOCUS_NAME && !this->write_full_names) { basepath_name = contig_name; } // fill out the rest of the variant @@ -766,6 +817,20 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, function& ref_paths) { + set samples; + set haplotypes; + for (const string& path_name : ref_paths) { + samples.insert(PathMetadata::parse_sample_name(path_name)); + haplotypes.insert(PathMetadata::parse_haplotype(path_name)); + if (samples.size() > 1 || haplotypes.size() > 1) { + this->write_full_names = true; + return; + } + } + this->write_full_names = false; +} + GAFOutputCaller::GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector& ref_paths, size_t trav_padding) : emitter(emitter), @@ -1054,7 +1119,7 @@ VCFGenotyper::~VCFGenotyper() { } -bool VCFGenotyper::call_snarl(const Snarl& snarl) { +bool VCFGenotyper::call_snarl(const Snarl& snarl, int ploidy_override, vector* out_child_ploidies) { // could be that our graph is a subgraph of the graph the snarls were computed from // so bypass snarls we can't process @@ -1296,6 +1361,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph, // our graph is not in vg format. we will make graphs for each site as needed and work with those traversal_finder = nullptr; } + + this->toggle_full_names_from_paths(ref_paths); } LegacyCaller::~LegacyCaller() { @@ -1305,7 +1372,7 @@ LegacyCaller::~LegacyCaller() { } } -bool LegacyCaller::call_snarl(const Snarl& snarl) { +bool LegacyCaller::call_snarl(const Snarl& snarl, int ploidy_override, vector* out_child_ploidies) { // if we can't handle the snarl, then the GraphCaller framework will recurse on its children if (!is_traversable(snarl)) { @@ -1630,13 +1697,14 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; } + this->toggle_full_names_from_paths(ref_paths); } FlowCaller::~FlowCaller() { } -bool FlowCaller::call_snarl(const Snarl& managed_snarl) { +bool FlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector* out_child_ploidies) { // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local @@ -1644,6 +1712,11 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // wants a pointer will crash. Snarl snarl = managed_snarl; + if (ploidy_override == 0) { + // returning true is a bit ironic, but we do *not* want to recurse so it's important we do + return true; + } + if (snarl.start().node_id() == snarl.end().node_id() || !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { // can't call one-node or out-of graph snarls. @@ -1803,10 +1876,14 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // use our support caller to choose our genotype vector trav_genotype; unique_ptr trav_call_info; - int ploidy = ref_ploidies[ref_path_name]; + int ploidy = ploidy_override == -1 ? ref_ploidies[ref_path_name] : ploidy_override; std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); - + if (out_child_ploidies != nullptr) { + // derive ploidies for child snarls from the genotype traversals and save them + resolve_child_ploidies(graph, &managed_snarl, travs, trav_genotype, *out_child_ploidies); + } + assert(trav_genotype.empty() || trav_genotype.size() == ploidy); if (!gaf_output) { @@ -1866,14 +1943,15 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, ref_path_set.insert(ref_paths[i]); ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; } - + + this->toggle_full_names_from_paths(ref_paths); } NestedFlowCaller::~NestedFlowCaller() { } -bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl) { +bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector* out_child_ploidies) { // remember the calls for each child snarl in this table CallTable call_table; diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 25d719aeb3d..c753bb315ff 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -49,12 +49,16 @@ class GraphCaller { RecurseType recurise_type = RecurseOnFail); /// Call a given snarl, and print the output to out_stream - virtual bool call_snarl(const Snarl& snarl) = 0; + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr) = 0; protected: /// Break up a chain into bits that we want to call using size heuristics vector break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial); + + /// Compute the child ploidies baseds on the called genotype + void resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector& travs, + const vector& genotype, vector& child_ploidies); protected: @@ -135,6 +139,11 @@ class VCFOutputCaller { // update the PS and LV tags in the output buffer (called in write_variants if include_nested is true) void update_nesting_info_tags(const SnarlManager* snarl_manager); + + // toggle write_full_names automatically depending on ref_paths + // (keeps backwards compatibility when it was always false whenever possible, + // but flips to full names when necessary) + void toggle_full_names_from_paths(const vector& ref_paths); /// output vcf mutable vcflib::VariantCallFile output_vcf; @@ -154,6 +163,9 @@ class VCFOutputCaller { // need to write LV/PS info tags bool include_nested; + + // toggle writing full name or just contig name + bool write_full_names; }; /** @@ -221,7 +233,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu virtual ~VCFGenotyper(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -274,7 +286,7 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { virtual ~LegacyCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -375,7 +387,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC virtual ~FlowCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -449,7 +461,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO virtual ~NestedFlowCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; From f23e65a8908419bbe108e2bb15dd031e275193b9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Nov 2023 11:42:10 -0500 Subject: [PATCH 31/47] fix merge re -S option --- src/subcommand/call_main.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index a8e76c8e478..1d59ac45b71 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -584,8 +584,18 @@ int main_call(int argc, char** argv) { // make sure we have some ref paths if (ref_paths.empty()) { - if (!ref_sample.empty()) { - cerr << "error [vg call]: No paths with selected reference sample \"" << ref_sample << "\" found. " + if (!ref_sample_set.empty()) { + string sample_string; + if (ref_sample_set.size() == 1) { + sample_string = "sample \"" + *ref_sample_set.begin() + "\""; + } else { + sample_string = "samples \""; + for (const string& rsn : ref_sample_set) { + sample_string += (sample_string.length() > 9 ? " " : "") + rsn; + } + sample_string += "\""; + } + cerr << "error [vg call]: No paths with selected reference " << sample_string << " found. " << "Try using vg paths -M to see which samples are in your graph" << endl; return 1; } From 6ebd95f1645bc796d0f9a4260803fa2fe0f75b78 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 27 Mar 2024 12:51:02 -0400 Subject: [PATCH 32/47] merge contiguous rgfa fragments --- src/rgfa.cpp | 70 +++++++++++++++++++++++++++++++++++++++++++--------- src/rgfa.hpp | 7 ++++++ 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index d1d8fd289e6..237c7cb8f81 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -146,7 +146,9 @@ void RGFACover::compute(const PathHandleGraph* graph, vector>& thread_rgfa_intervals = rgfa_intervals_vector[omp_get_thread_num()]; unordered_map& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()]; - vector queue = {snarl}; + vector queue = {snarl}; + + cerr << "top level snarl " << pb2json(*snarl) << endl; while(!queue.empty()) { const Snarl* cur_snarl = queue.back(); @@ -576,19 +578,13 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav } continue; } - - // add the interval to the local (thread safe) structures - step_handle_t step = trav[uncovered_interval.first]; - int64_t interval_length = uncovered_interval.second - uncovered_interval.first; + pair new_interval = make_pair(trav.at(uncovered_interval.first), + graph->get_next_step(trav.at(uncovered_interval.second - 1))); #ifdef debug + int64_t interval_length = uncovered_interval.second - uncovered_interval.first; cerr << "adding interval with length " << interval_length << endl; #endif - for (int64_t i = 0; i < interval_length; ++i) { - thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); - step = graph->get_next_step(step); - } - thread_rgfa_intervals.push_back(make_pair(trav.at(uncovered_interval.first), - graph->get_next_step(trav.at(uncovered_interval.second - 1)))); + add_interval(thread_rgfa_intervals, thread_node_to_interval, new_interval); } } @@ -621,6 +617,54 @@ vector> RGFACover::get_uncovered_intervals(const vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval) { + + bool merged = false; + path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); + + // check the before-first step. if it's in an interval then it must be immediately + // preceeding so we merge the new interval to the end of the found interval + step_handle_t before_first_step = graph->get_previous_step(new_interval.first); + if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) { + nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); + if (thread_node_to_interval.count(prev_node_id)) { + pair& prev_interval = thread_rgfa_intervals[thread_node_to_interval[prev_node_id]]; + if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { + assert(prev_interval.second == new_interval.first); + prev_interval.second = new_interval.second; + merged = true; + } + } + } + + // check the end step. if it's in an interval then it must be immediately + // following we merge the new interval to the front of the found interval + if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { + nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); + if (thread_node_to_interval.count(next_node_id)) { + pair& next_interval = thread_rgfa_intervals[thread_node_to_interval[next_node_id]]; + path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); + if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { + assert(next_interval.first == new_interval.second); + next_interval.first = new_interval.first; + merged = true; + } + } + } + + // add the interval to the local (thread safe) structures + if (!merged) { + for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); + } + thread_rgfa_intervals.push_back(new_interval); + } + + return !merged; +} + int64_t RGFACover::get_coverage(const vector& trav, const pair& uncovered_interval) { path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); int64_t coverage = 0; @@ -635,7 +679,9 @@ int64_t RGFACover::get_coverage(const vector& trav, const pair> get_uncovered_intervals(const vector& trav, const unordered_map& thread_node_to_interval); + // add a new interval into the rgfa_intervals veector and update the node_to_interval map + // if the interval can be merged into an existing, contiguous interval, do that instead + // returns true if a new interval was added, false if an existing interval was updated + bool add_interval(vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval); + // get the total coverage of a traversal (sum of step lengths) int64_t get_coverage(const vector& trav, const pair& uncovered_interval); From 9680149881b32d989520dc103da80d5907cedf8c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 28 Mar 2024 16:11:30 -0400 Subject: [PATCH 33/47] disable rank check --- src/algorithms/gfa_to_handle.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index c8a483f10ea..d58479cdc72 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -303,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* string rgfa_path_name; if (path_rank > 0) { // Special logic for off-reference paths, which get loaded into special rGFA cover paths - rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, true); + rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false); } else { // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC, @@ -1044,7 +1044,13 @@ void GFAParser::parse(istream& in) { } else { // This path existed already. Make sure we aren't showing a conflicting rank if (rgfa_path_rank != get<0>(found->second)) { - throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second))); + // vg paths can now write different fragments of the same sample/hap with different ranks + // (unlike minigraph where all fragments of teh same assembly share a rank) + // by consequence, this check needs to be disabled. + // TODO: we still want to check that each individual fragment has consisten ranks + // so the check should be moved downstream + // + // throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second))); } } auto& visit_queue = get<2>(found->second); From 89af9b6c89ad058234544cf8ec6468a94407d688 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 5 Apr 2024 09:57:04 -0400 Subject: [PATCH 34/47] auto-add rgfa prefix during rgfa conversion --- src/subcommand/convert_main.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 7474f5a521e..d8981e1f02d 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -6,6 +6,7 @@ #include "../algorithms/find_gbwtgraph.hpp" #include "../io/save_handle_graph.hpp" #include "../gfa.hpp" +#include "../rgfa.hpp" #include "../gbwt_helper.hpp" #include "../gbwtgraph_helper.hpp" #include @@ -236,6 +237,13 @@ int main_convert(int argc, char** argv) { cerr << "[vg convert] warning: vg-protobuf output (-v / --vg-out) is deprecated. please use -p instead." << endl; } + // we interpret the user path selections as reference path selections (consistent with vg paths) + // so we need to add the rgfa keyword to pick up the nonref paths + if ((!rgfa_paths.empty() || !rgfa_prefixes.empty()) && + std::find(rgfa_prefixes.begin(), rgfa_prefixes.end(), RGFACover::rgfa_sample_name) == rgfa_prefixes.end()) { + rgfa_prefixes.push_back(RGFACover::rgfa_sample_name); + } + // with -F or -G we convert an alignment and not a graph if (input == input_gam || input == input_gaf) { From 7ce91ebf7b75f96dcf463f94ff0f063155a4bf29 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 5 Apr 2024 16:45:37 -0400 Subject: [PATCH 35/47] merge adjacent intervals from different threads --- src/rgfa.cpp | 76 ++++++++++++++++++++++++++++++++++++---------------- src/rgfa.hpp | 3 ++- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 237c7cb8f81..243899d2425 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -128,6 +128,7 @@ void RGFACover::compute(const PathHandleGraph* graph, this->num_ref_intervals = this->rgfa_intervals.size(); #ifdef debug +#pragma omp critical(cerr) cerr << "[rgfa] Selected " << rgfa_intervals.size() << " rank=0 reference paths" << endl; #endif @@ -148,8 +149,6 @@ void RGFACover::compute(const PathHandleGraph* graph, vector queue = {snarl}; - cerr << "top level snarl " << pb2json(*snarl) << endl; - while(!queue.empty()) { const Snarl* cur_snarl = queue.back(); queue.pop_back(); @@ -168,24 +167,23 @@ void RGFACover::compute(const PathHandleGraph* graph, } }); - // now we need to fold up the thread covers + // now we need to fold up the thread covers for (int64_t t = 0; t < thread_count; ++t) { - int64_t offset = this->rgfa_intervals.size(); - - // todo: figure out one-shot stl invocation to move - for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { - this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]); - } #ifdef debug +#pragma omp critical(cerr) cerr << "Adding " << rgfa_intervals_vector[t].size() << " rgfa intervals from thread " << t << endl; #endif - rgfa_intervals_vector[t].clear(); - - for (const auto& node_interval : node_to_interval_vector[t]) { - this->node_to_interval[node_interval.first] = node_interval.second + offset; + // important to go through function rather than do a raw copy since + // inter-top-level snarl merging may need to happen + for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { + // the true flag at the end disables the overlap check. since they were computed + // in separate threads, snarls can overlap by a single node + add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true); } + rgfa_intervals_vector[t].clear(); node_to_interval_vector[t].clear(); } + // todo: we could optionally go through uncovered nodes and try to greedily search // for rgfa intervals at this point, since it seems for some graphs there are @@ -345,7 +343,10 @@ void RGFACover::load(const PathHandleGraph* graph, void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { assert(this->graph == static_cast(mutable_graph)); - +#ifdef debug + cerr << "applying rGFA cover with " << this->num_ref_intervals << " ref intervals " + << " and " << this->rgfa_intervals.size() << " total intervals" << endl; +#endif // index paths isued in rgfa cover unordered_map step_to_offset; unordered_set source_path_set; @@ -484,6 +485,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav } } #ifdef debug +#pragma omp critical(cerr) cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl; #endif @@ -537,9 +539,12 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav const pair& uncovered_interval = best_stats_fragment.fragment; #ifdef debug +#pragma omp critical(cerr) + { cerr << "best trav: "; for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx)); cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <get_next_step(trav.at(uncovered_interval.second - 1))); #ifdef debug int64_t interval_length = uncovered_interval.second - uncovered_interval.first; +#pragma omp critical(cerr) cerr << "adding interval with length " << interval_length << endl; #endif add_interval(thread_rgfa_intervals, thread_node_to_interval, new_interval); @@ -619,9 +625,17 @@ vector> RGFACover::get_uncovered_intervals(const vector>& thread_rgfa_intervals, unordered_map& thread_node_to_interval, - const pair& new_interval) { + const pair& new_interval, + bool global) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)) + << "-" << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; +#endif bool merged = false; + int64_t merged_interval_idx = -1; path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); // check the before-first step. if it's in an interval then it must be immediately @@ -630,11 +644,19 @@ bool RGFACover::add_interval(vector>& thread_ if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) { nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); if (thread_node_to_interval.count(prev_node_id)) { - pair& prev_interval = thread_rgfa_intervals[thread_node_to_interval[prev_node_id]]; + int64_t prev_idx = thread_node_to_interval[prev_node_id]; + pair& prev_interval = thread_rgfa_intervals[prev_idx]; if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { - assert(prev_interval.second == new_interval.first); +#ifdef debug +#pragma omp critical(cerr) + cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(prev_interval.first)) + << "-" << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; +#endif + assert(global || prev_interval.second == new_interval.first); prev_interval.second = new_interval.second; merged = true; + merged_interval_idx = prev_idx; } } } @@ -644,24 +666,32 @@ bool RGFACover::add_interval(vector>& thread_ if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); if (thread_node_to_interval.count(next_node_id)) { - pair& next_interval = thread_rgfa_intervals[thread_node_to_interval[next_node_id]]; + int64_t next_idx = thread_node_to_interval[next_node_id]; + pair& next_interval = thread_rgfa_intervals[next_idx]; path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { - assert(next_interval.first == new_interval.second); +#ifdef debug +#pragma omp critical(cerr) + cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)) + << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; +#endif + assert(global || next_interval.first == new_interval.second); next_interval.first = new_interval.first; merged = true; + merged_interval_idx = next_idx; } } } // add the interval to the local (thread safe) structures if (!merged) { - for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { - thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size(); - } + merged_interval_idx = thread_rgfa_intervals.size(); thread_rgfa_intervals.push_back(new_interval); } - + for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } return !merged; } diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 74311da333f..6204621b991 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -106,7 +106,8 @@ class RGFACover { // returns true if a new interval was added, false if an existing interval was updated bool add_interval(vector>& thread_rgfa_intervals, unordered_map& thread_node_to_interval, - const pair& new_interval); + const pair& new_interval, + bool global = false); // get the total coverage of a traversal (sum of step lengths) int64_t get_coverage(const vector& trav, const pair& uncovered_interval); From 0fa7487a13f70cf36077a534844bc0b844e41901 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 10 Apr 2024 13:08:13 -0400 Subject: [PATCH 36/47] fix merge --- src/graph_caller.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index b0987eafedb..fd09a799224 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -66,7 +66,15 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re }; // Start with the top level snarls - snarl_manager.for_each_top_level_snarl_parallel(process_snarl); + // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl() + vector top_level_snarls; + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + top_level_snarls.push_back(snarl); + }); +#pragma omp parallel for schedule(dynamic, 1) + for (int64_t i = 0; i < top_level_snarls.size(); ++i) { + process_snarl(top_level_snarls[i], -1); + } if (show_progress) cerr << "[vg call]: Finished processing " << top_snarl_count << " top-level snarls" << endl; top_level = false; From 1f5e9280ec2bcf75c59deb1c3f8777d06d3330ac Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 11 Apr 2024 14:30:05 -0400 Subject: [PATCH 37/47] fix bug where cyclic paths mishandled during interval merging --- src/rgfa.cpp | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 243899d2425..2068555b272 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -631,8 +631,10 @@ bool RGFACover::add_interval(vector>& thread_ #ifdef debug #pragma omp critical(cerr) cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) + << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">") << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)) - << "-" << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; #endif bool merged = false; int64_t merged_interval_idx = -1; @@ -645,18 +647,22 @@ bool RGFACover::add_interval(vector>& thread_ nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); if (thread_node_to_interval.count(prev_node_id)) { int64_t prev_idx = thread_node_to_interval[prev_node_id]; - pair& prev_interval = thread_rgfa_intervals[prev_idx]; + pair& prev_interval = thread_rgfa_intervals[prev_idx]; if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { + if (prev_interval.second == new_interval.first || + (global && graph->get_previous_step(prev_interval.second) == new_interval.first)) { #ifdef debug #pragma omp critical(cerr) - cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) - << ":" << graph->get_id(graph->get_handle_of_step(prev_interval.first)) - << "-" << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; -#endif - assert(global || prev_interval.second == new_interval.first); - prev_interval.second = new_interval.second; - merged = true; - merged_interval_idx = prev_idx; + cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) + << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.first)) + << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; +#endif + prev_interval.second = new_interval.second; + merged = true; + merged_interval_idx = prev_idx; + } } } } @@ -670,16 +676,18 @@ bool RGFACover::add_interval(vector>& thread_ pair& next_interval = thread_rgfa_intervals[next_idx]; path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { + if (next_interval.first == new_interval.second || + (global && next_interval.first == graph->get_previous_step(new_interval.second))) { #ifdef debug #pragma omp critical(cerr) - cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) - << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)) - << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; -#endif - assert(global || next_interval.first == new_interval.second); - next_interval.first = new_interval.first; - merged = true; - merged_interval_idx = next_idx; + cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)) + << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; +#endif + next_interval.first = new_interval.first; + merged = true; + merged_interval_idx = next_idx; + } } } } From 56dc5c212d7858ef8b5cffacd74831d940ce8637 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 12 Apr 2024 12:46:23 -0400 Subject: [PATCH 38/47] Revert "auto-add rgfa prefix during rgfa conversion" This reverts commit 89af9b6c89ad058234544cf8ec6468a94407d688. --- src/subcommand/convert_main.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index d8981e1f02d..7474f5a521e 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -6,7 +6,6 @@ #include "../algorithms/find_gbwtgraph.hpp" #include "../io/save_handle_graph.hpp" #include "../gfa.hpp" -#include "../rgfa.hpp" #include "../gbwt_helper.hpp" #include "../gbwtgraph_helper.hpp" #include @@ -237,13 +236,6 @@ int main_convert(int argc, char** argv) { cerr << "[vg convert] warning: vg-protobuf output (-v / --vg-out) is deprecated. please use -p instead." << endl; } - // we interpret the user path selections as reference path selections (consistent with vg paths) - // so we need to add the rgfa keyword to pick up the nonref paths - if ((!rgfa_paths.empty() || !rgfa_prefixes.empty()) && - std::find(rgfa_prefixes.begin(), rgfa_prefixes.end(), RGFACover::rgfa_sample_name) == rgfa_prefixes.end()) { - rgfa_prefixes.push_back(RGFACover::rgfa_sample_name); - } - // with -F or -G we convert an alignment and not a graph if (input == input_gam || input == input_gaf) { From 54712e1a88e16bd563ad39859478fb4788e0458c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 12 Apr 2024 12:50:25 -0400 Subject: [PATCH 39/47] fix debug message --- src/rgfa.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 2068555b272..5dae86dacd8 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -632,9 +632,13 @@ bool RGFACover::add_interval(vector>& thread_ #pragma omp critical(cerr) cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">") - << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)) - << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)); + if (new_interval.second == graph->path_end(graph->get_path_handle_of_step(new_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + } #endif bool merged = false; int64_t merged_interval_idx = -1; @@ -655,9 +659,13 @@ bool RGFACover::add_interval(vector>& thread_ #pragma omp critical(cerr) cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(prev_interval.first)) - << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; + << graph->get_id(graph->get_handle_of_step(prev_interval.first)); + if (prev_interval.second == graph->path_end(graph->get_path_handle_of_step(prev_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; + } #endif prev_interval.second = new_interval.second; merged = true; From 69fc397e004b64ed67103fac643e653bbdcaafff Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 12 Apr 2024 12:59:56 -0400 Subject: [PATCH 40/47] fix debug msg --- src/rgfa.cpp | 8 ++++++-- src/subcommand/paths_main.cpp | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 5dae86dacd8..b4acf7b21c9 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -689,8 +689,12 @@ bool RGFACover::add_interval(vector>& thread_ #ifdef debug #pragma omp critical(cerr) cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) - << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)) - << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)); + if (next_interval.second == graph->path_end(graph->get_path_handle_of_step(next_interval.second))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; + } #endif next_interval.first = new_interval.first; merged = true; diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index ea7cc56096b..b045247463c 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -42,7 +42,7 @@ void help_paths(char** argv) { << " -r, --retain-paths output a graph with only the selected paths retained" << endl << " rGFA cover" << endl << " -f, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl - << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " --rgfa-vcf FILE add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl << " output path data:" << endl From 3093eff4a9df7e989e46f2e57afe99d26ecc798a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sat, 13 Apr 2024 14:58:19 -0400 Subject: [PATCH 41/47] fix bug where double-ended merge would lead to invalid overlap --- src/rgfa.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++---- src/rgfa.hpp | 5 +++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index b4acf7b21c9..12b0bf56e6b 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -183,7 +183,9 @@ void RGFACover::compute(const PathHandleGraph* graph, rgfa_intervals_vector[t].clear(); node_to_interval_vector[t].clear(); } - + + // remove any intervals that were made redundant by add_interval + defragment_intervals(); // todo: we could optionally go through uncovered nodes and try to greedily search // for rgfa intervals at this point, since it seems for some graphs there are @@ -677,6 +679,7 @@ bool RGFACover::add_interval(vector>& thread_ // check the end step. if it's in an interval then it must be immediately // following we merge the new interval to the front of the found interval + int64_t deleted_idx = -1; if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); if (thread_node_to_interval.count(next_node_id)) { @@ -696,9 +699,19 @@ bool RGFACover::add_interval(vector>& thread_ cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; } #endif - next_interval.first = new_interval.first; - merged = true; - merged_interval_idx = next_idx; + if (merged == true) { + // decomission next_interval + next_interval.first = graph->path_end(next_path); + next_interval.second = graph->path_front_end(next_path); + deleted_idx = next_idx; + // extend the previous interval right + thread_rgfa_intervals[merged_interval_idx].second = new_interval.second; + } else { + // extend next_interval left + next_interval.first = new_interval.first; + merged = true; + merged_interval_idx = next_idx; + } } } } @@ -712,9 +725,35 @@ bool RGFACover::add_interval(vector>& thread_ for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; } + if (deleted_idx >= 0) { + // move the links to the deleted interval to the new interval + const pair& deleted_interval = thread_rgfa_intervals[deleted_idx]; + for (step_handle_t step = deleted_interval.first; step != deleted_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + } return !merged; } +void RGFACover::defragment_intervals() { + vector> new_intervals; + this->node_to_interval.clear(); + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + if (interval.first != graph->path_end(path_handle)) { + new_intervals.push_back(interval); + } + } + this->rgfa_intervals = std::move(new_intervals); + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + this->node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = i; + } + } +} + int64_t RGFACover::get_coverage(const vector& trav, const pair& uncovered_interval) { path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); int64_t coverage = 0; diff --git a/src/rgfa.hpp b/src/rgfa.hpp index 6204621b991..a5f44a74137 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -109,6 +109,11 @@ class RGFACover { const pair& new_interval, bool global = false); + // add_interval() can delete an existing interval. ex if ABC are contiguous intervals and B is added to A_C, + // then A gets extended out to the whole range and C is left dangling. Since we're using a vector, + // this requires a full update of everything at the end. + void defragment_intervals(); + // get the total coverage of a traversal (sum of step lengths) int64_t get_coverage(const vector& trav, const pair& uncovered_interval); From 286ec9fe1049745abf4009904ea5cfdbfff57cf2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sat, 13 Apr 2024 16:39:12 -0400 Subject: [PATCH 42/47] fix bug in last commit --- src/rgfa.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 12b0bf56e6b..8c8478ff452 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -178,7 +178,10 @@ void RGFACover::compute(const PathHandleGraph* graph, for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { // the true flag at the end disables the overlap check. since they were computed // in separate threads, snarls can overlap by a single node - add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true); + const pair& interval = rgfa_intervals_vector[t][j]; + if (interval.first != graph->path_end(graph->get_path_handle_of_step(interval.first))) { + add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true); + } } rgfa_intervals_vector[t].clear(); node_to_interval_vector[t].clear(); From 16e3d6186c3c008cbef83164c0cda9358d9b5658 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sun, 21 Apr 2024 15:41:28 -0400 Subject: [PATCH 43/47] treat phase block as subrange to hand converted gbz --- src/rgfa.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index 8c8478ff452..d7f6b71df5e 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -23,7 +23,15 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); if (path_subrange == PathMetadata::NO_SUBRANGE) { - path_subrange = make_pair(0, 0); + if (path_phase_block != PathMetadata::NO_PHASE_BLOCK && path_phase_block > 0) { + // gbz puts subranges into phase blocks + // so we interpret them as subranges here, which allows + // things like vg convert graph.gbz | vg paths -x - -f ... to make cover + path_subrange.first = path_phase_block; + path_subrange.second = PathMetadata::NO_END_POSITION; + } else { + path_subrange = make_pair(0, 0); + } } // we move the sample into the locus From f5f211bc886a6c447f3ace43bbdf44297234e209 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 1 May 2024 10:27:56 -0400 Subject: [PATCH 44/47] opt to print rgfa stats table --- src/rgfa.cpp | 243 +++++++++++++++++++++++----------- src/rgfa.hpp | 6 + src/subcommand/paths_main.cpp | 29 ++-- 3 files changed, 194 insertions(+), 84 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index d7f6b71df5e..a9b409a10aa 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include //#define debug @@ -126,7 +128,8 @@ void RGFACover::compute(const PathHandleGraph* graph, nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); if (node_to_interval.count(node_id)) { cerr << "[rgfa error]: node " << node_id << " covered by two reference paths," - << " including " << graph->get_path_name(ref_path_handle) + << " including " << graph->get_path_name(ref_path_handle) << " and " + << graph->get_path_name(graph->get_path_handle_of_step(rgfa_intervals.at(node_to_interval.at(node_id)).first)) << ". rGFA support current requires disjoint acyclic reference paths" << endl; exit(1); } @@ -930,6 +933,24 @@ vector> RGFACover::get_reference_nodes(nid_t node_id, bool return output_reference_nodes; } +pair RGFACover::parse_variant_id(const string& variant_id) const { + + if (variant_id[0] == '>' || variant_id[0] == '<') { + size_t p = variant_id.find_first_of("<>", 2); + if (p != string::npos) { + int64_t id1 = parse(variant_id.substr(1, p-1)); + int64_t id2 = parse(variant_id.substr(p+1)); + if (!graph->has_node(id1) || !graph->has_node(id2)) { + throw runtime_error("Unable to parse " + variant_id + ": node(s) not found in graph"); + } + return make_pair(graph->get_handle(id1, variant_id[0] == '<'), + graph->get_handle(id2, variant_id[p] == '<')); + } + } + throw runtime_error("Failed to parse " + variant_id); + return make_pair(handle_t(), handle_t()); +} + void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { vcflib::Variant var(vcf); @@ -951,12 +972,6 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { } } - // also want an rGFA path lookup - unordered_map name_to_rgfa_path; - graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { - name_to_rgfa_path[RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle))] = path_handle; - }); - // remove header lines we're going to add vcf.removeInfoHeaderLine("R_CHROM"); vcf.removeInfoHeaderLine("R_START"); @@ -971,80 +986,156 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { os << vcf.header << endl; - string prev_sequence_name; - string prev_r_chrom; - string prev_r_start; - string prev_r_end; - string prev_rank; while (vcf.getNextVariant(var)) { - if (name_to_rgfa_path.count(var.sequenceName)) { - string r_chrom; - string r_start; - string r_end; - string rank; - if (var.sequenceName == prev_sequence_name) { - // just use the previous values, which will be the same - r_chrom = prev_r_chrom; - r_start = prev_r_start; - r_end = prev_r_end; - rank = prev_rank; - } else { - // compute from the cover - path_handle_t path_handle = name_to_rgfa_path.at(var.sequenceName); - nid_t first_node = graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))); - vector> ref_nodes = this->get_reference_nodes(first_node, false); - - int64_t min_ref_pos = numeric_limits::max(); - int64_t max_ref_pos = -1; - int64_t min_rank = numeric_limits::max(); - for (const pair& rank_node : ref_nodes) { - step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; - path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); - string name = graph->get_path_name(ref_path); - // we assume one reference contig (which is built into the whole structure) - assert(r_chrom.empty() || r_chrom == name); - r_chrom = name; - int64_t ref_pos = node_to_ref_pos.at(rank_node.second); - // assume snarl is forward on both reference nodes - // todo: this won't be exact for some inversion cases, I don't think -- - // need to test these and either add check / or move to oriented search - min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); - max_ref_pos = max(max_ref_pos, ref_pos); - min_rank = min(min_rank, rank_node.first); - } + pair id_handles = parse_variant_id(var.id); + string r_chrom; + string r_start; + string r_end; + string rank; + + // compute from the cover + nid_t first_node = graph->get_id(id_handles.first); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); + min_rank = min(min_rank, rank_node.first); + } - r_start = std::to_string(min_ref_pos); - r_end = std::to_string(max_ref_pos); - rank = std::to_string(min_rank); - } + r_start = std::to_string(min_ref_pos); + r_end = std::to_string(max_ref_pos); + rank = std::to_string(min_rank); + + var.info["R_CHROM"] = {r_chrom}; + var.info["R_START"] = {r_start}; + var.info["R_END"] = {r_end}; + var.info["RANK"] = {rank}; + + os << var << endl; + } +} - if (!var.info.count("R_CHROM")) { - var.format.push_back("R_CHROM"); - } - var.info["R_CHROM"] = {r_chrom}; - if (!var.info.count("R_START")) { - var.format.push_back("R_START"); - } - var.info["R_START"] = {r_start}; - if (!var.info.count("R_END")) { - var.format.push_back("R_END"); - } - var.info["R_END"] = {r_end}; - if (!var.info.count("RANK")) { - var.format.push_back("RANK"); +void RGFACover::print_stats(ostream& os) { + + // todo: mostly copied from annotate function above. should probably refactor common + // logic into its own, shared thing.y + unordered_map node_to_ref_pos; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->rgfa_intervals.at(i); + // assumption: ref intervals span entire path + assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first); + int64_t pos = 0; + for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second; + step_handle = graph->get_next_step(step_handle)) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + assert(!graph->get_is_reverse(handle)); + assert(!node_to_ref_pos.count(node_id)); + node_to_ref_pos[node_id] = pos; + pos += graph->get_length(handle); + } + } + + // the header + os << "#Sample" << "\t" + << "Haplotype" << "\t" + << "Locus" << "\t" + << "Start" << "\t" + << "End" << "\t" + << "NodeStart" << "\t" + << "NodeEnd" << "\t" + << "Rank" << "\t" + << "AvgDepth" << "\t" + << "AvgSampleDepth" << "\t" + << "RefSequence" << "\t" + << "RefStart" << "\t" + << "RefEnd" << endl; + + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + + int64_t path_length = 0; + int64_t tot_depth = 0; + int64_t tot_sample_depth = 0; + int64_t tot_steps = 0; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + path_length += graph->get_length(graph->get_handle_of_step(step)); + set sample_set; + vector steps = graph->steps_of_handle(graph->get_handle_of_step(step)); + for (step_handle_t& other_step : steps) { + path_handle_t other_path = graph->get_path_handle_of_step(other_step); + sample_set.insert(graph->get_sample_name(other_path)); } - var.info["RANK"] = {rank}; - - prev_sequence_name = var.sequenceName; - prev_r_chrom = r_chrom; - prev_r_start = r_start; - prev_r_end = r_end; - prev_rank = rank; - - } else { - //cerr << "rGFA [warning]: VCF reference " << var.sequenceName << " not found in graph" << endl; + tot_depth += steps.size(); + tot_sample_depth += sample_set.size(); + // we don't want to count the rgfa cover + // todo (can remove this when move away from path-based scheme) + assert(sample_set.count(RGFACover::rgfa_sample_name)); + --tot_depth; + --tot_sample_depth; + ++tot_steps; } - os << var << endl; + pair sample_locus = RGFACover::parse_rgfa_locus_name(graph->get_locus_name(path_handle)); + size_t haplotype = graph->get_haplotype(path_handle); + + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + // interval is open ended, so we go back to last node + handle_t last_handle = graph->get_handle_of_step(graph->get_previous_step(interval.second)); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + string r_chrom; + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); + min_rank = min(min_rank, rank_node.first); + } + + + os << sample_locus.first << "\t" + << sample_locus.second << "\t" + << (haplotype != PathMetadata::NO_HAPLOTYPE ? (int64_t)haplotype : (int64_t)-1) << "\t" + << rgfa_subrange.first << "\t" + << (rgfa_subrange.first + path_length) << "\t" + << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t" + << (graph->get_is_reverse(last_handle) ? "<" : ">") + << graph->get_id(last_handle) << "\t" + << min_rank << "\t" + << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t" + << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t" + << Paths::strip_subrange(r_chrom) << "\t" + << min_ref_pos << "\t" + << max_ref_pos << "\n"; } } diff --git a/src/rgfa.hpp b/src/rgfa.hpp index a5f44a74137..db5abdfe850 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -85,8 +85,14 @@ class RGFACover { // return nullptr if node not in an interval const pair* get_interval(nid_t node_id) const; + // parse an id of the form >244>2334 and return the pair of handles + pair parse_variant_id(const string& variant_id) const; + // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os); + + // print out a table of statistics + void print_stats(ostream& os); protected: diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index b045247463c..e5621f0c287 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -45,6 +45,7 @@ void help_paths(char** argv) { << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " --rgfa-vcf FILE add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl + << " --rgfa-intervals print out table of rgfa intervals and their ranks and positions" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -99,6 +100,7 @@ unordered_map SENSE_TO_STRING { // Long options with no corresponding short options. const int OPT_SELECT_RGFA_FRAGMENTS = 1000; const int OPT_ANNOTATE_RGFA_VCF = 1001; +const int OPT_RGFA_INTERVALS = 1002; int main_paths(int argc, char** argv) { @@ -116,6 +118,7 @@ int main_paths(int argc, char** argv) { bool retain_paths = false; int64_t rgfa_min_len = -1; string rgfa_vcf_filename; + bool rgfa_print_intervals = false; string snarl_filename; string graph_file; string gbwt_file; @@ -165,6 +168,7 @@ int main_paths(int argc, char** argv) { {"coverage", no_argument, 0, 'c'}, {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS}, {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF}, + {"rgfa-intervals", no_argument, 0, OPT_RGFA_INTERVALS}, {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. {"threads-by", required_argument, 0, 'q'}, @@ -302,6 +306,11 @@ int main_paths(int argc, char** argv) { rgfa_vcf_filename = optarg; output_formats++; break; + + case OPT_RGFA_INTERVALS: + rgfa_print_intervals = true; + output_formats++; + break; case 't': { @@ -365,7 +374,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf, --rgfa-intervals) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -673,7 +682,7 @@ int main_paths(int argc, char** argv) { // output the graph vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names); - } else if (!rgfa_vcf_filename.empty()) { + } else if (!rgfa_vcf_filename.empty() || rgfa_print_intervals) { RGFACover rgfa_cover; // load up the rank-0 reference path selection unordered_set reference_paths; @@ -686,13 +695,17 @@ int main_paths(int argc, char** argv) { cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl; exit(1); } - vcflib::VariantCallFile variant_file; - variant_file.open(rgfa_vcf_filename); - if (!variant_file.is_open()) { - cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; - exit(1); + if (!rgfa_vcf_filename.empty()) { + vcflib::VariantCallFile variant_file; + variant_file.open(rgfa_vcf_filename); + if (!variant_file.is_open()) { + cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; + exit(1); + } + rgfa_cover.annotate_vcf(variant_file, cout); + } else if (rgfa_print_intervals) { + rgfa_cover.print_stats(cout); } - rgfa_cover.annotate_vcf(variant_file, cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one // (we're doing the whole graph here, which could be inefficient in the case the user is selecting From 5fa4fbc441c6f341df9f4e96d449424362c934a9 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 1 May 2024 13:20:38 -0400 Subject: [PATCH 45/47] can use previous step from path end on gbz --- src/rgfa.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index a9b409a10aa..d0b47cd4309 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -1098,7 +1098,14 @@ void RGFACover::print_stats(ostream& os) { nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); vector> ref_nodes = this->get_reference_nodes(first_node, false); // interval is open ended, so we go back to last node - handle_t last_handle = graph->get_handle_of_step(graph->get_previous_step(interval.second)); + step_handle_t last_step; + if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.second))) { + // can't get previous step of end in gbz, so we treat as special case here + last_step = graph->path_back(graph->get_path_handle_of_step(interval.second)); + } else { + last_step = graph->get_previous_step(interval.second); + } + handle_t last_handle = graph->get_handle_of_step(last_step); int64_t min_ref_pos = numeric_limits::max(); int64_t max_ref_pos = -1; From dd55ec7fc392d9119a475ca816e4213118252f4a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 1 May 2024 14:12:01 -0400 Subject: [PATCH 46/47] dont crash with multiple references --- src/rgfa.cpp | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index d0b47cd4309..d6a388afbf4 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -1099,9 +1099,9 @@ void RGFACover::print_stats(ostream& os) { vector> ref_nodes = this->get_reference_nodes(first_node, false); // interval is open ended, so we go back to last node step_handle_t last_step; - if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.second))) { + if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.first))) { // can't get previous step of end in gbz, so we treat as special case here - last_step = graph->path_back(graph->get_path_handle_of_step(interval.second)); + last_step = graph->path_back(graph->get_path_handle_of_step(interval.first)); } else { last_step = graph->get_previous_step(interval.second); } @@ -1110,23 +1110,32 @@ void RGFACover::print_stats(ostream& os) { int64_t min_ref_pos = numeric_limits::max(); int64_t max_ref_pos = -1; int64_t min_rank = numeric_limits::max(); - string r_chrom; + string r_chrom; + map> chrom_pos; for (const pair& rank_node : ref_nodes) { step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); string name = graph->get_path_name(ref_path); // we assume one reference contig (which is built into the whole structure) assert(r_chrom.empty() || r_chrom == name); - r_chrom = name; int64_t ref_pos = node_to_ref_pos.at(rank_node.second); - // assume snarl is forward on both reference nodes - // todo: this won't be exact for some inversion cases, I don't think -- - // need to test these and either add check / or move to oriented search - min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); - max_ref_pos = max(max_ref_pos, ref_pos); - min_rank = min(min_rank, rank_node.first); - } - + int64_t last_len = (int64_t)graph->get_length(graph->get_handle(rank_node.second)); + if (chrom_pos.count(name)) { + tuple& pos = chrom_pos[name]; + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + pos = make_tuple(min(get<0>(pos), ref_pos + last_len), max(get<1>(pos), ref_pos), min(get<2>(pos), rank_node.first)); + } else { + chrom_pos[name] = make_tuple(ref_pos + last_len, ref_pos, rank_node.first); + } + if (rank_node.first < min_rank) { + // todo: is there a better heuristic to use when choosing a reference? + // also: need to fix vcf annotate maybe to just list them all... + min_rank = rank_node.first; + r_chrom = name; + } + } os << sample_locus.first << "\t" << sample_locus.second << "\t" @@ -1141,8 +1150,8 @@ void RGFACover::print_stats(ostream& os) { << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t" << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t" << Paths::strip_subrange(r_chrom) << "\t" - << min_ref_pos << "\t" - << max_ref_pos << "\n"; + << get<0>(chrom_pos[r_chrom]) << "\t" + << get<1>(chrom_pos[r_chrom]) << "\n"; } } From 42bfaa1f0252042a440d2e083087bc332b5d1497 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 1 May 2024 14:18:46 -0400 Subject: [PATCH 47/47] dont crash with multiple references --- src/rgfa.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index d6a388afbf4..4af858a9d02 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -1117,7 +1117,6 @@ void RGFACover::print_stats(ostream& os) { path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); string name = graph->get_path_name(ref_path); // we assume one reference contig (which is built into the whole structure) - assert(r_chrom.empty() || r_chrom == name); int64_t ref_pos = node_to_ref_pos.at(rank_node.second); int64_t last_len = (int64_t)graph->get_length(graph->get_handle(rank_node.second)); if (chrom_pos.count(name)) {