From 5c904556b7e07060a42a62a6631635e944426c71 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 8 Sep 2023 15:19:27 -0400
Subject: [PATCH 01/47] start wiring in new rgfa interface

---
 src/algorithms/gfa_to_handle.cpp |  26 +-
 src/gfa.cpp                      |  44 ++-
 src/rgfa.cpp                     | 654 +++++++++++++++++++++++++++++++
 src/rgfa.hpp                     | 115 ++++++
 4 files changed, 821 insertions(+), 18 deletions(-)
 create mode 100644 src/rgfa.cpp
 create mode 100644 src/rgfa.hpp
diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp
index 65f19320676..ea0ae94dfc5 100644
--- a/src/algorithms/gfa_to_handle.cpp
+++ b/src/algorithms/gfa_to_handle.cpp
@@ -1,6 +1,6 @@
 #include "gfa_to_handle.hpp"
 #include "../path.hpp"
-
+#include "../rgfa.hpp"
 #include <gbwtgraph/utils.h>
 
 namespace vg {
@@ -300,16 +300,24 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
                 // Start later than 0
                 subrange = std::pair<int64_t, int64_t>(offset, PathMetadata::NO_END_POSITION);
             }
+
+            string rgfa_path_name;
+            if (path_rank > 0) {
+                // Special logic for off-reference paths, which get loaded into special rGFA cover paths
+                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length);
+            } else {
+                // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
+                rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC,
+                                                                PathMetadata::NO_SAMPLE_NAME,
+                                                                path_name, 
+                                                                PathMetadata::NO_HAPLOTYPE,
+                                                                PathMetadata::NO_PHASE_BLOCK,
+                                                                subrange);
+            }
+            path_handle_t path = graph->create_path_handle(rgfa_path_name);
             
-            // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
-            path_handle_t path = graph->create_path(PathSense::GENERIC,
-                                                    PathMetadata::NO_SAMPLE_NAME,
-                                                    path_name, 
-                                                    PathMetadata::NO_HAPLOTYPE,
-                                                    PathMetadata::NO_PHASE_BLOCK,
-                                                    subrange);
             // Then cache it
-            found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset));
+            found = rgfa_cache->emplace_hint(found, rgfa_path_name, std::make_pair(path, offset));
         }
         
         // Add the step to the path
diff --git a/src/gfa.cpp b/src/gfa.cpp
index 7b9453333c4..3f3344d16cd 100644
--- a/src/gfa.cpp
+++ b/src/gfa.cpp
@@ -1,6 +1,7 @@
 #include "gfa.hpp"
 #include "utility.hpp"
 #include "path.hpp"
+#include "rgfa.hpp"
 #include <sstream>
 
 #include <gbwtgraph/utils.h>
@@ -18,7 +19,16 @@ static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle
 
 void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>& rgfa_paths,
                   bool rgfa_pline, bool use_w_lines) {
-    
+
+    RGFACover rgfa_cover;
+    if (!rgfa_paths.empty()) {
+        // index the rgfa cover in the graph (combination of rank-0 rgfa_paths and rank-1 paths with rgfa sample name)
+        unordered_set<path_handle_t> rgfa_path_handles;
+        for (const string& rgfa_path_name : rgfa_paths) {
+            rgfa_path_handles.insert(graph->get_path_handle(rgfa_path_name));
+        }
+        rgfa_cover.load(graph, rgfa_path_handles);
+    }
     // TODO: Support sorting nodes, paths, and/or edges for canonical output
     // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped.
     
@@ -40,12 +50,18 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
     }
     out << "\n";
 
-    //Compute the rGFA tags of given paths (todo: support non-zero ranks)
+    //Compute the rGFA tags of given paths.  These paths are the rank-0 reference paths (passed in rgfa_paths set)
+    //along with paths that have the special rgfa sample name (rank>0) paths.
     unordered_map<nid_t, pair<path_handle_t, size_t>> node_offsets;
-    for (const string& path_name : rgfa_paths) {
-        path_handle_t path_handle = graph->get_path_handle(path_name);
-        size_t offset = 0;
-        graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
+    graph->for_each_path_handle([&](path_handle_t path_handle) {
+        string path_name = graph->get_path_name(path_handle);
+        if (rgfa_paths.count(path_name) || RGFACover::is_rgfa_path_name(path_name)) {
+            size_t offset = 0;
+            subrange_t path_subrange = graph->get_subrange(path_handle);
+            if (path_subrange != PathMetadata::NO_SUBRANGE) {
+                offset = path_subrange.first;
+            }
+            graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
                 handle_t handle = graph->get_handle_of_step(step_handle);
                 nid_t node_id = graph->get_id(handle);
                 if (graph->get_is_reverse(handle)) {
@@ -62,7 +78,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
                 }
                 offset += graph->get_length(handle);
             });
-    }
+        }
+    });
   
     //Go through each node in the graph
     graph->for_each_handle([&](const handle_t& h) {
@@ -73,9 +90,18 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
         auto it = node_offsets.find(node_id);
         if (it != node_offsets.end()) {
             // add rGFA tags
-            out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first)
+            string sample_name = graph->get_sample_name(it->second.first);
+            string locus_name = graph->get_locus_name(it->second.first);
+            int64_t haplotype = graph->get_haplotype(it->second.first);
+            if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) {
+                std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name);
+            }
+            string rgfa_sn = PathMetadata::create_path_name(PathSense::REFERENCE, sample_name, locus_name,
+                                                            haplotype, PathMetadata::NO_PHASE_BLOCK,
+                                                            PathMetadata::NO_SUBRANGE);
+            out << "\t" << "SN:Z:" << rgfa_sn
                 << "\t" << "SO:i:" << it->second.second
-                << "\t" << "SR:i:0"; // todo: support non-zero ranks?
+                << "\t" << "SR:i:" << rgfa_cover.get_rank(node_id);
         }
         out << "\n"; // Writing `std::endl` would flush the buffer.
         return true;
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
new file mode 100644
index 00000000000..1215dd486f6
--- /dev/null
+++ b/src/rgfa.cpp
@@ -0,0 +1,654 @@
+#include "rgfa.hpp"
+#include <sstream>
+#include <algorithm>
+
+//#define debug
+
+namespace vg {
+
+using namespace std;
+
+const string RGFACover::rgfa_sample_name = "_rGFA_";
+
+string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length) {
+
+    PathSense path_sense;
+    string path_sample;
+    string path_locus;
+    size_t path_haplotype;
+    size_t path_phase_block;
+    subrange_t path_subrange;
+    PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus,
+                                  path_haplotype, path_phase_block, path_subrange);
+
+    // we move the sample into the locus
+    // todo: is there something nicer?
+    string rgfa_locus;        
+    assert(path_locus != PathMetadata::NO_LOCUS_NAME);    
+    if (path_sample != PathMetadata::NO_SAMPLE_NAME) {
+        rgfa_locus = "SN:Z:" + path_sample;
+    }
+    // the contig name will be behind SC...
+    rgfa_locus += ":SC:Z:" + path_locus;
+
+    // we apply the subrange offset
+    path_subrange.first += start;
+    path_subrange.second = path_subrange.first + length;
+
+
+    // and return the final path, with sample/locus/rgfa-rank embedded in locus
+    // (as it's a reference path, we alsos strip out the phase block)
+    return PathMetadata::create_path_name(PathSense::REFERENCE, RGFACover::rgfa_sample_name,
+                                          rgfa_locus, path_haplotype,
+                                          PathMetadata::NO_PHASE_BLOCK, path_subrange);
+
+}
+
+bool RGFACover::is_rgfa_path_name(const string& path_name) {
+    return path_name == RGFACover::rgfa_sample_name;
+}
+
+pair<string, string> RGFACover::parse_rgfa_locus_name(const string& locus_name) {
+    pair<string, string> sample_locus;
+    vector<string> toks = split_delims(locus_name, ":");
+    assert(toks.size() % 3 == 0);
+    for (int64_t i = 0; i < toks.size(); i+=3) {
+        if (toks[i] == "SN") {
+            assert(toks[i+1] == "Z");
+            sample_locus.first = toks[i+2];
+        } else if (toks[i] == "SC") {
+            assert(toks[i+1] == "Z");
+            sample_locus.second = toks[i+2];
+        } else {
+            assert(false);
+        }
+    }
+    return sample_locus;    
+}
+
+
+void RGFACover::clear(MutablePathMutableHandleGraph* graph) {
+    vector<path_handle_t> rgfa_paths;
+    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+        rgfa_paths.push_back(path_handle);
+    });
+    for (path_handle_t path_handle : rgfa_paths) {
+        graph->destroy_path(path_handle);
+    }
+}
+
+void RGFACover::compute(const PathHandleGraph* graph,
+                        SnarlManager* snarl_manager,
+                        const unordered_set<path_handle_t>& reference_paths,
+                        int64_t minimum_length) {
+
+    // start from scratch
+    this->rgfa_intervals.clear();
+    this->node_to_interval.clear();
+    this->graph = graph;
+
+    // start with the reference paths
+    for (const path_handle_t& ref_path_handle : reference_paths) {
+        this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
+                                                 graph->path_back(ref_path_handle)));
+        graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
+            nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle));
+            if (node_to_interval.count(node_id)) {
+                cerr << "[rgfa error]: node " << node_id << " covered by two reference paths,"
+                     << " including " << graph->get_path_name(ref_path_handle)
+                     << ". rGFA support current requires disjoint acyclic reference paths" << endl;
+                exit(1);
+            }
+            node_to_interval[node_id] = rgfa_intervals.size() - 1;
+        });
+    }
+
+    // we use the path traversal finder for everything
+    // (even gbwt haplotypes, as we're using the path handle interface)
+    PathTraversalFinder path_trav_finder(*graph, *snarl_manager);
+
+    // we collect the rgfa cover in parallel as a list of path fragments
+    size_t thread_count = get_thread_count();
+    vector<vector<pair<step_handle_t, step_handle_t>>> rgfa_intervals_vector(thread_count);    
+    vector<unordered_map<nid_t, int64_t>> node_to_interval_vector(thread_count);
+    
+    // we process top-level snarls in parallel
+    snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) {
+        // per-thread output
+        vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals = rgfa_intervals_vector[omp_get_thread_num()];    
+        unordered_map<nid_t, int64_t>& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()];
+
+        vector<const Snarl*> queue = {snarl}; 
+
+        while(!queue.empty()) {
+            const Snarl* cur_snarl = queue.back();
+            queue.pop_back();
+
+            // get the snarl cover
+            compute_snarl(*cur_snarl, path_trav_finder, minimum_length,
+                          thread_rgfa_intervals,
+                          thread_node_to_interval);
+
+
+            // recurse on the children
+            const vector<const Snarl*>& children = snarl_manager->children_of(cur_snarl);
+            for (const Snarl* child_snarl : children) {
+                queue.push_back(child_snarl);
+            }
+        }
+    });
+
+    // now we need to fold up the thread covers
+    for (int64_t t = 0; t < thread_count; ++t) {
+        int64_t offset = this->rgfa_intervals.size();
+
+        // todo: figure out one-shot stl invocation to move
+        for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) {
+            this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]);
+        }
+        rgfa_intervals_vector[t].clear();
+
+        for (const auto& node_interval : node_to_interval_vector[t]) {
+            this->node_to_interval[node_interval.first] = node_interval.second + offset;
+        }
+        node_to_interval_vector[t].clear();
+    }
+
+    // todo: we could optionally go through uncovered nodes and try to greedily search
+    // for rgfa intervals at this point, since it seems for some graphs there are
+    // regions that don't get found via the traversal finder
+
+}
+
+void RGFACover::load(const PathHandleGraph* graph,
+                     const unordered_set<path_handle_t>& reference_paths) {
+    // start from scratch
+    this->rgfa_intervals.clear();
+    this->node_to_interval.clear();
+    this->graph = graph;
+
+    // start with the reference paths
+    for (const path_handle_t& ref_path_handle : reference_paths) {
+        this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
+                                                 graph->path_back(ref_path_handle)));
+        graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
+            node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1;
+        });
+    }
+
+    // then the rgfa cover paths
+    // since we want to keep our structures in  terms of original paths, we have to map back
+    // to them here (if we don't have original paths, then we can't find the overlaps and
+    // therefore nesting relationships between them.
+
+    // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise
+    // (this does not make guarantees about degenerate fragmentation related cases tho)
+    unordered_map<pair<string, string>, vector<path_handle_t>> sample_locus_to_paths;    
+    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+        sample_locus_to_paths[parse_rgfa_locus_name(graph->get_path_name(path_handle))] = {};
+    });
+    graph->for_each_path_handle([&](path_handle_t path_handle) {
+        pair<string, string> sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle));
+        if (sample_locus_to_paths.count(sample_locus)) {
+            sample_locus_to_paths[sample_locus].push_back(path_handle);
+        }
+    });
+
+    // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval
+    // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph
+    // so should probably have a better error message than the asserts below (ie if exact interval match not found)
+    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {    
+        // pase the rgfa locus to get the original sample and locus
+        pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_path_name(path_handle));
+        // find the sample in our index
+        const vector<path_handle_t>& source_paths = sample_locus_to_paths.at(source_sample_locus);
+        // find the containing path
+        subrange_t rgfa_subrange = graph->get_subrange(path_handle);
+        assert(rgfa_subrange != PathMetadata::NO_SUBRANGE);
+        int64_t rgfa_haplotype = graph->get_haplotype(path_handle);
+        const path_handle_t* source_path = nullptr;
+        subrange_t source_subrange;
+        for (const path_handle_t& source_path_candidate : source_paths) {
+            if (graph->get_haplotype(source_path_candidate) == rgfa_haplotype) {
+                source_subrange = graph->get_subrange(source_path_candidate);
+                if (source_subrange == PathMetadata::NO_SUBRANGE) {
+                    source_subrange.first = 0;
+                    source_subrange.second = 0;
+                    graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) {
+                        source_subrange.second += graph->get_length(graph->get_handle_of_step(step));
+                    });
+                }
+                if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) {
+                    source_path = &source_path_candidate;
+                    break;
+                }
+            }
+        }
+        assert(source_path != nullptr);
+        // now find the exact interval in the containing path and update our data structure
+        bool found_start = false;
+        step_handle_t source_start;
+        int64_t cur_offset = 0;
+        graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) {
+            if (cur_offset + source_subrange.first == rgfa_subrange.first) {
+                source_start = cur_step;
+                found_start = true;
+            }
+            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
+            return !found_start;
+        });
+        assert(found_start);
+
+        bool found_end;
+        step_handle_t source_end;
+        cur_offset = 0;
+        for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) {
+            if (cur_offset == rgfa_subrange.second) {
+                found_end = true;
+                source_end = cur_step;
+                break;
+            }
+            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
+        }
+        assert(found_end);
+
+        // we can finally add our interval
+        this->rgfa_intervals.push_back(make_pair(source_start, source_end));
+        for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) {
+            node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1;
+        }
+    });
+}
+
+void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
+    assert(this->graph == static_cast<PathHandleGraph*>(mutable_graph));
+
+    // compute the offsets in parallel, assuming we don't have a path position index of all paths
+    // todo: we could be smarter about not traversing the same path multiple times!
+    vector<int64_t> rgfa_offsets(this->rgfa_intervals.size());
+    vector<int64_t> rgfa_lengths(this->rgfa_intervals.size());
+#pragma omp parallel for
+    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+        path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first);
+        // below check is because we don't write the reference path, which is in the cover but
+        // not using the naming convention
+        if (is_rgfa_path_name(graph->get_path_name(source_path_handle))) {
+            rgfa_offsets[i] = 0;
+            mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) {
+                if (step_handle == rgfa_intervals[i].first) {
+                    return false;
+                }
+                rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle));
+                return true;
+            });
+            rgfa_lengths[i] = 0;
+            for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
+                 step_handle = mutable_graph->get_next_step(step_handle)) {
+                rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
+            }
+        }
+    }
+
+    // write the rgfa paths
+    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+        path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first);
+        string source_path_name = graph->get_path_name(source_path_handle);
+        if (is_rgfa_path_name(source_path_name)) {
+            string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]);
+            path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name);
+            for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
+                 step_handle = mutable_graph->get_next_step(step_handle)) {
+                mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle));
+            }
+        }
+    }
+
+    this->forwardize_rgfa_paths(mutable_graph);
+}
+
+int64_t RGFACover::get_rank(nid_t node_id) const {
+    if (!node_to_interval.count(node_id)) {
+        return -1;
+    }
+
+    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));
+
+    // since our decomposition is based on snarl tranversals, we know that fragments must
+    // overlap their parents on snarl end points (at the very least)
+    // therefore we can find parents by scanning along the rgfa paths.    
+    step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first);
+    int64_t left_rank = 0;
+    if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) {
+        left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent)));
+    }
+
+    step_handle_t right_parent = graph->get_next_step(rgfa_interval.second);
+    int64_t right_rank = 0;
+    if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
+        right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent)));
+    }
+
+    return min(left_rank, right_rank);    
+}
+
+step_handle_t RGFACover::get_step(nid_t node_id) const {    
+    assert(node_to_interval.count(node_id));
+
+    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));    
+    path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first);
+    for (step_handle_t step = rgfa_interval.first; step != rgfa_interval.second; step = graph->get_next_step(step)) {
+        if (graph->get_id(graph->get_handle_of_step(step)) == node_id) {
+            return step;
+        }
+    }
+    assert(false);
+    return step_handle_t();
+}
+
+pair<const pair<step_handle_t, step_handle_t>*,
+     const pair<step_handle_t, step_handle_t>*>
+RGFACover::get_parent_intervals(const pair<step_handle_t, step_handle_t>& interval) const {
+
+    pair<const pair<step_handle_t, step_handle_t>*,
+         const pair<step_handle_t, step_handle_t>*> parents = make_pair(nullptr, nullptr);
+    
+    // since our decomposition is baseds on snarl tranversals, we know that fragments must
+    // overlap their parents on snarl end points (at the very least)
+    // therefore we can find parents by scanning along the rgfa paths.    
+    step_handle_t left_parent = graph->get_previous_step(interval.first);
+    if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(interval.first))) {
+        int64_t interval_idx = this->node_to_interval.at(graph->get_id(graph->get_handle_of_step(left_parent)));
+        parents.first = &this->rgfa_intervals.at(interval_idx);
+    }
+
+    step_handle_t right_parent = graph->get_next_step(interval.second);
+    if (right_parent != graph->path_end(graph->get_path_handle_of_step(interval.second))) {
+        int64_t interval_idx = node_to_interval.at(graph->get_id(graph->get_handle_of_step(right_parent)));
+        parents.second = &this->rgfa_intervals.at(interval_idx);
+    }
+    return parents;
+}
+
+const vector<pair<step_handle_t, step_handle_t>>& RGFACover::get_intervals() const {
+    return this->rgfa_intervals;
+}
+
+const pair<step_handle_t, step_handle_t>* RGFACover::get_interval(nid_t node_id) const {
+    if (this->node_to_interval.count(node_id)) {
+        return &this->rgfa_intervals.at(node_to_interval.at(node_id));
+    }
+    return nullptr;
+}
+
+
+void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length,
+                              vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
+                              unordered_map<nid_t, int64_t>& thread_node_to_interval) {
+
+    // start by finding the path traversals through the snarl
+    vector<vector<step_handle_t>> travs;
+    {
+        pair<vector<SnarlTraversal>, vector<pair<step_handle_t, step_handle_t> > > path_travs = path_trav_finder.find_path_traversals(snarl);
+        travs.reserve(path_travs.first.size());
+        
+        // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around
+        for (int64_t i = 0; i < path_travs.first.size(); ++i) {
+            string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first));
+            if (is_rgfa_path_name(trav_path_name)) {
+                // we ignore existing (off-reference) rGFA paths
+                // todo: shoulgd there be better error handling?                
+                cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl;
+                continue;
+            }
+            bool reversed = false;
+            if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) {
+                reversed = true;
+            }                
+            assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed);
+            vector<step_handle_t> trav;
+            trav.reserve(path_travs.first[i].visit_size());
+            bool done = false;
+            function<step_handle_t(step_handle_t)> visit_next_step = [&](step_handle_t step_handle) {
+                return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle);
+            };
+            for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) {
+                trav.push_back(step_handle);
+                if (step_handle == path_travs.second[i].second) {
+                    done = true;
+                }
+            }
+            if (reversed) {
+                std::reverse(trav.begin(), trav.end());
+            }
+            travs.push_back(trav);
+        }
+    }
+
+
+    // build an initial ranked list of candidate traversal fragments
+    vector<pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>> ranked_trav_fragments;
+    for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) {
+        // only a reference traversal (or deletion that we don't need to consider)
+        // will have its first two nodes covered
+        if (this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][0]))) &&
+            this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][1])))) {
+            continue;
+        }
+        
+        const vector<step_handle_t>& trav = travs.at(trav_idx);
+        vector<pair<int64_t, int64_t>> uncovered_intervals = get_uncovered_intervals(trav, thread_node_to_interval);
+
+        for (const auto& uncovered_interval : uncovered_intervals) {
+            unordered_set<nid_t> cycle_check;
+            bool cyclic = false;
+            int64_t interval_length = 0;            
+            for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) {
+                handle_t handle = graph->get_handle_of_step(trav[i]);
+                interval_length += graph->get_length(handle);
+                nid_t node_id = graph->get_id(handle);
+                if (cycle_check.count(node_id)) {
+                    cyclic = true;
+                } else {
+                    cycle_check.insert(node_id);
+                }                
+            }
+            if (!cyclic && interval_length >= minimum_length) {
+                int64_t trav_coverage = get_coverage(trav, uncovered_interval);
+                ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(trav_idx, uncovered_interval)));
+            }
+        }
+    }
+
+    // todo: typedef!
+    auto heap_comp = [] (const pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>& s1,
+                         const pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>& s2) {
+        return s1.first < s2.first;
+    };
+
+    // put the fragments into a max heap
+    std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+
+    // now greedily pull out traversal intervals from the ranked list until none are left
+    while (!ranked_trav_fragments.empty()) {
+
+        // get the best scoring (max) fragment from heap
+        auto best_stats_fragment = ranked_trav_fragments.front();
+        std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+        ranked_trav_fragments.pop_back();
+        
+        const vector<step_handle_t>& trav = travs.at(best_stats_fragment.second.first);
+        const pair<int64_t, int64_t>& uncovered_interval = best_stats_fragment.second.second;
+
+        // our traversal may have been partially covered by a different iteration, if so, we need to break it up
+        // and continue
+        vector<pair<int64_t, int64_t>> chopped_intervals;
+        int64_t cur_start = -1;
+        bool chopped = false;
+        for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) {
+            nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i]));
+            bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id);
+            if (!covered && cur_start == -1) {
+                cur_start = i;
+            } else if (covered) {
+                chopped = true;
+                if (cur_start != -1) {
+                    chopped_intervals.push_back(make_pair(cur_start, i));
+                    cur_start = -1;
+                }
+            }
+        }
+        if (cur_start != -1) {
+            chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second));
+        }
+        if (chopped) {
+            for (const pair<int64_t, int64_t>& chopped_interval : chopped_intervals) {
+                int64_t chopped_trav_length = 0;
+                for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) {
+                    chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i]));
+                }
+                if (chopped_trav_length >= minimum_length) {
+                    int64_t trav_coverage = get_coverage(trav, chopped_interval);
+                    ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(best_stats_fragment.second.first, chopped_interval)));
+                    std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+                }
+            }
+            continue;
+        }
+
+        // add the interval to the local (thread safe) structures
+        step_handle_t step = trav[uncovered_interval.first];
+        int64_t interval_length = uncovered_interval.second - uncovered_interval.first;
+        for (int64_t i = 0; i < interval_length; ++i) {
+            thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
+            step = graph->get_next_step(step);
+        }
+        thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second-1]));
+    }
+}
+
+vector<pair<int64_t, int64_t>> RGFACover::get_uncovered_intervals(const vector<step_handle_t>& trav,
+                                                                  const unordered_map<nid_t, int64_t>& thread_node_to_interval) {
+
+    vector<pair<int64_t, int64_t>> intervals;
+    int64_t start = -1;
+    unordered_set<nid_t> dupe_check;
+    for (size_t i = 0; i < trav.size(); ++i) {
+        nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i]));
+        bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id);
+        // we break at dupes even if uncovered -- never want same id twice in an interval
+        bool dupe = !covered && dupe_check.count(node_id);
+        dupe_check.insert(node_id);
+        if (covered || dupe) {
+            if (start != -1) {
+                intervals.push_back(make_pair(start, i));
+            }
+            start = dupe ? i : -1;
+        } else {
+            if (start == -1) {
+                start = i;
+            }
+        }
+    }
+    if (start != -1) {
+        intervals.push_back(make_pair(start, trav.size()));
+    }
+    return intervals;
+}
+
+int64_t RGFACover::get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval) {
+    path_handle_t path_handle = graph->get_path_handle_of_step(trav.front());
+    int64_t coverage = 0;
+
+    for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) {
+        const step_handle_t& step = trav[i];
+        handle_t handle = graph->get_handle_of_step(step);
+        vector<step_handle_t> all_steps = graph->steps_of_handle(handle);        
+        int64_t length = graph->get_length(handle);
+        coverage += length * all_steps.size();
+    }
+
+    return coverage;
+}
+    
+// copied pretty much verbatem from
+// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880
+void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph) {
+    assert(this->graph == static_cast<PathHandleGraph*>(mutable_graph));
+    
+    unordered_map<nid_t, nid_t> id_map;
+    mutable_graph->for_each_path_handle([&](path_handle_t path_handle) {
+        string path_name = mutable_graph->get_path_name(path_handle);
+        if (is_rgfa_path_name(path_name)) {
+            size_t fw_count = 0;
+            size_t total_steps = 0;
+            mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
+                handle_t handle = mutable_graph->get_handle_of_step(step_handle);
+                if (mutable_graph->get_is_reverse(handle)) {
+                    handle_t flipped_handle = mutable_graph->create_handle(mutable_graph->get_sequence(handle));
+                    id_map[mutable_graph->get_id(flipped_handle)] = mutable_graph->get_id(handle);
+                    mutable_graph->follow_edges(handle, true, [&](handle_t prev_handle) {
+                        if (mutable_graph->get_id(prev_handle) != mutable_graph->get_id(handle)) {
+                            mutable_graph->create_edge(prev_handle, flipped_handle);
+                        }
+                    });
+                    mutable_graph->follow_edges(handle, false, [&](handle_t next_handle) {
+                        if (mutable_graph->get_id(handle) != mutable_graph->get_id(next_handle)) {
+                            mutable_graph->create_edge(flipped_handle, next_handle);
+                        }
+                    });
+                    // self-loop cases we punted on above:
+                    if (mutable_graph->has_edge(handle, handle)) {
+                        mutable_graph->create_edge(flipped_handle, flipped_handle);
+                    }
+                    if (mutable_graph->has_edge(handle, mutable_graph->flip(handle))) {
+                        mutable_graph->create_edge(flipped_handle, mutable_graph->flip(flipped_handle));                                
+                    }
+                    if (mutable_graph->has_edge(mutable_graph->flip(handle), handle)) {
+                        mutable_graph->create_edge(mutable_graph->flip(flipped_handle), flipped_handle);
+                    }
+                    vector<step_handle_t> steps = mutable_graph->steps_of_handle(handle);
+                    size_t ref_count = 0;
+                    for (step_handle_t step : steps) {
+                        if (mutable_graph->get_path_handle_of_step(step) == path_handle) {
+                            ++ref_count;
+                        }
+                        step_handle_t next_step = mutable_graph->get_next_step(step);
+                        handle_t new_handle = mutable_graph->get_is_reverse(mutable_graph->get_handle_of_step(step)) ? flipped_handle :
+                            mutable_graph->flip(flipped_handle);
+                        mutable_graph->rewrite_segment(step, next_step, {new_handle});
+                    }
+                    if (ref_count > 1) {
+                        cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << mutable_graph->get_id(handle) << endl;
+                        exit(1);
+                    }
+                    ++fw_count;
+                    assert(mutable_graph->steps_of_handle(handle).empty());
+                    dynamic_cast<DeletableHandleGraph*>(mutable_graph)->destroy_handle(handle);
+                }
+                ++total_steps;
+            });
+        }
+    });
+
+    // rename all the ids back to what they were (so nodes keep their ids, just get flipped around)
+    mutable_graph->reassign_node_ids([&id_map](nid_t new_id) {
+        return id_map.count(new_id) ? id_map[new_id] : new_id;
+    });
+
+    // do a check just to be sure
+    mutable_graph->for_each_path_handle([&](path_handle_t path_handle) {
+        string path_name = mutable_graph->get_path_name(path_handle);
+        if (is_rgfa_path_name(path_name)) {
+            mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
+                handle_t handle = mutable_graph->get_handle_of_step(step_handle);
+                if (mutable_graph->get_is_reverse(handle)) {
+                    cerr << "[rGFA] error: Failed to fowardize node " << mutable_graph->get_id(handle) << " in path " << path_name << endl;
+                    exit(1);
+                }
+            });
+        }
+    });
+}
+
+}
+
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
new file mode 100644
index 00000000000..cd9841a9d68
--- /dev/null
+++ b/src/rgfa.hpp
@@ -0,0 +1,115 @@
+#ifndef VG_RGFA_HPP_INCLUDED
+#define VG_RGFA_HPP_INCLUDED
+
+/**
+ * \file rgfa.hpp
+ *
+ * Interface for computing and querying rGFA path covers.
+ *
+ * An rGFA cover is a set of path fragments (stored as separate paths) in the graph.
+ * They are always relative to an existing reference sample (ie GRCh38).
+ * rGFA path fragments have a special name format, where sample=rGFA
+ *
+ * The data structures used in this class are always relative to the original paths
+ * in the graph.  The REfERENCE-SENSE fragments that are used to serialize the
+ * cover (and for surjection) can be created and loaded, but they are not used
+ * beyond that.  
+ */
+
+#include "handle.hpp"
+#include "snarls.hpp"
+#include "traversal_finder.hpp"
+
+namespace vg {
+
+using namespace std;
+
+class RGFACover {
+public:
+    // I'm not sure how much we want to allow this to change.  For now just define it here
+    static const string rgfa_sample_name;
+
+    // make an rGFA path name from a subrange of a normal path
+    static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length);
+
+    // test if path name is rGFA
+    static bool is_rgfa_path_name(const string& path_name);
+
+    // break up the rGFA locus name back into original sample and locus
+    static pair<string, string> parse_rgfa_locus_name(const string& locus_name);
+
+public:
+    // clear out the existing rGFA cover from the graph.  recommended to run this
+    // before compute()
+    void clear(MutablePathMutableHandleGraph* graph);
+
+    // compute the rgfa cover from the graph, starting with a given set of reference paths
+    void compute(const PathHandleGraph* graph,
+                 SnarlManager* snarl_manager,
+                 const unordered_set<path_handle_t>& reference_paths,
+                 int64_t minimum_length);
+
+    // load the rgfa cover from the graph, assuming it's been computed already and
+    // saved in special rgfa paths
+    // this function assumes that the rGFA cover paths are exactly consistent
+    // with the original paths.
+    void load(const PathHandleGraph* graph,
+              const unordered_set<path_handle_t>& reference_paths);
+
+    // apply the rgfa cover to a graph (must have been computed first), adding it
+    // as a bunch of (reference sense) paths with a special sample name
+    void apply(MutablePathMutableHandleGraph* mutable_graph);
+
+    // get the rgfa rank (level) of a given node (0 if on a reference path)
+    int64_t get_rank(nid_t node_id) const;
+
+    // get the rgfa step of a given node
+    step_handle_t get_step(nid_t node_id) const;
+
+    // get the parent intervals
+    pair<const pair<step_handle_t, step_handle_t>*,
+         const pair<step_handle_t, step_handle_t>*> get_parent_intervals(const pair<step_handle_t, step_handle_t>& interval) const;
+
+    // get the intervals
+    const vector<pair<step_handle_t, step_handle_t>>& get_intervals() const;
+
+    // get an interval from a node
+    // return nullptr if node not in an interval
+    const pair<step_handle_t, step_handle_t>* get_interval(nid_t node_id) const;
+
+    
+protected:
+
+    // compute the cover for the given snarl, by greedily finding the convered rgfa paths through it
+    // the cover is added to the two "thread_" structures.
+    void compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length,
+                       vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
+                       unordered_map<nid_t, int64_t>& thread_node_to_interval);
+
+    // get intervals in travs that are not covered according to this->node_to_interval or
+    // the  thread_node_to_interval parameter
+    vector<pair<int64_t, int64_t>> get_uncovered_intervals(const vector<step_handle_t>& trav,
+                                                           const unordered_map<nid_t, int64_t>& thread_node_to_interval);
+
+    // get the total coverage of a traversal (sum of step lengths)
+    int64_t get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval);
+
+    // make sure all nodes in all rGFA paths are in forward orientation
+    // this is always possible because they are, by definition, disjoint
+    // this should only be run from inside apply()
+    void forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph);
+    
+    
+protected:
+
+    const PathHandleGraph* graph;
+
+    vector<pair<step_handle_t, step_handle_t>> rgfa_intervals;
+
+    unordered_map<nid_t, int64_t> node_to_interval;
+};
+
+/// Export the given VG graph to the given GFA file.
+}
+
+#endif

From a8dcd2bd38211979ff5fa0684b1e07c4759a6079 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sun, 10 Sep 2023 17:54:38 -0400
Subject: [PATCH 02/47] start wiring in cli interface

---
 src/gfa.cpp                   |   6 +-
 src/rgfa.cpp                  | 121 +++++++++++++++++++++-------------
 src/rgfa.hpp                  |   3 +
 src/subcommand/paths_main.cpp | 106 ++++++++++++++++++++++-------
 4 files changed, 162 insertions(+), 74 deletions(-)

diff --git a/src/gfa.cpp b/src/gfa.cpp
index 3f3344d16cd..fcc2e13c220 100644
--- a/src/gfa.cpp
+++ b/src/gfa.cpp
@@ -96,8 +96,10 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
             if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) {
                 std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name);
             }
-            string rgfa_sn = PathMetadata::create_path_name(PathSense::REFERENCE, sample_name, locus_name,
-                                                            haplotype, PathMetadata::NO_PHASE_BLOCK,
+            string rgfa_sn = PathMetadata::create_path_name(sample_name.empty() ? PathSense::GENERIC : PathSense::REFERENCE,
+                                                            sample_name, locus_name,
+                                                            sample_name.empty() ? PathMetadata::NO_HAPLOTYPE : haplotype,
+                                                            PathMetadata::NO_PHASE_BLOCK,
                                                             PathMetadata::NO_SUBRANGE);
             out << "\t" << "SN:Z:" << rgfa_sn
                 << "\t" << "SO:i:" << it->second.second
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 1215dd486f6..5ef6649d165 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -2,7 +2,7 @@
 #include <sstream>
 #include <algorithm>
 
-//#define debug
+#define debug
 
 namespace vg {
 
@@ -20,16 +20,19 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in
     subrange_t path_subrange;
     PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus,
                                   path_haplotype, path_phase_block, path_subrange);
+    if (path_subrange == PathMetadata::NO_SUBRANGE) {
+        path_subrange = make_pair(0, 0);
+    }
 
     // we move the sample into the locus
     // todo: is there something nicer?
     string rgfa_locus;        
     assert(path_locus != PathMetadata::NO_LOCUS_NAME);    
     if (path_sample != PathMetadata::NO_SAMPLE_NAME) {
-        rgfa_locus = "SN:Z:" + path_sample;
+        rgfa_locus = path_sample + "::";
     }
     // the contig name will be behind SC...
-    rgfa_locus += ":SC:Z:" + path_locus;
+    rgfa_locus += path_locus;
 
     // we apply the subrange offset
     path_subrange.first += start;
@@ -45,27 +48,22 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in
 }
 
 bool RGFACover::is_rgfa_path_name(const string& path_name) {
-    return path_name == RGFACover::rgfa_sample_name;
+    return PathMetadata::parse_sample_name(path_name) == RGFACover::rgfa_sample_name;
 }
 
 pair<string, string> RGFACover::parse_rgfa_locus_name(const string& locus_name) {
-    pair<string, string> sample_locus;
-    vector<string> toks = split_delims(locus_name, ":");
-    assert(toks.size() % 3 == 0);
-    for (int64_t i = 0; i < toks.size(); i+=3) {
-        if (toks[i] == "SN") {
-            assert(toks[i+1] == "Z");
-            sample_locus.first = toks[i+2];
-        } else if (toks[i] == "SC") {
-            assert(toks[i+1] == "Z");
-            sample_locus.second = toks[i+2];
-        } else {
-            assert(false);
-        }
+    pair<string, string> sample_locus = make_pair(PathMetadata::NO_SAMPLE_NAME, PathMetadata::NO_LOCUS_NAME);
+
+    auto pos = locus_name.find("::");
+    if (pos != string::npos) {
+        sample_locus.first = locus_name.substr(0, pos);
+        sample_locus.second = locus_name.substr(pos + 2); 
+    } else {
+        sample_locus.second = locus_name;
     }
-    return sample_locus;    
-}
 
+    return sample_locus;
+}    
 
 void RGFACover::clear(MutablePathMutableHandleGraph* graph) {
     vector<path_handle_t> rgfa_paths;
@@ -102,7 +100,12 @@ void RGFACover::compute(const PathHandleGraph* graph,
             node_to_interval[node_id] = rgfa_intervals.size() - 1;
         });
     }
+    this->num_ref_intervals = this->rgfa_intervals.size();
 
+#ifdef debug
+    cerr << "[rgfa] Selected " << rgfa_intervals.size() << " rank=0 reference paths" << endl;
+#endif
+    
     // we use the path traversal finder for everything
     // (even gbwt haplotypes, as we're using the path handle interface)
     PathTraversalFinder path_trav_finder(*graph, *snarl_manager);
@@ -146,6 +149,9 @@ void RGFACover::compute(const PathHandleGraph* graph,
         for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) {
             this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]);
         }
+#ifdef debug
+        cerr << "Adding " << rgfa_intervals_vector[t].size() << " rgfa intervals from thread " << t << endl;
+#endif
         rgfa_intervals_vector[t].clear();
 
         for (const auto& node_interval : node_to_interval_vector[t]) {
@@ -175,6 +181,7 @@ void RGFACover::load(const PathHandleGraph* graph,
             node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1;
         });
     }
+    this->num_ref_intervals = this->rgfa_intervals.size();
 
     // then the rgfa cover paths
     // since we want to keep our structures in  terms of original paths, we have to map back
@@ -185,7 +192,8 @@ void RGFACover::load(const PathHandleGraph* graph,
     // (this does not make guarantees about degenerate fragmentation related cases tho)
     unordered_map<pair<string, string>, vector<path_handle_t>> sample_locus_to_paths;    
     graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
-        sample_locus_to_paths[parse_rgfa_locus_name(graph->get_path_name(path_handle))] = {};
+        string locus_name = graph->get_locus_name(path_handle);
+        sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {};
     });
     graph->for_each_path_handle([&](path_handle_t path_handle) {
         pair<string, string> sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle));
@@ -199,20 +207,30 @@ void RGFACover::load(const PathHandleGraph* graph,
     // so should probably have a better error message than the asserts below (ie if exact interval match not found)
     graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {    
         // pase the rgfa locus to get the original sample and locus
-        pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_path_name(path_handle));
+        pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle));
         // find the sample in our index
         const vector<path_handle_t>& source_paths = sample_locus_to_paths.at(source_sample_locus);
         // find the containing path
         subrange_t rgfa_subrange = graph->get_subrange(path_handle);
         assert(rgfa_subrange != PathMetadata::NO_SUBRANGE);
         int64_t rgfa_haplotype = graph->get_haplotype(path_handle);
+        // allow match between 0 and NO_HAPLOTYPE
+        if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) {
+            rgfa_haplotype = 0;
+        }
         const path_handle_t* source_path = nullptr;
         subrange_t source_subrange;
         for (const path_handle_t& source_path_candidate : source_paths) {
-            if (graph->get_haplotype(source_path_candidate) == rgfa_haplotype) {
+            int64_t source_haplotype = graph->get_haplotype(source_path_candidate);
+            if (source_haplotype == PathMetadata::NO_HAPLOTYPE) {
+                source_haplotype = 0;
+            }
+            if (source_haplotype == rgfa_haplotype) {
                 source_subrange = graph->get_subrange(source_path_candidate);
                 if (source_subrange == PathMetadata::NO_SUBRANGE) {
                     source_subrange.first = 0;
+                }
+                if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) {
                     source_subrange.second = 0;
                     graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) {
                         source_subrange.second += graph->get_length(graph->get_handle_of_step(step));
@@ -241,7 +259,6 @@ void RGFACover::load(const PathHandleGraph* graph,
 
         bool found_end;
         step_handle_t source_end;
-        cur_offset = 0;
         for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) {
             if (cur_offset == rgfa_subrange.second) {
                 found_end = true;
@@ -268,38 +285,38 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
     vector<int64_t> rgfa_offsets(this->rgfa_intervals.size());
     vector<int64_t> rgfa_lengths(this->rgfa_intervals.size());
 #pragma omp parallel for
-    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+    for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {
         path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first);
-        // below check is because we don't write the reference path, which is in the cover but
-        // not using the naming convention
-        if (is_rgfa_path_name(graph->get_path_name(source_path_handle))) {
-            rgfa_offsets[i] = 0;
-            mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) {
-                if (step_handle == rgfa_intervals[i].first) {
-                    return false;
-                }
-                rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle));
-                return true;
-            });
-            rgfa_lengths[i] = 0;
-            for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
-                 step_handle = mutable_graph->get_next_step(step_handle)) {
-                rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
+#ifdef debug
+        cerr << "computing offset for application of rgfa path " << graph->get_path_name(source_path_handle) << endl;
+#endif
+        rgfa_offsets[i] = 0;
+        mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) {
+            if (step_handle == rgfa_intervals[i].first) {
+                return false;
             }
+            rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle));
+            return true;
+        });
+        rgfa_lengths[i] = 0;
+        step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second);
+        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step;
+             step_handle = mutable_graph->get_next_step(step_handle)) {
+            rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
+
         }
     }
 
     // write the rgfa paths
-    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+    for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {
         path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first);
         string source_path_name = graph->get_path_name(source_path_handle);
-        if (is_rgfa_path_name(source_path_name)) {
-            string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]);
-            path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name);
-            for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
-                 step_handle = mutable_graph->get_next_step(step_handle)) {
-                mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle));
-            }
+        string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]);
+        path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name);
+        step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second);
+        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step;
+             step_handle = mutable_graph->get_next_step(step_handle)) {
+            mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle));
         }
     }
 
@@ -479,6 +496,13 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
         const vector<step_handle_t>& trav = travs.at(best_stats_fragment.second.first);
         const pair<int64_t, int64_t>& uncovered_interval = best_stats_fragment.second.second;
 
+#ifdef debug
+        cerr << "best trav: ";
+        for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx));
+        cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <<endl;
+#endif
+            
+
         // our traversal may have been partially covered by a different iteration, if so, we need to break it up
         // and continue
         vector<pair<int64_t, int64_t>> chopped_intervals;
@@ -518,6 +542,9 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
         // add the interval to the local (thread safe) structures
         step_handle_t step = trav[uncovered_interval.first];
         int64_t interval_length = uncovered_interval.second - uncovered_interval.first;
+#ifdef debug
+        cerr << "adding interval with length " << interval_length << endl;
+#endif
         for (int64_t i = 0; i < interval_length; ++i) {
             thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
             step = graph->get_next_step(step);
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index cd9841a9d68..1b0ea598d73 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -106,6 +106,9 @@ class RGFACover {
 
     vector<pair<step_handle_t, step_handle_t>> rgfa_intervals;
 
+    // so rgfa_intervals[0,num_ref_intervals-1] will be all rank-0 reference intervals
+    int64_t num_ref_intervals;
+
     unordered_map<nid_t, int64_t> node_to_interval;
 };
 
diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 1fc293df752..19d54070c29 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -16,6 +16,9 @@
 #include "../vg.hpp"
 #include "../xg.hpp"
 #include "../gbwt_helper.hpp"
+#include "../integrated_snarl_finder.hpp"
+#include "../rgfa.hpp"
+#include "../io/save_handle_graph.hpp"
 #include <vg/io/vpkg.hpp>
 #include <vg/io/stream.hpp>
 #include <vg/io/alignment_emitter.hpp>
@@ -37,6 +40,10 @@ void help_paths(char** argv) {
          << "    -V, --extract-vg         output a path-only graph covering the selected paths" << endl
          << "    -d, --drop-paths         output a graph with the selected paths removed" << endl
          << "    -r, --retain-paths       output a graph with only the selected paths retained" << endl
+         << "  rGFA cover" << endl
+         << "    -R, --rgfa-min-length N  add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
+         << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
+         << "    -t, --threads N          use up to N threads when computing rGFA cover (default: all available)" << endl
          << "  output path data:" << endl
          << "    -X, --extract-gam        print (as GAM alignments) the stored paths in the graph" << endl
          << "    -A, --extract-gaf        print (as GAF alignments) the stored paths in the graph" << endl
@@ -101,6 +108,8 @@ int main_paths(int argc, char** argv) {
     bool extract_as_fasta = false;
     bool drop_paths = false;
     bool retain_paths = false;
+    int64_t rgfa_min_len = -1;
+    string snarl_filename;
     string graph_file;
     string gbwt_file;
     string path_prefix;
@@ -130,6 +139,8 @@ int main_paths(int argc, char** argv) {
             {"extract-vg", no_argument, 0, 'V'},
             {"drop-paths", no_argument, 0, 'd'},
             {"retain-paths", no_argument, 0, 'r'},
+            {"rgfa-cover", required_argument, 0, 'R'},
+            {"snarls", required_argument, 0, 's'},
             {"extract-gam", no_argument, 0, 'X'},
             {"extract-gaf", no_argument, 0, 'A'},            
             {"list", no_argument, 0, 'L'},
@@ -145,16 +156,15 @@ int main_paths(int argc, char** argv) {
             {"reference-paths", no_argument, 0, 'R'},
             {"haplotype-paths", no_argument, 0, 'H'},
             {"coverage", no_argument, 0, 'c'},            
-
+            {"threads", required_argument, 0, 't'},
             // Hidden options for backward compatibility.
-            {"threads", no_argument, 0, 'T'},
             {"threads-by", required_argument, 0, 'q'},
 
             {0, 0, 0, 0}
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c",
+        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGHp:ct:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -183,13 +193,22 @@ int main_paths(int argc, char** argv) {
         case 'd':
             drop_paths = true;
             output_formats++;
-            break;
+            break;            
             
         case 'r':
             retain_paths = true;
             output_formats++;
             break;
-                
+
+        case 'R':
+            rgfa_min_len = parse<int64_t>(optarg);            
+            output_formats++;
+            break;
+
+        case 's':
+            snarl_filename = optarg;
+            break;
+            
         case 'X':
             extract_as_gam = true;
             output_formats++;
@@ -265,9 +284,16 @@ int main_paths(int argc, char** argv) {
             output_formats++;
             break;
 
-        case 'T':
-            std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl;
+        case 't':
+        {
+            int num_threads = parse<int>(optarg);
+            if (num_threads <= 0) {
+                cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
+                exit(1);
+            }
+            omp_set_num_threads(num_threads);
             break;
+        }
 
         case 'q':
             std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl;
@@ -320,7 +346,7 @@ int main_paths(int argc, char** argv) {
         }
     } 
     if (output_formats != 1) {
-        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl;
+        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl;
         std::exit(EXIT_FAILURE);
     }
     if (selection_criteria > 1) {
@@ -371,9 +397,23 @@ int main_paths(int argc, char** argv) {
           exit(1);
         }
     }
-    
-    
-    
+
+    // Load or compute the snarls
+    unique_ptr<SnarlManager> snarl_manager;
+    if (rgfa_min_len >= 0) {
+        if (!snarl_filename.empty()) {
+            ifstream snarl_file(snarl_filename.c_str());
+            if (!snarl_file) {
+                cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl;
+                return 1;
+            }
+            snarl_manager = vg::io::VPKG::load_one<SnarlManager>(snarl_file);
+        } else {
+            IntegratedSnarlFinder finder(*graph);
+            snarl_manager = unique_ptr<SnarlManager>(new SnarlManager(std::move(finder.find_snarls_parallel())));
+        }
+    }
+           
     set<string> path_names;
     if (!path_file.empty()) {
         ifstream path_stream(path_file);
@@ -566,7 +606,7 @@ int main_paths(int argc, char** argv) {
             
         };
         
-        if (drop_paths || retain_paths) {
+        if (drop_paths || retain_paths || rgfa_min_len >= 0) {
             MutablePathMutableHandleGraph* mutable_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph.get());
             if (!mutable_graph) {
                 std::cerr << "error[vg paths]: graph cannot be modified" << std::endl;
@@ -578,24 +618,40 @@ int main_paths(int argc, char** argv) {
                 exit(1);
             }
 
-            vector<string> to_destroy;
-            if (drop_paths) {
+            if (drop_paths || retain_paths) {
+                vector<string> to_destroy;
+                if (drop_paths) {
+                    for_each_selected_path([&](const path_handle_t& path_handle) {
+                        string name = graph->get_path_name(path_handle);
+                        to_destroy.push_back(name);
+                    });
+                } else {
+                    for_each_unselected_path([&](const path_handle_t& path_handle) {
+                        string name = graph->get_path_name(path_handle);
+                        to_destroy.push_back(name);
+                    });
+                }
+                for (string& path_name : to_destroy) {
+                    mutable_graph->destroy_path(graph->get_path_handle(path_name));
+                }
+            } else {                
+                assert(rgfa_min_len >= 0);
+                RGFACover rgfa_cover;
+                // clean out existing cover
+                rgfa_cover.clear(mutable_graph);
+                // load up the rank-0 reference path selection
+                unordered_set<path_handle_t> reference_paths;
                 for_each_selected_path([&](const path_handle_t& path_handle) {
-                    string name = graph->get_path_name(path_handle);
-                    to_destroy.push_back(name);
+                    reference_paths.insert(path_handle);
                 });
-            } else {
-                for_each_unselected_path([&](const path_handle_t& path_handle) {
-                    string name = graph->get_path_name(path_handle);
-                    to_destroy.push_back(name);
-                });
-            }
-            for (string& path_name : to_destroy) {
-                mutable_graph->destroy_path(graph->get_path_handle(path_name));
+                // compute the new cover
+                rgfa_cover.compute(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len);
+                // and write it out to the graph
+                rgfa_cover.apply(mutable_graph);
             }
             
             // output the graph
-            serializable_graph->serialize(cout);
+            vg::io::save_handle_graph(graph.get(), std::cout);
         }
         else if (coverage) {
             // for every node, count the number of unique paths.  then add the coverage count to each one

From a2261b4ae956db2b828e77351ba478239fe71eef Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 19 Sep 2023 17:55:46 -0400
Subject: [PATCH 03/47] add rgfa options to save_graph interface

---
 src/io/save_handle_graph.hpp  | 14 ++++++++++----
 src/subcommand/paths_main.cpp |  4 +++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp
index bee5b1297c5..58946fd5490 100644
--- a/src/io/save_handle_graph.hpp
+++ b/src/io/save_handle_graph.hpp
@@ -42,11 +42,14 @@ using namespace std;
  * Save a handle graph. 
  * Todo: should this be somewhere else (ie in vgio with new types registered?)
  */
-inline void save_handle_graph(HandleGraph* graph, ostream& os) {
+inline void save_handle_graph(HandleGraph* graph, ostream& os,
+                              const set<string>& rgfa_paths = {},
+                              bool rgfa_pline = false,
+                              bool use_w_lines = true) {
 
     if (dynamic_cast<GFAHandleGraph*>(graph) != nullptr) {
         // We loaded a GFA into a handle graph, want to write back to GFA
-        graph_to_gfa(dynamic_cast<GFAHandleGraph*>(graph), os);
+        graph_to_gfa(dynamic_cast<GFAHandleGraph*>(graph), os, rgfa_paths, rgfa_pline, use_w_lines);
     } else if (dynamic_cast<SerializableHandleGraph*>(graph) != nullptr) {
         // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility.
         dynamic_cast<SerializableHandleGraph*>(graph)->serialize(os);
@@ -58,14 +61,17 @@ inline void save_handle_graph(HandleGraph* graph, ostream& os) {
     }
 }
 
-inline void save_handle_graph(HandleGraph* graph, const string& dest_path) {
+inline void save_handle_graph(HandleGraph* graph, const string& dest_path,
+                              const set<string>& rgfa_paths = {},
+                              bool rgfa_pline = false,
+                              bool use_w_lines = true) {  
     if (dynamic_cast<GFAHandleGraph*>(graph) != nullptr) {
         // We loaded a GFA into a handle graph, want to write back to GFA
         ofstream os(dest_path);
         if (!os) {
             throw runtime_error("error[save_handle_graph]: Unable to write to: " + dest_path);
         }
-        graph_to_gfa(dynamic_cast<GFAHandleGraph*>(graph), os);
+        graph_to_gfa(dynamic_cast<GFAHandleGraph*>(graph), os, rgfa_paths, rgfa_pline, use_w_lines);
     } else if (dynamic_cast<SerializableHandleGraph*>(graph) != nullptr) {
         // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility.
         dynamic_cast<SerializableHandleGraph*>(graph)->serialize(dest_path);
diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 19d54070c29..db948397cc2 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -618,6 +618,7 @@ int main_paths(int argc, char** argv) {
                 exit(1);
             }
 
+            set<string> reference_path_names;
             if (drop_paths || retain_paths) {
                 vector<string> to_destroy;
                 if (drop_paths) {
@@ -643,6 +644,7 @@ int main_paths(int argc, char** argv) {
                 unordered_set<path_handle_t> reference_paths;
                 for_each_selected_path([&](const path_handle_t& path_handle) {
                     reference_paths.insert(path_handle);
+                    reference_path_names.insert(graph->get_path_name(path_handle));
                 });
                 // compute the new cover
                 rgfa_cover.compute(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len);
@@ -651,7 +653,7 @@ int main_paths(int argc, char** argv) {
             }
             
             // output the graph
-            vg::io::save_handle_graph(graph.get(), std::cout);
+            vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names);
         }
         else if (coverage) {
             // for every node, count the number of unique paths.  then add the coverage count to each one

From 60f83519b857ea5097c8940bdc751a0f82e7f5e9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 19 Sep 2023 22:28:38 -0400
Subject: [PATCH 04/47] open intervals

---
 src/rgfa.cpp | 20 +++++++++++---------
 src/rgfa.hpp |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 5ef6649d165..f0989a041c8 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -88,7 +88,7 @@ void RGFACover::compute(const PathHandleGraph* graph,
     // start with the reference paths
     for (const path_handle_t& ref_path_handle : reference_paths) {
         this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
-                                                 graph->path_back(ref_path_handle)));
+                                                 graph->path_end(ref_path_handle)));
         graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
             nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle));
             if (node_to_interval.count(node_id)) {
@@ -176,7 +176,7 @@ void RGFACover::load(const PathHandleGraph* graph,
     // start with the reference paths
     for (const path_handle_t& ref_path_handle : reference_paths) {
         this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
-                                                 graph->path_back(ref_path_handle)));
+                                                 graph->path_end(ref_path_handle)));
         graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
             node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1;
         });
@@ -205,7 +205,7 @@ void RGFACover::load(const PathHandleGraph* graph,
     // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval
     // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph
     // so should probably have a better error message than the asserts below (ie if exact interval match not found)
-    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {    
+    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
         // pase the rgfa locus to get the original sample and locus
         pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle));
         // find the sample in our index
@@ -270,6 +270,7 @@ void RGFACover::load(const PathHandleGraph* graph,
         assert(found_end);
 
         // we can finally add our interval
+        source_end = graph->get_next_step(source_end);
         this->rgfa_intervals.push_back(make_pair(source_start, source_end));
         for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) {
             node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1;
@@ -299,8 +300,7 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
             return true;
         });
         rgfa_lengths[i] = 0;
-        step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second);
-        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step;
+        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
              step_handle = mutable_graph->get_next_step(step_handle)) {
             rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
 
@@ -313,8 +313,7 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
         string source_path_name = graph->get_path_name(source_path_handle);
         string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]);
         path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name);
-        step_handle_t last_step = mutable_graph->get_next_step(rgfa_intervals[i].second);
-        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != last_step;
+        for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
              step_handle = mutable_graph->get_next_step(step_handle)) {
             mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle));
         }
@@ -325,10 +324,12 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
 
 int64_t RGFACover::get_rank(nid_t node_id) const {
     if (!node_to_interval.count(node_id)) {
+        cerr << "rank is -1 because " << node_id << " is not in the node_to_interval structure" << endl;
         return -1;
     }
 
     const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));
+    cerr << "get rank" << endl;
 
     // since our decomposition is based on snarl tranversals, we know that fragments must
     // overlap their parents on snarl end points (at the very least)
@@ -339,7 +340,8 @@ int64_t RGFACover::get_rank(nid_t node_id) const {
         left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent)));
     }
 
-    step_handle_t right_parent = graph->get_next_step(rgfa_interval.second);
+    // don't need to go next, since already one past
+    step_handle_t right_parent = rgfa_interval.second;
     int64_t right_rank = 0;
     if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
         right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent)));
@@ -549,7 +551,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
             thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
             step = graph->get_next_step(step);
         }
-        thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second-1]));
+        thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second]));
     }
 }
 
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 1b0ea598d73..4619303d356 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -104,6 +104,7 @@ class RGFACover {
 
     const PathHandleGraph* graph;
 
+    // intervals are end-exclusive (like bed)
     vector<pair<step_handle_t, step_handle_t>> rgfa_intervals;
 
     // so rgfa_intervals[0,num_ref_intervals-1] will be all rank-0 reference intervals

From 6dd3856e2e4a81f55c8e3f4a94a04467d8bf6d86 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 20 Sep 2023 10:31:07 -0400
Subject: [PATCH 05/47] clean a few cases

---
 src/gfa.cpp  | 7 ++++---
 src/rgfa.cpp | 4 +---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/gfa.cpp b/src/gfa.cpp
index fcc2e13c220..93e457fc40d 100644
--- a/src/gfa.cpp
+++ b/src/gfa.cpp
@@ -55,7 +55,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
     unordered_map<nid_t, pair<path_handle_t, size_t>> node_offsets;
     graph->for_each_path_handle([&](path_handle_t path_handle) {
         string path_name = graph->get_path_name(path_handle);
-        if (rgfa_paths.count(path_name) || RGFACover::is_rgfa_path_name(path_name)) {
+        if (rgfa_paths.count(path_name) || (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(path_name))) {
             size_t offset = 0;
             subrange_t path_subrange = graph->get_subrange(path_handle);
             if (path_subrange != PathMetadata::NO_SUBRANGE) {
@@ -93,7 +93,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
             string sample_name = graph->get_sample_name(it->second.first);
             string locus_name = graph->get_locus_name(it->second.first);
             int64_t haplotype = graph->get_haplotype(it->second.first);
-            if (RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) {
+            if (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) {
                 std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name);
             }
             string rgfa_sn = PathMetadata::create_path_name(sample_name.empty() ? PathSense::GENERIC : PathSense::REFERENCE,
@@ -134,7 +134,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set<string>&
     // Paths as P-lines
     for (const path_handle_t& h : path_handles) {
         auto path_name = graph->get_path_name(h);
-        if (rgfa_pline || !rgfa_paths.count(path_name)) {
+        if (rgfa_pline || rgfa_paths.empty() ||
+            (!rgfa_paths.count(path_name) && !RGFACover::is_rgfa_path_name(path_name))) {
             if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) {
                 // We have a mix of reference and non-reference paths on the same sample which GFA can't handle.
                 cerr << "warning [gfa]: path " << path_name << " will be interpreted as reference sense "
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index f0989a041c8..bc690555c71 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -2,7 +2,7 @@
 #include <sstream>
 #include <algorithm>
 
-#define debug
+//#define debug
 
 namespace vg {
 
@@ -324,12 +324,10 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
 
 int64_t RGFACover::get_rank(nid_t node_id) const {
     if (!node_to_interval.count(node_id)) {
-        cerr << "rank is -1 because " << node_id << " is not in the node_to_interval structure" << endl;
         return -1;
     }
 
     const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));
-    cerr << "get rank" << endl;
 
     // since our decomposition is based on snarl tranversals, we know that fragments must
     // overlap their parents on snarl end points (at the very least)

From 860660bacfea395f6a659a07930add2d5ce7d81f Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 20 Sep 2023 12:32:30 -0400
Subject: [PATCH 06/47] rgfa tests

---
 src/algorithms/gfa_to_handle.cpp |  5 ++--
 src/rgfa.cpp                     |  5 ++--
 src/rgfa.hpp                     |  3 +-
 test/rgfa/rgfa_ins.gfa           | 15 ++++++++++
 test/rgfa/rgfa_ins2.gfa          | 18 ++++++++++++
 test/rgfa/rgfa_ins3.gfa          | 18 ++++++++++++
 test/rgfa/rgfa_tiny.gfa          | 38 +++++++++++++++++++++++++
 test/t/11_vg_paths.t             | 48 +++++++++++++++++++++++++++++++-
 8 files changed, 143 insertions(+), 7 deletions(-)
 create mode 100644 test/rgfa/rgfa_ins.gfa
 create mode 100644 test/rgfa/rgfa_ins2.gfa
 create mode 100644 test/rgfa/rgfa_ins3.gfa
 create mode 100644 test/rgfa/rgfa_tiny.gfa

diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp
index ea0ae94dfc5..abff52c2add 100644
--- a/src/algorithms/gfa_to_handle.cpp
+++ b/src/algorithms/gfa_to_handle.cpp
@@ -291,7 +291,6 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
         }
         if (found == rgfa_cache->end()) {
             // Need to make a new path, possibly with subrange start info.
-            
             std::pair<int64_t, int64_t> subrange;
             if (offset == 0) {
                 // Don't send a subrange
@@ -304,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
             string rgfa_path_name;
             if (path_rank > 0) {
                 // Special logic for off-reference paths, which get loaded into special rGFA cover paths
-                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length);
+                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false);
             } else {
                 // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
                 rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC,
@@ -317,7 +316,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
             path_handle_t path = graph->create_path_handle(rgfa_path_name);
             
             // Then cache it
-            found = rgfa_cache->emplace_hint(found, rgfa_path_name, std::make_pair(path, offset));
+            found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset));
         }
         
         // Add the step to the path
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index bc690555c71..37619b626d8 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -10,7 +10,8 @@ using namespace std;
 
 const string RGFACover::rgfa_sample_name = "_rGFA_";
 
-string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length) {
+string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length,
+                                      bool specify_subrange_end) {
 
     PathSense path_sense;
     string path_sample;
@@ -36,7 +37,7 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in
 
     // we apply the subrange offset
     path_subrange.first += start;
-    path_subrange.second = path_subrange.first + length;
+    path_subrange.second = specify_subrange_end ? path_subrange.first + length : PathMetadata::NO_END_POSITION;
 
 
     // and return the final path, with sample/locus/rgfa-rank embedded in locus
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 4619303d356..3404b0aa0b6 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -30,7 +30,8 @@ class RGFACover {
     static const string rgfa_sample_name;
 
     // make an rGFA path name from a subrange of a normal path
-    static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length);
+    static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length,
+                                      bool specify_subrange_end = true);
 
     // test if path name is rGFA
     static bool is_rgfa_path_name(const string& path_name);
diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa
new file mode 100644
index 00000000000..9751b52ba47
--- /dev/null
+++ b/test/rgfa/rgfa_ins.gfa
@@ -0,0 +1,15 @@
+H	VN:Z:1.0
+S	1	CAAATAAG
+S	2	TTT
+S	3	TTT
+S	4	TTT
+S	5	TTT
+P	x	1+,5+	*
+P	y	1+,2+,4+,5+	*
+P	z	1+,2+,3+,4+,5+	*
+L	1	+	2	+	0M
+L	2	+	3	+	0M
+L	2	+	4	+	0M
+L	3	+	4	+	0M
+L	4	+	5	+	0M
+L	1	+	5	+	0M
diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa
new file mode 100644
index 00000000000..ef6f1f3055c
--- /dev/null
+++ b/test/rgfa/rgfa_ins2.gfa
@@ -0,0 +1,18 @@
+H	VN:Z:1.0
+S	1	CAAATAAG
+S	2	TTT
+S	3	TTT
+S	4	TTT
+S	5	TTT
+S	6	TTTTTTTTTT
+P	x	1+,5+	*
+P	y	1+,2+,6+,4+,5+	*
+P	z	1+,2+,3+,4+,5+	*
+L	1	+	2	+	0M
+L	2	+	3	+	0M
+L	2	+	4	+	0M
+L	3	+	4	+	0M
+L	4	+	5	+	0M
+L	1	+	5	+	0M
+L	2	+	6	+	0M
+L	6	+	4	+	0M
diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa
new file mode 100644
index 00000000000..861ac58d3b5
--- /dev/null
+++ b/test/rgfa/rgfa_ins3.gfa
@@ -0,0 +1,18 @@
+H	VN:Z:1.0
+S	1	CAAATAAG
+S	2	TTT
+S	3	TTT
+S	4	TTT
+S	5	TTT
+S	6	TTTTTTTTTT
+P	x	1+,5+	*
+P	y	5-,4-,6-,2-,1-	*
+P	z	1+,2+,3+,4+,5+	*
+L	1	+	2	+	0M
+L	2	+	3	+	0M
+L	2	+	4	+	0M
+L	3	+	4	+	0M
+L	4	+	5	+	0M
+L	1	+	5	+	0M
+L	2	+	6	+	0M
+L	6	+	4	+	0M
diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa
new file mode 100644
index 00000000000..498d2e0a6dc
--- /dev/null
+++ b/test/rgfa/rgfa_tiny.gfa
@@ -0,0 +1,38 @@
+H	VN:Z:1.0
+S	5	C
+S	7	A
+S	12	ATAT
+S	8	G
+S	1	CAAATAAG
+S	4	T
+S	6	TTG
+S	15	CCAACTCTCTG
+S	2	A
+S	10	A
+S	9	AAATTTTCTGGAGTTCTAT
+S	11	T
+S	13	A
+S	14	T
+S	3	G
+P	x	1+,3+,5+,6+,8+,9+,11+,12+,14+,15+	*
+P	y	1+,2+,4+,6+,8+,9+,10+,12+,13+,15+	*
+L	5	+	6	+	0M
+L	7	+	9	+	0M
+L	12	+	13	+	0M
+L	12	+	14	+	0M
+L	8	+	9	+	0M
+L	1	+	2	+	0M
+L	1	+	3	+	0M
+L	4	+	6	+	0M
+L	6	+	7	+	0M
+L	6	+	8	+	0M
+L	2	+	4	+	0M
+L	2	+	5	+	0M
+L	10	+	12	+	0M
+L	9	+	10	+	0M
+L	9	+	11	+	0M
+L	11	+	12	+	0M
+L	13	+	15	+	0M
+L	14	+	15	+	0M
+L	3	+	4	+	0M
+L	3	+	5	+	0M
diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index 3bba6231d56..1409bdfe2c2 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg
 
 export LC_ALL="C" # force a consistent sort order 
 
-plan tests 20
+plan tests 25
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg
@@ -61,4 +61,50 @@ is $? 0 "vg path coverage reports correct lengths in first column"
 
 rm -f q.vg q.cov.len q.len
 
+vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa
+printf "P	_rGFA_#y[33]	10+	*
+P	_rGFA_#y[38]	13+	*
+P	_rGFA_#y[8]	2+,4+	*\n" > rgfa_tiny_expected_fragments.rgfa
+grep ^P rgfa_tiny.rgfa | grep rGFA | sort > rgfa_tiny_fragments.rgfa
+diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa
+is $? 0 "Found the expected rgfa SNP cover of tiny graph"
+
+rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa
+
+vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW >  rgfa_ins.rgfa
+printf "P	_rGFA_#z[8]	2+,3+,4+	*\n" > rgfa_ins_expected_fragments.rgfa
+grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa
+diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa
+is $? 0 "Found the expected rgfa cover for simple nested insertion"
+
+rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa
+
+vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa
+printf "P	_rGFA_#y[8]	2+,6+,4+	*
+P	_rGFA_#z[11]	3+	*\n" > rgfa_ins2_expected_fragments.rgfa
+grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa
+diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa
+is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments"
+
+rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa
+
+vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa
+printf "P	_rGFA_#y[8]	2+,6+,4+	*\n" > rgfa_ins2R5_expected_fragments.rgfa
+grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa
+diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa
+is $? 0 "rgfa Minimum fragment length filters out small fragment"
+
+rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa
+
+vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW  > rgfa_ins3.rgfa
+printf "P	x	1+,5+	*
+P	_rGFA_#y[3]	4+,6+,2+	*
+P	y	5-,4+,6+,2+,1-	*
+P	_rGFA_#z[11]	3+	*
+P	z	1+,2-,3+,4-,5+	*\n" | sort > rgfa_ins3_expected_fragments.rgfa
+grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa
+diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa
+is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization"
+
+rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa
 

From 37945cbc91b8c7b7f2904f9564d59e332896d814 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 21 Sep 2023 12:19:51 -0400
Subject: [PATCH 07/47] Add tag test

---
 test/t/11_vg_paths.t | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index 1409bdfe2c2..dd00510ccfa 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg
 
 export LC_ALL="C" # force a consistent sort order 
 
-plan tests 25
+plan tests 26
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg
@@ -88,6 +88,18 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion that requires
 
 rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa
 
+vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa
+printf "S	1	CAAATAAG	SN:Z:x	SO:i:0	SR:i:0
+S	2	TTT	SN:Z:y	SO:i:8	SR:i:1
+S	3	TTT	SN:Z:z	SO:i:11	SR:i:2
+S	4	TTT	SN:Z:y	SO:i:21	SR:i:1
+S	5	TTT	SN:Z:x	SO:i:8	SR:i:0
+S	6	TTTTTTTTTT	SN:Z:y	SO:i:11	SR:i:1\n" > rgfa_ins2_expected_segs.rgfa
+diff rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa
+is $? 0 "Found the expected rgfa tags for simple nested insertion that requires two fragments"
+
+rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa
+
 vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa
 printf "P	_rGFA_#y[8]	2+,6+,4+	*\n" > rgfa_ins2R5_expected_fragments.rgfa
 grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa

From f3642eeae68f2b1b9600f4571038444a81c3e88d Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 21 Sep 2023 17:06:05 -0400
Subject: [PATCH 08/47] add rgfa support to deconstruct

---
 src/deconstructor.cpp   | 33 +++++++++++++++++++--------------
 src/rgfa.cpp            | 23 +++++++++++++++++++++++
 src/rgfa.hpp            |  5 +++++
 test/t/26_deconstruct.t |  9 ++++++++-
 4 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
index 118d9df45a5..d95ee4d72d6 100644
--- a/src/deconstructor.cpp
+++ b/src/deconstructor.cpp
@@ -1,6 +1,7 @@
 #include "deconstructor.hpp"
 #include "traversal_finder.hpp"
 #include <gbwtgraph/gbwtgraph.h>
+#include "rgfa.hpp"
 
 //#define debug
 
@@ -792,22 +793,24 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
 
             const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx];
 
+            string fixed_trav_name = RGFACover::revert_rgfa_path_name(ref_trav_name, true);
+
             vcflib::Variant v;
             v.quality = 60;
 
             // in VCF we usually just want the contig
-            string contig_name = PathMetadata::parse_locus_name(ref_trav_name);
+            string contig_name = PathMetadata::parse_locus_name(fixed_trav_name);
             if (contig_name == PathMetadata::NO_LOCUS_NAME) {
-                contig_name = ref_trav_name;
+                contig_name = fixed_trav_name;
             } else if (long_ref_contig) {
                 // the sample name isn't unique enough, so put a full ugly name in the vcf
-                if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) {
-                    contig_name = ref_trav_name;
+                if (PathMetadata::parse_sense(fixed_trav_name) == PathSense::GENERIC) {
+                    contig_name = fixed_trav_name;
                 } else {
                     contig_name = PathMetadata::create_path_name(PathSense::REFERENCE,
-                                                                 PathMetadata::parse_sample_name(ref_trav_name),
+                                                                 PathMetadata::parse_sample_name(fixed_trav_name),
                                                                  contig_name,
-                                                                 PathMetadata::parse_haplotype(ref_trav_name),
+                                                                 PathMetadata::parse_haplotype(fixed_trav_name),
                                                                  PathMetadata::NO_PHASE_BLOCK,
                                                                  PathMetadata::NO_SUBRANGE);
                 }
@@ -939,13 +942,14 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
     graph->for_each_path_handle([&](const path_handle_t& path_handle) {
         string path_name = graph->get_path_name(path_handle);
         if (!this->ref_paths.count(path_name)) {
-            string sample_name = graph->get_sample_name(path_handle);
+            path_name = RGFACover::revert_rgfa_path_name(path_name, true);
+            string sample_name = PathMetadata::parse_sample_name(path_name);
             // for backward compatibility
             if (sample_name == PathMetadata::NO_SAMPLE_NAME) {
                 sample_name = path_name;
             }
             if (!ref_samples.count(sample_name)) {
-                size_t haplotype = graph->get_haplotype(path_handle);
+                size_t haplotype = PathMetadata::parse_haplotype(path_name);
                 if (haplotype == PathMetadata::NO_HAPLOTYPE) {
                     haplotype = 0;
                 }
@@ -1032,18 +1036,19 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
             for (handle_t handle : graph->scan_path(path_handle)) {
                 path_len += graph->get_length(handle);
             }
-            string locus_name = graph->get_locus_name(path_handle);
+            string fixed_refpath = RGFACover::revert_rgfa_path_name(refpath, true);
+            string locus_name = PathMetadata::parse_locus_name(fixed_refpath);
             if (locus_name == PathMetadata::NO_LOCUS_NAME) {
-                locus_name = refpath;
+                locus_name = fixed_refpath;
             } else if (long_ref_contig) {
                 // the sample name isn't unique enough, so put a full ugly name in the vcf
-                if (graph->get_sense(path_handle) == PathSense::GENERIC) {
-                    locus_name = graph->get_path_name(path_handle);
+                if (PathMetadata::parse_sense(fixed_refpath) == PathSense::GENERIC) {
+                    locus_name = fixed_refpath;
                 } else {
                     locus_name = PathMetadata::create_path_name(PathSense::REFERENCE,
-                                                                graph->get_sample_name(path_handle),
+                                                                PathMetadata::parse_sample_name(fixed_refpath),
                                                                 locus_name,
-                                                                graph->get_haplotype(path_handle),
+                                                                PathMetadata::parse_haplotype(fixed_refpath),
                                                                 PathMetadata::NO_PHASE_BLOCK,
                                                                 PathMetadata::NO_SUBRANGE);
                 }
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 37619b626d8..5a36ac21b5f 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -66,6 +66,29 @@ pair<string, string> RGFACover::parse_rgfa_locus_name(const string& locus_name)
     return sample_locus;
 }    
 
+string RGFACover::revert_rgfa_path_name(const string& rgfa_path_name, bool strip_subrange) {
+    if (!is_rgfa_path_name(rgfa_path_name)) {
+        return rgfa_path_name;
+    }
+    PathSense path_sense;
+    string path_sample;
+    string path_locus;
+    size_t path_haplotype;
+    size_t path_phase_block;
+    subrange_t path_subrange;
+    PathMetadata::parse_path_name(rgfa_path_name, path_sense, path_sample, path_locus,
+                                  path_haplotype, path_phase_block, path_subrange);
+
+    std::tie(path_sample, path_locus) = parse_rgfa_locus_name(path_locus);
+
+    if (path_sense == PathSense::REFERENCE && path_sample == PathMetadata::NO_SAMPLE_NAME) {
+        path_sense = PathSense::GENERIC;        
+    }
+    return PathMetadata::create_path_name(path_sense, path_sample,
+                                          path_locus, path_haplotype,
+                                          path_phase_block, strip_subrange ? PathMetadata::NO_SUBRANGE : path_subrange);
+}
+
 void RGFACover::clear(MutablePathMutableHandleGraph* graph) {
     vector<path_handle_t> rgfa_paths;
     graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 3404b0aa0b6..26473dfcab9 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -39,6 +39,11 @@ class RGFACover {
     // break up the rGFA locus name back into original sample and locus
     static pair<string, string> parse_rgfa_locus_name(const string& locus_name);
 
+    // undo make_rgfa_path_name, getting back an original non-rgfa name
+    // if the input's not a rgfa path name, just return it
+    static string revert_rgfa_path_name(const string& path_name,
+                                        bool strip_subrange = false);    
+
 public:
     // clear out the existing rGFA cover from the graph.  recommended to run this
     // before compute()
diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index aa11d17d092..4a60f64faff 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 24
+plan tests 26
 
 vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg
 vg index tiny.vg -x tiny.xg
@@ -157,3 +157,10 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction"
 rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa
 
 
+vg deconstruct rgfa/rgfa_ins2.gfa -P y -ea | grep  "^y	11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_y.vcf
+vg paths -x rgfa/rgfa_ins2.gfa -R 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep  "^y	11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf
+diff rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf
+is "$?" 0 "deconstruct properly handles rGFA reference"
+is $(cat rgfa_ins2_rgfa.vcf | wc -l) 1 "deconstruct properly handles rGFA reference (sanity check)"
+
+rm -f rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf

From f8160722c7c54c4a5e0afbf702dd017b4ff867c9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Sep 2023 09:22:51 -0400
Subject: [PATCH 09/47] fix up some edge cases

---
 src/deconstructor.cpp |  3 ++-
 src/rgfa.cpp          | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
index d95ee4d72d6..cd61c721c7c 100644
--- a/src/deconstructor.cpp
+++ b/src/deconstructor.cpp
@@ -612,7 +612,8 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const {
         }
 #endif
         if (ref_paths.count(path_trav_name) &&
-            (ref_trav_name.empty() || path_trav_name < ref_trav_name)) {
+            (ref_trav_name.empty() || path_trav_name < ref_trav_name ||
+             (RGFACover::is_rgfa_path_name(ref_trav_name) && !RGFACover::is_rgfa_path_name(path_trav_name)))) {
             ref_trav_name = path_trav_name;
 #ifdef debug
 #pragma omp critical (cerr)
diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 5a36ac21b5f..22b3541b31f 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -327,7 +327,6 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
         for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
              step_handle = mutable_graph->get_next_step(step_handle)) {
             rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
-
         }
     }
 
@@ -351,20 +350,21 @@ int64_t RGFACover::get_rank(nid_t node_id) const {
         return -1;
     }
 
-    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));
+    int64_t interval_idx = this->node_to_interval.at(node_id);
+    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
 
     // since our decomposition is based on snarl tranversals, we know that fragments must
     // overlap their parents on snarl end points (at the very least)
     // therefore we can find parents by scanning along the rgfa paths.    
     step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first);
-    int64_t left_rank = 0;
+    int64_t left_rank = interval_idx < num_ref_intervals ? 0 : -1;
     if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) {
         left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent)));
     }
 
     // don't need to go next, since already one past
     step_handle_t right_parent = rgfa_interval.second;
-    int64_t right_rank = 0;
+    int64_t right_rank = interval_idx < num_ref_intervals ? 0 : -1;
     if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
         right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent)));
     }
@@ -464,6 +464,9 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
             travs.push_back(trav);
         }
     }
+#ifdef debug
+    cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl;
+#endif
 
 
     // build an initial ranked list of candidate traversal fragments
@@ -573,7 +576,8 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
             thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
             step = graph->get_next_step(step);
         }
-        thread_rgfa_intervals.push_back(make_pair(trav[uncovered_interval.first], trav[uncovered_interval.second]));
+        thread_rgfa_intervals.push_back(make_pair(trav.at(uncovered_interval.first),
+                                                  graph->get_next_step(trav.at(uncovered_interval.second - 1))));
     }
 }
 

From 7c86b03507aeac95974bb8a8ee21904ace495c18 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Sep 2023 11:15:34 -0400
Subject: [PATCH 10/47] allow multiple -S in call

---
 src/subcommand/call_main.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index c8dc59128c6..cdf2d4fcda1 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -58,7 +58,7 @@ void help_call(char** argv) {
        << "    -N, --translation FILE  Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output" << endl
        << "    -O, --gbz-translation   Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl
        << "    -p, --ref-path NAME     Reference path to call on (multipile allowed. defaults to all paths)" << endl
-       << "    -S, --ref-sample NAME   Call on all paths with given sample name (cannot be used with -p)" << endl
+       << "    -S, --ref-sample NAME   Call on all paths with given sample name (cannot be used with -p; multiple allowed)" << endl
        << "    -o, --ref-offset N      Offset in reference path (multiple allowed, 1 per path)" << endl
        << "    -l, --ref-length N      Override length of reference in the contig field of output VCF" << endl
        << "    -d, --ploidy N          Ploidy of sample.  Only 1 and 2 supported. (default: 2)" << endl
@@ -83,7 +83,7 @@ int main_call(int argc, char** argv) {
     string ref_fasta_filename;
     string ins_fasta_filename;
     vector<string> ref_paths;
-    string ref_sample;
+    set<string> ref_sample_set;
     vector<size_t> ref_path_offsets;
     vector<size_t> ref_path_lengths;
     string min_support_string;
@@ -229,7 +229,7 @@ int main_call(int argc, char** argv) {
             ref_paths.push_back(optarg);
             break;
         case 'S':
-            ref_sample = optarg;
+            ref_sample_set.insert(optarg);
             break;            
         case 'o':
             ref_path_offsets.push_back(parse<int>(optarg));
@@ -363,7 +363,7 @@ int main_call(int argc, char** argv) {
         cerr << "error [vg call]: -c/-C no supported with -v, -l or -n" << endl;
         return 1;        
     }
-    if (!ref_paths.empty() && !ref_sample.empty()) {
+    if (!ref_paths.empty() && !ref_sample_set.empty()) {
         cerr << "error [vg call]: -S cannot be used with -p" << endl;
         return 1;
     }
@@ -507,7 +507,7 @@ int main_call(int argc, char** argv) {
                 PathSense path_sense = PathMetadata::parse_sense(name);
                 if (!Paths::is_alt(name) && path_sense != PathSense::HAPLOTYPE) {
                     string sample_name = PathMetadata::parse_sample_name(name);
-                    if (ref_sample.empty() || sample_name == ref_sample) {                        
+                    if (ref_sample_set.empty() || ref_sample_set.count(sample_name)) {                        
                         ref_paths.push_back(name);
                         // keep track of length best we can using maximum coordinate in event of subpaths
                         subrange_t subrange;

From e0e3cc20a69ee5e7f4a328d705740bf959d1e43d Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 22 Sep 2023 11:15:44 -0400
Subject: [PATCH 11/47] rgfa call test

---
 test/t/18_vg_call.t | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index 6a457f4eb9f..d4af9ad9ef5 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 PATH=../bin:$PATH # for vg
 
 
-plan tests 19
+plan tests 20
 
 # Toy example of hand-made pileup (and hand inspected truth) to make sure some
 # obvious (and only obvious) SNPs are detected by vg call
@@ -191,6 +191,18 @@ is $? 0 "overriding contig length does not change calls"
 
 rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi  x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf
 
+vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa
+vg sim -x rgfa_ins2.rgfa -n 20 -l 20 -a > rgfa_ins2.gam
+vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack
+vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf
+
+printf "x	8	>1>5	G	GTTTTTTTTTTTTTTTT
+y	11	>2>4	TTTTTTTTTTT	T,TTTT\n" > rgfa_ins2.truth
+diff rgfa_ins2.vcf rgfa_ins2.truth
+is $? 0 "vg call makes expected calls on rgfa references"
+
+rm -f rgfa_ins2.rgfa rgfa_ins2.gam rgfa_ins2.pack rgfa_ins2.vcf rgfa_ins2.truth
+
 
 
 

From b6b955f272da6535db2810bb4e59c3ff0dcd1f10 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sat, 23 Sep 2023 14:17:08 -0400
Subject: [PATCH 12/47] speed up rgfa path application

---
 src/rgfa.cpp | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 22b3541b31f..5149b784e2d 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -305,30 +305,40 @@ void RGFACover::load(const PathHandleGraph* graph,
 void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
     assert(this->graph == static_cast<PathHandleGraph*>(mutable_graph));
 
-    // compute the offsets in parallel, assuming we don't have a path position index of all paths
-    // todo: we could be smarter about not traversing the same path multiple times!
-    vector<int64_t> rgfa_offsets(this->rgfa_intervals.size());
-    vector<int64_t> rgfa_lengths(this->rgfa_intervals.size());
-#pragma omp parallel for
+    // index paths isued in rgfa cover
+    unordered_map<step_handle_t, int64_t> step_to_offset;
+    unordered_set<path_handle_t> source_path_set;
     for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {
         path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first);
-#ifdef debug
-        cerr << "computing offset for application of rgfa path " << graph->get_path_name(source_path_handle) << endl;
-#endif
-        rgfa_offsets[i] = 0;
-        mutable_graph->for_each_step_in_path(source_path_handle, [&](step_handle_t step_handle) {
-            if (step_handle == rgfa_intervals[i].first) {
-                return false;
+        step_to_offset[rgfa_intervals[i].first] = -1;
+        source_path_set.insert(source_path_handle);
+    }
+    vector<path_handle_t> source_paths(source_path_set.begin(), source_path_set.end()); // for old-style omp interface
+#pragma omp parallel for
+    for (int64_t i = 0; i < source_paths.size(); ++i) {
+        int64_t offset = 0;
+        graph->for_each_step_in_path(source_paths[i], [&](step_handle_t step_handle) {
+            if (step_to_offset.count(step_handle)) {
+                step_to_offset[step_handle] = offset;
             }
-            rgfa_offsets[i] += graph->get_length(graph->get_handle_of_step(step_handle));
-            return true;
+            offset += graph->get_length(graph->get_handle_of_step(step_handle));
         });
+    }
+    
+    // compute the offsets in parallel
+    vector<int64_t> rgfa_offsets(this->rgfa_intervals.size());
+    vector<int64_t> rgfa_lengths(this->rgfa_intervals.size());
+
+#pragma omp parallel for
+    for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {
+        rgfa_offsets[i] = step_to_offset.at(rgfa_intervals[i].first);
         rgfa_lengths[i] = 0;
         for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second;
              step_handle = mutable_graph->get_next_step(step_handle)) {
             rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle));
-        }
+        }        
     }
+    step_to_offset.clear();
 
     // write the rgfa paths
     for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {

From e02e8a2ace6205d82fa969e45e8432bd87f91cbf Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sat, 23 Sep 2023 14:37:22 -0400
Subject: [PATCH 13/47] sample option for surject path selection, some rgfa
 support

---
 src/hts_alignment_emitter.cpp   |  6 ++++--
 src/subcommand/call_main.cpp    |  2 +-
 src/subcommand/surject_main.cpp | 14 +++++++++++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp
index 2df5a1fac51..b301e25f97a 100644
--- a/src/hts_alignment_emitter.cpp
+++ b/src/hts_alignment_emitter.cpp
@@ -12,6 +12,7 @@
 #include "algorithms/find_translation.hpp"
 #include <vg/io/hfile_cppstream.hpp>
 #include <vg/io/stream.hpp>
+#include "rgfa.hpp"
 
 #include <sstream>
 
@@ -179,6 +180,7 @@ pair<vector<pair<string, int64_t>>, unordered_map<string, int64_t>> extract_path
         auto& own_length = get<1>(path_info);
         auto& base_length = get<2>(path_info);
         string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path);
+        base_path_name = RGFACover::revert_rgfa_path_name(base_path_name);
         if (!base_path_set.count(base_path_name)) {
             path_names_and_lengths.push_back(make_pair(base_path_name, base_length));
             base_path_set.insert(base_path_name);
@@ -229,7 +231,7 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
             
             if (print_subrange_warnings) {
                 subrange_t subrange;
-                std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange);
+                std::string base_path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(sequence_name), &subrange);
                 if (subrange != PathMetadata::NO_SUBRANGE) {
                     // The user is asking explicitly to surject to a path that is a
                     // subrange of some other logical path, like
@@ -693,7 +695,7 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector<pair<in
 
     // Resolve subpath naming / offset
     subrange_t subrange;
-    path_name = Paths::strip_subrange(path_name, &subrange);
+    path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(path_name), &subrange);
     if (subrange != PathMetadata::NO_SUBRANGE) {
         pos += subrange.first;
     }
diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index cdf2d4fcda1..a8e76c8e478 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -521,7 +521,7 @@ int main_call(int argc, char** argv) {
                     }
                 }
             });
-        if (ref_sample_names.size() > 1 && ref_sample.empty()) {
+        if (ref_sample_names.size() > 1 && ref_sample_set.empty()) {
             cerr << "error [vg call]: Multiple reference samples detected: [";
             size_t count = 0;
             for (const string& n : ref_sample_names) {                
diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp
index 8121ababf47..6d4d21e1dfa 100644
--- a/src/subcommand/surject_main.cpp
+++ b/src/subcommand/surject_main.cpp
@@ -39,6 +39,7 @@ void help_surject(char** argv) {
          << "  -t, --threads N          number of threads to use" << endl
          << "  -p, --into-path NAME     surject into this path or its subpaths (many allowed, default: reference, then non-alt generic)" << endl
          << "  -F, --into-paths FILE    surject into path names listed in HTSlib sequence dictionary or path list FILE" << endl
+         << "  -n, --into-sample NAME   surject into pahts from this sample (multiple allowed)" << endl
          << "  -i, --interleaved        GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl
          << "  -M, --multimap           include secondary alignments to all overlapping paths instead of just primary" << endl
          << "  -G, --gaf-input          input file is GAF instead of GAM" << endl
@@ -84,6 +85,7 @@ int main_surject(int argc, char** argv) {
     
     string xg_name;
     vector<string> path_names;
+    vector<string> path_sample_names;
     string path_file;
     string output_format = "GAM";
     string input_format = "GAM";
@@ -113,6 +115,7 @@ int main_surject(int argc, char** argv) {
             {"threads", required_argument, 0, 't'},
             {"into-path", required_argument, 0, 'p'},
             {"into-paths", required_argument, 0, 'F'},
+            {"into-sample", required_argument, 0, 'n'},
             {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
             {"subpath-local", required_argument, 0, 'l'},
             {"interleaved", no_argument, 0, 'i'},
@@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) {
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:",
+        c = getopt_long (argc, argv, "hx:p:F:n:liGmcbsN:R:f:C:t:SPa:ALMVw:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -158,6 +161,10 @@ int main_surject(int argc, char** argv) {
         case 'F':
             path_file = optarg;
             break;
+
+        case 'n':
+            path_sample_names.push_back(optarg);
+            break;            
         
         case 'l':
             subpath_global = false;
@@ -279,6 +286,11 @@ int main_surject(int argc, char** argv) {
     
     // Get the paths to surject into and their length information, either from
     // the given file, or from the provided list, or from sniffing the graph.
+    for (const string& sample_name : path_sample_names) {
+        path_handle_graph->for_each_path_of_sample(sample_name, [&](path_handle_t path_handle) {
+            path_names.push_back(path_handle_graph->get_path_name(path_handle));
+        });
+    }
     vector<tuple<path_handle_t, size_t, size_t>> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *xgidx);
     // Clear out path_names so we don't accidentally use it
     path_names.clear();

From 17f3f572405a483edd4863b4a48de3eddf32a566 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sun, 24 Sep 2023 19:21:02 -0400
Subject: [PATCH 14/47] fix bugs in load() and get_rank()

---
 src/rgfa.cpp | 101 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 5149b784e2d..eefb238a187 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -1,6 +1,7 @@
 #include "rgfa.hpp"
 #include <sstream>
 #include <algorithm>
+#include <queue>
 
 //#define debug
 
@@ -275,29 +276,43 @@ void RGFACover::load(const PathHandleGraph* graph,
             if (cur_offset + source_subrange.first == rgfa_subrange.first) {
                 source_start = cur_step;
                 found_start = true;
+            } else {
+                cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
             }
-            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
             return !found_start;
         });
         assert(found_start);
+        assert(graph->get_id(graph->get_handle_of_step(source_start)) ==
+               graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))));
+                assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) ==
+                       graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle))));
 
-        bool found_end;
+        bool found_end = false;
         step_handle_t source_end;
-        for (step_handle_t cur_step = source_start; graph->has_next_step(cur_step); cur_step = graph->get_next_step(cur_step)) {
-            if (cur_offset == rgfa_subrange.second) {
+        int64_t num_steps = 0;
+        for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) {
+            ++num_steps;
+            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
+            
+            if (cur_offset + source_subrange.first == rgfa_subrange.second) {
                 found_end = true;
                 source_end = cur_step;
                 break;
             }
-            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
+            if (!graph->has_next_step(cur_step)) {
+                break;
+            }
         }
         assert(found_end);
+        assert(num_steps == graph->get_step_count(path_handle));
 
         // we can finally add our interval
         source_end = graph->get_next_step(source_end);
         this->rgfa_intervals.push_back(make_pair(source_start, source_end));
         for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) {
-            node_to_interval[graph->get_id(graph->get_handle_of_step(cur_step))] = rgfa_intervals.size() - 1;
+            int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step));
+            assert(!node_to_interval.count(cur_id));
+            node_to_interval[cur_id] = rgfa_intervals.size() - 1;
         }
     });
 }
@@ -356,46 +371,58 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
 }
 
 int64_t RGFACover::get_rank(nid_t node_id) const {
-    if (!node_to_interval.count(node_id)) {
-        return -1;
-    }
 
-    int64_t interval_idx = this->node_to_interval.at(node_id);
-    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
+    // search back to reference in order to find the rank.
+    unordered_set<nid_t> visited;
+    priority_queue<pair<int64_t, nid_t>> queue;
+    queue.push(make_pair(0, node_id));
 
-    // since our decomposition is based on snarl tranversals, we know that fragments must
-    // overlap their parents on snarl end points (at the very least)
-    // therefore we can find parents by scanning along the rgfa paths.    
-    step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first);
-    int64_t left_rank = interval_idx < num_ref_intervals ? 0 : -1;
-    if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) {
-        left_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(left_parent)));
-    }
+    nid_t current_id;
+    int64_t distance = 0;
 
-    // don't need to go next, since already one past
-    step_handle_t right_parent = rgfa_interval.second;
-    int64_t right_rank = interval_idx < num_ref_intervals ? 0 : -1;
-    if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
-        right_rank = 1 + get_rank(graph->get_id(graph->get_handle_of_step(right_parent)));
-    }
+    while (!queue.empty()) {
+        std::tie(distance, current_id) = queue.top();
+        queue.pop();
 
-    return min(left_rank, right_rank);    
-}
+        if (!visited.count(current_id)) {
+
+            visited.insert(current_id);
+
+            if (this->node_to_interval.count(current_id)) {
+                int64_t interval_idx = this->node_to_interval.at(current_id);
 
-step_handle_t RGFACover::get_step(nid_t node_id) const {    
-    assert(node_to_interval.count(node_id));
+                // we've hit the reference, can stop searching
+                if (interval_idx < this->num_ref_intervals) {
+                    return distance;
+                }
 
-    const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(this->node_to_interval.at(node_id));    
-    path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first);
-    for (step_handle_t step = rgfa_interval.first; step != rgfa_interval.second; step = graph->get_next_step(step)) {
-        if (graph->get_id(graph->get_handle_of_step(step)) == node_id) {
-            return step;
+                // visit the neighbouring intervals
+                const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
+                step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first);
+                if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) {
+                    queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(left_parent))));
+                }
+                const step_handle_t& right_parent = rgfa_interval.second;
+                if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
+                    queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(right_parent))));
+                }
+            } else {
+                // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals)
+                graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) {
+                    queue.push(make_pair(distance, graph->get_id(next)));
+                });
+                graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) {
+                    queue.push(make_pair(distance, graph->get_id(next)));
+                });                                
+            }
+            
         }
     }
-    assert(false);
-    return step_handle_t();
-}
 
+    // this shouldn't happen?
+    return -1;
+}
+    
 pair<const pair<step_handle_t, step_handle_t>*,
      const pair<step_handle_t, step_handle_t>*>
 RGFACover::get_parent_intervals(const pair<step_handle_t, step_handle_t>& interval) const {

From 4d65d74a45fbf0b715d3dfc5d2e6d8ac29c47c77 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 25 Sep 2023 15:21:10 -0400
Subject: [PATCH 15/47] ignore rgfa path fragments in depth and clip coverage
 calcs

---
 src/algorithms/coverage_depth.cpp | 26 +++++++++++++++++++-------
 src/algorithms/coverage_depth.hpp | 10 ++++++----
 src/clip.cpp                      | 21 +++++++++++++++------
 src/clip.hpp                      | 11 +++++++----
 src/subcommand/clip_main.cpp      | 24 +++++++++++++++++++++---
 src/subcommand/depth_main.cpp     | 22 +++++++++++++++++++---
 6 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp
index c92471348da..d49e1880bf5 100644
--- a/src/algorithms/coverage_depth.cpp
+++ b/src/algorithms/coverage_depth.cpp
@@ -295,7 +295,8 @@ pair<double, double> sample_gam_depth(const HandleGraph& graph, const vector<Ali
     return combine_and_average_node_coverages(graph, node_coverages, min_coverage);
 }
 
-void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream) {
+void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles,
+                 const unordered_set<path_handle_t>& ignore_paths, ostream& out_stream) {
     assert(graph.has_path(path_name));
 
     path_handle_t path_handle = graph.get_path_handle(path_name);
@@ -311,10 +312,14 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m
             size_t step_count = 0;            
             handle_t handle = graph.get_handle_of_step(step_handle);
             graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
+                    path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);
+                    if (ignore_paths.count(step_path_handle)) {
+                        return true;
+                    }
                     if (count_cycles) {
                         ++step_count;
                     } else {
-                        path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);
+
                         auto it = path_to_name.find(step_path_handle);
                         if (it == path_to_name.end()) {
                             string step_path_name = graph.get_path_name(step_path_handle);
@@ -323,6 +328,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m
                         }
                         path_set.insert(it->second);
                     }
+                    return true;
                 });
             size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;
             size_t node_len = graph.get_length(handle);
@@ -339,7 +345,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m
 
 pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
                                        step_handle_t start_step, step_handle_t end_plus_one_step,
-                                       size_t min_coverage, bool count_cycles) {
+                                       size_t min_coverage, bool count_cycles, const unordered_set<path_handle_t>& ignore_paths) {
 
     // compute the mean and variance of our base coverage across the bin
     size_t bin_length = 0;
@@ -357,10 +363,13 @@ pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
         unordered_set<string> path_set;
         size_t step_count = 0;
         graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) {
+                path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle);
+                if (ignore_paths.count(step_path_handle)) {
+                    return true;
+                }
                 if (count_cycles) {
                     ++step_count;
                 } else {
-                        path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle);
                         auto it = path_to_name.find(step_path_handle);
                         if (it == path_to_name.end()) {
                             string step_path_name = graph.get_path_name(step_path_handle);
@@ -368,7 +377,8 @@ pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
                             it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first;
                         }
                         path_set.insert(it->second);
-                }                    
+                }
+                return true;
             });
         size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;        
 
@@ -386,7 +396,8 @@ vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandle
                                                                 const string& path_name,
                                                                 size_t bin_size,
                                                                 size_t min_coverage,
-                                                                bool count_cycles) {
+                                                                bool count_cycles,
+                                                                const unordered_set<path_handle_t>& ignore_paths) {
 
     path_handle_t path_handle = graph.get_path_handle(path_name);
     
@@ -414,7 +425,8 @@ vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandle
         step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step;
         size_t bin_start = bins[i].first;
         size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset;
-        pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles);
+        pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles,
+                                                          ignore_paths);
         binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second);
     }
 
diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp
index 021cc57fa52..9cb7af0a1a6 100644
--- a/src/algorithms/coverage_depth.hpp
+++ b/src/algorithms/coverage_depth.hpp
@@ -67,14 +67,16 @@ pair<double, double> sample_mapping_depth(const HandleGraph& graph, const vector
 /// print path-name offset base-coverage for every base on a path (just like samtools depth)
 /// ignoring things below min_coverage.  offsets are 1-based in output stream
 /// coverage here is the number of steps from (unique) other paths
-void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream);
+void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles,
+                 const unordered_set<path_handle_t>& ignore_paths, ostream& out_stream);
 
 /// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage
 pair<double, double> path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step,
-                                       size_t min_coverage, bool count_cycles);
+                                       size_t min_coverage, bool count_cycles, const unordered_set<path_handle_t>& ignore_paths);
 
-vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size,
-                                                                size_t min_coverage, bool count_cycles);
+vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph, const string& path_name,
+                                                                size_t bin_size, size_t min_coverage, bool count_cycles,
+                                                                const unordered_set<path_handle_t>& ignore_paths);
 
 }
 }
diff --git a/src/clip.cpp b/src/clip.cpp
index 0d2c5efd78d..45e0e5367fc 100644
--- a/src/clip.cpp
+++ b/src/clip.cpp
@@ -481,7 +481,8 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
                                             function<void(function<void(handle_t, const Region*)>)> iterate_handles,
                                             function<void(function<void(edge_t, const Region*)>)> iterate_edges,                                            
                                             int64_t min_depth, const vector<string>& ref_prefixes,
-                                            int64_t min_fragment_len, bool verbose) {
+                                            int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths,
+                                            bool verbose) {
 
     // find all nodes in the snarl that are not on the reference interval (reference path name from containing interval)
     unordered_set<nid_t> to_delete;
@@ -502,6 +503,10 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
         bool on_ref = false;
         size_t depth = 0;
         graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) {
+                path_handle_t path_handle = graph->get_path_handle_of_step(step_handle);
+                if (ignore_paths.count(path_handle)) {
+                    return true;
+                }
                 ++depth;
                 if (depth > min_depth || on_ref) {
                     return false;
@@ -509,7 +514,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
                 if (!ref_prefixes.empty() || region) {
                     // if we have a region, do exact comparison to it.
                     // otherwise, do a prefix check against ref_prefix
-                    string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle));
+                    string path_name = graph->get_path_name(path_handle);
                     if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) {
                         on_ref = true;
                         return false;
@@ -540,6 +545,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
     edge_depths.resize(edge_count + 1);
     
     graph->for_each_path_handle([&](path_handle_t path_handle) {
+        if (!ignore_paths.count(path_handle)) {                
             bool is_ref_path = check_prefixes(graph->get_path_name(path_handle));
             handle_t prev_handle;
             bool first = true;
@@ -563,6 +569,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
                     }
                     prev_handle = handle;
                 });
+            }
         });
 
     unordered_set<edge_t> edges_to_delete;
@@ -622,7 +629,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
 }
 
 void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector<string>& ref_prefixes,
-                          int64_t min_fragment_len, bool verbose) {
+                                    int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths, bool verbose) {
     
     function<void(function<void(handle_t, const Region*)>)> iterate_handles = [&] (function<void(handle_t, const Region*)> visit_handle) {
         graph->for_each_handle([&](handle_t handle) {
@@ -636,11 +643,13 @@ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_
             });        
     };
 
-    clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, verbose);
+    clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len,
+                                           ignore_paths, verbose);
 }
 
 void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector<Region>& regions,
-                                    SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose) {
+                                    SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len,
+                                              const unordered_set<path_handle_t>& ignore_paths, bool verbose) {
 
     function<void(function<void(handle_t, const Region*)>)> iterate_handles = [&] (function<void(handle_t, const Region*)> visit_handle) {
         
@@ -678,7 +687,7 @@ void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* gra
     }
     vector<string> ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end());
 
-    clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, verbose);
+    clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, ignore_paths, verbose);
     
 }
 
diff --git a/src/clip.hpp b/src/clip.hpp
index e57a260b98c..4c1294aad71 100644
--- a/src/clip.hpp
+++ b/src/clip.hpp
@@ -64,19 +64,22 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
                                             function<void(function<void(handle_t, const Region*)>)> iterate_handles,
                                             function<void(function<void(edge_t, const Region*)>)> iterate_edges,
                                             int64_t min_depth, const vector<string>& ref_prefixes,
-                                            int64_t min_fragment_len, bool verbose);
+                                            int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths,
+                                            bool verbose);
 
 /**
  * Run above function on graph
  */
 void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector<string>& ref_prefixes,
-                          int64_t min_fragment_len, bool verbose);
+                                    int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths, bool verbose);
 
 /**
  * Or on contained snarls
  */
-void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector<Region>& regions,
-                                    SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose);
+void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph,
+                                              const vector<Region>& regions,  SnarlManager& snarl_manager, bool include_endpoints,
+                                              int64_t min_depth, int64_t min_fragment_len,
+                                              const unordered_set<path_handle_t>& ignore_paths, bool verbose);
 
 /**
  * clip out deletion edges 
diff --git a/src/subcommand/clip_main.cpp b/src/subcommand/clip_main.cpp
index ddbd5956862..d219b6dd35b 100644
--- a/src/subcommand/clip_main.cpp
+++ b/src/subcommand/clip_main.cpp
@@ -8,6 +8,7 @@
 #include <vg/io/alignment_emitter.hpp>
 #include "../clip.hpp"
 #include <bdsg/overlays/overlay_helper.hpp>
+#include "../rgfa.hpp"
 
 #include <unistd.h>
 #include <getopt.h>
@@ -44,6 +45,7 @@ void help_clip(char** argv) {
        << "    -m, --min-fragment-len N  Don't write novel path fragment if it is less than N bp long" << endl
        << "    -B, --output-bed          Write BED-style file of affected intervals instead of clipped graph. " << endl
        << "                              Columns 4-9 are: snarl node-count edge-count shallow-node-count shallow-edge-count avg-degree" << endl
+       << "    -i, --ignore-path-prefix  Ignore all paths beginning with given prefix, ex when computing depth [_rGFA_]." << endl
        << "    -t, --threads N           number of threads to use [default: all available]" << endl
        << "    -v, --verbose             Print some logging messages" << endl
        << endl;
@@ -54,6 +56,7 @@ int main_clip(int argc, char** argv) {
     string bed_path;
     string snarls_path;
     vector<string> ref_prefixes;
+    vector<string> ignore_prefixes = {RGFACover::rgfa_sample_name};
     int64_t min_depth = -1;
     int64_t min_fragment_len = 0;
     bool verbose = false;
@@ -102,13 +105,14 @@ int main_clip(int argc, char** argv) {
             {"snarls", required_argument, 0, 'r'},
             {"min-fragment-len", required_argument, 0, 'm'},
             {"output-bed", no_argument, 0, 'B'},
+            {"ignore-path-prefixes", required_argument, 0, 'i'},
             {"threads", required_argument, 0, 't'},
             {"verbose", required_argument, 0, 'v'},
             {0, 0, 0, 0}
 
         };
         int option_index = 0;
-        c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bt:v",
+        c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bi:t:v",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -180,6 +184,9 @@ int main_clip(int argc, char** argv) {
         case 'B':
             out_bed = true;
             break;
+        case 'i':
+            ignore_prefixes.push_back(optarg);
+            break;
         case 'v':
             verbose = true;
             break;
@@ -336,13 +343,24 @@ int main_clip(int argc, char** argv) {
     }        
 
     if (min_depth >= 0) {
+        // compute paths to ignore
+        unordered_set<path_handle_t> ignore_set;
+        graph->for_each_path_handle([&](path_handle_t path_handle) {
+            string path_name = graph->get_path_name(path_handle);
+            for (const string& ip : ignore_prefixes) {
+                if (path_name.compare(0, ip.length(), ip) == 0) {
+                    ignore_set.insert(path_handle);
+                    break;
+                }
+            }
+        });
         // run the depth clipping       
         if (bed_path.empty()) {            
             // do the whole graph
-            clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, verbose);
+            clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, ignore_set, verbose);
         } else {
             // do the contained snarls
-            clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, verbose);
+            clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, ignore_set, verbose);
         }
         
     } else if (max_deletion >= 0) {
diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp
index 54284f67fd5..2fd42cd6ae6 100644
--- a/src/subcommand/depth_main.cpp
+++ b/src/subcommand/depth_main.cpp
@@ -45,6 +45,7 @@ void help_depth(char** argv) {
          << "    -P, --paths-by STR     select the paths with the given name prefix" << endl        
          << "    -b, --bin-size N       bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl
          << "    -m, --min-coverage N   ignore nodes with less than N coverage depth [1]" << endl
+         << "    -i, --ignore-prefix P  ignore paths with given prefix when computing path coverage [_rGFA_]" << endl 
          << "    -t, --threads N        number of threads to use [all available]" << endl;
 }
 
@@ -69,6 +70,7 @@ int main_depth(int argc, char** argv) {
     bool count_cycles = false;
 
     size_t min_coverage = 1;
+    vector<string> ignore_path_prefixes;
 
     int c;
     optind = 2; // force optind past command positional argument
@@ -87,13 +89,14 @@ int main_depth(int argc, char** argv) {
             {"min-mapq", required_argument, 0, 'Q'},
             {"min-coverage", required_argument, 0, 'm'},
             {"count-cycles", no_argument, 0, 'c'},
+            {"ignore-prefix", required_argument, 0, 'i'},
             {"threads", required_argument, 0, 't'},
             {"help", no_argument, 0, 'h'},
             {0, 0, 0, 0}
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:",
+        c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ci:t:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -138,6 +141,9 @@ int main_depth(int argc, char** argv) {
         case 'c':
             count_cycles = true;
             break;
+        case 'i':
+            ignore_path_prefixes.push_back(optarg);
+            break;
         case 't':
         {
             int num_threads = parse<int>(optarg);
@@ -197,6 +203,7 @@ int main_depth(int argc, char** argv) {
         // we want our paths sorted by the subpath parse so the output is sorted
         map<pair<string, int64_t>, string> ref_paths;
         unordered_set<string> base_path_set;
+        unordered_set<path_handle_t> ignore_paths;
         
         graph->for_each_path_handle([&](path_handle_t path_handle) {
                 string path_name = graph->get_path_name(path_handle);
@@ -222,6 +229,14 @@ int main_depth(int argc, char** argv) {
                     assert(!ref_paths.count(coord));
                     ref_paths[coord] = path_name;
                 }
+                if (pack_filename.empty()) {
+                    for (const string& ignore_prefix : ignore_path_prefixes) {
+                        if (path_name.compare(0, ignore_prefix.length(), ignore_prefix) == 0) {
+                            ignore_paths.insert(path_handle);
+                            break;
+                        }
+                    }
+                }
             });
         
         for (const auto& ref_name : ref_paths_input_set) {
@@ -240,7 +255,8 @@ int main_depth(int argc, char** argv) {
                 if (!pack_filename.empty()) {
                     binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels);
                 } else {
-                    binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles);
+                    binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles,
+                                                                 ignore_paths);
                 }
                 for (auto& bin_cov : binned_depth) {
                     // bins can ben nan if min_coverage filters everything out.  just skip
@@ -253,7 +269,7 @@ int main_depth(int argc, char** argv) {
                 if (!pack_filename.empty()) {
                     algorithms::packed_depths(*packer, ref_path, min_coverage, cout);
                 } else {
-                    algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout);
+                    algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, ignore_paths, cout);
                 }
             }
         }

From b46eaaa115d10f5a3a8752a6f54514378038c2b5 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 26 Sep 2023 14:06:41 -0400
Subject: [PATCH 16/47] use (by default) fragment not original paths in rgfa
 index

---
 src/rgfa.cpp | 235 +++++++++++++++++++++++++++------------------------
 src/rgfa.hpp |   8 +-
 2 files changed, 129 insertions(+), 114 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index eefb238a187..4428463b53d 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -192,7 +192,8 @@ void RGFACover::compute(const PathHandleGraph* graph,
 }
 
 void RGFACover::load(const PathHandleGraph* graph,
-                     const unordered_set<path_handle_t>& reference_paths) {
+                     const unordered_set<path_handle_t>& reference_paths,
+                     bool use_original_paths) {
     // start from scratch
     this->rgfa_intervals.clear();
     this->node_to_interval.clear();
@@ -200,121 +201,132 @@ void RGFACover::load(const PathHandleGraph* graph,
 
     // start with the reference paths
     for (const path_handle_t& ref_path_handle : reference_paths) {
-        this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
-                                                 graph->path_end(ref_path_handle)));
         graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
-            node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size() - 1;
+            node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size();
         });
+        this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
+                                                 graph->path_end(ref_path_handle)));
     }
     this->num_ref_intervals = this->rgfa_intervals.size();
 
-    // then the rgfa cover paths
-    // since we want to keep our structures in  terms of original paths, we have to map back
-    // to them here (if we don't have original paths, then we can't find the overlaps and
-    // therefore nesting relationships between them.
-
-    // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise
-    // (this does not make guarantees about degenerate fragmentation related cases tho)
-    unordered_map<pair<string, string>, vector<path_handle_t>> sample_locus_to_paths;    
-    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
-        string locus_name = graph->get_locus_name(path_handle);
-        sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {};
-    });
-    graph->for_each_path_handle([&](path_handle_t path_handle) {
-        pair<string, string> sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle));
-        if (sample_locus_to_paths.count(sample_locus)) {
-            sample_locus_to_paths[sample_locus].push_back(path_handle);
-        }
-    });
+    if (!use_original_paths) {
+        // just suck up the rgfa fragments right into the data structure
+        graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+            graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
+                node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size();
+            });
+            this->rgfa_intervals.push_back(make_pair(graph->path_begin(path_handle),
+                                                     graph->path_end(path_handle)));
+        });
+    } else {
+        // then the rgfa cover paths
+        // since we want to keep our structures in  terms of original paths, we have to map back
+        // to them here (if we don't have original paths, then we can't find the overlaps and
+        // therefore nesting relationships between them.
+
+        // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise
+        // (this does not make guarantees about degenerate fragmentation related cases tho)
+        unordered_map<pair<string, string>, vector<path_handle_t>> sample_locus_to_paths;    
+        graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+            string locus_name = graph->get_locus_name(path_handle);
+            sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {};
+        });
+        graph->for_each_path_handle([&](path_handle_t path_handle) {
+            pair<string, string> sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle));
+            if (sample_locus_to_paths.count(sample_locus)) {
+                sample_locus_to_paths[sample_locus].push_back(path_handle);
+            }
+        });
 
-    // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval
-    // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph
-    // so should probably have a better error message than the asserts below (ie if exact interval match not found)
-    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
-        // pase the rgfa locus to get the original sample and locus
-        pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle));
-        // find the sample in our index
-        const vector<path_handle_t>& source_paths = sample_locus_to_paths.at(source_sample_locus);
-        // find the containing path
-        subrange_t rgfa_subrange = graph->get_subrange(path_handle);
-        assert(rgfa_subrange != PathMetadata::NO_SUBRANGE);
-        int64_t rgfa_haplotype = graph->get_haplotype(path_handle);
-        // allow match between 0 and NO_HAPLOTYPE
-        if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) {
-            rgfa_haplotype = 0;
-        }
-        const path_handle_t* source_path = nullptr;
-        subrange_t source_subrange;
-        for (const path_handle_t& source_path_candidate : source_paths) {
-            int64_t source_haplotype = graph->get_haplotype(source_path_candidate);
-            if (source_haplotype == PathMetadata::NO_HAPLOTYPE) {
-                source_haplotype = 0;
+        // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval
+        // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph
+        // so should probably have a better error message than the asserts below (ie if exact interval match not found)
+        graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+            // pase the rgfa locus to get the original sample and locus
+            pair<string, string> source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle));
+            // find the sample in our index
+            const vector<path_handle_t>& source_paths = sample_locus_to_paths.at(source_sample_locus);
+            // find the containing path
+            subrange_t rgfa_subrange = graph->get_subrange(path_handle);
+            assert(rgfa_subrange != PathMetadata::NO_SUBRANGE);
+            int64_t rgfa_haplotype = graph->get_haplotype(path_handle);
+            // allow match between 0 and NO_HAPLOTYPE
+            if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) {
+                rgfa_haplotype = 0;
             }
-            if (source_haplotype == rgfa_haplotype) {
-                source_subrange = graph->get_subrange(source_path_candidate);
-                if (source_subrange == PathMetadata::NO_SUBRANGE) {
-                    source_subrange.first = 0;
+            const path_handle_t* source_path = nullptr;
+            subrange_t source_subrange;
+            for (const path_handle_t& source_path_candidate : source_paths) {
+                int64_t source_haplotype = graph->get_haplotype(source_path_candidate);
+                if (source_haplotype == PathMetadata::NO_HAPLOTYPE) {
+                    source_haplotype = 0;
                 }
-                if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) {
-                    source_subrange.second = 0;
-                    graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) {
-                        source_subrange.second += graph->get_length(graph->get_handle_of_step(step));
-                    });
-                }
-                if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) {
-                    source_path = &source_path_candidate;
-                    break;
+                if (source_haplotype == rgfa_haplotype) {
+                    source_subrange = graph->get_subrange(source_path_candidate);
+                    if (source_subrange == PathMetadata::NO_SUBRANGE) {
+                        source_subrange.first = 0;
+                    }
+                    if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) {
+                        source_subrange.second = 0;
+                        graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) {
+                            source_subrange.second += graph->get_length(graph->get_handle_of_step(step));
+                        });
+                    }
+                    if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) {
+                        source_path = &source_path_candidate;
+                        break;
+                    }
                 }
             }
-        }
-        assert(source_path != nullptr);
-        // now find the exact interval in the containing path and update our data structure
-        bool found_start = false;
-        step_handle_t source_start;
-        int64_t cur_offset = 0;
-        graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) {
-            if (cur_offset + source_subrange.first == rgfa_subrange.first) {
-                source_start = cur_step;
-                found_start = true;
-            } else {
+            assert(source_path != nullptr);
+            // now find the exact interval in the containing path and update our data structure
+            bool found_start = false;
+            step_handle_t source_start;
+            int64_t cur_offset = 0;
+            graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) {
+                if (cur_offset + source_subrange.first == rgfa_subrange.first) {
+                    source_start = cur_step;
+                    found_start = true;
+                } else {
+                    cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
+                }
+                return !found_start;
+            });
+            assert(found_start);
+            assert(graph->get_id(graph->get_handle_of_step(source_start)) ==
+                   graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))));
+            assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) ==
+                   graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle))));
+
+            bool found_end = false;
+            step_handle_t source_end;
+            int64_t num_steps = 0;
+            for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) {
+                ++num_steps;
                 cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
-            }
-            return !found_start;
-        });
-        assert(found_start);
-        assert(graph->get_id(graph->get_handle_of_step(source_start)) ==
-               graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))));
-                assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) ==
-                       graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle))));
-
-        bool found_end = false;
-        step_handle_t source_end;
-        int64_t num_steps = 0;
-        for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) {
-            ++num_steps;
-            cur_offset += graph->get_length(graph->get_handle_of_step(cur_step));
             
-            if (cur_offset + source_subrange.first == rgfa_subrange.second) {
-                found_end = true;
-                source_end = cur_step;
-                break;
+                if (cur_offset + source_subrange.first == rgfa_subrange.second) {
+                    found_end = true;
+                    source_end = cur_step;
+                    break;
+                }
+                if (!graph->has_next_step(cur_step)) {
+                    break;
+                }
             }
-            if (!graph->has_next_step(cur_step)) {
-                break;
+            assert(found_end);
+            assert(num_steps == graph->get_step_count(path_handle));
+
+            // we can finally add our interval
+            source_end = graph->get_next_step(source_end);
+            this->rgfa_intervals.push_back(make_pair(source_start, source_end));
+            for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) {
+                int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step));
+                assert(!node_to_interval.count(cur_id));
+                node_to_interval[cur_id] = rgfa_intervals.size() - 1;
             }
-        }
-        assert(found_end);
-        assert(num_steps == graph->get_step_count(path_handle));
-
-        // we can finally add our interval
-        source_end = graph->get_next_step(source_end);
-        this->rgfa_intervals.push_back(make_pair(source_start, source_end));
-        for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) {
-            int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step));
-            assert(!node_to_interval.count(cur_id));
-            node_to_interval[cur_id] = rgfa_intervals.size() - 1;
-        }
-    });
+        });
+    }
 }
 
 void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
@@ -396,16 +408,17 @@ int64_t RGFACover::get_rank(nid_t node_id) const {
                     return distance;
                 }
 
-                // visit the neighbouring intervals
+                // search out of the snarl -- any parent traversals will overlap here
                 const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
-                step_handle_t left_parent = graph->get_previous_step(rgfa_interval.first);
-                if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(rgfa_interval.first))) {
-                    queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(left_parent))));
-                }
-                const step_handle_t& right_parent = rgfa_interval.second;
-                if (right_parent != graph->path_end(graph->get_path_handle_of_step(rgfa_interval.second))) {
-                    queue.push(make_pair(distance + 1, graph->get_id(graph->get_handle_of_step(right_parent))));
-                }
+                
+                graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) {
+                    queue.push(make_pair(distance + 1, graph->get_id(prev)));
+                });
+                step_handle_t last_step = graph->get_previous_step(rgfa_interval.second);
+                graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) {
+                    queue.push(make_pair(distance + 1, graph->get_id(next)));
+                });
+
             } else {
                 // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals)
                 graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) {
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 26473dfcab9..6fa426680ef 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -57,10 +57,12 @@ class RGFACover {
 
     // load the rgfa cover from the graph, assuming it's been computed already and
     // saved in special rgfa paths
-    // this function assumes that the rGFA cover paths are exactly consistent
-    // with the original paths.
+    // the use_original_paths flag specifies whether the cover indexes the original
+    // paths or the rgfa paths fragments (if the former, then they must exist
+    // and be 100% consistent with the paths used to construct the fragments)
     void load(const PathHandleGraph* graph,
-              const unordered_set<path_handle_t>& reference_paths);
+              const unordered_set<path_handle_t>& reference_paths,
+              bool use_original_paths = false);
 
     // apply the rgfa cover to a graph (must have been computed first), adding it
     // as a bunch of (reference sense) paths with a special sample name

From 1942dc6cfc60f866b86747b085de0465426b1bbf Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 26 Sep 2023 15:16:05 -0400
Subject: [PATCH 17/47] break ties with name when choosing cover

---
 src/rgfa.cpp | 24 ++++++++++--------------
 src/rgfa.hpp | 12 ++++++++++++
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 4428463b53d..0a205ea660b 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -478,6 +478,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
 
     // start by finding the path traversals through the snarl
     vector<vector<step_handle_t>> travs;
+    vector<string> trav_names;
     {
         pair<vector<SnarlTraversal>, vector<pair<step_handle_t, step_handle_t> > > path_travs = path_trav_finder.find_path_traversals(snarl);
         travs.reserve(path_travs.first.size());
@@ -512,6 +513,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
                 std::reverse(trav.begin(), trav.end());
             }
             travs.push_back(trav);
+            trav_names.push_back(trav_path_name);
         }
     }
 #ifdef debug
@@ -520,7 +522,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
 
 
     // build an initial ranked list of candidate traversal fragments
-    vector<pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>> ranked_trav_fragments;
+    vector<RankedFragment> ranked_trav_fragments;
     for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) {
         // only a reference traversal (or deletion that we don't need to consider)
         // will have its first two nodes covered
@@ -548,30 +550,24 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
             }
             if (!cyclic && interval_length >= minimum_length) {
                 int64_t trav_coverage = get_coverage(trav, uncovered_interval);
-                ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(trav_idx, uncovered_interval)));
+                ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval});
             }
         }
     }
 
-    // todo: typedef!
-    auto heap_comp = [] (const pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>& s1,
-                         const pair<int64_t, pair<int64_t, pair<int64_t, int64_t>>>& s2) {
-        return s1.first < s2.first;
-    };
-
     // put the fragments into a max heap
-    std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+    std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end());
 
     // now greedily pull out traversal intervals from the ranked list until none are left
     while (!ranked_trav_fragments.empty()) {
 
         // get the best scoring (max) fragment from heap
         auto best_stats_fragment = ranked_trav_fragments.front();
-        std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+        std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end());
         ranked_trav_fragments.pop_back();
         
-        const vector<step_handle_t>& trav = travs.at(best_stats_fragment.second.first);
-        const pair<int64_t, int64_t>& uncovered_interval = best_stats_fragment.second.second;
+        const vector<step_handle_t>& trav = travs.at(best_stats_fragment.trav_idx);
+        const pair<int64_t, int64_t>& uncovered_interval = best_stats_fragment.fragment;
 
 #ifdef debug
         cerr << "best trav: ";
@@ -609,8 +605,8 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
                 }
                 if (chopped_trav_length >= minimum_length) {
                     int64_t trav_coverage = get_coverage(trav, chopped_interval);
-                    ranked_trav_fragments.push_back(make_pair(trav_coverage, make_pair(best_stats_fragment.second.first, chopped_interval)));
-                    std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp);
+                    ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval});
+                    std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end());
                 }
             }
             continue;
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 6fa426680ef..2a51cf7aecd 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -119,6 +119,18 @@ class RGFACover {
     int64_t num_ref_intervals;
 
     unordered_map<nid_t, int64_t> node_to_interval;
+
+    // used when selecting traversals to make the greedy cover
+    struct RankedFragment {
+        int64_t coverage;
+        const string* name;
+        int64_t trav_idx;
+        pair<int64_t, int64_t> fragment;
+        bool operator<(const RankedFragment& f2) {
+            // note: name comparison is flipped because we want to select high coverage / low name
+            return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name);
+        }
+    };
 };
 
 /// Export the given VG graph to the given GFA file.

From e53973492b17162962eac3449bd9603040aa8e75 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 29 Sep 2023 15:43:17 -0400
Subject: [PATCH 18/47] work around step iterator issue in gbz

---
 src/rgfa.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 0a205ea660b..8aeb14419f5 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -414,7 +414,14 @@ int64_t RGFACover::get_rank(nid_t node_id) const {
                 graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) {
                     queue.push(make_pair(distance + 1, graph->get_id(prev)));
                 });
-                step_handle_t last_step = graph->get_previous_step(rgfa_interval.second);
+                // hack around gbwtgraph bug (feature?) that does not let you decrement path_end
+                path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first);
+                step_handle_t last_step;
+                if (rgfa_interval.second == graph->path_end(path_handle)) {
+                    last_step = graph->path_back(path_handle);
+                } else {
+                    last_step = graph->get_previous_step(rgfa_interval.second);
+                }
                 graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) {
                     queue.push(make_pair(distance + 1, graph->get_id(next)));
                 });

From 2a5ffad837c032d7cc76e1a5c6dd8e4f55b58378 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 29 Sep 2023 15:43:47 -0400
Subject: [PATCH 19/47] get rgfa tests passing; add back end range where
 possible

---
 src/algorithms/gfa_to_handle.cpp |  2 +-
 test/t/11_vg_paths.t             | 18 +++++++++---------
 test/t/18_vg_call.t              |  2 +-
 test/t/48_vg_convert.t           |  7 +++++--
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp
index abff52c2add..aae969973c3 100644
--- a/src/algorithms/gfa_to_handle.cpp
+++ b/src/algorithms/gfa_to_handle.cpp
@@ -303,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
             string rgfa_path_name;
             if (path_rank > 0) {
                 // Special logic for off-reference paths, which get loaded into special rGFA cover paths
-                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false);
+                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, true);
             } else {
                 // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
                 rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC,
diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index dd00510ccfa..ec41182c368 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -62,9 +62,9 @@ is $? 0 "vg path coverage reports correct lengths in first column"
 rm -f q.vg q.cov.len q.len
 
 vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa
-printf "P	_rGFA_#y[33]	10+	*
-P	_rGFA_#y[38]	13+	*
-P	_rGFA_#y[8]	2+,4+	*\n" > rgfa_tiny_expected_fragments.rgfa
+printf "P	_rGFA_#y[33-34]	10+	*
+P	_rGFA_#y[38-39]	13+	*
+P	_rGFA_#y[8-9]	2+,4+	*\n" > rgfa_tiny_expected_fragments.rgfa
 grep ^P rgfa_tiny.rgfa | grep rGFA | sort > rgfa_tiny_fragments.rgfa
 diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa
 is $? 0 "Found the expected rgfa SNP cover of tiny graph"
@@ -72,7 +72,7 @@ is $? 0 "Found the expected rgfa SNP cover of tiny graph"
 rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa
 
 vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW >  rgfa_ins.rgfa
-printf "P	_rGFA_#z[8]	2+,3+,4+	*\n" > rgfa_ins_expected_fragments.rgfa
+printf "P	_rGFA_#z[8-11]	2+,3+,4+	*\n" > rgfa_ins_expected_fragments.rgfa
 grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa
 diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa
 is $? 0 "Found the expected rgfa cover for simple nested insertion"
@@ -80,8 +80,8 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion"
 rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa
 
 vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa
-printf "P	_rGFA_#y[8]	2+,6+,4+	*
-P	_rGFA_#z[11]	3+	*\n" > rgfa_ins2_expected_fragments.rgfa
+printf "P	_rGFA_#y[8-11]	2+,6+,4+	*
+P	_rGFA_#z[11-14]	3+	*\n" > rgfa_ins2_expected_fragments.rgfa
 grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa
 diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa
 is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments"
@@ -101,7 +101,7 @@ is $? 0 "Found the expected rgfa tags for simple nested insertion that requires
 rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa
 
 vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa
-printf "P	_rGFA_#y[8]	2+,6+,4+	*\n" > rgfa_ins2R5_expected_fragments.rgfa
+printf "P	_rGFA_#y[8-11]	2+,6+,4+	*\n" > rgfa_ins2R5_expected_fragments.rgfa
 grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa
 diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa
 is $? 0 "rgfa Minimum fragment length filters out small fragment"
@@ -110,9 +110,9 @@ rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments
 
 vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW  > rgfa_ins3.rgfa
 printf "P	x	1+,5+	*
-P	_rGFA_#y[3]	4+,6+,2+	*
+P	_rGFA_#y[3-6]	4+,6+,2+	*
 P	y	5-,4+,6+,2+,1-	*
-P	_rGFA_#z[11]	3+	*
+P	_rGFA_#z[11-14]	3+	*
 P	z	1+,2-,3+,4-,5+	*\n" | sort > rgfa_ins3_expected_fragments.rgfa
 grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa
 diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa
diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index d4af9ad9ef5..28b1122f11d 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -192,7 +192,7 @@ is $? 0 "overriding contig length does not change calls"
 rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi  x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf
 
 vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa
-vg sim -x rgfa_ins2.rgfa -n 20 -l 20 -a > rgfa_ins2.gam
+vg sim -x rgfa_ins2.rgfa -n 30 -s 99 -l 20 -a > rgfa_ins2.gam
 vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack
 vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf
 
diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t
index bed23a12c41..f967c048ec0 100644
--- a/test/t/48_vg_convert.t
+++ b/test/t/48_vg_convert.t
@@ -189,7 +189,7 @@ is "$?" 0 "converting gfa and equivalent rgfa produces same output"
 
 rm -f tiny.gfa.gfa tiny.rgfa.gfa
 
-is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by y -x - | grep -v "^#" | cut -f1)" "y[35]" "rank-1 rgfa subpath found"
+is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by _rGFA_ -x - | grep -v "^#" | cut -f1)" "_rGFA_#y[35-36]" "rank-1 rgfa subpath found"
 
 vg convert -g tiny/tiny.rgfa -T gfa-id-mapping.tsv > /dev/null
 grep ^S tiny/tiny.rgfa  | awk '{print $2}' | sort > rgfa_nodes
@@ -404,12 +404,15 @@ is "${?}" "0" "GFA -> HashGraph -> GFA conversion preserves path metadata"
 
 # We can't do rGFA to GBZ directly until the GBWTGraph GFA parser learns to read tags
 # rGFA to HashGraph to GBZ to GFA
-vg paths -M -x graphs/rgfa_with_reference.rgfa | sort >paths.truth.txt
+# todo: would be great if gbz could preserve the NO_HAPLOTYPE
+#       gbz also strips the end of the subrange
+vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' -e 's/-2//g' -e 's/-8//g' |  sort >paths.truth.txt
 vg convert -a graphs/rgfa_with_reference.rgfa > rgfa_with_reference.hg
 vg gbwt -x rgfa_with_reference.hg -E --gbz-format -g rgfa_with_reference.gbz
 vg paths -M -x rgfa_with_reference.gbz | sort >paths.gbz.txt
 cmp paths.gbz.txt paths.truth.txt
 is "${?}" "0" "rGFA -> HashGraph -> GBZ conversion preserves path metadata"
+vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' |  sort >paths.truth.txt
 vg convert -f rgfa_with_reference.gbz >extracted.gfa
 vg paths -M -x extracted.gfa | sort >paths.gfa.txt
 cmp paths.gfa.txt paths.truth.txt

From c4bca6660aeaec128b94083eae5f34ea0d829b57 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 5 Oct 2023 15:57:55 -0400
Subject: [PATCH 20/47] fix option collision from last merge

---
 src/subcommand/paths_main.cpp |  8 ++++----
 test/t/11_vg_paths.t          | 12 ++++++------
 test/t/18_vg_call.t           |  2 +-
 test/t/26_deconstruct.t       |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index db948397cc2..34c36a25978 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -41,7 +41,7 @@ void help_paths(char** argv) {
          << "    -d, --drop-paths         output a graph with the selected paths removed" << endl
          << "    -r, --retain-paths       output a graph with only the selected paths retained" << endl
          << "  rGFA cover" << endl
-         << "    -R, --rgfa-min-length N  add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
+         << "    -f, --rgfa-min-length N  add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
          << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
          << "    -t, --threads N          use up to N threads when computing rGFA cover (default: all available)" << endl
          << "  output path data:" << endl
@@ -139,7 +139,7 @@ int main_paths(int argc, char** argv) {
             {"extract-vg", no_argument, 0, 'V'},
             {"drop-paths", no_argument, 0, 'd'},
             {"retain-paths", no_argument, 0, 'r'},
-            {"rgfa-cover", required_argument, 0, 'R'},
+            {"rgfa-min-length", required_argument, 0, 'f'},
             {"snarls", required_argument, 0, 's'},
             {"extract-gam", no_argument, 0, 'X'},
             {"extract-gaf", no_argument, 0, 'A'},            
@@ -164,7 +164,7 @@ int main_paths(int argc, char** argv) {
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGHp:ct:",
+        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drf:R:s:aGHp:ct:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -200,7 +200,7 @@ int main_paths(int argc, char** argv) {
             output_formats++;
             break;
 
-        case 'R':
+        case 'f':
             rgfa_min_len = parse<int64_t>(optarg);            
             output_formats++;
             break;
diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index ec41182c368..382e6f740a0 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -61,7 +61,7 @@ is $? 0 "vg path coverage reports correct lengths in first column"
 
 rm -f q.vg q.cov.len q.len
 
-vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa
+vg paths -v rgfa/rgfa_tiny.gfa --rgfa-min-length 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa
 printf "P	_rGFA_#y[33-34]	10+	*
 P	_rGFA_#y[38-39]	13+	*
 P	_rGFA_#y[8-9]	2+,4+	*\n" > rgfa_tiny_expected_fragments.rgfa
@@ -71,7 +71,7 @@ is $? 0 "Found the expected rgfa SNP cover of tiny graph"
 
 rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa
 
-vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg convert - -fW >  rgfa_ins.rgfa
+vg paths -v rgfa/rgfa_ins.gfa --rgfa-min-length 5 -Q x | vg convert - -fW >  rgfa_ins.rgfa
 printf "P	_rGFA_#z[8-11]	2+,3+,4+	*\n" > rgfa_ins_expected_fragments.rgfa
 grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa
 diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa
@@ -79,7 +79,7 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion"
 
 rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa
 
-vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa
+vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa
 printf "P	_rGFA_#y[8-11]	2+,6+,4+	*
 P	_rGFA_#z[11-14]	3+	*\n" > rgfa_ins2_expected_fragments.rgfa
 grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa
@@ -88,7 +88,7 @@ is $? 0 "Found the expected rgfa cover for simple nested insertion that requires
 
 rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa
 
-vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa
+vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa
 printf "S	1	CAAATAAG	SN:Z:x	SO:i:0	SR:i:0
 S	2	TTT	SN:Z:y	SO:i:8	SR:i:1
 S	3	TTT	SN:Z:z	SO:i:11	SR:i:2
@@ -100,7 +100,7 @@ is $? 0 "Found the expected rgfa tags for simple nested insertion that requires
 
 rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa
 
-vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa
+vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa
 printf "P	_rGFA_#y[8-11]	2+,6+,4+	*\n" > rgfa_ins2R5_expected_fragments.rgfa
 grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa
 diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa
@@ -108,7 +108,7 @@ is $? 0 "rgfa Minimum fragment length filters out small fragment"
 
 rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa
 
-vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg convert - -fW  > rgfa_ins3.rgfa
+vg paths -v rgfa/rgfa_ins3.gfa --rgfa-min-length 3 -Q x | vg convert - -fW  > rgfa_ins3.rgfa
 printf "P	x	1+,5+	*
 P	_rGFA_#y[3-6]	4+,6+,2+	*
 P	y	5-,4+,6+,2+,1-	*
diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index 28b1122f11d..64668b15a5c 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -191,7 +191,7 @@ is $? 0 "overriding contig length does not change calls"
 
 rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi  x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf
 
-vg paths -x rgfa/rgfa_ins2.gfa -Q x -R 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa
+vg paths -x rgfa/rgfa_ins2.gfa -Q x -f 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa
 vg sim -x rgfa_ins2.rgfa -n 30 -s 99 -l 20 -a > rgfa_ins2.gam
 vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack
 vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf
diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index 4a60f64faff..2f955daedd4 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -158,7 +158,7 @@ rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon
 
 
 vg deconstruct rgfa/rgfa_ins2.gfa -P y -ea | grep  "^y	11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_y.vcf
-vg paths -x rgfa/rgfa_ins2.gfa -R 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep  "^y	11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf
+vg paths -x rgfa/rgfa_ins2.gfa -f 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep  "^y	11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf
 diff rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf
 is "$?" 0 "deconstruct properly handles rGFA reference"
 is $(cat rgfa_ins2_rgfa.vcf | wc -l) 1 "deconstruct properly handles rGFA reference (sanity check)"

From c8419abf477701ab1f5153790d0744d244c8e55e Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 5 Oct 2023 16:16:57 -0400
Subject: [PATCH 21/47] rGFA selection option

---
 src/subcommand/paths_main.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 34c36a25978..999654eff17 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -57,6 +57,7 @@ void help_paths(char** argv) {
          << "    -p, --paths-file FILE    select the paths named in a file (one per line)" << endl
          << "    -Q, --paths-by STR       select the paths with the given name prefix" << endl
          << "    -S, --sample STR         select the haplotypes or reference paths for this sample" << endl
+         << "        --rgfa-paths         select all rGFA fragments (must have been added with -R previously)" << endl
          << "    -a, --variant-paths      select the variant paths added by 'vg construct -a'" << endl
          << "    -G, --generic-paths      select the generic, non-reference, non-haplotype paths" << endl
          << "    -R, --reference-paths    select the reference paths" << endl
@@ -94,6 +95,9 @@ unordered_map<PathSense, string> SENSE_TO_STRING {
     {PathSense::HAPLOTYPE, "HAPLOTYPE"}
 };
 
+// Long options with no corresponding short options.
+const int OPT_SELECT_RGFA_FRAGMENTS = 1000;
+
 int main_paths(int argc, char** argv) {
 
     if (argc == 2) {
@@ -155,7 +159,8 @@ int main_paths(int argc, char** argv) {
             {"generic-paths", no_argument, 0, 'G'},
             {"reference-paths", no_argument, 0, 'R'},
             {"haplotype-paths", no_argument, 0, 'H'},
-            {"coverage", no_argument, 0, 'c'},            
+            {"coverage", no_argument, 0, 'c'},
+            {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS},
             {"threads", required_argument, 0, 't'},
             // Hidden options for backward compatibility.
             {"threads-by", required_argument, 0, 'q'},
@@ -284,6 +289,11 @@ int main_paths(int argc, char** argv) {
             output_formats++;
             break;
 
+        case OPT_SELECT_RGFA_FRAGMENTS:
+            sample_name = RGFACover::rgfa_sample_name;
+            selection_criteria++;
+            break;
+            
         case 't':
         {
             int num_threads = parse<int>(optarg);
@@ -350,7 +360,7 @@ int main_paths(int argc, char** argv) {
         std::exit(EXIT_FAILURE);
     }
     if (selection_criteria > 1) {
-        std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p) cannot be used" << std::endl;
+        std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths,) cannot be used" << std::endl;
         std::exit(EXIT_FAILURE);
     }
     if (select_alt_paths && !gbwt_file.empty()) {
@@ -766,7 +776,7 @@ int main_paths(int argc, char** argv) {
             }
             for_each_selected_path([&](path_handle_t path_handle) {
                 if (list_names) {
-                    cout << graph->get_path_name(path_handle);
+                    cout << RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle));
                     if (list_lengths) {
                         size_t path_length = 0;
                         for (handle_t handle : graph->scan_path(path_handle)) {

From 22974f7ea03440b2556a43cbbc087ee3cc04925b Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 5 Oct 2023 16:17:10 -0400
Subject: [PATCH 22/47] leave subrange in surjected-to rgfa fragments

---
 src/hts_alignment_emitter.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp
index b301e25f97a..768f517ff1c 100644
--- a/src/hts_alignment_emitter.cpp
+++ b/src/hts_alignment_emitter.cpp
@@ -695,10 +695,17 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector<pair<in
 
     // Resolve subpath naming / offset
     subrange_t subrange;
-    path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(path_name), &subrange);
-    if (subrange != PathMetadata::NO_SUBRANGE) {
-        pos += subrange.first;
-    }
+    if (RGFACover::is_rgfa_path_name(path_name)) {
+        // unpack the rGFA path name.  also, we *leave* the subrange on in this case
+        // since we want the fragments to show up as references (todo: do we really?)
+        path_name = RGFACover::revert_rgfa_path_name(path_name, false);
+    } else {
+        // strip subrange and add it to the offset
+        path_name = Paths::strip_subrange(path_name, &subrange);
+        if (subrange != PathMetadata::NO_SUBRANGE) {
+            pos += subrange.first;
+        }        
+    }     
 }
 
 void HTSAlignmentEmitter::convert_unpaired(Alignment& aln, bam_hdr_t* header, vector<bam1_t*>& dest) {

From 98a67512d7d96ed1307ec85998715af1d43f829f Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 6 Oct 2023 10:37:39 -0400
Subject: [PATCH 23/47] keep rgfa subpaths in surject

---
 src/hts_alignment_emitter.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp
index 768f517ff1c..c8ae527a13f 100644
--- a/src/hts_alignment_emitter.cpp
+++ b/src/hts_alignment_emitter.cpp
@@ -151,8 +151,9 @@ static bool for_each_subpath_of(const PathPositionHandleGraph& graph, const stri
 
 /// Returns the base path name for this path (i.e. the path's name without any subrange).
 static string get_path_base_name(const PathPositionHandleGraph& graph, const path_handle_t& path) {
-    if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) {
-        // This is a full path
+    string path_name = graph.get_path_name(path);
+    if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE || RGFACover::is_rgfa_path_name(path_name)) {
+        // This is a full path or an rGFA fragment that we don't want to touch
         return graph.get_path_name(path);
     } else {
         // This is a subpath, so remember what it's a subpath of, and use that.
@@ -231,7 +232,14 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin
             
             if (print_subrange_warnings) {
                 subrange_t subrange;
-                std::string base_path_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(sequence_name), &subrange);
+                std::string base_path_name;
+                if (RGFACover::is_rgfa_path_name(sequence_name)) {
+                    base_path_name = RGFACover::revert_rgfa_path_name(sequence_name, false);
+                    // a white lie, but we want our subranges in rgfa intervals, at least for now
+                    subrange = PathMetadata::NO_SUBRANGE; 
+                } else {
+                    base_path_name = Paths::strip_subrange(sequence_name, &subrange);
+                }
                 if (subrange != PathMetadata::NO_SUBRANGE) {
                     // The user is asking explicitly to surject to a path that is a
                     // subrange of some other logical path, like

From 2ba15832861fde7d466e86902b916d5cbcb5f21c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 6 Oct 2023 10:38:08 -0400
Subject: [PATCH 24/47] revert rgfa name processing in paths listing -- too
 confusing for now

---
 src/subcommand/paths_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 999654eff17..4ebf6ec26a5 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -776,7 +776,7 @@ int main_paths(int argc, char** argv) {
             }
             for_each_selected_path([&](path_handle_t path_handle) {
                 if (list_names) {
-                    cout << RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle));
+                    cout << graph->get_path_name(path_handle);
                     if (list_lengths) {
                         size_t path_length = 0;
                         for (handle_t handle : graph->scan_path(path_handle)) {

From 07aa374a1fcba9c920b5a770ac4b26820372a01e Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 6 Oct 2023 10:54:57 -0400
Subject: [PATCH 25/47] clean rgfa names in fasta output

---
 src/subcommand/paths_main.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 4ebf6ec26a5..58b13910f64 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -519,7 +519,7 @@ int main_paths(int argc, char** argv) {
                 // Write as a Path in a VG
                 chunk_to_emitter(path, *graph_emitter);
             } else if (extract_as_fasta) {
-                write_fasta_sequence(name, path_sequence(*graph, path), cout);
+                write_fasta_sequence(RGFACover::revert_rgfa_path_name(name, false), path_sequence(*graph, path), cout);
             }
             if (list_lengths) {
                 cout << path.name() << "\t" << path_to_length(path) << endl;
@@ -831,7 +831,8 @@ int main_paths(int argc, char** argv) {
                     } else if (extract_as_vg) {
                         chunk_to_emitter(path, *graph_emitter); 
                     } else if (extract_as_fasta) {
-                        write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout);
+                        write_fasta_sequence(RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle), false),
+                                             path_sequence(*graph, path), cout);
                     }
                 }
             });

From 65a5d7f763f94d6038b41fa10b278888912de9a7 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 6 Oct 2023 11:27:45 -0400
Subject: [PATCH 26/47] fair enough, clang

---
 src/rgfa.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 2a51cf7aecd..954f3d6d94d 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -126,7 +126,7 @@ class RGFACover {
         const string* name;
         int64_t trav_idx;
         pair<int64_t, int64_t> fragment;
-        bool operator<(const RankedFragment& f2) {
+        bool operator<(const RankedFragment& f2) const {
             // note: name comparison is flipped because we want to select high coverage / low name
             return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name);
         }

From 182cf39cdee9b1e26ba2cc6e59aec1fa7b406112 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 6 Oct 2023 15:10:49 -0400
Subject: [PATCH 27/47] fix rgfa tests post merge

---
 src/algorithms/gfa_to_handle.cpp | 4 ----
 test/t/48_vg_convert.t           | 6 ++----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp
index aae969973c3..c8a483f10ea 100644
--- a/src/algorithms/gfa_to_handle.cpp
+++ b/src/algorithms/gfa_to_handle.cpp
@@ -674,10 +674,6 @@ tuple<string, size_t, string, pair<int64_t, int64_t>, GFAParser::chars_t, GFAPar
     
     // Grab the haplotype number
     int64_t haplotype_number = take_number(cursor, end, -1, "parsing haplotype number");
-    if (haplotype_number == -1) {
-        // This field is required
-        throw GFAFormatError("Missing haplotype number in W line", cursor);
-    }
     take_tab(cursor, end, "parsing end of haplotype number");
     
     // Grab the sequence/contig/locus name
diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t
index f967c048ec0..7eb46260af5 100644
--- a/test/t/48_vg_convert.t
+++ b/test/t/48_vg_convert.t
@@ -404,15 +404,13 @@ is "${?}" "0" "GFA -> HashGraph -> GFA conversion preserves path metadata"
 
 # We can't do rGFA to GBZ directly until the GBWTGraph GFA parser learns to read tags
 # rGFA to HashGraph to GBZ to GFA
-# todo: would be great if gbz could preserve the NO_HAPLOTYPE
-#       gbz also strips the end of the subrange
-vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' -e 's/-2//g' -e 's/-8//g' |  sort >paths.truth.txt
+vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/-2//g' -e 's/-8//g' |  sort >paths.truth.txt
 vg convert -a graphs/rgfa_with_reference.rgfa > rgfa_with_reference.hg
 vg gbwt -x rgfa_with_reference.hg -E --gbz-format -g rgfa_with_reference.gbz
 vg paths -M -x rgfa_with_reference.gbz | sort >paths.gbz.txt
 cmp paths.gbz.txt paths.truth.txt
 is "${?}" "0" "rGFA -> HashGraph -> GBZ conversion preserves path metadata"
-vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/NO_HAPLOTYPE/0/g' -e 's/_rGFA_#/_rGFA_#0#/g' |  sort >paths.truth.txt
+vg paths -M -x graphs/rgfa_with_reference.rgfa |  sort >paths.truth.txt
 vg convert -f rgfa_with_reference.gbz >extracted.gfa
 vg paths -M -x extracted.gfa | sort >paths.gfa.txt
 cmp paths.gfa.txt paths.truth.txt

From 1d07f4c2a2bccc41d9f519710d40106d2c251d7e Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 10 Oct 2023 12:39:39 -0400
Subject: [PATCH 28/47] start rgfa VCF annotator

---
 src/rgfa.cpp                  | 255 ++++++++++++++++++++++++++--------
 src/rgfa.hpp                  |  10 +-
 src/subcommand/paths_main.cpp |  36 ++++-
 3 files changed, 239 insertions(+), 62 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 8aeb14419f5..4da96291152 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -202,7 +202,19 @@ void RGFACover::load(const PathHandleGraph* graph,
     // start with the reference paths
     for (const path_handle_t& ref_path_handle : reference_paths) {
         graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) {
-            node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size();
+            nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle));
+            if (graph->get_is_reverse(graph->get_handle_of_step(step_handle))) {
+                cerr << "[rgfa] error: Reversed step " << node_id << " found in rank-0 reference "
+                     << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be forward-only." << endl;
+                exit(1);
+            }            
+            if (node_to_interval.count(node_id)) {
+                cerr << "[rgfa] error: Cycle found on node " << node_id << "in rank-0 reference "
+                     << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be acyclic." << endl;
+                exit(1);
+
+            }
+            node_to_interval[node_id] = rgfa_intervals.size();
         });
         this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle),
                                                  graph->path_end(ref_path_handle)));
@@ -385,62 +397,8 @@ void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
 int64_t RGFACover::get_rank(nid_t node_id) const {
 
     // search back to reference in order to find the rank.
-    unordered_set<nid_t> visited;
-    priority_queue<pair<int64_t, nid_t>> queue;
-    queue.push(make_pair(0, node_id));
-
-    nid_t current_id;
-    int64_t distance = 0;
-
-    while (!queue.empty()) {
-        std::tie(distance, current_id) = queue.top();
-        queue.pop();
-
-        if (!visited.count(current_id)) {
-
-            visited.insert(current_id);
-
-            if (this->node_to_interval.count(current_id)) {
-                int64_t interval_idx = this->node_to_interval.at(current_id);
-
-                // we've hit the reference, can stop searching
-                if (interval_idx < this->num_ref_intervals) {
-                    return distance;
-                }
-
-                // search out of the snarl -- any parent traversals will overlap here
-                const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
-                
-                graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) {
-                    queue.push(make_pair(distance + 1, graph->get_id(prev)));
-                });
-                // hack around gbwtgraph bug (feature?) that does not let you decrement path_end
-                path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first);
-                step_handle_t last_step;
-                if (rgfa_interval.second == graph->path_end(path_handle)) {
-                    last_step = graph->path_back(path_handle);
-                } else {
-                    last_step = graph->get_previous_step(rgfa_interval.second);
-                }
-                graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) {
-                    queue.push(make_pair(distance + 1, graph->get_id(next)));
-                });
-
-            } else {
-                // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals)
-                graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) {
-                    queue.push(make_pair(distance, graph->get_id(next)));
-                });
-                graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) {
-                    queue.push(make_pair(distance, graph->get_id(next)));
-                });                                
-            }
-            
-        }
-    }
-
-    // this shouldn't happen?
-    return -1;
+    vector<pair<int64_t, nid_t>> ref_steps = this->get_reference_nodes(node_id, true);
+    return ref_steps.at(0).first;
 }
     
 pair<const pair<step_handle_t, step_handle_t>*,
@@ -758,5 +716,188 @@ void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_gra
     });
 }
 
+vector<pair<int64_t, nid_t>> RGFACover::get_reference_nodes(nid_t node_id, bool first) const {
+
+    // search back to reference in order to find the rank.
+    unordered_set<nid_t> visited;
+    priority_queue<pair<int64_t, nid_t>> queue;
+    queue.push(make_pair(0, node_id));
+
+    nid_t current_id;
+    int64_t distance = 0;
+
+    // output reference intervals
+    vector<pair<int64_t, nid_t>> output_reference_nodes;
+
+    while (!queue.empty()) {
+        std::tie(distance, current_id) = queue.top();
+        queue.pop();
+
+        if (!visited.count(current_id)) {
+
+            visited.insert(current_id);
+
+            if (this->node_to_interval.count(current_id)) {
+                int64_t interval_idx = this->node_to_interval.at(current_id);
+
+                const pair<step_handle_t, step_handle_t>& rgfa_interval = this->rgfa_intervals.at(interval_idx);
+
+                // we've hit the reference, fish out its step and stop searching.
+                if (interval_idx < this->num_ref_intervals) {
+                    output_reference_nodes.push_back(make_pair(distance, current_id));
+                    if (first) {
+                        break;
+                    }
+                    continue;
+                }
+
+                // search out of the snarl -- any parent traversals will overlap here                
+                graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) {
+                    queue.push(make_pair(distance + 1, graph->get_id(prev)));
+                });
+                // hack around gbwtgraph bug (feature?) that does not let you decrement path_end
+                path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first);
+                step_handle_t last_step;
+                if (rgfa_interval.second == graph->path_end(path_handle)) {
+                    last_step = graph->path_back(path_handle);
+                } else {
+                    last_step = graph->get_previous_step(rgfa_interval.second);
+                }
+                graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) {
+                    queue.push(make_pair(distance + 1, graph->get_id(next)));
+                });
+
+            } else {
+                // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals)
+                graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) {
+                    queue.push(make_pair(distance, graph->get_id(next)));
+                });
+                graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) {
+                    queue.push(make_pair(distance, graph->get_id(next)));
+                });                                
+            }
+            
+        }
+    }
+
+    assert(output_reference_nodes.size() > 0 && (first == (output_reference_nodes.size() == 1)));
+    return output_reference_nodes;
+}
+
+void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) {
+    vcflib::Variant var(vcf);
+
+    // want a reference position lookup
+    unordered_map<nid_t, int64_t> node_to_ref_pos;
+    for (int64_t i = 0; i < this->num_ref_intervals; ++i) {
+        const pair<step_handle_t, step_handle_t>& ref_interval = this->rgfa_intervals.at(i);
+        // assumption: ref intervals span entire path
+        assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first);
+        int64_t pos = 0;
+        for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second;
+             step_handle = graph->get_next_step(step_handle)) {
+            handle_t handle = graph->get_handle_of_step(step_handle);
+            nid_t node_id = graph->get_id(handle);
+            assert(!graph->get_is_reverse(handle));            
+            assert(!node_to_ref_pos.count(node_id));
+            node_to_ref_pos[node_id] = pos;
+            pos += graph->get_length(handle);
+        }
+    }
+    
+    // also want an rGFA path lookup
+    unordered_map<string, path_handle_t> name_to_rgfa_path;
+    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
+        name_to_rgfa_path[RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle))] = path_handle;
+    });
+
+    // remove header lines we're going to add
+    vcf.removeInfoHeaderLine("R_CHROM");
+    vcf.removeInfoHeaderLine("R_START");
+    vcf.removeInfoHeaderLine("R_END");
+    vcf.removeInfoHeaderLine("RANK");
+
+    // and add them back, so as not to duplicate them if they are already there
+    vcf.addHeaderLine("##INFO=<ID=R_CHROM,Number=1,Type=String,Description=\"Reference Chromsome Name\">");
+    vcf.addHeaderLine("##INFO=<ID=R_START,Number=1,Type=String,Description=\"Chromsome Start (0-based inclusive)\">");
+    vcf.addHeaderLine("##INFO=<ID=R_END,Number=1,Type=String,Description=\"Reference End (0-based exclusive)\">");
+    vcf.addHeaderLine("##INFO=<ID=RANK,Number=1,Type=String,Description=\"rGFA Rank\">");
+
+    os << vcf.header << endl;
+
+    string prev_sequence_name;
+    string prev_r_chrom;
+    string prev_r_start;
+    string prev_r_end;
+    string prev_rank;
+    while (vcf.getNextVariant(var)) {
+        if (name_to_rgfa_path.count(var.sequenceName)) {
+            string r_chrom;
+            string r_start;
+            string r_end;
+            string rank;
+            if (var.sequenceName == prev_sequence_name) {
+                // just use the previous values, which will be the same
+                r_chrom = prev_r_chrom;
+                r_start = prev_r_start;
+                r_end = prev_r_end;
+                rank = prev_rank;
+            } else {
+                // compute from the cover
+                path_handle_t path_handle = name_to_rgfa_path.at(var.sequenceName);
+                nid_t first_node = graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)));
+                vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
+
+                int64_t min_ref_pos = numeric_limits<int64_t>::max();
+                int64_t max_ref_pos = -1;
+                int64_t min_rank = numeric_limits<int64_t>::max();
+                for (const pair<int64_t, nid_t>& rank_node : ref_nodes) {
+                    step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first;
+                    path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
+                    string name = graph->get_path_name(ref_path);
+                    // we assume one reference contig (which is built into the whole structure)
+                    assert(r_chrom.empty() || r_chrom == name);
+                    r_chrom = name;
+                    int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
+                    min_ref_pos = min(min_ref_pos, ref_pos);
+                    max_ref_pos = max(max_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
+                    min_rank = min(min_rank, rank_node.first);
+                }
+
+                r_start = std::to_string(min_ref_pos);
+                r_end = std::to_string(max_ref_pos);
+                rank = std::to_string(min_rank);
+            }
+
+            if (!var.info.count("R_CHROM")) {
+                var.format.push_back("R_CHROM");
+            }
+            var.info["R_CHROM"] = {r_chrom};
+            if (!var.info.count("R_START")) {
+                var.format.push_back("R_START");
+            }
+            var.info["R_START"] = {r_start};
+            if (!var.info.count("R_END")) {
+                var.format.push_back("R_END");
+            }
+            var.info["R_END"] = {r_end};
+            if (!var.info.count("RANK")) {
+                var.format.push_back("RANK");
+            }
+            var.info["RANK"] = {rank};
+            
+            prev_sequence_name = var.sequenceName;
+            prev_r_chrom = r_chrom;
+            prev_r_start = r_start;
+            prev_r_end = r_end;
+            prev_rank = rank;
+            
+        } else {
+            //cerr << "rGFA [warning]: VCF reference " << var.sequenceName << " not found in graph" << endl;
+        }
+        os << var << endl;        
+    }
+}
+
 }
 
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 954f3d6d94d..fe7559f67c6 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -85,6 +85,8 @@ class RGFACover {
     // return nullptr if node not in an interval
     const pair<step_handle_t, step_handle_t>* get_interval(nid_t node_id) const;
 
+    // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover
+    void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os);
     
 protected:
 
@@ -106,7 +108,11 @@ class RGFACover {
     // this is always possible because they are, by definition, disjoint
     // this should only be run from inside apply()
     void forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph);
-    
+
+    // search back to the reference and return <distance, step> when its found
+    // (here distance is the number of intervals crossed, aka rank)
+    // "first" toggles returning the first interfval found vs all of them
+    vector<pair<int64_t, nid_t>> get_reference_nodes(nid_t node_id, bool first) const;    
     
 protected:
 
@@ -133,6 +139,8 @@ class RGFACover {
     };
 };
 
+
+
 /// Export the given VG graph to the given GFA file.
 }
 
diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index 58b13910f64..ea7cc56096b 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -44,6 +44,7 @@ void help_paths(char** argv) {
          << "    -f, --rgfa-min-length N  add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
          << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
          << "    -t, --threads N          use up to N threads when computing rGFA cover (default: all available)" << endl
+         << "        --rgfa-vcf FILE      add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl
          << "  output path data:" << endl
          << "    -X, --extract-gam        print (as GAM alignments) the stored paths in the graph" << endl
          << "    -A, --extract-gaf        print (as GAF alignments) the stored paths in the graph" << endl
@@ -97,6 +98,7 @@ unordered_map<PathSense, string> SENSE_TO_STRING {
 
 // Long options with no corresponding short options.
 const int OPT_SELECT_RGFA_FRAGMENTS = 1000;
+const int OPT_ANNOTATE_RGFA_VCF = 1001;
 
 int main_paths(int argc, char** argv) {
 
@@ -113,6 +115,7 @@ int main_paths(int argc, char** argv) {
     bool drop_paths = false;
     bool retain_paths = false;
     int64_t rgfa_min_len = -1;
+    string rgfa_vcf_filename;
     string snarl_filename;
     string graph_file;
     string gbwt_file;
@@ -161,6 +164,7 @@ int main_paths(int argc, char** argv) {
             {"haplotype-paths", no_argument, 0, 'H'},
             {"coverage", no_argument, 0, 'c'},
             {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS},
+            {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF},
             {"threads", required_argument, 0, 't'},
             // Hidden options for backward compatibility.
             {"threads-by", required_argument, 0, 'q'},
@@ -293,6 +297,11 @@ int main_paths(int argc, char** argv) {
             sample_name = RGFACover::rgfa_sample_name;
             selection_criteria++;
             break;
+
+        case OPT_ANNOTATE_RGFA_VCF:
+            rgfa_vcf_filename = optarg;
+            output_formats++;
+            break;
             
         case 't':
         {
@@ -356,11 +365,11 @@ int main_paths(int argc, char** argv) {
         }
     } 
     if (output_formats != 1) {
-        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl;
+        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf) must be specified" << std::endl;
         std::exit(EXIT_FAILURE);
     }
     if (selection_criteria > 1) {
-        std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths,) cannot be used" << std::endl;
+        std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths) cannot be used" << std::endl;
         std::exit(EXIT_FAILURE);
     }
     if (select_alt_paths && !gbwt_file.empty()) {
@@ -664,8 +673,27 @@ int main_paths(int argc, char** argv) {
             
             // output the graph
             vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names);
-        }
-        else if (coverage) {
+        } else if (!rgfa_vcf_filename.empty()) {
+            RGFACover rgfa_cover;
+            // load up the rank-0 reference path selection
+            unordered_set<path_handle_t> reference_paths;
+            for_each_selected_path([&](const path_handle_t& path_handle) {
+                reference_paths.insert(path_handle);
+            });
+            // load up the cover (which must have been previously computed and serialized with -f)
+            rgfa_cover.load(graph.get(), reference_paths);
+            if (rgfa_cover.get_intervals().size() == 0) {
+                cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl;
+                exit(1);
+            }
+            vcflib::VariantCallFile variant_file;
+            variant_file.open(rgfa_vcf_filename);
+            if (!variant_file.is_open()) {
+                cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl;
+                exit(1);
+            }
+            rgfa_cover.annotate_vcf(variant_file, cout);
+        } else if (coverage) {
             // for every node, count the number of unique paths.  then add the coverage count to each one
             // (we're doing the whole graph here, which could be inefficient in the case the user is selecting
             //  a small path)

From b8cc93756aac1e17a78caab314e240546e69f412 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 11 Oct 2023 10:53:18 -0400
Subject: [PATCH 29/47] Fix assertion bug

---
 src/rgfa.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 4da96291152..d1d8fd289e6 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -780,7 +780,7 @@ vector<pair<int64_t, nid_t>> RGFACover::get_reference_nodes(nid_t node_id, bool
         }
     }
 
-    assert(output_reference_nodes.size() > 0 && (first == (output_reference_nodes.size() == 1)));
+    assert(output_reference_nodes.size() > 0 && (!first || (output_reference_nodes.size() == 1)));
     return output_reference_nodes;
 }
 
@@ -859,8 +859,11 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) {
                     assert(r_chrom.empty() || r_chrom == name);
                     r_chrom = name;
                     int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
-                    min_ref_pos = min(min_ref_pos, ref_pos);
-                    max_ref_pos = max(max_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
+                    // assume snarl is forward on both reference nodes
+                    // todo: this won't be exact for some inversion cases, I don't think --
+                    //       need to test these and either add check / or move to oriented search
+                    min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
+                    max_ref_pos = max(max_ref_pos, ref_pos);
                     min_rank = min(min_rank, rank_node.first);
                 }
 

From 220d33d003c2e08d4a47beafd02974c970cbe88d Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 19 Oct 2023 15:05:03 -0400
Subject: [PATCH 30/47] better rgfa support in call

---
 src/graph_caller.cpp | 124 +++++++++++++++++++++++++++++++++++--------
 src/graph_caller.hpp |  22 ++++++--
 2 files changed, 118 insertions(+), 28 deletions(-)

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index c9a518718e0..18db98d1ce0 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -1,6 +1,7 @@
 #include "graph_caller.hpp"
 #include "algorithms/expand_context.hpp"
 #include "annotation.hpp"
+#include "rgfa.hpp"
 
 //#define debug
 
@@ -18,34 +19,44 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re
 
     // Used to recurse on children of parents that can't be called
     size_t thread_count = get_thread_count();
-    vector<vector<const Snarl*>> snarl_queue(thread_count);
+    vector<vector<pair<const Snarl*, int>>> snarl_queue(thread_count);
 
     // Run the snarl caller on a snarl, and queue up the children if it fails
-    auto process_snarl = [&](const Snarl* snarl) {
+    auto process_snarl = [&](const Snarl* snarl, int ploidy_override) {
 
         if (!snarl_manager.is_trivial(snarl, graph)) {
 
 #ifdef debug
             cerr << "GraphCaller running call_snarl on " << pb2json(*snarl) << endl;
 #endif
-
-            bool was_called = call_snarl(*snarl);
-            if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {
-                const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
-                vector<const Snarl*>& thread_queue = snarl_queue[omp_get_thread_num()];
-                thread_queue.insert(thread_queue.end(), children.begin(), children.end());
+            const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
+            vector<int> child_ploidies(children.size(), -1);
+            bool was_called = call_snarl(*snarl, ploidy_override, &child_ploidies);
+            if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) {                
+                vector<pair<const Snarl*, int>>& thread_queue = snarl_queue[omp_get_thread_num()];
+                for (int64_t i = 0; i < children.size(); ++i) {
+                    thread_queue.push_back(make_pair(children[i], child_ploidies[i]));
+                }
             }
         }
     };
 
     // Start with the top level snarls
-    snarl_manager.for_each_top_level_snarl_parallel(process_snarl);
+    // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl()
+    vector<const Snarl*> top_level_snarls;
+    snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) {
+        top_level_snarls.push_back(snarl);
+    });
+#pragma omp parallel for schedule(dynamic, 1)
+    for (int64_t i = 0; i < top_level_snarls.size(); ++i) {
+        process_snarl(top_level_snarls[i], -1);
+    }
 
     // Then recurse on any children the snarl caller failed to handle
     while (!std::all_of(snarl_queue.begin(), snarl_queue.end(),
-                        [](const vector<const Snarl*>& snarl_vec) {return snarl_vec.empty();})) {
-        vector<const Snarl*> cur_queue;
-        for (vector<const Snarl*>& thread_queue : snarl_queue) {
+                        [](const vector<pair<const Snarl*, int>>& snarl_vec) {return snarl_vec.empty();})) {
+        vector<pair<const Snarl*, int>> cur_queue;
+        for (vector<pair<const Snarl*, int>>& thread_queue : snarl_queue) {
             cur_queue.reserve(cur_queue.size() + thread_queue.size());
             std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue));
             thread_queue.clear();
@@ -53,7 +64,7 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re
 
 #pragma omp parallel for schedule(dynamic, 1)
         for (int i = 0; i < cur_queue.size(); ++i) {
-            process_snarl(cur_queue[i]);
+            process_snarl(cur_queue[i].first, cur_queue[i].second);
         }
     }
   
@@ -175,8 +186,48 @@ vector<Chain> GraphCaller::break_chain(const HandleGraph& graph, const Chain& ch
 
     return chain_frags;
 }
+
+
+void GraphCaller::resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector<SnarlTraversal>& travs,
+                                         const vector<int>& genotype, vector<int>& child_ploidies) {
+    const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
+
+    child_ploidies.resize(children.size());
     
-VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false)
+    if (!children.empty()) {
+        // index the nodes in the traversals
+        vector<unordered_set<handle_t>> trav_indexes(genotype.size());
+        for (int64_t i = 0; i < genotype.size(); ++i) {
+            const SnarlTraversal& trav = travs[genotype[i]];
+            unordered_set<handle_t>& trav_idx = trav_indexes[i];
+            if (i > 0 && genotype[i] == genotype[i-1]) {
+                trav_idx = trav_indexes[i-1];
+            } else {
+                for (int j = 0; j < trav.visit_size(); ++j) {
+                    trav_idx.insert(graph.get_handle(trav.visit(j).node_id(), trav.visit(j).backward()));
+                }
+            }
+        }
+
+        // for every child snarl, count the number of traversals that spans it -- this will be its ploidy
+        for (int64_t i = 0; i < children.size(); ++i) {
+            const Snarl* child_snarl = children[i];
+            child_ploidies[i] = 0;
+            handle_t child_start = graph.get_handle(child_snarl->start().node_id(), child_snarl->start().backward());
+            handle_t child_end = graph.get_handle(child_snarl->end().node_id(), child_snarl->end().backward());
+            for (int64_t j = 0; j < trav_indexes.size(); ++j) {
+                unordered_set<handle_t>& trav_idx = trav_indexes[j];
+                if ((trav_idx.count(child_start) && trav_idx.count(child_end)) ||
+                    (trav_idx.count(graph.flip(child_start)) && trav_idx.count(graph.flip(child_end)))) {
+                    ++child_ploidies[i];
+                }
+            }
+        }
+    }
+}
+
+
+VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false), write_full_names(false)
 {
     output_variants.resize(get_thread_count());
 }
@@ -491,11 +542,11 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa
 
     // resolve subpath naming
     subrange_t subrange;
-    string basepath_name = Paths::strip_subrange(ref_path_name, &subrange);
+    string basepath_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(ref_path_name), &subrange);
     size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
     // in VCF we usually just want a contig
     string contig_name = PathMetadata::parse_locus_name(basepath_name);
-    if (contig_name != PathMetadata::NO_LOCUS_NAME) {
+    if (contig_name != PathMetadata::NO_LOCUS_NAME && !this->write_full_names) {
         basepath_name = contig_name;
     }
     // fill out the rest of the variant    
@@ -766,6 +817,20 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, function<void(cons
     }
 }
 
+void VCFOutputCaller::toggle_full_names_from_paths(const vector<string>& ref_paths) {
+    set<string> samples;
+    set<int> haplotypes;
+    for (const string& path_name : ref_paths) {
+        samples.insert(PathMetadata::parse_sample_name(path_name));
+        haplotypes.insert(PathMetadata::parse_haplotype(path_name));
+        if (samples.size() > 1 || haplotypes.size() > 1) {
+            this->write_full_names = true;
+            return;
+        }
+    }
+    this->write_full_names = false;
+}
+
 GAFOutputCaller::GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector<string>& ref_paths,
                                  size_t trav_padding) :
     emitter(emitter),
@@ -1054,7 +1119,7 @@ VCFGenotyper::~VCFGenotyper() {
 
 }
 
-bool VCFGenotyper::call_snarl(const Snarl& snarl) {
+bool VCFGenotyper::call_snarl(const Snarl& snarl, int ploidy_override, vector<int>* out_child_ploidies) {
 
     // could be that our graph is a subgraph of the graph the snarls were computed from
     // so bypass snarls we can't process
@@ -1296,6 +1361,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph,
         // our graph is not in vg format.  we will make graphs for each site as needed and work with those
         traversal_finder = nullptr;
     }
+
+    this->toggle_full_names_from_paths(ref_paths);
 }
 
 LegacyCaller::~LegacyCaller() {
@@ -1305,7 +1372,7 @@ LegacyCaller::~LegacyCaller() {
     }
 }
 
-bool LegacyCaller::call_snarl(const Snarl& snarl) {
+bool LegacyCaller::call_snarl(const Snarl& snarl, int ploidy_override, vector<int>* out_child_ploidies) {
 
     // if we can't handle the snarl, then the GraphCaller framework will recurse on its children
     if (!is_traversable(snarl)) {
@@ -1630,13 +1697,14 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph,
         ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2;
     }
 
+    this->toggle_full_names_from_paths(ref_paths);
 }
    
 FlowCaller::~FlowCaller() {
 
 }
 
-bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
+bool FlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector<int>* out_child_ploidies) {
 
     // todo: In order to experiment with merging consecutive snarls to make longer traversals,
     // I am experimenting with sending "fake" snarls through this code.  So make a local
@@ -1644,6 +1712,11 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
     // wants a pointer will crash. 
     Snarl snarl = managed_snarl;
 
+    if (ploidy_override == 0) {
+        // returning true is a bit ironic, but we do *not* want to recurse so it's important we do
+        return true;
+    }
+
     if (snarl.start().node_id() == snarl.end().node_id() ||
         !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) {
         // can't call one-node or out-of graph snarls.
@@ -1803,10 +1876,14 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) {
         // use our support caller to choose our genotype
         vector<int> trav_genotype;
         unique_ptr<SnarlCaller::CallInfo> trav_call_info;
-        int ploidy = ref_ploidies[ref_path_name];
+        int ploidy = ploidy_override == -1 ? ref_ploidies[ref_path_name] : ploidy_override;
         std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name,
                                                                         make_pair(get<0>(ref_interval), get<1>(ref_interval)));
-
+        if (out_child_ploidies != nullptr) {
+            // derive ploidies for child snarls from the genotype traversals and save them
+            resolve_child_ploidies(graph, &managed_snarl, travs, trav_genotype, *out_child_ploidies);
+        }
+        
         assert(trav_genotype.empty() || trav_genotype.size() == ploidy);
 
         if (!gaf_output) {
@@ -1866,14 +1943,15 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph,
         ref_path_set.insert(ref_paths[i]);
         ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2;
     }
-
+    
+    this->toggle_full_names_from_paths(ref_paths);
 }
    
 NestedFlowCaller::~NestedFlowCaller() {
 
 }
 
-bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl) {
+bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector<int>* out_child_ploidies) {
     
     // remember the calls for each child snarl in this table
     CallTable call_table;
diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp
index 25d719aeb3d..c753bb315ff 100644
--- a/src/graph_caller.hpp
+++ b/src/graph_caller.hpp
@@ -49,12 +49,16 @@ class GraphCaller {
                                        RecurseType recurise_type = RecurseOnFail);
 
     /// Call a given snarl, and print the output to out_stream
-    virtual bool call_snarl(const Snarl& snarl) = 0;
+    virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr) = 0;
 
 protected:
 
     /// Break up a chain into bits that we want to call using size heuristics
     vector<Chain> break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial);
+
+    /// Compute the child ploidies baseds on the called genotype
+    void resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector<SnarlTraversal>& travs,
+                                const vector<int>& genotype, vector<int>& child_ploidies);
     
 protected:
 
@@ -135,6 +139,11 @@ class VCFOutputCaller {
 
     // update the PS and LV tags in the output buffer (called in write_variants if include_nested is true)
     void update_nesting_info_tags(const SnarlManager* snarl_manager);
+
+    // toggle write_full_names automatically depending on ref_paths
+    // (keeps backwards compatibility when it was always false whenever possible,
+    // but flips to full names when necessary)
+    void toggle_full_names_from_paths(const vector<string>& ref_paths);
     
     /// output vcf
     mutable vcflib::VariantCallFile output_vcf;
@@ -154,6 +163,9 @@ class VCFOutputCaller {
 
     // need to write LV/PS info tags
     bool include_nested;
+
+    // toggle writing full name or just contig name 
+    bool write_full_names;
 };
 
 /**
@@ -221,7 +233,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu
 
     virtual ~VCFGenotyper();
 
-    virtual bool call_snarl(const Snarl& snarl);
+    virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);    
 
     virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
                               const vector<size_t>& contig_length_overrides = {}) const;
@@ -274,7 +286,7 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
 
     virtual ~LegacyCaller();
 
-    virtual bool call_snarl(const Snarl& snarl);
+    virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
 
     virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
                               const vector<size_t>& contig_length_overrides = {}) const;
@@ -375,7 +387,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC
    
     virtual ~FlowCaller();
 
-    virtual bool call_snarl(const Snarl& snarl);
+    virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
 
     virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
                               const vector<size_t>& contig_length_overrides = {}) const;
@@ -449,7 +461,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO
    
     virtual ~NestedFlowCaller();
 
-    virtual bool call_snarl(const Snarl& snarl);
+    virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector<int>* out_child_ploidies = nullptr);
 
     virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
                               const vector<size_t>& contig_length_overrides = {}) const;

From f23e65a8908419bbe108e2bb15dd031e275193b9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 21 Nov 2023 11:42:10 -0500
Subject: [PATCH 31/47] fix merge re -S option

---
 src/subcommand/call_main.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp
index a8e76c8e478..1d59ac45b71 100644
--- a/src/subcommand/call_main.cpp
+++ b/src/subcommand/call_main.cpp
@@ -584,8 +584,18 @@ int main_call(int argc, char** argv) {
 
     // make sure we have some ref paths
     if (ref_paths.empty()) {
-        if (!ref_sample.empty()) {
-            cerr << "error [vg call]: No paths with selected reference sample \"" << ref_sample << "\" found. "
+        if (!ref_sample_set.empty()) {
+            string sample_string;
+            if (ref_sample_set.size() == 1) {
+                sample_string = "sample \"" + *ref_sample_set.begin() + "\"";
+            } else {
+                sample_string = "samples \"";
+                for (const string& rsn : ref_sample_set) {
+                    sample_string += (sample_string.length() > 9 ? " " : "") + rsn;
+                }
+                sample_string += "\"";
+            }
+            cerr << "error [vg call]: No paths with selected reference " << sample_string << " found. "
                  << "Try using vg paths -M to see which samples are in your graph" << endl;
             return 1;
         }

From 6ebd95f1645bc796d0f9a4260803fa2fe0f75b78 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 27 Mar 2024 12:51:02 -0400
Subject: [PATCH 32/47] merge contiguous rgfa fragments

---
 src/rgfa.cpp | 70 +++++++++++++++++++++++++++++++++++++++++++---------
 src/rgfa.hpp |  7 ++++++
 2 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index d1d8fd289e6..237c7cb8f81 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -146,7 +146,9 @@ void RGFACover::compute(const PathHandleGraph* graph,
         vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals = rgfa_intervals_vector[omp_get_thread_num()];    
         unordered_map<nid_t, int64_t>& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()];
 
-        vector<const Snarl*> queue = {snarl}; 
+        vector<const Snarl*> queue = {snarl};
+
+        cerr << "top level snarl " << pb2json(*snarl) << endl;
 
         while(!queue.empty()) {
             const Snarl* cur_snarl = queue.back();
@@ -576,19 +578,13 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
             }
             continue;
         }
-
-        // add the interval to the local (thread safe) structures
-        step_handle_t step = trav[uncovered_interval.first];
-        int64_t interval_length = uncovered_interval.second - uncovered_interval.first;
+        pair<step_handle_t, step_handle_t> new_interval = make_pair(trav.at(uncovered_interval.first),
+                                                                    graph->get_next_step(trav.at(uncovered_interval.second - 1)));
 #ifdef debug
+        int64_t interval_length = uncovered_interval.second - uncovered_interval.first;
         cerr << "adding interval with length " << interval_length << endl;
 #endif
-        for (int64_t i = 0; i < interval_length; ++i) {
-            thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
-            step = graph->get_next_step(step);
-        }
-        thread_rgfa_intervals.push_back(make_pair(trav.at(uncovered_interval.first),
-                                                  graph->get_next_step(trav.at(uncovered_interval.second - 1))));
+        add_interval(thread_rgfa_intervals, thread_node_to_interval, new_interval);
     }
 }
 
@@ -621,6 +617,54 @@ vector<pair<int64_t, int64_t>> RGFACover::get_uncovered_intervals(const vector<s
     return intervals;
 }
 
+bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
+                             unordered_map<nid_t, int64_t>& thread_node_to_interval,
+                             const pair<step_handle_t, step_handle_t>& new_interval) {
+
+    bool merged = false;
+    path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first);
+
+    // check the before-first step.  if it's in an interval then it must be immediately
+    // preceeding so we merge the new interval to the end of the found interval
+    step_handle_t before_first_step = graph->get_previous_step(new_interval.first);
+    if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) {
+        nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step));
+        if (thread_node_to_interval.count(prev_node_id)) {
+            pair<step_handle_t, step_handle_t>& prev_interval = thread_rgfa_intervals[thread_node_to_interval[prev_node_id]];
+            if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) {
+                assert(prev_interval.second == new_interval.first);
+                prev_interval.second = new_interval.second;
+                merged = true;
+            }
+        }
+    }
+    
+    // check the end step.  if it's in an interval then it must be immediately
+    // following  we merge the new interval to the front of the found interval
+    if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) {
+        nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second));
+        if (thread_node_to_interval.count(next_node_id)) {
+            pair<step_handle_t, step_handle_t>& next_interval = thread_rgfa_intervals[thread_node_to_interval[next_node_id]];
+            path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first);
+            if (graph->get_path_handle_of_step(next_interval.first) == path_handle) {
+                assert(next_interval.first == new_interval.second);
+                next_interval.first = new_interval.first;
+                merged = true;
+            }
+        }
+    }
+
+    // add the interval to the local (thread safe) structures
+    if (!merged) {
+        for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) {
+            thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
+        }
+        thread_rgfa_intervals.push_back(new_interval);
+    }
+    
+    return !merged;
+}
+
 int64_t RGFACover::get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval) {
     path_handle_t path_handle = graph->get_path_handle_of_step(trav.front());
     int64_t coverage = 0;
@@ -635,7 +679,9 @@ int64_t RGFACover::get_coverage(const vector<step_handle_t>& trav, const pair<in
 
     return coverage;
 }
-    
+
+
+
 // copied pretty much verbatem from
 // https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880
 void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph) {
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index fe7559f67c6..74311da333f 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -101,6 +101,13 @@ class RGFACover {
     vector<pair<int64_t, int64_t>> get_uncovered_intervals(const vector<step_handle_t>& trav,
                                                            const unordered_map<nid_t, int64_t>& thread_node_to_interval);
 
+    // add a new interval into the rgfa_intervals veector and update the node_to_interval map
+    // if the interval can be merged into an existing, contiguous interval, do that instead
+    // returns true if a new interval was added, false if an existing interval was updated
+    bool add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
+                      unordered_map<nid_t, int64_t>& thread_node_to_interval,
+                      const pair<step_handle_t, step_handle_t>& new_interval);
+
     // get the total coverage of a traversal (sum of step lengths)
     int64_t get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval);
 

From 9680149881b32d989520dc103da80d5907cedf8c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 28 Mar 2024 16:11:30 -0400
Subject: [PATCH 33/47] disable rank check

---
 src/algorithms/gfa_to_handle.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp
index c8a483f10ea..d58479cdc72 100644
--- a/src/algorithms/gfa_to_handle.cpp
+++ b/src/algorithms/gfa_to_handle.cpp
@@ -303,7 +303,7 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
             string rgfa_path_name;
             if (path_rank > 0) {
                 // Special logic for off-reference paths, which get loaded into special rGFA cover paths
-                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, true);
+                rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false);
             } else {
                 // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
                 rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC,
@@ -1044,7 +1044,13 @@ void GFAParser::parse(istream& in) {
                             } else {
                                 // This path existed already. Make sure we aren't showing a conflicting rank
                                 if (rgfa_path_rank != get<0>(found->second)) {
-                                    throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second)));
+                                  // vg paths can now write different fragments of the same sample/hap with different ranks
+                                  // (unlike minigraph where all fragments of teh same assembly share a rank)
+                                  // by consequence, this check needs to be disabled.
+                                  // TODO: we still want to check that each individual fragment has consisten ranks
+                                  //       so the check should be moved downstream
+                                  //
+                                  // throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second)));
                                 }
                             }
                             auto& visit_queue = get<2>(found->second);

From 89af9b6c89ad058234544cf8ec6468a94407d688 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 5 Apr 2024 09:57:04 -0400
Subject: [PATCH 34/47] auto-add rgfa prefix during rgfa conversion

---
 src/subcommand/convert_main.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp
index 7474f5a521e..d8981e1f02d 100644
--- a/src/subcommand/convert_main.cpp
+++ b/src/subcommand/convert_main.cpp
@@ -6,6 +6,7 @@
 #include "../algorithms/find_gbwtgraph.hpp"
 #include "../io/save_handle_graph.hpp"
 #include "../gfa.hpp"
+#include "../rgfa.hpp"
 #include "../gbwt_helper.hpp"
 #include "../gbwtgraph_helper.hpp"
 #include <vg/io/stream.hpp>
@@ -236,6 +237,13 @@ int main_convert(int argc, char** argv) {
           cerr << "[vg convert] warning: vg-protobuf output (-v / --vg-out) is deprecated. please use -p instead." << endl;
     }
 
+    // we interpret the user path selections as reference path selections (consistent with vg paths)
+    // so we need to add the rgfa keyword to pick up the nonref paths
+    if ((!rgfa_paths.empty() || !rgfa_prefixes.empty()) &&
+        std::find(rgfa_prefixes.begin(), rgfa_prefixes.end(), RGFACover::rgfa_sample_name) == rgfa_prefixes.end()) {
+        rgfa_prefixes.push_back(RGFACover::rgfa_sample_name);
+    }
+
     
     // with -F or -G we convert an alignment and not a graph
     if (input == input_gam || input == input_gaf) {

From 7ce91ebf7b75f96dcf463f94ff0f063155a4bf29 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 5 Apr 2024 16:45:37 -0400
Subject: [PATCH 35/47] merge adjacent intervals from different threads

---
 src/rgfa.cpp | 76 ++++++++++++++++++++++++++++++++++++----------------
 src/rgfa.hpp |  3 ++-
 2 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 237c7cb8f81..243899d2425 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -128,6 +128,7 @@ void RGFACover::compute(const PathHandleGraph* graph,
     this->num_ref_intervals = this->rgfa_intervals.size();
 
 #ifdef debug
+#pragma omp critical(cerr)
     cerr << "[rgfa] Selected " << rgfa_intervals.size() << " rank=0 reference paths" << endl;
 #endif
     
@@ -148,8 +149,6 @@ void RGFACover::compute(const PathHandleGraph* graph,
 
         vector<const Snarl*> queue = {snarl};
 
-        cerr << "top level snarl " << pb2json(*snarl) << endl;
-
         while(!queue.empty()) {
             const Snarl* cur_snarl = queue.back();
             queue.pop_back();
@@ -168,24 +167,23 @@ void RGFACover::compute(const PathHandleGraph* graph,
         }
     });
 
-    // now we need to fold up the thread covers
+    // now we need to fold up the thread covers    
     for (int64_t t = 0; t < thread_count; ++t) {
-        int64_t offset = this->rgfa_intervals.size();
-
-        // todo: figure out one-shot stl invocation to move
-        for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) {
-            this->rgfa_intervals.push_back(rgfa_intervals_vector[t][j]);
-        }
 #ifdef debug
+#pragma omp critical(cerr)        
         cerr << "Adding " << rgfa_intervals_vector[t].size() << " rgfa intervals from thread " << t << endl;
 #endif
-        rgfa_intervals_vector[t].clear();
-
-        for (const auto& node_interval : node_to_interval_vector[t]) {
-            this->node_to_interval[node_interval.first] = node_interval.second + offset;
+        // important to go through function rather than do a raw copy since
+        // inter-top-level snarl merging may need to happen
+        for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) {
+            // the true flag at the end disables the overlap check.  since they were computed
+            // in separate threads, snarls can overlap by a single node
+            add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true);
         }
+        rgfa_intervals_vector[t].clear();
         node_to_interval_vector[t].clear();
     }
+    
 
     // todo: we could optionally go through uncovered nodes and try to greedily search
     // for rgfa intervals at this point, since it seems for some graphs there are
@@ -345,7 +343,10 @@ void RGFACover::load(const PathHandleGraph* graph,
 
 void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) {
     assert(this->graph == static_cast<PathHandleGraph*>(mutable_graph));
-
+#ifdef debug
+    cerr << "applying rGFA cover with " << this->num_ref_intervals << " ref intervals "
+         << " and " << this->rgfa_intervals.size() << " total intervals" << endl;
+#endif
     // index paths isued in rgfa cover
     unordered_map<step_handle_t, int64_t> step_to_offset;
     unordered_set<path_handle_t> source_path_set;
@@ -484,6 +485,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
         }
     }
 #ifdef debug
+#pragma omp critical(cerr)    
     cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl;
 #endif
 
@@ -537,9 +539,12 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
         const pair<int64_t, int64_t>& uncovered_interval = best_stats_fragment.fragment;
 
 #ifdef debug
+#pragma omp critical(cerr)
+        {
         cerr << "best trav: ";
         for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx));
         cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <<endl;
+        }
 #endif
             
 
@@ -582,6 +587,7 @@ void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav
                                                                     graph->get_next_step(trav.at(uncovered_interval.second - 1)));
 #ifdef debug
         int64_t interval_length = uncovered_interval.second - uncovered_interval.first;
+#pragma omp critical(cerr)        
         cerr << "adding interval with length " << interval_length << endl;
 #endif
         add_interval(thread_rgfa_intervals, thread_node_to_interval, new_interval);
@@ -619,9 +625,17 @@ vector<pair<int64_t, int64_t>> RGFACover::get_uncovered_intervals(const vector<s
 
 bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
                              unordered_map<nid_t, int64_t>& thread_node_to_interval,
-                             const pair<step_handle_t, step_handle_t>& new_interval) {
+                             const pair<step_handle_t, step_handle_t>& new_interval,
+                             bool global) {
 
+#ifdef debug
+#pragma omp critical(cerr)
+    cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first))
+         << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first))
+         << "-" << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl;
+#endif
     bool merged = false;
+    int64_t merged_interval_idx = -1;
     path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first);
 
     // check the before-first step.  if it's in an interval then it must be immediately
@@ -630,11 +644,19 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
     if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) {
         nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step));
         if (thread_node_to_interval.count(prev_node_id)) {
-            pair<step_handle_t, step_handle_t>& prev_interval = thread_rgfa_intervals[thread_node_to_interval[prev_node_id]];
+            int64_t prev_idx = thread_node_to_interval[prev_node_id];
+            pair<step_handle_t, step_handle_t>& prev_interval = thread_rgfa_intervals[prev_idx];
             if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) {
-                assert(prev_interval.second == new_interval.first);
+#ifdef debug
+#pragma omp critical(cerr)
+                cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first))
+                     << ":" << graph->get_id(graph->get_handle_of_step(prev_interval.first))
+                     << "-" << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl;
+#endif                
+                assert(global || prev_interval.second == new_interval.first);
                 prev_interval.second = new_interval.second;
                 merged = true;
+                merged_interval_idx = prev_idx;
             }
         }
     }
@@ -644,24 +666,32 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
     if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) {
         nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second));
         if (thread_node_to_interval.count(next_node_id)) {
-            pair<step_handle_t, step_handle_t>& next_interval = thread_rgfa_intervals[thread_node_to_interval[next_node_id]];
+            int64_t next_idx = thread_node_to_interval[next_node_id];
+            pair<step_handle_t, step_handle_t>& next_interval = thread_rgfa_intervals[next_idx];
             path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first);
             if (graph->get_path_handle_of_step(next_interval.first) == path_handle) {
-                assert(next_interval.first == new_interval.second);
+#ifdef debug
+#pragma omp critical(cerr)
+                cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first))
+                     << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first))
+                     << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
+#endif                                
+                assert(global || next_interval.first == new_interval.second);
                 next_interval.first = new_interval.first;
                 merged = true;
+                merged_interval_idx = next_idx;
             }
         }
     }
 
     // add the interval to the local (thread safe) structures
     if (!merged) {
-        for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) {
-            thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = thread_rgfa_intervals.size();
-        }
+        merged_interval_idx = thread_rgfa_intervals.size();
         thread_rgfa_intervals.push_back(new_interval);
     }
-    
+    for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) {
+        thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx;
+    }
     return !merged;
 }
 
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 74311da333f..6204621b991 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -106,7 +106,8 @@ class RGFACover {
     // returns true if a new interval was added, false if an existing interval was updated
     bool add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_rgfa_intervals,
                       unordered_map<nid_t, int64_t>& thread_node_to_interval,
-                      const pair<step_handle_t, step_handle_t>& new_interval);
+                      const pair<step_handle_t, step_handle_t>& new_interval,
+                      bool global = false);
 
     // get the total coverage of a traversal (sum of step lengths)
     int64_t get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval);

From 0fa7487a13f70cf36077a534844bc0b844e41901 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 10 Apr 2024 13:08:13 -0400
Subject: [PATCH 36/47] fix merge

---
 src/graph_caller.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp
index b0987eafedb..fd09a799224 100644
--- a/src/graph_caller.cpp
+++ b/src/graph_caller.cpp
@@ -66,7 +66,15 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re
     };
 
     // Start with the top level snarls
-    snarl_manager.for_each_top_level_snarl_parallel(process_snarl);
+    // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl()
+    vector<const Snarl*> top_level_snarls;
+    snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) {
+        top_level_snarls.push_back(snarl);
+    });
+#pragma omp parallel for schedule(dynamic, 1)
+    for (int64_t i = 0; i < top_level_snarls.size(); ++i) {
+        process_snarl(top_level_snarls[i], -1);
+    }
     if (show_progress) cerr << "[vg call]: Finished processing " << top_snarl_count << " top-level snarls" << endl;
 
     top_level = false;

From 1f5e9280ec2bcf75c59deb1c3f8777d06d3330ac Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 11 Apr 2024 14:30:05 -0400
Subject: [PATCH 37/47] fix bug where cyclic paths mishandled during interval
 merging

---
 src/rgfa.cpp | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 243899d2425..2068555b272 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -631,8 +631,10 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
 #ifdef debug
 #pragma omp critical(cerr)
     cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first))
+         << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">")
          << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first))
-         << "-" << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl;
+         << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">")
+         << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl;
 #endif
     bool merged = false;
     int64_t merged_interval_idx = -1;
@@ -645,18 +647,22 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
         nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step));
         if (thread_node_to_interval.count(prev_node_id)) {
             int64_t prev_idx = thread_node_to_interval[prev_node_id];
-            pair<step_handle_t, step_handle_t>& prev_interval = thread_rgfa_intervals[prev_idx];
+            pair<step_handle_t, step_handle_t>& prev_interval = thread_rgfa_intervals[prev_idx];            
             if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) {
+                if (prev_interval.second == new_interval.first ||
+                    (global && graph->get_previous_step(prev_interval.second) == new_interval.first)) {
 #ifdef debug
 #pragma omp critical(cerr)
-                cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first))
-                     << ":" << graph->get_id(graph->get_handle_of_step(prev_interval.first))
-                     << "-" << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl;
-#endif                
-                assert(global || prev_interval.second == new_interval.first);
-                prev_interval.second = new_interval.second;
-                merged = true;
-                merged_interval_idx = prev_idx;
+                    cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first))
+                         << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">")
+                         << graph->get_id(graph->get_handle_of_step(prev_interval.first))
+                         << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">")
+                         << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl;
+#endif
+                    prev_interval.second = new_interval.second;
+                    merged = true;
+                    merged_interval_idx = prev_idx;
+                }
             }
         }
     }
@@ -670,16 +676,18 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
             pair<step_handle_t, step_handle_t>& next_interval = thread_rgfa_intervals[next_idx];
             path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first);
             if (graph->get_path_handle_of_step(next_interval.first) == path_handle) {
+                if (next_interval.first == new_interval.second ||
+                    (global && next_interval.first == graph->get_previous_step(new_interval.second))) {
 #ifdef debug
 #pragma omp critical(cerr)
-                cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first))
-                     << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first))
-                     << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
-#endif                                
-                assert(global || next_interval.first == new_interval.second);
-                next_interval.first = new_interval.first;
-                merged = true;
-                merged_interval_idx = next_idx;
+                    cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first))
+                         << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first))
+                         << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
+#endif
+                    next_interval.first = new_interval.first;
+                    merged = true;
+                    merged_interval_idx = next_idx;
+                }
             }
         }
     }

From 56dc5c212d7858ef8b5cffacd74831d940ce8637 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 12 Apr 2024 12:46:23 -0400
Subject: [PATCH 38/47] Revert "auto-add rgfa prefix during rgfa conversion"

This reverts commit 89af9b6c89ad058234544cf8ec6468a94407d688.
---
 src/subcommand/convert_main.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp
index d8981e1f02d..7474f5a521e 100644
--- a/src/subcommand/convert_main.cpp
+++ b/src/subcommand/convert_main.cpp
@@ -6,7 +6,6 @@
 #include "../algorithms/find_gbwtgraph.hpp"
 #include "../io/save_handle_graph.hpp"
 #include "../gfa.hpp"
-#include "../rgfa.hpp"
 #include "../gbwt_helper.hpp"
 #include "../gbwtgraph_helper.hpp"
 #include <vg/io/stream.hpp>
@@ -237,13 +236,6 @@ int main_convert(int argc, char** argv) {
           cerr << "[vg convert] warning: vg-protobuf output (-v / --vg-out) is deprecated. please use -p instead." << endl;
     }
 
-    // we interpret the user path selections as reference path selections (consistent with vg paths)
-    // so we need to add the rgfa keyword to pick up the nonref paths
-    if ((!rgfa_paths.empty() || !rgfa_prefixes.empty()) &&
-        std::find(rgfa_prefixes.begin(), rgfa_prefixes.end(), RGFACover::rgfa_sample_name) == rgfa_prefixes.end()) {
-        rgfa_prefixes.push_back(RGFACover::rgfa_sample_name);
-    }
-
     
     // with -F or -G we convert an alignment and not a graph
     if (input == input_gam || input == input_gaf) {

From 54712e1a88e16bd563ad39859478fb4788e0458c Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 12 Apr 2024 12:50:25 -0400
Subject: [PATCH 39/47] fix debug message

---
 src/rgfa.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 2068555b272..5dae86dacd8 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -632,9 +632,13 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
 #pragma omp critical(cerr)
     cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first))
          << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">")
-         << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first))
-         << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">")
-         << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl;
+         << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first));
+    if (new_interval.second == graph->path_end(graph->get_path_handle_of_step(new_interval.first))) {
+        cerr << "PATH_END" << endl;
+    }  else {
+        cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">")
+             << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl;
+    }
 #endif
     bool merged = false;
     int64_t merged_interval_idx = -1;
@@ -655,9 +659,13 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
 #pragma omp critical(cerr)
                     cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first))
                          << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">")
-                         << graph->get_id(graph->get_handle_of_step(prev_interval.first))
-                         << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">")
-                         << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl;
+                         << graph->get_id(graph->get_handle_of_step(prev_interval.first));
+                    if (prev_interval.second == graph->path_end(graph->get_path_handle_of_step(prev_interval.first))) {
+                        cerr << "PATH_END" << endl;
+                    } else {
+                         cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">")
+                              << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl;
+                    }
 #endif
                     prev_interval.second = new_interval.second;
                     merged = true;

From 69fc397e004b64ed67103fac643e653bbdcaafff Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Fri, 12 Apr 2024 12:59:56 -0400
Subject: [PATCH 40/47] fix debug msg

---
 src/rgfa.cpp                  | 8 ++++++--
 src/subcommand/paths_main.cpp | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 5dae86dacd8..b4acf7b21c9 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -689,8 +689,12 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
 #ifdef debug
 #pragma omp critical(cerr)
                     cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first))
-                         << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first))
-                         << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
+                         << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first));
+                    if (next_interval.second == graph->path_end(graph->get_path_handle_of_step(next_interval.second))) {
+                        cerr << "PATH_END" << endl;
+                    } else {
+                         cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
+                    }
 #endif
                     next_interval.first = new_interval.first;
                     merged = true;
diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index ea7cc56096b..b045247463c 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -42,7 +42,7 @@ void help_paths(char** argv) {
          << "    -r, --retain-paths       output a graph with only the selected paths retained" << endl
          << "  rGFA cover" << endl
          << "    -f, --rgfa-min-length N  add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
-         << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
+         << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl
          << "    -t, --threads N          use up to N threads when computing rGFA cover (default: all available)" << endl
          << "        --rgfa-vcf FILE      add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl
          << "  output path data:" << endl

From 3093eff4a9df7e989e46f2e57afe99d26ecc798a Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sat, 13 Apr 2024 14:58:19 -0400
Subject: [PATCH 41/47] fix bug where double-ended merge would lead to invalid
 overlap

---
 src/rgfa.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++----
 src/rgfa.hpp |  5 +++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index b4acf7b21c9..12b0bf56e6b 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -183,7 +183,9 @@ void RGFACover::compute(const PathHandleGraph* graph,
         rgfa_intervals_vector[t].clear();
         node_to_interval_vector[t].clear();
     }
-    
+
+    // remove any intervals that were made redundant by add_interval
+    defragment_intervals();    
 
     // todo: we could optionally go through uncovered nodes and try to greedily search
     // for rgfa intervals at this point, since it seems for some graphs there are
@@ -677,6 +679,7 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
     
     // check the end step.  if it's in an interval then it must be immediately
     // following  we merge the new interval to the front of the found interval
+    int64_t deleted_idx = -1;
     if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) {
         nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second));
         if (thread_node_to_interval.count(next_node_id)) {
@@ -696,9 +699,19 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
                          cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl;
                     }
 #endif
-                    next_interval.first = new_interval.first;
-                    merged = true;
-                    merged_interval_idx = next_idx;
+                    if (merged == true) {
+                        // decomission next_interval
+                        next_interval.first = graph->path_end(next_path);
+                        next_interval.second = graph->path_front_end(next_path);
+                        deleted_idx = next_idx;
+                        // extend the previous interval right
+                        thread_rgfa_intervals[merged_interval_idx].second = new_interval.second;
+                    } else {
+                        // extend next_interval left
+                        next_interval.first = new_interval.first;
+                        merged = true;
+                        merged_interval_idx = next_idx;
+                    }
                 }
             }
         }
@@ -712,9 +725,35 @@ bool RGFACover::add_interval(vector<pair<step_handle_t, step_handle_t>>& thread_
     for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) {
         thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx;
     }
+    if (deleted_idx >= 0) {
+        // move the links to the deleted interval to the new interval
+        const pair<step_handle_t, step_handle_t>& deleted_interval = thread_rgfa_intervals[deleted_idx];
+        for (step_handle_t step = deleted_interval.first; step != deleted_interval.second; step = graph->get_next_step(step)) {
+            thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx;
+        }        
+    }
     return !merged;
 }
 
+void RGFACover::defragment_intervals() {    
+    vector<pair<step_handle_t, step_handle_t>> new_intervals;
+    this->node_to_interval.clear();
+    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+        const pair<step_handle_t, step_handle_t>& interval = this->rgfa_intervals[i];
+        path_handle_t path_handle = graph->get_path_handle_of_step(interval.first);
+        if (interval.first != graph->path_end(path_handle)) {
+            new_intervals.push_back(interval);
+        }
+    }
+    this->rgfa_intervals = std::move(new_intervals);
+    for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) {
+        const pair<step_handle_t, step_handle_t>& interval = this->rgfa_intervals[i];        
+        for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) {
+            this->node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = i;
+        }                
+    }                                
+}
+
 int64_t RGFACover::get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval) {
     path_handle_t path_handle = graph->get_path_handle_of_step(trav.front());
     int64_t coverage = 0;
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index 6204621b991..a5f44a74137 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -109,6 +109,11 @@ class RGFACover {
                       const pair<step_handle_t, step_handle_t>& new_interval,
                       bool global = false);
 
+    // add_interval() can delete an existing interval. ex if ABC are contiguous intervals and B is added to A_C,
+    // then A gets extended out to the whole range and C is left dangling.  Since we're using a vector,
+    // this requires a full update of everything at the end. 
+    void defragment_intervals();
+
     // get the total coverage of a traversal (sum of step lengths)
     int64_t get_coverage(const vector<step_handle_t>& trav, const pair<int64_t, int64_t>& uncovered_interval);
 

From 286ec9fe1049745abf4009904ea5cfdbfff57cf2 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sat, 13 Apr 2024 16:39:12 -0400
Subject: [PATCH 42/47] fix bug in last commit

---
 src/rgfa.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 12b0bf56e6b..8c8478ff452 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -178,7 +178,10 @@ void RGFACover::compute(const PathHandleGraph* graph,
         for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) {
             // the true flag at the end disables the overlap check.  since they were computed
             // in separate threads, snarls can overlap by a single node
-            add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true);
+            const pair<step_handle_t, step_handle_t>& interval = rgfa_intervals_vector[t][j];
+            if (interval.first != graph->path_end(graph->get_path_handle_of_step(interval.first))) {
+                add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true);
+            }
         }
         rgfa_intervals_vector[t].clear();
         node_to_interval_vector[t].clear();

From 16e3d6186c3c008cbef83164c0cda9358d9b5658 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Sun, 21 Apr 2024 15:41:28 -0400
Subject: [PATCH 43/47] treat phase block as subrange to hand converted gbz

---
 src/rgfa.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index 8c8478ff452..d7f6b71df5e 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -23,7 +23,15 @@ string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, in
     PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus,
                                   path_haplotype, path_phase_block, path_subrange);
     if (path_subrange == PathMetadata::NO_SUBRANGE) {
-        path_subrange = make_pair(0, 0);
+        if (path_phase_block != PathMetadata::NO_PHASE_BLOCK && path_phase_block > 0) {
+            // gbz puts subranges into phase blocks
+            // so we interpret them as subranges here, which allows
+            // things like vg convert graph.gbz | vg paths -x - -f ... to make cover
+            path_subrange.first = path_phase_block;
+            path_subrange.second = PathMetadata::NO_END_POSITION;
+        } else {
+            path_subrange = make_pair(0, 0);
+        }
     }
 
     // we move the sample into the locus

From f5f211bc886a6c447f3ace43bbdf44297234e209 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 1 May 2024 10:27:56 -0400
Subject: [PATCH 44/47] opt to print rgfa stats table

---
 src/rgfa.cpp                  | 243 +++++++++++++++++++++++-----------
 src/rgfa.hpp                  |   6 +
 src/subcommand/paths_main.cpp |  29 ++--
 3 files changed, 194 insertions(+), 84 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index d7f6b71df5e..a9b409a10aa 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -2,6 +2,8 @@
 #include <sstream>
 #include <algorithm>
 #include <queue>
+#include <iomanip>
+#include <set>
 
 //#define debug
 
@@ -126,7 +128,8 @@ void RGFACover::compute(const PathHandleGraph* graph,
             nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle));
             if (node_to_interval.count(node_id)) {
                 cerr << "[rgfa error]: node " << node_id << " covered by two reference paths,"
-                     << " including " << graph->get_path_name(ref_path_handle)
+                     << " including " << graph->get_path_name(ref_path_handle) << " and "
+                     << graph->get_path_name(graph->get_path_handle_of_step(rgfa_intervals.at(node_to_interval.at(node_id)).first))
                      << ". rGFA support current requires disjoint acyclic reference paths" << endl;
                 exit(1);
             }
@@ -930,6 +933,24 @@ vector<pair<int64_t, nid_t>> RGFACover::get_reference_nodes(nid_t node_id, bool
     return output_reference_nodes;
 }
 
+pair<handle_t, handle_t> RGFACover::parse_variant_id(const string& variant_id) const {
+
+    if (variant_id[0] == '>' || variant_id[0] == '<') {
+        size_t p = variant_id.find_first_of("<>", 2);
+        if (p != string::npos) {
+            int64_t id1 = parse<int64_t>(variant_id.substr(1, p-1));
+            int64_t id2 = parse<int64_t>(variant_id.substr(p+1));
+            if (!graph->has_node(id1) || !graph->has_node(id2)) {
+                throw runtime_error("Unable to parse " + variant_id + ": node(s) not found in graph");
+            }
+            return make_pair(graph->get_handle(id1, variant_id[0] == '<'),
+                             graph->get_handle(id2, variant_id[p] == '<'));
+        }
+    }
+    throw runtime_error("Failed to parse " + variant_id);
+    return make_pair(handle_t(), handle_t());
+}
+
 void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) {
     vcflib::Variant var(vcf);
 
@@ -951,12 +972,6 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) {
         }
     }
     
-    // also want an rGFA path lookup
-    unordered_map<string, path_handle_t> name_to_rgfa_path;
-    graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) {
-        name_to_rgfa_path[RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle))] = path_handle;
-    });
-
     // remove header lines we're going to add
     vcf.removeInfoHeaderLine("R_CHROM");
     vcf.removeInfoHeaderLine("R_START");
@@ -971,80 +986,156 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) {
 
     os << vcf.header << endl;
 
-    string prev_sequence_name;
-    string prev_r_chrom;
-    string prev_r_start;
-    string prev_r_end;
-    string prev_rank;
     while (vcf.getNextVariant(var)) {
-        if (name_to_rgfa_path.count(var.sequenceName)) {
-            string r_chrom;
-            string r_start;
-            string r_end;
-            string rank;
-            if (var.sequenceName == prev_sequence_name) {
-                // just use the previous values, which will be the same
-                r_chrom = prev_r_chrom;
-                r_start = prev_r_start;
-                r_end = prev_r_end;
-                rank = prev_rank;
-            } else {
-                // compute from the cover
-                path_handle_t path_handle = name_to_rgfa_path.at(var.sequenceName);
-                nid_t first_node = graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)));
-                vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
-
-                int64_t min_ref_pos = numeric_limits<int64_t>::max();
-                int64_t max_ref_pos = -1;
-                int64_t min_rank = numeric_limits<int64_t>::max();
-                for (const pair<int64_t, nid_t>& rank_node : ref_nodes) {
-                    step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first;
-                    path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
-                    string name = graph->get_path_name(ref_path);
-                    // we assume one reference contig (which is built into the whole structure)
-                    assert(r_chrom.empty() || r_chrom == name);
-                    r_chrom = name;
-                    int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
-                    // assume snarl is forward on both reference nodes
-                    // todo: this won't be exact for some inversion cases, I don't think --
-                    //       need to test these and either add check / or move to oriented search
-                    min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
-                    max_ref_pos = max(max_ref_pos, ref_pos);
-                    min_rank = min(min_rank, rank_node.first);
-                }
+        pair<handle_t, handle_t> id_handles = parse_variant_id(var.id);
+        string r_chrom;
+        string r_start;
+        string r_end;
+        string rank;
+
+       // compute from the cover
+        nid_t first_node = graph->get_id(id_handles.first);
+        vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
+
+        int64_t min_ref_pos = numeric_limits<int64_t>::max();
+        int64_t max_ref_pos = -1;
+        int64_t min_rank = numeric_limits<int64_t>::max();
+        for (const pair<int64_t, nid_t>& rank_node : ref_nodes) {
+            step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first;
+            path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
+            string name = graph->get_path_name(ref_path);
+            // we assume one reference contig (which is built into the whole structure)
+            assert(r_chrom.empty() || r_chrom == name);
+            r_chrom = name;
+            int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
+            // assume snarl is forward on both reference nodes
+            // todo: this won't be exact for some inversion cases, I don't think --
+            //       need to test these and either add check / or move to oriented search
+            min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
+            max_ref_pos = max(max_ref_pos, ref_pos);
+            min_rank = min(min_rank, rank_node.first);
+        }
 
-                r_start = std::to_string(min_ref_pos);
-                r_end = std::to_string(max_ref_pos);
-                rank = std::to_string(min_rank);
-            }
+        r_start = std::to_string(min_ref_pos);
+        r_end = std::to_string(max_ref_pos);
+        rank = std::to_string(min_rank);
+        
+        var.info["R_CHROM"] = {r_chrom};
+        var.info["R_START"] = {r_start};
+        var.info["R_END"] = {r_end};
+        var.info["RANK"] = {rank};
+        
+        os << var << endl;        
+    }
+}
 
-            if (!var.info.count("R_CHROM")) {
-                var.format.push_back("R_CHROM");
-            }
-            var.info["R_CHROM"] = {r_chrom};
-            if (!var.info.count("R_START")) {
-                var.format.push_back("R_START");
-            }
-            var.info["R_START"] = {r_start};
-            if (!var.info.count("R_END")) {
-                var.format.push_back("R_END");
-            }
-            var.info["R_END"] = {r_end};
-            if (!var.info.count("RANK")) {
-                var.format.push_back("RANK");
+void RGFACover::print_stats(ostream& os) {
+
+    // todo: mostly copied from annotate function above.  should probably refactor common
+    // logic into its own, shared thing.y
+    unordered_map<nid_t, int64_t> node_to_ref_pos;
+    for (int64_t i = 0; i < this->num_ref_intervals; ++i) {
+        const pair<step_handle_t, step_handle_t>& ref_interval = this->rgfa_intervals.at(i);
+        // assumption: ref intervals span entire path
+        assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first);
+        int64_t pos = 0;
+        for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second;
+             step_handle = graph->get_next_step(step_handle)) {
+            handle_t handle = graph->get_handle_of_step(step_handle);
+            nid_t node_id = graph->get_id(handle);
+            assert(!graph->get_is_reverse(handle));            
+            assert(!node_to_ref_pos.count(node_id));
+            node_to_ref_pos[node_id] = pos;
+            pos += graph->get_length(handle);
+        }
+    }
+
+    // the header
+    os << "#Sample" << "\t"
+       << "Haplotype" << "\t"
+       << "Locus" << "\t"
+       << "Start" << "\t"
+       << "End" << "\t"
+       << "NodeStart" << "\t"
+       << "NodeEnd" << "\t"
+       << "Rank" << "\t"
+       << "AvgDepth" << "\t"
+       << "AvgSampleDepth" << "\t"
+       << "RefSequence" << "\t"
+       << "RefStart" << "\t"
+       << "RefEnd" << endl;
+        
+    for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) {
+        const pair<step_handle_t, step_handle_t>& interval = this->rgfa_intervals[i];
+        path_handle_t path_handle = graph->get_path_handle_of_step(interval.first);
+        subrange_t rgfa_subrange = graph->get_subrange(path_handle);
+        assert(rgfa_subrange != PathMetadata::NO_SUBRANGE);
+        
+        int64_t path_length = 0;
+        int64_t tot_depth = 0;
+        int64_t tot_sample_depth = 0;
+        int64_t tot_steps = 0;
+        for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) {
+            path_length += graph->get_length(graph->get_handle_of_step(step));
+            set<string> sample_set;
+            vector<step_handle_t> steps = graph->steps_of_handle(graph->get_handle_of_step(step));
+            for (step_handle_t& other_step : steps) {
+                path_handle_t other_path = graph->get_path_handle_of_step(other_step);
+                sample_set.insert(graph->get_sample_name(other_path));                
             }
-            var.info["RANK"] = {rank};
-            
-            prev_sequence_name = var.sequenceName;
-            prev_r_chrom = r_chrom;
-            prev_r_start = r_start;
-            prev_r_end = r_end;
-            prev_rank = rank;
-            
-        } else {
-            //cerr << "rGFA [warning]: VCF reference " << var.sequenceName << " not found in graph" << endl;
+            tot_depth += steps.size();
+            tot_sample_depth += sample_set.size();
+            // we don't want to count the rgfa cover
+            // todo (can remove this when move away from path-based scheme)
+            assert(sample_set.count(RGFACover::rgfa_sample_name));
+            --tot_depth;
+            --tot_sample_depth;
+            ++tot_steps;
         }
-        os << var << endl;        
+        pair<string, string> sample_locus = RGFACover::parse_rgfa_locus_name(graph->get_locus_name(path_handle));
+        size_t haplotype = graph->get_haplotype(path_handle);
+
+        nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first));
+        vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
+        // interval is open ended, so we go back to last node
+        handle_t last_handle = graph->get_handle_of_step(graph->get_previous_step(interval.second));
+        
+        int64_t min_ref_pos = numeric_limits<int64_t>::max();
+        int64_t max_ref_pos = -1;
+        int64_t min_rank = numeric_limits<int64_t>::max();
+        string r_chrom;
+        for (const pair<int64_t, nid_t>& rank_node : ref_nodes) { 
+            step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first;
+            path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
+            string name = graph->get_path_name(ref_path);
+            // we assume one reference contig (which is built into the whole structure)
+            assert(r_chrom.empty() || r_chrom == name);
+            r_chrom = name;
+            int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
+            // assume snarl is forward on both reference nodes
+            // todo: this won't be exact for some inversion cases, I don't think --
+            //       need to test these and either add check / or move to oriented search
+            min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
+            max_ref_pos = max(max_ref_pos, ref_pos);
+            min_rank = min(min_rank, rank_node.first);
+        }
+
+
+        os << sample_locus.first << "\t"
+           << sample_locus.second << "\t"
+           << (haplotype != PathMetadata::NO_HAPLOTYPE ? (int64_t)haplotype : (int64_t)-1) << "\t"
+           << rgfa_subrange.first << "\t"
+           << (rgfa_subrange.first + path_length) << "\t"
+           << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<"  : ">")
+           << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t"
+           << (graph->get_is_reverse(last_handle) ? "<"  : ">")
+           << graph->get_id(last_handle) << "\t"
+           << min_rank << "\t"
+           << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t"
+           << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t"                        
+           << Paths::strip_subrange(r_chrom) << "\t"
+           << min_ref_pos << "\t"
+           << max_ref_pos << "\n";
     }
 }
 
diff --git a/src/rgfa.hpp b/src/rgfa.hpp
index a5f44a74137..db5abdfe850 100644
--- a/src/rgfa.hpp
+++ b/src/rgfa.hpp
@@ -85,8 +85,14 @@ class RGFACover {
     // return nullptr if node not in an interval
     const pair<step_handle_t, step_handle_t>* get_interval(nid_t node_id) const;
 
+    // parse an id of the form >244>2334 and return the pair of handles
+    pair<handle_t, handle_t> parse_variant_id(const string& variant_id) const;
+    
     // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover
     void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os);
+
+    // print out a table of statistics
+    void print_stats(ostream& os);
     
 protected:
 
diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index b045247463c..e5621f0c287 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -45,6 +45,7 @@ void help_paths(char** argv) {
          << "    -s, --snarls FILE        snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl
          << "    -t, --threads N          use up to N threads when computing rGFA cover (default: all available)" << endl
          << "        --rgfa-vcf FILE      add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl
+         << "        --rgfa-intervals     print out table of rgfa intervals and their ranks and positions" << endl
          << "  output path data:" << endl
          << "    -X, --extract-gam        print (as GAM alignments) the stored paths in the graph" << endl
          << "    -A, --extract-gaf        print (as GAF alignments) the stored paths in the graph" << endl
@@ -99,6 +100,7 @@ unordered_map<PathSense, string> SENSE_TO_STRING {
 // Long options with no corresponding short options.
 const int OPT_SELECT_RGFA_FRAGMENTS = 1000;
 const int OPT_ANNOTATE_RGFA_VCF = 1001;
+const int OPT_RGFA_INTERVALS = 1002;
 
 int main_paths(int argc, char** argv) {
 
@@ -116,6 +118,7 @@ int main_paths(int argc, char** argv) {
     bool retain_paths = false;
     int64_t rgfa_min_len = -1;
     string rgfa_vcf_filename;
+    bool rgfa_print_intervals = false;
     string snarl_filename;
     string graph_file;
     string gbwt_file;
@@ -165,6 +168,7 @@ int main_paths(int argc, char** argv) {
             {"coverage", no_argument, 0, 'c'},
             {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS},
             {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF},
+            {"rgfa-intervals", no_argument, 0, OPT_RGFA_INTERVALS},
             {"threads", required_argument, 0, 't'},
             // Hidden options for backward compatibility.
             {"threads-by", required_argument, 0, 'q'},
@@ -302,6 +306,11 @@ int main_paths(int argc, char** argv) {
             rgfa_vcf_filename = optarg;
             output_formats++;
             break;
+
+        case OPT_RGFA_INTERVALS:
+            rgfa_print_intervals = true;
+            output_formats++;
+            break;            
             
         case 't':
         {
@@ -365,7 +374,7 @@ int main_paths(int argc, char** argv) {
         }
     } 
     if (output_formats != 1) {
-        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf) must be specified" << std::endl;
+        std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf, --rgfa-intervals) must be specified" << std::endl;
         std::exit(EXIT_FAILURE);
     }
     if (selection_criteria > 1) {
@@ -673,7 +682,7 @@ int main_paths(int argc, char** argv) {
             
             // output the graph
             vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names);
-        } else if (!rgfa_vcf_filename.empty()) {
+        } else if (!rgfa_vcf_filename.empty() || rgfa_print_intervals) {
             RGFACover rgfa_cover;
             // load up the rank-0 reference path selection
             unordered_set<path_handle_t> reference_paths;
@@ -686,13 +695,17 @@ int main_paths(int argc, char** argv) {
                 cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl;
                 exit(1);
             }
-            vcflib::VariantCallFile variant_file;
-            variant_file.open(rgfa_vcf_filename);
-            if (!variant_file.is_open()) {
-                cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl;
-                exit(1);
+            if (!rgfa_vcf_filename.empty()) {
+                vcflib::VariantCallFile variant_file;
+                variant_file.open(rgfa_vcf_filename);
+                if (!variant_file.is_open()) {
+                    cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl;
+                    exit(1);
+                }
+                rgfa_cover.annotate_vcf(variant_file, cout);
+            } else if (rgfa_print_intervals) {
+                rgfa_cover.print_stats(cout);
             }
-            rgfa_cover.annotate_vcf(variant_file, cout);
         } else if (coverage) {
             // for every node, count the number of unique paths.  then add the coverage count to each one
             // (we're doing the whole graph here, which could be inefficient in the case the user is selecting

From 5fa4fbc441c6f341df9f4e96d449424362c934a9 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 1 May 2024 13:20:38 -0400
Subject: [PATCH 45/47] can use previous step from path end on gbz

---
 src/rgfa.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index a9b409a10aa..d0b47cd4309 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -1098,7 +1098,14 @@ void RGFACover::print_stats(ostream& os) {
         nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first));
         vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
         // interval is open ended, so we go back to last node
-        handle_t last_handle = graph->get_handle_of_step(graph->get_previous_step(interval.second));
+        step_handle_t last_step;
+        if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.second))) {
+            // can't get previous step of end in gbz, so we treat as special case here
+            last_step = graph->path_back(graph->get_path_handle_of_step(interval.second));
+        } else {
+            last_step = graph->get_previous_step(interval.second);
+        }                
+        handle_t last_handle = graph->get_handle_of_step(last_step);
         
         int64_t min_ref_pos = numeric_limits<int64_t>::max();
         int64_t max_ref_pos = -1;

From dd55ec7fc392d9119a475ca816e4213118252f4a Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 1 May 2024 14:12:01 -0400
Subject: [PATCH 46/47] dont crash with multiple references

---
 src/rgfa.cpp | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index d0b47cd4309..d6a388afbf4 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -1099,9 +1099,9 @@ void RGFACover::print_stats(ostream& os) {
         vector<pair<int64_t, nid_t>> ref_nodes = this->get_reference_nodes(first_node, false);
         // interval is open ended, so we go back to last node
         step_handle_t last_step;
-        if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.second))) {
+        if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.first))) {
             // can't get previous step of end in gbz, so we treat as special case here
-            last_step = graph->path_back(graph->get_path_handle_of_step(interval.second));
+            last_step = graph->path_back(graph->get_path_handle_of_step(interval.first));
         } else {
             last_step = graph->get_previous_step(interval.second);
         }                
@@ -1110,23 +1110,32 @@ void RGFACover::print_stats(ostream& os) {
         int64_t min_ref_pos = numeric_limits<int64_t>::max();
         int64_t max_ref_pos = -1;
         int64_t min_rank = numeric_limits<int64_t>::max();
-        string r_chrom;
+        string r_chrom; 
+        map<string, tuple<int64_t, int64_t, int64_t>> chrom_pos;
         for (const pair<int64_t, nid_t>& rank_node : ref_nodes) { 
             step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first;
             path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
             string name = graph->get_path_name(ref_path);
             // we assume one reference contig (which is built into the whole structure)
             assert(r_chrom.empty() || r_chrom == name);
-            r_chrom = name;
             int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
-            // assume snarl is forward on both reference nodes
-            // todo: this won't be exact for some inversion cases, I don't think --
-            //       need to test these and either add check / or move to oriented search
-            min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second)));
-            max_ref_pos = max(max_ref_pos, ref_pos);
-            min_rank = min(min_rank, rank_node.first);
-        }
-
+            int64_t last_len = (int64_t)graph->get_length(graph->get_handle(rank_node.second));
+            if (chrom_pos.count(name)) {
+                tuple<int64_t, int64_t, int64_t>& pos = chrom_pos[name];
+                // assume snarl is forward on both reference nodes
+                // todo: this won't be exact for some inversion cases, I don't think --
+                //       need to test these and either add check / or move to oriented search
+                pos = make_tuple(min(get<0>(pos), ref_pos + last_len), max(get<1>(pos), ref_pos), min(get<2>(pos), rank_node.first));
+            } else {
+                chrom_pos[name] = make_tuple(ref_pos + last_len, ref_pos, rank_node.first);
+            }
+            if (rank_node.first < min_rank) {
+                // todo: is there a better heuristic to use when choosing a reference?
+                // also: need to fix vcf annotate maybe to just list them all...
+                min_rank = rank_node.first;
+                r_chrom = name;
+            }
+        }        
 
         os << sample_locus.first << "\t"
            << sample_locus.second << "\t"
@@ -1141,8 +1150,8 @@ void RGFACover::print_stats(ostream& os) {
            << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t"
            << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t"                        
            << Paths::strip_subrange(r_chrom) << "\t"
-           << min_ref_pos << "\t"
-           << max_ref_pos << "\n";
+           << get<0>(chrom_pos[r_chrom]) << "\t"
+           << get<1>(chrom_pos[r_chrom]) << "\n";
     }
 }
 

From 42bfaa1f0252042a440d2e083087bc332b5d1497 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Wed, 1 May 2024 14:18:46 -0400
Subject: [PATCH 47/47] dont crash with multiple references

---
 src/rgfa.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/rgfa.cpp b/src/rgfa.cpp
index d6a388afbf4..4af858a9d02 100644
--- a/src/rgfa.cpp
+++ b/src/rgfa.cpp
@@ -1117,7 +1117,6 @@ void RGFACover::print_stats(ostream& os) {
             path_handle_t ref_path = graph->get_path_handle_of_step(ref_step);
             string name = graph->get_path_name(ref_path);
             // we assume one reference contig (which is built into the whole structure)
-            assert(r_chrom.empty() || r_chrom == name);
             int64_t ref_pos = node_to_ref_pos.at(rank_node.second);
             int64_t last_len = (int64_t)graph->get_length(graph->get_handle(rank_node.second));
             if (chrom_pos.count(name)) {