Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Improve off-reference coordinate and rGFA support #3891

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,039 changes: 1,031 additions & 8 deletions src/gfa.cpp

Large diffs are not rendered by default.

112 changes: 112 additions & 0 deletions src/gfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
*/

#include "handle.hpp"
#include "snarls.hpp"
#include "traversal_finder.hpp"

namespace vg {

Expand All @@ -26,6 +28,116 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out,
bool rgfa_pline = false,
bool use_w_lines = true);


/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped
/// or adapted into libhandlegraph at some point, ideally.

/// It works by using a special sample name (default=_rGFA_) for rGFA contigs.
/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z:<name>
/// The rGFA rank also goes in the locus field behind SR:i:<rank>

/// In GFA, these paths live in rGFA tags on S elements
/// In the graph, they are reference paths with SN/SR fields in their locus names.
/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P)

/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA
/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags)
/// when specified with -Q/-P in convert -f.

/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference
/// backbone, and higher the further number of (nested) bubbles away it is.
/// If the path is not an RGFA path, then return -1
int get_rgfa_rank(const string& path_name);

/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank
string strip_rgfa_path_name(const string& path_name);

/// Clamp a path to given subrange (taking into account an existing subrange)
void clamp_path_subrange(string& path_name, int64_t start, int64_t end);

/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and
/// moving its old sample into the locus field
string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange,
const string& rgfa_sample,
const string& start_parent_path_name = "",
int64_t start_parent_offset = -1,
int64_t start_parent_node_id = -1,
bool start_parent_node_reversed = false,
const string& end_parent_path_name = "",
int64_t end_parent_offset = -1,
int64_t end_parent_node_id = -1,
bool end_parent_node_reversed = false);


/// Parse (and remove) all rgfa-specific information, copying it into the various input paramters
/// The rgfa-information-free vg path name is returned (ie the path_name input to carete_rgfa_path_name)
string parse_rgfa_path_name(const string& path_name, int* rgfa_rank = nullptr,
string* rgfa_sample = nullptr,
string* start_parent_rgfa_path_name = nullptr,
int64_t* start_parent_offset = nullptr,
int64_t* start_parent_node_id = nullptr,
bool* start_parent_node_reversed = nullptr,
string* end_parent_rgfa_path_name = nullptr,
int64_t* end_parent_offset = nullptr,
int64_t* end_parent_node_id = nullptr,
bool* end_parent_node_reversed = nullptr);

/// Like above, but put the rgfa stuff into tags
string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags);


/// Compute the rGFA path cover
/// graph: the graph
/// snarl_manager: the snarls (todo: should use distance index)
/// reference_paths: rank-0 paths
/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered)
/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths
void rgfa_graph_cover(MutablePathMutableHandleGraph* graph,
SnarlManager* snarl_manager,
const unordered_set<path_handle_t>& reference_paths,
int64_t minimum_length,
const string& rgfa_sample_name,
const unordered_map<string, vector<pair<int64_t, int64_t>>>& preferred_intervals = {});

/// Some information about an RGFA fragment we pass around and index below
struct RGFAFragment {
int64_t rank;
vector<step_handle_t> steps;
int64_t length;
int64_t start_parent_idx;
int64_t end_parent_idx;
step_handle_t start_parent_step;
step_handle_t end_parent_step;
};

void rgfa_snarl_cover(const PathHandleGraph* graph,
const Snarl& snarl,
PathTraversalFinder& path_trav_finder,
const unordered_set<path_handle_t>& reference_paths,
int64_t minimum_length,
vector<RGFAFragment>& cover_fragments,
unordered_map<nid_t, int64_t>& cover_node_to_fragment,
const unordered_map<string, vector<pair<int64_t, int64_t>>>& preferred_intervals);

/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm
/// 1. Coverage : total step length across the traversal for all paths
/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal
/// 3. Duplicated bases : the number of duplicated bases in the traversal path
tuple<int64_t, int64_t, int64_t> rgfa_traversal_stats(const PathHandleGraph* graph,
const vector<step_handle_t>& trav,
const pair<int64_t, int64_t>& trav_fragment);

/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals
bool rgfa_traversal_stats_less(const tuple<int64_t, int64_t, int64_t>& s1, const tuple<int64_t, int64_t, int64_t>& s2);

/// Make sure all rgfa paths are forwardized
void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph,
const unordered_set<path_handle_t>& reference_paths);

/// Extract rGFA tags from minigraph GFA in order to pass in as hints above
unordered_map<const string&, vector<pair<int64_t, int64_t>>> extract_rgfa_intervals(const string& rgfa_path);


}

#endif
27 changes: 19 additions & 8 deletions src/hts_alignment_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "algorithms/find_translation.hpp"
#include <vg/io/hfile_cppstream.hpp>
#include <vg/io/stream.hpp>
#include "gfa.hpp"

#include <sstream>

Expand Down Expand Up @@ -178,7 +179,10 @@ pair<vector<pair<string, int64_t>>, unordered_map<string, int64_t>> extract_path
auto& path = get<0>(path_info);
auto& own_length = get<1>(path_info);
auto& base_length = get<2>(path_info);
string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path);
string base_path_name = graph.get_path_name(path);
if (subpath_support && get_rgfa_rank(base_path_name) < 0) {
base_path_name = get_path_base_name(graph, path);
}
if (!base_path_set.count(base_path_name)) {
path_names_and_lengths.push_back(make_pair(base_path_name, base_length));
base_path_set.insert(base_path_name);
Expand Down Expand Up @@ -355,9 +359,14 @@ vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const strin

// When we find a path or subpath we want, we will keep it.
auto keep_path_or_subpath = [&](const path_handle_t& path) {
string base_name = get_path_base_name(graph, path);
string base_name = graph.get_path_name(path);
bool is_rgfa = get_rgfa_rank(base_name) >= 0;
if (!is_rgfa) {
// only process subpaths if not rgfa
base_name = get_path_base_name(graph, path);
}
int64_t base_length = -1;
if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) {
if (is_rgfa || graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) {
// This is a full path so we can determine base length now.
base_length = graph.get_path_length(path);
}
Expand Down Expand Up @@ -691,11 +700,13 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector<pair<in
pos_rev = aln.refpos(0).is_reverse();
cigar = cigar_against_path(aln, pos_rev, pos, path_len, 0);

// Resolve subpath naming / offset
subrange_t subrange;
path_name = Paths::strip_subrange(path_name, &subrange);
if (subrange != PathMetadata::NO_SUBRANGE) {
pos += subrange.first;
if (get_rgfa_rank(path_name) < 0) {
// Resolve subpath naming / offset (if not rGFA)
subrange_t subrange;
path_name = Paths::strip_subrange(path_name, &subrange);
if (subrange != PathMetadata::NO_SUBRANGE) {
pos += subrange.first;
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion src/subcommand/convert_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,13 @@ int main_convert(int argc, char** argv) {
continue;
}
}
// Tagged (rank>0) rgfa paths get written if a base path specified
if (get_rgfa_rank(path_name) > 0) {
assert(!rgfa_paths.count(path_name));
rgfa_paths.insert(path_name);
}
});

}
graph_to_gfa(graph_to_write, std::cout, rgfa_paths, rgfa_pline, wline);
} else {
Expand Down Expand Up @@ -468,7 +474,7 @@ void help_convert(char** argv) {
<< "gfa output options (use with -f):" << endl
<< " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl
<< " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl
<< " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl
<< " -B, --rgfa-pline paths written as rank 0 rGFA tags also written as lines" << endl
<< " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl
<< " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl
<< " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl
Expand Down
Loading