Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/rgfa' into rgfa
Browse files Browse the repository at this point in the history
  • Loading branch information
glennhickey committed May 10, 2023
2 parents 2b38d5a + 86df147 commit c9553e0
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 0 deletions.
7 changes: 7 additions & 0 deletions src/gfa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,10 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph,
SnarlManager* snarl_manager,
const unordered_set<path_handle_t>& reference_paths,
int64_t minimum_length,
<<<<<<< HEAD
const string& rgfa_sample_name,
=======
>>>>>>> origin/rgfa
const unordered_map<string, vector<pair<int64_t, int64_t>>>& preferred_intervals){

// for sanity's sake, we don't want to ever support multiple rgfa covers, so start by
Expand Down Expand Up @@ -499,7 +502,11 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph,
subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle);
rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0);
rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length;
<<<<<<< HEAD
string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange, rgfa_sample_name);
=======
string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange);
>>>>>>> origin/rgfa

#ifdef debug
#pragma omp critical(cerr)
Expand Down
69 changes: 69 additions & 0 deletions src/gfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,75 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph,
unordered_map<const string&, vector<pair<int64_t, int64_t>>> extract_rgfa_intervals(const string& rgfa_path);



/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped
/// or adapted into libhandlegraph at some point, ideally.

/// It works by using a special sample name (default=_rGFA_) for rGFA contigs.
/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z:<name>
/// The rGFA rank also goes in the locus field behind SR:i:<rank>

/// In GFA, these paths live in rGFA tags on S elements
/// In the graph, they are reference paths with SN/SR fields in their locus names.
/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P)

/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA
/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags)
/// when specified with -Q/-P in convert -f.

/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference
/// backbone, and higher the further number of (nested) bubbles away it is.
/// If the path is not an RGFA path, then return -1
int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_");

/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and
/// moving its old sample into the locus field
string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange,
const string& rgfa_sample="_rGFA_");

/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank
string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample="_rGFA_");

/// Compute the rGFA path cover
/// graph: the graph
/// snarl_manager: the snarls (todo: should use distance index)
/// reference_paths: rank-0 paths
/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered)
/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths
void rgfa_graph_cover(MutablePathMutableHandleGraph* graph,
SnarlManager* snarl_manager,
const unordered_set<path_handle_t>& reference_paths,
int64_t minimum_length,
const unordered_map<string, vector<pair<int64_t, int64_t>>>& preferred_intervals = {});

void rgfa_snarl_cover(const PathHandleGraph* graph,
const Snarl& snarl,
PathTraversalFinder& path_trav_finder,
const unordered_set<path_handle_t>& reference_paths,
int64_t minimum_length,
vector<pair<int64_t, vector<step_handle_t>>>& cover_fragments,
unordered_map<nid_t, int64_t>& cover_node_to_fragment,
const unordered_map<string, vector<pair<int64_t, int64_t>>>& preferred_intervals);

/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm
/// 1. Coverage : total step length across the traversal for all paths
/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal
/// 3. Duplicated bases : the number of duplicated bases in the traversal path
tuple<int64_t, int64_t, int64_t> rgfa_traversal_stats(const PathHandleGraph* graph,
const vector<step_handle_t>& trav,
const pair<int64_t, int64_t>& trav_fragment);

/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals
bool rgfa_traversal_stats_less(const tuple<int64_t, int64_t, int64_t>& s1, const tuple<int64_t, int64_t, int64_t>& s2);

/// Make sure all rgfa paths are forwardized
void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph,
const unordered_set<path_handle_t>& reference_paths);

/// Extract rGFA tags from minigraph GFA in order to pass in as hints above
unordered_map<const string&, vector<pair<int64_t, int64_t>>> extract_rgfa_intervals(const string& rgfa_path);


}

#endif
18 changes: 18 additions & 0 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ void help_paths(char** argv) {
<< " -r, --retain-paths output a graph with only the selected paths retained" << endl
<< " rGFA cover" << endl
<< " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl
<<<<<<< HEAD
<< " -N, --rgfa-sample STR give all rGFA cover fragments sample name (path prefix) STR (default: _rGFA_)." << endl
<< " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
<< " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl
=======
<< " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl
<< " -t, --threads N use up to N threads when computing rGFA covoer (default: all available)" << endl
>>>>>>> origin/rgfa
<< " output path data:" << endl
<< " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl
<< " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl
Expand Down Expand Up @@ -108,7 +113,10 @@ int main_paths(int argc, char** argv) {
bool drop_paths = false;
bool retain_paths = false;
int64_t rgfa_min_len = -1;
<<<<<<< HEAD
string rgfa_sample_name = "_rGFA_";
=======
>>>>>>> origin/rgfa
string snarl_filename;
string graph_file;
string gbwt_file;
Expand Down Expand Up @@ -143,7 +151,10 @@ int main_paths(int argc, char** argv) {
{"drop-paths", no_argument, 0, 'd'},
{"retain-paths", no_argument, 0, 'r'},
{"rgfa-cover", required_argument, 0, 'R'},
<<<<<<< HEAD
{"rgfa-sample", required_argument, 0, 'N'},
=======
>>>>>>> origin/rgfa
{"snarls", required_argument, 0, 's'},
{"extract-gam", no_argument, 0, 'X'},
{"extract-gaf", no_argument, 0, 'A'},
Expand All @@ -166,7 +177,11 @@ int main_paths(int argc, char** argv) {
};

int option_index = 0;
<<<<<<< HEAD
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:N:s:aGp:ct:",
=======
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGp:ct:",
>>>>>>> origin/rgfa
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -207,10 +222,13 @@ int main_paths(int argc, char** argv) {
output_formats++;
break;

<<<<<<< HEAD
case 'N':
rgfa_sample_name = optarg;
break;

=======
>>>>>>> origin/rgfa
case 's':
snarl_filename = optarg;
break;
Expand Down

0 comments on commit c9553e0

Please sign in to comment.