Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More complete rGFA support (experimental) #4113

Open
wants to merge 48 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5c90455
start wiring in new rgfa interface
glennhickey Sep 8, 2023
a8dcd2b
start wiring in cli interface
glennhickey Sep 10, 2023
a2261b4
add rgfa options to save_graph interface
glennhickey Sep 19, 2023
60f8351
open intervals
glennhickey Sep 20, 2023
6dd3856
clean a few cases
glennhickey Sep 20, 2023
860660b
rgfa tests
glennhickey Sep 20, 2023
37945cb
Add tag test
glennhickey Sep 21, 2023
f3642ee
add rgfa support to deconstruct
glennhickey Sep 21, 2023
f816072
fix up some edge cases
glennhickey Sep 22, 2023
7c86b03
allow multiple -S in call
glennhickey Sep 22, 2023
e0e3cc2
rgfa call test
glennhickey Sep 22, 2023
b6b955f
speed up rgfa path application
glennhickey Sep 23, 2023
e02e8a2
sample option for surject path selection, some rgfa support
glennhickey Sep 23, 2023
17f3f57
fix bugs in load() and get_rank()
glennhickey Sep 24, 2023
4d65d74
ignore rgfa path fragments in depth and clip coverage calcs
glennhickey Sep 25, 2023
b46eaaa
use (by default) fragment not original paths in rgfa index
glennhickey Sep 26, 2023
1942dc6
break ties with name when choosing cover
glennhickey Sep 26, 2023
e539734
work around step iterator issue in gbz
glennhickey Sep 29, 2023
2a5ffad
get rgfa tests passing; add back end range where possible
glennhickey Sep 29, 2023
c4bca66
fix option collision from last merge
glennhickey Oct 5, 2023
c8419ab
rGFA selection option
glennhickey Oct 5, 2023
22974f7
leave subrange in surjected-to rgfa fragments
glennhickey Oct 5, 2023
98a6751
keep rgfa subpaths in surject
glennhickey Oct 6, 2023
2ba1583
revert rgfa name processing in paths listing -- too confusing for now
glennhickey Oct 6, 2023
07aa374
clean rgfa names in fasta output
glennhickey Oct 6, 2023
65a5d7f
fair enough, clang
glennhickey Oct 6, 2023
182cf39
fix rgfa tests post merge
glennhickey Oct 6, 2023
1d07f4c
start rgfa VCF annotator
glennhickey Oct 10, 2023
b8cc937
Fix assertion bug
glennhickey Oct 11, 2023
220d33d
better rgfa support in call
glennhickey Oct 19, 2023
f23e65a
fix merge re -S option
glennhickey Nov 21, 2023
6ebd95f
merge contiguous rgfa fragments
glennhickey Mar 27, 2024
9680149
disable rank check
glennhickey Mar 28, 2024
89af9b6
auto-add rgfa prefix during rgfa conversion
glennhickey Apr 5, 2024
7ce91eb
merge adjacent intervals from different threads
glennhickey Apr 5, 2024
ff71cd8
Merge remote-tracking branch 'origin/master' into rgfa2
glennhickey Apr 10, 2024
0fa7487
fix merge
glennhickey Apr 10, 2024
1f5e928
fix bug where cyclic paths mishandled during interval merging
glennhickey Apr 11, 2024
56dc5c2
Revert "auto-add rgfa prefix during rgfa conversion"
glennhickey Apr 12, 2024
54712e1
fix debug message
glennhickey Apr 12, 2024
69fc397
fix debug msg
glennhickey Apr 12, 2024
3093eff
fix bug where double-ended merge would lead to invalid overlap
glennhickey Apr 13, 2024
286ec9f
fix bug in last commit
glennhickey Apr 13, 2024
16e3d61
treat phase block as subrange to hand converted gbz
glennhickey Apr 21, 2024
f5f211b
opt to print rgfa stats table
glennhickey May 1, 2024
5fa4fbc
can use previous step from path end on gbz
glennhickey May 1, 2024
dd55ec7
dont crash with multiple references
glennhickey May 1, 2024
42bfaa1
dont crash with multiple references
glennhickey May 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions src/algorithms/coverage_depth.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ pair<double, double> sample_gam_depth(const HandleGraph& graph, const vector<Ali
return combine_and_average_node_coverages(graph, node_coverages, min_coverage);
}

void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream) {
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles,
const unordered_set<path_handle_t>& ignore_paths, ostream& out_stream) {
assert(graph.has_path(path_name));

path_handle_t path_handle = graph.get_path_handle(path_name);
Expand All @@ -311,10 +312,14 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m
size_t step_count = 0;
handle_t handle = graph.get_handle_of_step(step_handle);
graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);
if (ignore_paths.count(step_path_handle)) {
return true;
}
if (count_cycles) {
++step_count;
} else {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2);

auto it = path_to_name.find(step_path_handle);
if (it == path_to_name.end()) {
string step_path_name = graph.get_path_name(step_path_handle);
Expand All @@ -323,6 +328,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m
}
path_set.insert(it->second);
}
return true;
});
size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;
size_t node_len = graph.get_length(handle);
Expand All @@ -339,7 +345,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m

pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
step_handle_t start_step, step_handle_t end_plus_one_step,
size_t min_coverage, bool count_cycles) {
size_t min_coverage, bool count_cycles, const unordered_set<path_handle_t>& ignore_paths) {

// compute the mean and variance of our base coverage across the bin
size_t bin_length = 0;
Expand All @@ -357,18 +363,22 @@ pair<double, double> path_depth_of_bin(const PathHandleGraph& graph,
unordered_set<string> path_set;
size_t step_count = 0;
graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle);
if (ignore_paths.count(step_path_handle)) {
return true;
}
if (count_cycles) {
++step_count;
} else {
path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle);
auto it = path_to_name.find(step_path_handle);
if (it == path_to_name.end()) {
string step_path_name = graph.get_path_name(step_path_handle);
// disregard subpath tags when counting
it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first;
}
path_set.insert(it->second);
}
}
return true;
});
size_t coverage = (count_cycles ? step_count : path_set.size()) - 1;

Expand All @@ -386,7 +396,8 @@ vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandle
const string& path_name,
size_t bin_size,
size_t min_coverage,
bool count_cycles) {
bool count_cycles,
const unordered_set<path_handle_t>& ignore_paths) {

path_handle_t path_handle = graph.get_path_handle(path_name);

Expand Down Expand Up @@ -414,7 +425,8 @@ vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandle
step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step;
size_t bin_start = bins[i].first;
size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset;
pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles);
pair<double, double> coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles,
ignore_paths);
binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second);
}

Expand Down
10 changes: 6 additions & 4 deletions src/algorithms/coverage_depth.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,16 @@ pair<double, double> sample_mapping_depth(const HandleGraph& graph, const vector
/// print path-name offset base-coverage for every base on a path (just like samtools depth)
/// ignoring things below min_coverage. offsets are 1-based in output stream
/// coverage here is the number of steps from (unique) other paths
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream);
void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles,
const unordered_set<path_handle_t>& ignore_paths, ostream& out_stream);

/// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage
pair<double, double> path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step,
size_t min_coverage, bool count_cycles);
size_t min_coverage, bool count_cycles, const unordered_set<path_handle_t>& ignore_paths);

vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size,
size_t min_coverage, bool count_cycles);
vector<tuple<size_t, size_t, double, double>> binned_path_depth(const PathHandleGraph& graph, const string& path_name,
size_t bin_size, size_t min_coverage, bool count_cycles,
const unordered_set<path_handle_t>& ignore_paths);

}
}
Expand Down
37 changes: 23 additions & 14 deletions src/algorithms/gfa_to_handle.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "gfa_to_handle.hpp"
#include "../path.hpp"

#include "../rgfa.hpp"
#include <gbwtgraph/utils.h>

namespace vg {
Expand Down Expand Up @@ -291,7 +291,6 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
}
if (found == rgfa_cache->end()) {
// Need to make a new path, possibly with subrange start info.

std::pair<int64_t, int64_t> subrange;
if (offset == 0) {
// Don't send a subrange
Expand All @@ -300,14 +299,22 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph*
// Start later than 0
subrange = std::pair<int64_t, int64_t>(offset, PathMetadata::NO_END_POSITION);
}

string rgfa_path_name;
if (path_rank > 0) {
// Special logic for off-reference paths, which get loaded into special rGFA cover paths
rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false);
} else {
// TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC,
PathMetadata::NO_SAMPLE_NAME,
path_name,
PathMetadata::NO_HAPLOTYPE,
PathMetadata::NO_PHASE_BLOCK,
subrange);
}
path_handle_t path = graph->create_path_handle(rgfa_path_name);

// TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense.
path_handle_t path = graph->create_path(PathSense::GENERIC,
PathMetadata::NO_SAMPLE_NAME,
path_name,
PathMetadata::NO_HAPLOTYPE,
PathMetadata::NO_PHASE_BLOCK,
subrange);
// Then cache it
found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset));
}
Expand Down Expand Up @@ -667,10 +674,6 @@ tuple<string, size_t, string, pair<int64_t, int64_t>, GFAParser::chars_t, GFAPar

// Grab the haplotype number
int64_t haplotype_number = take_number(cursor, end, -1, "parsing haplotype number");
if (haplotype_number == -1) {
// This field is required
throw GFAFormatError("Missing haplotype number in W line", cursor);
}
take_tab(cursor, end, "parsing end of haplotype number");

// Grab the sequence/contig/locus name
Expand Down Expand Up @@ -1041,7 +1044,13 @@ void GFAParser::parse(istream& in) {
} else {
// This path existed already. Make sure we aren't showing a conflicting rank
if (rgfa_path_rank != get<0>(found->second)) {
throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second)));
// vg paths can now write different fragments of the same sample/hap with different ranks
// (unlike minigraph where all fragments of teh same assembly share a rank)
// by consequence, this check needs to be disabled.
// TODO: we still want to check that each individual fragment has consisten ranks
// so the check should be moved downstream
//
// throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second)));
}
}
auto& visit_queue = get<2>(found->second);
Expand Down
21 changes: 15 additions & 6 deletions src/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,8 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
function<void(function<void(handle_t, const Region*)>)> iterate_handles,
function<void(function<void(edge_t, const Region*)>)> iterate_edges,
int64_t min_depth, const vector<string>& ref_prefixes,
int64_t min_fragment_len, bool verbose) {
int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths,
bool verbose) {

// find all nodes in the snarl that are not on the reference interval (reference path name from containing interval)
unordered_set<nid_t> to_delete;
Expand All @@ -502,14 +503,18 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
bool on_ref = false;
size_t depth = 0;
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) {
path_handle_t path_handle = graph->get_path_handle_of_step(step_handle);
if (ignore_paths.count(path_handle)) {
return true;
}
++depth;
if (depth > min_depth || on_ref) {
return false;
}
if (!ref_prefixes.empty() || region) {
// if we have a region, do exact comparison to it.
// otherwise, do a prefix check against ref_prefix
string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle));
string path_name = graph->get_path_name(path_handle);
if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) {
on_ref = true;
return false;
Expand Down Expand Up @@ -540,6 +545,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
edge_depths.resize(edge_count + 1);

graph->for_each_path_handle([&](path_handle_t path_handle) {
if (!ignore_paths.count(path_handle)) {
bool is_ref_path = check_prefixes(graph->get_path_name(path_handle));
handle_t prev_handle;
bool first = true;
Expand All @@ -563,6 +569,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
}
prev_handle = handle;
});
}
});

unordered_set<edge_t> edges_to_delete;
Expand Down Expand Up @@ -622,7 +629,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
}

void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector<string>& ref_prefixes,
int64_t min_fragment_len, bool verbose) {
int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths, bool verbose) {

function<void(function<void(handle_t, const Region*)>)> iterate_handles = [&] (function<void(handle_t, const Region*)> visit_handle) {
graph->for_each_handle([&](handle_t handle) {
Expand All @@ -636,11 +643,13 @@ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_
});
};

clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, verbose);
clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len,
ignore_paths, verbose);
}

void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector<Region>& regions,
SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose) {
SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len,
const unordered_set<path_handle_t>& ignore_paths, bool verbose) {

function<void(function<void(handle_t, const Region*)>)> iterate_handles = [&] (function<void(handle_t, const Region*)> visit_handle) {

Expand Down Expand Up @@ -678,7 +687,7 @@ void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* gra
}
vector<string> ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end());

clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, verbose);
clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, ignore_paths, verbose);

}

Expand Down
11 changes: 7 additions & 4 deletions src/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,22 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph
function<void(function<void(handle_t, const Region*)>)> iterate_handles,
function<void(function<void(edge_t, const Region*)>)> iterate_edges,
int64_t min_depth, const vector<string>& ref_prefixes,
int64_t min_fragment_len, bool verbose);
int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths,
bool verbose);

/**
* Run above function on graph
*/
void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector<string>& ref_prefixes,
int64_t min_fragment_len, bool verbose);
int64_t min_fragment_len, const unordered_set<path_handle_t>& ignore_paths, bool verbose);

/**
* Or on contained snarls
*/
void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector<Region>& regions,
SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose);
void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph,
const vector<Region>& regions, SnarlManager& snarl_manager, bool include_endpoints,
int64_t min_depth, int64_t min_fragment_len,
const unordered_set<path_handle_t>& ignore_paths, bool verbose);

/**
* clip out deletion edges
Expand Down
Loading
Loading