From f5f211bc886a6c447f3ace43bbdf44297234e209 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 1 May 2024 10:27:56 -0400 Subject: [PATCH] opt to print rgfa stats table --- src/rgfa.cpp | 243 +++++++++++++++++++++++----------- src/rgfa.hpp | 6 + src/subcommand/paths_main.cpp | 29 ++-- 3 files changed, 194 insertions(+), 84 deletions(-) diff --git a/src/rgfa.cpp b/src/rgfa.cpp index d7f6b71df5e..a9b409a10aa 100644 --- a/src/rgfa.cpp +++ b/src/rgfa.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include //#define debug @@ -126,7 +128,8 @@ void RGFACover::compute(const PathHandleGraph* graph, nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); if (node_to_interval.count(node_id)) { cerr << "[rgfa error]: node " << node_id << " covered by two reference paths," - << " including " << graph->get_path_name(ref_path_handle) + << " including " << graph->get_path_name(ref_path_handle) << " and " + << graph->get_path_name(graph->get_path_handle_of_step(rgfa_intervals.at(node_to_interval.at(node_id)).first)) << ". rGFA support current requires disjoint acyclic reference paths" << endl; exit(1); } @@ -930,6 +933,24 @@ vector> RGFACover::get_reference_nodes(nid_t node_id, bool return output_reference_nodes; } +pair RGFACover::parse_variant_id(const string& variant_id) const { + + if (variant_id[0] == '>' || variant_id[0] == '<') { + size_t p = variant_id.find_first_of("<>", 2); + if (p != string::npos) { + int64_t id1 = parse(variant_id.substr(1, p-1)); + int64_t id2 = parse(variant_id.substr(p+1)); + if (!graph->has_node(id1) || !graph->has_node(id2)) { + throw runtime_error("Unable to parse " + variant_id + ": node(s) not found in graph"); + } + return make_pair(graph->get_handle(id1, variant_id[0] == '<'), + graph->get_handle(id2, variant_id[p] == '<')); + } + } + throw runtime_error("Failed to parse " + variant_id); + return make_pair(handle_t(), handle_t()); +} + void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { vcflib::Variant var(vcf); @@ -951,12 +972,6 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { } } - // also want an rGFA path lookup - unordered_map name_to_rgfa_path; - graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { - name_to_rgfa_path[RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle))] = path_handle; - }); - // remove header lines we're going to add vcf.removeInfoHeaderLine("R_CHROM"); vcf.removeInfoHeaderLine("R_START"); @@ -971,80 +986,156 @@ void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { os << vcf.header << endl; - string prev_sequence_name; - string prev_r_chrom; - string prev_r_start; - string prev_r_end; - string prev_rank; while (vcf.getNextVariant(var)) { - if (name_to_rgfa_path.count(var.sequenceName)) { - string r_chrom; - string r_start; - string r_end; - string rank; - if (var.sequenceName == prev_sequence_name) { - // just use the previous values, which will be the same - r_chrom = prev_r_chrom; - r_start = prev_r_start; - r_end = prev_r_end; - rank = prev_rank; - } else { - // compute from the cover - path_handle_t path_handle = name_to_rgfa_path.at(var.sequenceName); - nid_t first_node = graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle))); - vector> ref_nodes = this->get_reference_nodes(first_node, false); - - int64_t min_ref_pos = numeric_limits::max(); - int64_t max_ref_pos = -1; - int64_t min_rank = numeric_limits::max(); - for (const pair& rank_node : ref_nodes) { - step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; - path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); - string name = graph->get_path_name(ref_path); - // we assume one reference contig (which is built into the whole structure) - assert(r_chrom.empty() || r_chrom == name); - r_chrom = name; - int64_t ref_pos = node_to_ref_pos.at(rank_node.second); - // assume snarl is forward on both reference nodes - // todo: this won't be exact for some inversion cases, I don't think -- - // need to test these and either add check / or move to oriented search - min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); - max_ref_pos = max(max_ref_pos, ref_pos); - min_rank = min(min_rank, rank_node.first); - } + pair id_handles = parse_variant_id(var.id); + string r_chrom; + string r_start; + string r_end; + string rank; + + // compute from the cover + nid_t first_node = graph->get_id(id_handles.first); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); + min_rank = min(min_rank, rank_node.first); + } - r_start = std::to_string(min_ref_pos); - r_end = std::to_string(max_ref_pos); - rank = std::to_string(min_rank); - } + r_start = std::to_string(min_ref_pos); + r_end = std::to_string(max_ref_pos); + rank = std::to_string(min_rank); + + var.info["R_CHROM"] = {r_chrom}; + var.info["R_START"] = {r_start}; + var.info["R_END"] = {r_end}; + var.info["RANK"] = {rank}; + + os << var << endl; + } +} - if (!var.info.count("R_CHROM")) { - var.format.push_back("R_CHROM"); - } - var.info["R_CHROM"] = {r_chrom}; - if (!var.info.count("R_START")) { - var.format.push_back("R_START"); - } - var.info["R_START"] = {r_start}; - if (!var.info.count("R_END")) { - var.format.push_back("R_END"); - } - var.info["R_END"] = {r_end}; - if (!var.info.count("RANK")) { - var.format.push_back("RANK"); +void RGFACover::print_stats(ostream& os) { + + // todo: mostly copied from annotate function above. should probably refactor common + // logic into its own, shared thing.y + unordered_map node_to_ref_pos; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->rgfa_intervals.at(i); + // assumption: ref intervals span entire path + assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first); + int64_t pos = 0; + for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second; + step_handle = graph->get_next_step(step_handle)) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + assert(!graph->get_is_reverse(handle)); + assert(!node_to_ref_pos.count(node_id)); + node_to_ref_pos[node_id] = pos; + pos += graph->get_length(handle); + } + } + + // the header + os << "#Sample" << "\t" + << "Haplotype" << "\t" + << "Locus" << "\t" + << "Start" << "\t" + << "End" << "\t" + << "NodeStart" << "\t" + << "NodeEnd" << "\t" + << "Rank" << "\t" + << "AvgDepth" << "\t" + << "AvgSampleDepth" << "\t" + << "RefSequence" << "\t" + << "RefStart" << "\t" + << "RefEnd" << endl; + + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + + int64_t path_length = 0; + int64_t tot_depth = 0; + int64_t tot_sample_depth = 0; + int64_t tot_steps = 0; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + path_length += graph->get_length(graph->get_handle_of_step(step)); + set sample_set; + vector steps = graph->steps_of_handle(graph->get_handle_of_step(step)); + for (step_handle_t& other_step : steps) { + path_handle_t other_path = graph->get_path_handle_of_step(other_step); + sample_set.insert(graph->get_sample_name(other_path)); } - var.info["RANK"] = {rank}; - - prev_sequence_name = var.sequenceName; - prev_r_chrom = r_chrom; - prev_r_start = r_start; - prev_r_end = r_end; - prev_rank = rank; - - } else { - //cerr << "rGFA [warning]: VCF reference " << var.sequenceName << " not found in graph" << endl; + tot_depth += steps.size(); + tot_sample_depth += sample_set.size(); + // we don't want to count the rgfa cover + // todo (can remove this when move away from path-based scheme) + assert(sample_set.count(RGFACover::rgfa_sample_name)); + --tot_depth; + --tot_sample_depth; + ++tot_steps; } - os << var << endl; + pair sample_locus = RGFACover::parse_rgfa_locus_name(graph->get_locus_name(path_handle)); + size_t haplotype = graph->get_haplotype(path_handle); + + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + // interval is open ended, so we go back to last node + handle_t last_handle = graph->get_handle_of_step(graph->get_previous_step(interval.second)); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + string r_chrom; + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); + min_rank = min(min_rank, rank_node.first); + } + + + os << sample_locus.first << "\t" + << sample_locus.second << "\t" + << (haplotype != PathMetadata::NO_HAPLOTYPE ? (int64_t)haplotype : (int64_t)-1) << "\t" + << rgfa_subrange.first << "\t" + << (rgfa_subrange.first + path_length) << "\t" + << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t" + << (graph->get_is_reverse(last_handle) ? "<" : ">") + << graph->get_id(last_handle) << "\t" + << min_rank << "\t" + << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t" + << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t" + << Paths::strip_subrange(r_chrom) << "\t" + << min_ref_pos << "\t" + << max_ref_pos << "\n"; } } diff --git a/src/rgfa.hpp b/src/rgfa.hpp index a5f44a74137..db5abdfe850 100644 --- a/src/rgfa.hpp +++ b/src/rgfa.hpp @@ -85,8 +85,14 @@ class RGFACover { // return nullptr if node not in an interval const pair* get_interval(nid_t node_id) const; + // parse an id of the form >244>2334 and return the pair of handles + pair parse_variant_id(const string& variant_id) const; + // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os); + + // print out a table of statistics + void print_stats(ostream& os); protected: diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index b045247463c..e5621f0c287 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -45,6 +45,7 @@ void help_paths(char** argv) { << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " --rgfa-vcf FILE add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl + << " --rgfa-intervals print out table of rgfa intervals and their ranks and positions" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -99,6 +100,7 @@ unordered_map SENSE_TO_STRING { // Long options with no corresponding short options. const int OPT_SELECT_RGFA_FRAGMENTS = 1000; const int OPT_ANNOTATE_RGFA_VCF = 1001; +const int OPT_RGFA_INTERVALS = 1002; int main_paths(int argc, char** argv) { @@ -116,6 +118,7 @@ int main_paths(int argc, char** argv) { bool retain_paths = false; int64_t rgfa_min_len = -1; string rgfa_vcf_filename; + bool rgfa_print_intervals = false; string snarl_filename; string graph_file; string gbwt_file; @@ -165,6 +168,7 @@ int main_paths(int argc, char** argv) { {"coverage", no_argument, 0, 'c'}, {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS}, {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF}, + {"rgfa-intervals", no_argument, 0, OPT_RGFA_INTERVALS}, {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. {"threads-by", required_argument, 0, 'q'}, @@ -302,6 +306,11 @@ int main_paths(int argc, char** argv) { rgfa_vcf_filename = optarg; output_formats++; break; + + case OPT_RGFA_INTERVALS: + rgfa_print_intervals = true; + output_formats++; + break; case 't': { @@ -365,7 +374,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf, --rgfa-intervals) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -673,7 +682,7 @@ int main_paths(int argc, char** argv) { // output the graph vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names); - } else if (!rgfa_vcf_filename.empty()) { + } else if (!rgfa_vcf_filename.empty() || rgfa_print_intervals) { RGFACover rgfa_cover; // load up the rank-0 reference path selection unordered_set reference_paths; @@ -686,13 +695,17 @@ int main_paths(int argc, char** argv) { cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl; exit(1); } - vcflib::VariantCallFile variant_file; - variant_file.open(rgfa_vcf_filename); - if (!variant_file.is_open()) { - cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; - exit(1); + if (!rgfa_vcf_filename.empty()) { + vcflib::VariantCallFile variant_file; + variant_file.open(rgfa_vcf_filename); + if (!variant_file.is_open()) { + cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; + exit(1); + } + rgfa_cover.annotate_vcf(variant_file, cout); + } else if (rgfa_print_intervals) { + rgfa_cover.print_stats(cout); } - rgfa_cover.annotate_vcf(variant_file, cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one // (we're doing the whole graph here, which could be inefficient in the case the user is selecting