From b619a0fcaee819c4c5a09821372e69072743029c Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 22 Feb 2019 14:43:43 -0800 Subject: [PATCH 01/63] partially implemented command-line interface --- src/subcommand/mod_main.cpp | 32 ++++++++++++++++++++++++++++++-- src/subgraph.hpp | 9 +++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index a4ecadfee2d..6e44ea2c210 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -17,6 +17,9 @@ #include "../algorithms/topological_sort.hpp" #include "../algorithms/remove_high_degree.hpp" +#include "../algorithms/0_demo_extract_subgraph.cpp" +#include "../algorithms/0_demo_snarl_to_strings.cpp" + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -79,7 +82,8 @@ void help_mod(char** argv) { << " -a, --cactus convert to cactus graph representation" << endl << " -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF" << endl << " -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file" << endl - << " -t, --threads N for tasks that can be done in parallel, use this many threads" << endl; + << " -t, --threads N for tasks that can be done in parallel, use this many threads" << endl + << " -F, --demo_0 N TO BE REMOVED: add six adenosines to the front of start of a given node N" << endl; } int main_mod(int argc, char** argv) { @@ -131,6 +135,7 @@ int main_mod(int argc, char** argv) { string vcf_filename; string loci_filename; int max_degree = 0; + string demo_0 = ""; int c; optind = 2; // force optind past command positional argument @@ -182,11 +187,12 @@ int main_mod(int argc, char** argv) { {"sample-vcf", required_argument, 0, 'v'}, {"sample-graph", required_argument, 0, 'G'}, {"max-degree", required_argument, 0, 'M'}, + {"demo_0", required_argument, 0, 'F'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:", + c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:F:", long_options, &option_index); @@ -375,6 +381,10 @@ int main_mod(int argc, char** argv) { max_degree = parse(optarg); break; + case 'F': + demo_0 = optarg; + break; + case 'h': case '?': help_mod(argv); @@ -851,6 +861,24 @@ int main_mod(int argc, char** argv) { graph->paths = Paths(); } + if ( demo_0.length() > 0 ) { + vector snarl_nodes = split_at_space(demo_0); + vg::id_t start_id = 220; + vg::id_t end_id = 218; + // tag_all_snarl_nodes(*graph, snarl_nodes[0], snarl_nodes[1]); + // vg::SubHandleGraph snarl = extract_subgraph(*graph, start_id, end_id); + vector walks = get_walks(*graph, start_id, end_id); + cout << endl << endl << endl; + + for (string walk : walks){ + cout << walk << endl; + } + + cout << endl << endl << endl; + // list walks get_walks(snarl); + + } + graph->serialize_to_ostream(std::cout); delete graph; diff --git a/src/subgraph.hpp b/src/subgraph.hpp index c1922866061..a0795a8619a 100644 --- a/src/subgraph.hpp +++ b/src/subgraph.hpp @@ -44,6 +44,9 @@ using namespace std; /// Look up the handle for the node with the given ID in the given orientation virtual handle_t get_handle(const id_t& node_id, bool is_reverse = false) const; + // Copy over the visit version which would otherwise be shadowed. + using HandleGraph::get_handle; + /// Get the ID from a handle virtual id_t get_id(const handle_t& handle) const; @@ -65,12 +68,18 @@ using namespace std; /// continue. Returns true if we finished and false if we stopped early. virtual bool follow_edges(const handle_t& handle, bool go_left, const function& iteratee) const; + // Copy over the template for nice calls + using HandleGraph::follow_edges; + /// Loop over all the nodes in the graph in their local forward /// orientations, in their internal stored order. Stop if the iteratee /// returns false. Can be told to run in parallel, in which case stopping /// after a false return value is on a best-effort basis and iteration /// order is not defined. virtual void for_each_handle(const function& iteratee, bool parallel = false) const; + + // Copy over the template for nice calls + using HandleGraph::for_each_handle; /// Return the number of nodes in the graph /// TODO: can't be node_count because XG has a field named node_count. From d12ad0223074529030180806edfa9cbf4501fa45 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 5 Mar 2019 13:21:52 -0800 Subject: [PATCH 02/63] updating src/subcommand/mod_main.cpp --- src/subcommand/mod_main.cpp | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index e4496cb4c90..cacd46cd422 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -17,8 +17,7 @@ #include "../algorithms/topological_sort.hpp" #include "../algorithms/remove_high_degree.hpp" -#include "../algorithms/0_demo_extract_subgraph.cpp" -#include "../algorithms/0_demo_snarl_to_strings.cpp" +#include "../algorithms/0_demo_final_0.hpp" using namespace std; using namespace vg; @@ -83,7 +82,8 @@ void help_mod(char** argv) { << " -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF" << endl << " -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file" << endl << " -t, --threads N for tasks that can be done in parallel, use this many threads" << endl - << " -F, --demo_0 N TO BE REMOVED: add six adenosines to the front of start of a given node N" << endl; + << " -F, --demo_0 FILE Given a .snarls file (from command vg snarls) and the corresponding graph," << endl + << " simplifies redundancy in graph's snarls." << endl; } int main_mod(int argc, char** argv) { @@ -135,7 +135,7 @@ int main_mod(int argc, char** argv) { string vcf_filename; string loci_filename; int max_degree = 0; - string demo_0 = ""; + string demo_0; int c; optind = 2; // force optind past command positional argument @@ -862,28 +862,22 @@ int main_mod(int argc, char** argv) { graph->paths = Paths(); } - if ( demo_0.length() > 0 ) { - vector snarl_nodes = split_at_space(demo_0); - vg::id_t start_id = 220; - vg::id_t end_id = 218; - // tag_all_snarl_nodes(*graph, snarl_nodes[0], snarl_nodes[1]); - // vg::SubHandleGraph snarl = extract_subgraph(*graph, start_id, end_id); - vector walks = get_walks(*graph, start_id, end_id); - cout << endl << endl << endl; - - for (string walk : walks){ - cout << walk << endl; + if ( !demo_0.empty() ) { + + std::ifstream snarl_stream; + snarl_stream.open(demo_0); + + if (!snarl_stream) { + cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; + exit(1); } - cout << endl << endl << endl; - // list walks get_walks(snarl); - + clean_all_snarls(*graph, snarl_stream); } - graph->serialize_to_ostream(std::cout); - delete graph; + return 0; } From 3dca93e94bb988926b616d075c6a039d02242b1b Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 5 Mar 2019 15:37:10 -0800 Subject: [PATCH 03/63] update msa_converter to have seqan format --- src/msa_converter.cpp | 90 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index 85057a9f877..b5dfa7fc1e4 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -8,7 +8,7 @@ #include "msa_converter.hpp" -//#define debug_msa_converter +// #define debug_msa_converter namespace vg { @@ -67,6 +67,92 @@ using namespace std; alignment[tokens[1]] = tokens[6]; } } + + else if (format == "seqan") { + // conservation line starts with int. + unordered_set conservation_chars{'0','1','2','3','4','5','6','7','8','9'}; + + auto is_conservation_line = [&](string& line) { + bool conservation_line = false; + for (char c : line) { + if (!isspace(c)) { + if (conservation_chars.count(c)) { + conservation_line = true; + } + else { + conservation_line = false; + } + break; + } + } + return conservation_line; + }; + + auto is_blank = [](const string& str) { + return all_of(str.begin(), str.end(), [](char c){return isspace(c);}); + }; + + auto is_draw_line = [&](string& line) { + bool draw_line = false; + for (char c : line) { + if (!isspace(c)) { + if (c == '|') { + draw_line = true; + } + else { + draw_line = false; + } + break; + } + } + return draw_line; + }; + + // removes leading whitespace of line. + auto get_next_line = [&](istream& in) { + string line; + getline(in, line); + + for (auto it = line.begin(); it != line.end(); it++){ + if (!isspace(*it)) { + line.erase(line.begin(), it); + break; + } + } + return line; + }; + + // make an alignment block + alignments.emplace_back(); + auto& alignment = alignments.back(); + + int seq_count = 0; + string line; + line = get_next_line(in); + while (!in.eof()) { + if (is_conservation_line(line) || is_blank(line)){ + seq_count = 0; + } + else if (is_draw_line(line)) { + seq_count++; + } + else{ + auto iter = alignment.find(to_string(seq_count)); + if (iter != alignment.end()) { + iter->second.append(line); + } + else { + alignment[to_string(seq_count)] = line; + } + } + line = get_next_line(in); + + } + + + + } + else if (format == "clustal") { unordered_set conservation_chars{'.', ':', '*'}; @@ -168,7 +254,7 @@ using namespace std; cerr << "alignments:" << endl; for (const auto& aln : alignments) { for (const auto& seq : aln) { - cerr << seq.first << "\t" << seq.second << endl; + cerr << "seq.first " << seq.first << "\t" << "seq.second " << seq.second << endl << endl; } cerr << endl; } From e8207f79febedc1586dd8b174656447321556e3a Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 5 Mar 2019 16:38:44 -0800 Subject: [PATCH 04/63] Changes to be committed: modified: src/algorithms/0_demo_final_0.cpp --- src/algorithms/0_demo_final_0.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp index 20277e5227f..303658d4883 100644 --- a/src/algorithms/0_demo_final_0.cpp +++ b/src/algorithms/0_demo_final_0.cpp @@ -19,6 +19,7 @@ void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_st for (auto roots : snarl_roots){ clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); } + delete snarl_manager; } From 344bcf68bde14efc03d0bff976e3e9dc31d0fed4 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 6 Mar 2019 16:04:49 -0800 Subject: [PATCH 05/63] fixed bug in msa converter --- src/algorithms/0_demo_final_0.cpp | 4 ++-- src/msa_converter.cpp | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp index 20277e5227f..fb627746da2 100644 --- a/src/algorithms/0_demo_final_0.cpp +++ b/src/algorithms/0_demo_final_0.cpp @@ -29,7 +29,7 @@ void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, c //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings //representing all possible walks through the snarl: vector walks = graph_to_strings(graph, start_id, end_id); - + //Make a new snarl from walks: VG new_snarl = strings_to_graph(walks); @@ -170,7 +170,7 @@ VG strings_to_graph(const vector& walks){ globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - // std::cout << align << "\n"; + cerr << align << "\n"; stringstream ss; ss << align; diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index b5dfa7fc1e4..87aa8f8ed3a 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -130,13 +130,10 @@ using namespace std; string line; line = get_next_line(in); while (!in.eof()) { - if (is_conservation_line(line) || is_blank(line)){ + if (is_conservation_line(line)){ seq_count = 0; } - else if (is_draw_line(line)) { - seq_count++; - } - else{ + else if (!is_draw_line(line) && !is_blank(line)) { auto iter = alignment.find(to_string(seq_count)); if (iter != alignment.end()) { iter->second.append(line); @@ -144,6 +141,7 @@ using namespace std; else { alignment[to_string(seq_count)] = line; } + seq_count++; } line = get_next_line(in); From 77365c106fe23dfdb769c56cc3af7d2d0a719d8d Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 3 Apr 2019 14:37:51 -0700 Subject: [PATCH 06/63] Changes to be committed: modified: src/subcommand/mod_main.cpp --- src/subcommand/mod_main.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index cacd46cd422..ff8d0cca666 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -863,19 +863,23 @@ int main_mod(int argc, char** argv) { } if ( !demo_0.empty() ) { - - std::ifstream snarl_stream; - snarl_stream.open(demo_0); - - if (!snarl_stream) { - cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; - exit(1); + + ///Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. + test_gbwt(*graph); + + // std::ifstream snarl_stream; + // snarl_stream.open(demo_0); + + // if (!snarl_stream) { + // cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; + // exit(1); + // } + + // clean_all_snarls(*graph, snarl_stream); } + // graph->serialize_to_ostream(std::cout); + // delete graph; - clean_all_snarls(*graph, snarl_stream); - } - graph->serialize_to_ostream(std::cout); - delete graph; return 0; From 938ad38f283887437c3c306cc4fb946d143cc2c6 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 3 Apr 2019 14:38:59 -0700 Subject: [PATCH 07/63] testing gbwt helper Changes to be committed: modified: src/algorithms/0_demo_final_0.cpp modified: src/algorithms/0_demo_final_0.hpp --- src/algorithms/0_demo_final_0.cpp | 40 ++++++++++++++++++++++++++++++- src/algorithms/0_demo_final_0.hpp | 6 +++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp index 2af628f6e53..e64dea9636a 100644 --- a/src/algorithms/0_demo_final_0.cpp +++ b/src/algorithms/0_demo_final_0.cpp @@ -10,15 +10,54 @@ #include #include "../msa_converter.hpp" #include "../snarls.hpp" +#include "../gbwt_helper.hpp" +#include "../stream/vpkg.hpp" namespace vg { +void print_kmer(const std::vector>&, const std::string& string){ + cout << string << endl; +} + +void test_gbwt(MutablePathDeletableHandleGraph& graph){ + ifstream gbwt_stream; + string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; + gbwt_stream.open(gbwt_name); + + unique_ptr gbwt; + // Load the GBWT from its container + gbwt = stream::VPKG::load_one(gbwt_stream); + + size_t k = 20; + for_each_kmer(graph, *gbwt, k, print_kmer, false); + + +} + + void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ SnarlManager* snarl_manager = new SnarlManager(snarl_stream); + +/* Use this code to count number of snarls in graph. +* int top_count = 0; +* for (const Snarl* snarl : snarl_manager->top_level_snarls()){ +* top_count++; +* } +* cerr << "number of top_level snarls in graph: " << top_count << endl; +* +* int general_count = 0; +* snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ +* general_count++; +* }); +* cerr << "number of total snarls in graph: " << general_count << endl; +*/ + + vector snarl_roots = snarl_manager->top_level_snarls(); for (auto roots : snarl_roots){ clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); } + delete snarl_manager; @@ -171,7 +210,6 @@ VG strings_to_graph(const vector& walks){ globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - cerr << align << "\n"; stringstream ss; ss << align; diff --git a/src/algorithms/0_demo_final_0.hpp b/src/algorithms/0_demo_final_0.hpp index dbd5e61c854..32265621a58 100644 --- a/src/algorithms/0_demo_final_0.hpp +++ b/src/algorithms/0_demo_final_0.hpp @@ -1,3 +1,7 @@ +/* +Robin Rounthwaite +Find function call in ./subcommand/main.cpp +*/ #include #include "../vg.hpp" #include "../handle.hpp" @@ -5,6 +9,8 @@ #include "count_walks.hpp" namespace vg { + void test_gbwt(MutablePathDeletableHandleGraph& graph); + void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); From 6a96a1db91c2d4304f5e4666552e342389549326 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 8 Apr 2019 10:01:20 -0700 Subject: [PATCH 08/63] updating haplotypes_to_strings --- src/algorithms/0_demo_final_0.cpp | 45 ++++++++++++++++++++++++------- src/algorithms/0_demo_final_0.hpp | 2 +- src/subcommand/mod_main.cpp | 12 +++++++-- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp index e64dea9636a..480ea3db4f0 100644 --- a/src/algorithms/0_demo_final_0.cpp +++ b/src/algorithms/0_demo_final_0.cpp @@ -15,11 +15,13 @@ namespace vg { -void print_kmer(const std::vector>&, const std::string& string){ - cout << string << endl; -} +// void print_kmer(const std::vector>&, const std::string& string){ +// cout << string << endl; +// } + +vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ -void test_gbwt(MutablePathDeletableHandleGraph& graph){ + ///stuff that will go in mod_main: ifstream gbwt_stream; string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; gbwt_stream.open(gbwt_name); @@ -27,11 +29,38 @@ void test_gbwt(MutablePathDeletableHandleGraph& graph){ unique_ptr gbwt; // Load the GBWT from its container gbwt = stream::VPKG::load_one(gbwt_stream); +// ----------------------------------------------------------------- + /// make subgraph for the snarl: + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + + GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); +//TODO:identify source and sinks for troubleshooting! + unordered_map> sequences; // will contain all haplotype walks through snarl + handle_t source_handle = haploGraph.get_handle(source_id); + sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); + + for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { + + vector seqs_here = sequences[handle]; + gbwt::SearchState cur_state = haploGraph.get_state(handle); + + haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { + handle_t next_handle = haploGraph.get_handle(next_search.node); + string next_seq = haploGraph.get_sequence(next_handle); + // transfer the sequences for the preceding handle to next_handle's sequences, + // plus the new handle's sequence. + for (string seq : seqs_here){ + sequences[next_handle].push_back(seq + next_seq); + } - size_t k = 20; - for_each_kmer(graph, *gbwt, k, print_kmer, false); + + }); + } + // all the sequences at the sinks will be all the sequences in the snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + return sequences[sink_handle]; } @@ -227,8 +256,6 @@ VG strings_to_graph(const vector& walks){ vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ - // id_t start_id = 220; - // id_t end_id = 218; SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); unordered_map> sequences; @@ -237,7 +264,7 @@ vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t sta count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - // identify sources and sinks + // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. snarl.for_each_handle([&](const handle_t& handle) { bool is_source = true, is_sink = true; snarl.follow_edges(handle, true, [&](const handle_t& prev) { diff --git a/src/algorithms/0_demo_final_0.hpp b/src/algorithms/0_demo_final_0.hpp index 32265621a58..67f0210f7ee 100644 --- a/src/algorithms/0_demo_final_0.hpp +++ b/src/algorithms/0_demo_final_0.hpp @@ -9,7 +9,7 @@ Find function call in ./subcommand/main.cpp #include "count_walks.hpp" namespace vg { - void test_gbwt(MutablePathDeletableHandleGraph& graph); + vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id); void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index ff8d0cca666..521e1fa2dc9 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -865,7 +865,15 @@ int main_mod(int argc, char** argv) { if ( !demo_0.empty() ) { ///Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. - test_gbwt(*graph); + vg::id_t source = 1; + vg::id_t sink = 8; + + vector haplotypes = haplotypes_to_strings(*graph, source, sink); + cout << "here goes!" << endl; + for(string haplotype : haplotypes) { + + cout << haplotype << endl; + } // std::ifstream snarl_stream; // snarl_stream.open(demo_0); @@ -878,7 +886,7 @@ int main_mod(int argc, char** argv) { // clean_all_snarls(*graph, snarl_stream); } // graph->serialize_to_ostream(std::cout); - // delete graph; + delete graph; From 1bafef627b0e00c903d30dca5fa7383a6cd92ef3 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 8 Apr 2019 10:29:30 -0700 Subject: [PATCH 09/63] update haplotype_to_strings --- src/algorithms/0_demo_final_0.cpp | 45 ++++++++++++++++++++++++------- src/algorithms/0_demo_final_0.hpp | 2 +- src/subcommand/mod_main.cpp | 12 +++++++-- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp index e64dea9636a..480ea3db4f0 100644 --- a/src/algorithms/0_demo_final_0.cpp +++ b/src/algorithms/0_demo_final_0.cpp @@ -15,11 +15,13 @@ namespace vg { -void print_kmer(const std::vector>&, const std::string& string){ - cout << string << endl; -} +// void print_kmer(const std::vector>&, const std::string& string){ +// cout << string << endl; +// } + +vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ -void test_gbwt(MutablePathDeletableHandleGraph& graph){ + ///stuff that will go in mod_main: ifstream gbwt_stream; string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; gbwt_stream.open(gbwt_name); @@ -27,11 +29,38 @@ void test_gbwt(MutablePathDeletableHandleGraph& graph){ unique_ptr gbwt; // Load the GBWT from its container gbwt = stream::VPKG::load_one(gbwt_stream); +// ----------------------------------------------------------------- + /// make subgraph for the snarl: + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + + GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); +//TODO:identify source and sinks for troubleshooting! + unordered_map> sequences; // will contain all haplotype walks through snarl + handle_t source_handle = haploGraph.get_handle(source_id); + sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); + + for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { + + vector seqs_here = sequences[handle]; + gbwt::SearchState cur_state = haploGraph.get_state(handle); + + haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { + handle_t next_handle = haploGraph.get_handle(next_search.node); + string next_seq = haploGraph.get_sequence(next_handle); + // transfer the sequences for the preceding handle to next_handle's sequences, + // plus the new handle's sequence. + for (string seq : seqs_here){ + sequences[next_handle].push_back(seq + next_seq); + } - size_t k = 20; - for_each_kmer(graph, *gbwt, k, print_kmer, false); + + }); + } + // all the sequences at the sinks will be all the sequences in the snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + return sequences[sink_handle]; } @@ -227,8 +256,6 @@ VG strings_to_graph(const vector& walks){ vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ - // id_t start_id = 220; - // id_t end_id = 218; SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); unordered_map> sequences; @@ -237,7 +264,7 @@ vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t sta count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - // identify sources and sinks + // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. snarl.for_each_handle([&](const handle_t& handle) { bool is_source = true, is_sink = true; snarl.follow_edges(handle, true, [&](const handle_t& prev) { diff --git a/src/algorithms/0_demo_final_0.hpp b/src/algorithms/0_demo_final_0.hpp index 32265621a58..67f0210f7ee 100644 --- a/src/algorithms/0_demo_final_0.hpp +++ b/src/algorithms/0_demo_final_0.hpp @@ -9,7 +9,7 @@ Find function call in ./subcommand/main.cpp #include "count_walks.hpp" namespace vg { - void test_gbwt(MutablePathDeletableHandleGraph& graph); + vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id); void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index ff8d0cca666..521e1fa2dc9 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -865,7 +865,15 @@ int main_mod(int argc, char** argv) { if ( !demo_0.empty() ) { ///Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. - test_gbwt(*graph); + vg::id_t source = 1; + vg::id_t sink = 8; + + vector haplotypes = haplotypes_to_strings(*graph, source, sink); + cout << "here goes!" << endl; + for(string haplotype : haplotypes) { + + cout << haplotype << endl; + } // std::ifstream snarl_stream; // snarl_stream.open(demo_0); @@ -878,7 +886,7 @@ int main_mod(int argc, char** argv) { // clean_all_snarls(*graph, snarl_stream); } // graph->serialize_to_ostream(std::cout); - // delete graph; + delete graph; From 4917a3b1a79c28301218326bf6b5e966d6065158 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 13 Jun 2019 11:54:37 -0700 Subject: [PATCH 10/63] new haplotype path oriented approach to aligning graphs, with old edge-oriented approach tidied away. On branch normalize_snarls Changes to be committed: new file: src/algorithms/0_draft_haplotype_realignment.cpp new file: src/algorithms/0_draft_haplotype_realignment.hpp new file: src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp new file: src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp new file: src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp new file: src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp modified: src/subcommand/mod_main.cpp --- .../0_draft_haplotype_realignment.cpp | 760 ++++++++++++ .../0_draft_haplotype_realignment.hpp | 28 + ...d_includes_non_path_oriented_approach).cpp | 1018 +++++++++++++++++ ...old_0-diff_extension_for_not_at_source.cpp | 964 ++++++++++++++++ .../0_demo_final_old_0-only_source_paths.cpp | 741 ++++++++++++ .../0_demo_final_old_0-only_source_paths.hpp | 37 + src/subcommand/mod_main.cpp | 102 +- 7 files changed, 3632 insertions(+), 18 deletions(-) create mode 100644 src/algorithms/0_draft_haplotype_realignment.cpp create mode 100644 src/algorithms/0_draft_haplotype_realignment.hpp create mode 100644 src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp create mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp create mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp create mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp new file mode 100644 index 00000000000..09f192bb532 --- /dev/null +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -0,0 +1,760 @@ +#pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? +#include "0_draft_haplotype_realignment.hpp" + +#include +#include + +#include +#include +#include + +#include "../vg.hpp" +#include "../gbwt_helper.hpp" +#include "../stream/vpkg.hpp" +#include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? +#include +#include +#include +#include "../msa_converter.hpp" + +//TODO: Tomorrow's goal: edit haplotypes using Jordan's technique to re-integrate your snarl. + +namespace vg { + +/// Given the haplotypes extracted from the graph in extract_haplotypes, +// creates a new subgraph made from the realignment of the extracted +// haplotypes. +void align_haplotypes(const GBWTGraph& haploGraph, const pair< vector< vector >, vector< vector > >& haplotypes){ + vector< string > haplotypes_from_source_to_sink = format_handle_haplotypes_to_strings(haploGraph, haplotypes.first); + vector< string > other_haplotypes = format_handle_haplotypes_to_strings(haploGraph, haplotypes.second); + //TODO: Debug: disamiguate beign/ending regions of nodes by adding leading/trailing AAA seq (essential a special character). + for (string& hap : haplotypes_from_source_to_sink){ + hap = "AAAAAAAA" + hap + "AAAAAAAA"; + } + + + VG new_snarl = align_haplotypes(haplotypes_from_source_to_sink); + //TODO: Debug workaround to avoid hassle of overwriting inputGraph. + new_snarl.serialize_to_ostream(cout); + vector walks = debug_graph_to_strings(new_snarl, 2, 12); + + //TODO: Debug print statements + // cerr << "source_to_sink haplotypes" << endl; + // for (string hap : haplotypes_from_source_to_sink){ + // cerr << hap << endl << endl; + // } + // cerr << "source_to_sink_walks" << endl; + // for (string walk : walks){ + // cerr << walk << endl << endl; + // } + // cerr << "are there any walks that aren't haplotypes?" << endl; + // for (string walk : walks){ + // if (find(haplotypes_from_source_to_sink.begin(), haplotypes_from_source_to_sink.end(), walk) != haplotypes_from_source_to_sink.end()){ + // cerr << "good" << endl; + // } else { + // cerr << "bad walk" << endl; + // cerr << walk << endl; + // } + // } + cerr << "are there any haps that aren't walks?" << endl; + for (string hap : haplotypes_from_source_to_sink){ + if (find(walks.begin(), walks.end(), hap) != walks.end()){ + cerr << "good" << endl; + } else { + cerr << "bad hap" << endl; + cerr << hap << endl; + } + } + + // cerr << "other haplotypes" << endl; + // for (string hap : other_haplotypes){ + // cerr << hap << endl << endl; + // } + // vector actually_source_to_sink; + // vector to_print_other_haps; + // cerr << "other haplotypes sorted" << endl; + // for (string hap : other_haplotypes){ + // if (find(haplotypes_from_source_to_sink.begin(), haplotypes_from_source_to_sink.end(), hap) != haplotypes_from_source_to_sink.end()){ + // actually_source_to_sink.emplace_back(hap); + // } else { + // to_print_other_haps.emplace_back(hap); + // } + + // } + // sort(actually_source_to_sink.begin(), actually_source_to_sink.end()); + // cerr << "actually source to sink" << actually_source_to_sink.size() << endl; + // for (string hap : actually_source_to_sink){ + // cerr << hap << endl << endl; + // } + // cerr << endl << endl << "to_print_other_haps" << to_print_other_haps.size() << endl; + // sort(to_print_other_haps.begin(), to_print_other_haps.end()); + // for (string hap : to_print_other_haps){ + // cerr << hap << endl << endl; + // } + + + +} + + +//Returns: a pair containting two sets of paths (each represented by a vector). The first +// in the pair represents all paths reaching from source to sink in the snarl, and the +// second representing all other paths in the snarl (e.g. any that don't reach both +// source and sink in the graph.) +pair< vector< vector >, vector< vector > > extract_haplotypes(const GBWTGraph& haploGraph, + const id_t& source_id, + const id_t& sink_id){ + cerr << "depth first begins!" << endl; + //touched_handles contains all handles that have been touched by the depth_first_search, + //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch + //from source to sink in the snarl. + unordered_set touched_handles; + + //haplotype_queue contains all started exon_haplotypes not completed yet. + //Every time we encounter a branch in the paths, the next node down the path + //Is stored here, along with the vector of handles that represents the path up + //to the SearchState. + vector< pair< vector, gbwt::SearchState> > haplotype_queue; + + // source and sink handle for haploGraph: + handle_t source_handle = haploGraph.get_handle(source_id); + handle_t sink_handle = haploGraph.get_handle(sink_id); + + //place source in haplotype_queue. + vector source_handle_vec(1, source_handle); + gbwt::SearchState source_state = haploGraph.get_state(source_handle); + haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); + touched_handles.emplace(source_handle); + + //haplotypes contains all "finished" haplotypes - those that were either walked + //to their conclusion, or until they reached the sink. + vector< vector > haplotypes_from_source_to_sink; + vector< vector > other_haplotypes; + + // for every partly-extracted thread, extend the thread until it either reaches + // the sink of the snarl or the end of the thread. + while (!haplotype_queue.empty()) { + + // get a haplotype out of haplotype_queue to extend - + // a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState next_search) -> bool { + // cerr << "this node immediately follows cur_haplotypes current search_state." << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << haploGraph.get_id(haploGraph.node_to_handle(next_search.node)) << endl; + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue + // or one of the finished haplotype_handle_vectors. + if (next_searches.size() > 1){ + + // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, + // or place in the haplotypes_from_source_to_sink if haplotype extends to sink, + // or place in the other_haplotypes if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches){ + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + + // add the new handle to the vec: + next_handle_vec.push_back(next_handle); + + // if new_handle is the sink, put in haplotypes_from_source_to_sink + if (haploGraph.get_id(next_handle) == sink_id){ + haplotypes_from_source_to_sink.push_back(next_handle_vec); + + } else { // keep extending the haplotype! + + pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); + haplotype_queue.push_back(next_haplotype); + + } + + //next_handle will be touched. + touched_handles.emplace(next_handle); + } + + } // if next_searches is empty, the path has ended but not reached sink. + else if ( next_searches.empty() ) { + //TODO: debug + // cerr << "next_searches is empty" << endl; + + // We have reached the end of the path, but it doesn't reach the sink. + // we need to add cur_haplotype to other_haplotypes. + other_haplotypes.push_back(cur_haplotype.first); + + } // if new_handle is the sink, put in haplotypes_from_source_to_sink + else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { + // TODO: debug: + // cerr << "next_searches is sink" << endl; + + // Then we need to add cur_haplotype + next_search to haplotypes_from_source_to_sink. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + cur_haplotype.first.push_back(next_handle); + haplotypes_from_source_to_sink.push_back(cur_haplotype.first); + + //touched next_search's handle + touched_handles.emplace(next_handle); + + } //else, there is just one next_search, and it's not the end of the path. + //just extend the search by adding (cur_haplotype + next_search to haplotype_queue. + else { + + // get the next_handle from the one next_search. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + // TODO: debug: + // cerr << "normal extend" << endl; + // cerr << "this is next_handle" << haploGraph.get_id(next_handle) << endl; + + + // modify cur_haplotype with next_handle and next_search. + cur_haplotype.first.push_back(next_handle); + cur_haplotype.second = next_searches.back(); // there's only one next_search in next_searches. + + // put cur_haplotype back in haplotype_queue. + haplotype_queue.push_back(cur_haplotype); + touched_handles.emplace(next_handle); + + } + + } + + //Find any haplotypes starting from handles not starting at the source, but which + //still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + + // move haplotypes_not_starting_at_source into other_haplotypes: + other_haplotypes.reserve(other_haplotypes.size() + haplotypes_not_starting_at_source.size()); + move(haplotypes_not_starting_at_source.begin(), haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + + return make_pair(haplotypes_from_source_to_sink, other_haplotypes); +} + +vector< string > format_handle_haplotypes_to_strings(const GBWTGraph& haploGraph, const vector< vector< handle_t > >& haplotype_handle_vectors){ + vector< string > haplotype_strings; + for (vector haplotype_handles : haplotype_handle_vectors){ + string hap; + for (handle_t& handle : haplotype_handles){ + hap += haploGraph.get_sequence(handle); + } + haplotype_strings.push_back(hap); + } + return haplotype_strings; +} + +vector> find_haplotypes_not_at_source(const GBWTGraph& haploGraph, unordered_set& touched_handles, const id_t& sink_id){ + //TODO: debug: source handle size? + // cerr << '\n\n\n\n' << endl; + // for (id_t node_id = 23493; node_id <= 23505; node_id ++){ + // handle_t trial_handle = haploGraph.get_handle(node_id); + // gbwt::SearchState normal_search = haploGraph.get_state(trial_handle); + // cerr << "is normal searchstate at handle " << haploGraph.get_id(trial_handle) << " empty? " << normal_search.empty() << " size: " << normal_search.size() << endl; + // gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(trial_handle)); + // cerr << "is the prefix searchstate empty? " << new_search.empty() << " size: " << new_search.size() << endl; + // } + + + + + + cerr << "finding haplotypes not at source!" << endl; + /// Search every handle in touched handles for haplotypes starting at that point. + // Any new haplotypes will be added to haplotype_queue. + vector, gbwt::SearchState>> haplotype_queue; + + // Fully extended haplotypes (or haplotypes extended to the snarl's sink) + // will be added to finished_haplotypes. + vector> finished_haplotypes; + + // In addition, we need to put the new handle into to_search, because a path may have + // started on the new handle (which means we need to start a searchstate there.) + unordered_set to_search; + + // We don't need to ever check the sink handle, since paths from the sink handle + // extend beyond snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + touched_handles.erase(sink_handle); + + // Create nested function for making a new_search: + auto make_new_search = [&](handle_t handle) { + cerr << "lambda" << endl; + + // Are there any new threads starting at this handle? + gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); + // if (new_search != gbwt::SearchState()){ + if (!new_search.empty()){ + //TODO: Debug code: are searchstates empty? + cerr << "apparently new thread starts at node: " << haploGraph.get_id(handle) << endl; + cerr << "is the searchstate empty? " << new_search.empty() << " size: " << new_search.size() << endl; + // Then add them to haplotype_queue. + haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { + + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle long, + // then there isn't any useful string to extract from this. + if (next_handle != sink_handle || next_search == gbwt::SearchState()){ + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); + + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to to_search. + if (touched_handles.find(next_handle) == touched_handles.end()){ + to_search.emplace(next_handle); + } + } + return true; + }); + } + }; + + // TODO: Debug code: Search every handle in touched handles for haplotypes starting at that point. + // for (handle_t handle : touched_handles){ + // cerr << "isn't a source handle: " << haploGraph.get_sequence(handle) << endl; + // make_new_search(handle); + // } + + /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. + /// Then, check to see if there are any new threads on handles in to_search. + /// Extend those threads, and add any newly found handles to to_search, + /// then search for threads again in to_search again... repeat until to_search remains + /// emptied of new handles. + + // for tracking whether the haplotype thread is still extending: + bool still_extending; + + // TODO: Debug code: did we find any haplotypes that need extending? + // cerr << "haps need extending below:" << endl; + // for (auto handle : to_search){ + // cerr << "hap needs extending: " << haploGraph.get_id(handle) << " " << haploGraph.get_sequence(handle) << endl; + // } + // cerr << "haps queue:" << endl; + // for (auto hap : haplotype_queue){ + // handle_t handle = haploGraph.node_to_handle(hap.second.node); + // cerr << "need to search hap: " << haploGraph.get_id(handle) << " " << haploGraph.get_sequence(handle) << endl; + // } + // extend haplotypes on any nodes found to act as a starting thread. + while(!to_search.empty() || !haplotype_queue.empty()){ + while (!haplotype_queue.empty()){ + cerr << "extend haplotype_queue" << endl; + + // get a haplotype to extend out of haplotype_queue - a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + for (gbwt::SearchState next_search: next_searches){ + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // if next_search is empty, then we've fallen off the thread, + // and cur_haplotype can be placed in finished_haplotypes as is for this thread. + if (next_search == gbwt::SearchState()){ + + finished_haplotypes.push_back(cur_haplotype.first); + + } + // if next_search is on the sink_handle, + // then cur_haplotype.first + next_search goes to finished_haplotypes. + else if (haploGraph.get_id(next_handle) == sink_id){ + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + //add next_handle + next_handle_vec.push_back(next_handle); + //place in finished_haplotypes + finished_haplotypes.push_back(next_handle_vec); + + // also, if next_handle hasn't been checked for new threads, add to to_search. + if (touched_handles.find(next_handle) != touched_handles.end()){ + to_search.emplace(next_handle); + } + + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { + + // copy over cur_haplotype: + pair< vector, gbwt::SearchState> cur_haplotype_copy = cur_haplotype; + //modify with next_handle/search + cur_haplotype_copy.first.push_back(next_handle); + cur_haplotype_copy.second = next_search; + // place back in haplotype_queue for further extension. + haplotype_queue.push_back(cur_haplotype_copy); + + // also, if next_handle hasn't been checked for new threads, add to to_search. + if (touched_handles.find(next_handle) != touched_handles.end()){ + to_search.emplace(next_handle); + } + + } + } + + + + } + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search){ + make_new_search(handle); // will add to haplotype_queue if there's any new_searches to be had. + } + to_search.clear(); + + } + return finished_haplotypes; +} + + +//TODO: make return a vector> instead, then convert using separate fxn. +// Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. +// Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, +// where each vector of handles represents one path from source to sink. +// alternative function return: +//unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ +vector get_embedded_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ + unordered_map > paths; + unordered_map multiple_occurrences; + + // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. + graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { + // Each occurrence represents an embedded path + // (note - in the case of a looped path, there will be multiple occurrences for one path.) + // For each path represented by an occurrence, we need to walk along the path until we reach + // the sink node. That series of handles represents the the sequence of the path. + + string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); + if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) + + //record this in multiple_occurrences, and get the number of times we've seen this occurrence. + int occ_num; + if (multiple_occurrences.find(path) == multiple_occurrences.end()){ + occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. + multiple_occurrences[path] = occ_num; + } else { + occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. + } + + //record the other occurrences with an added identifier to differentiate between paths. + paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); + } + else{ // this is the first time we've encountered this occurrence. + paths[path].emplace_back(occurrence); + } + }); + + //Now, for every occurrence, walk along the path until we reach the sink. + for (pair > path : paths){ + // cerr << "my name" << path.first << endl; + // cerr << "my occurences:" << endl; + // for (auto occ : path.second) { + // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; + // } + // cerr << "testing get_next_occurrence:" << endl; + // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); + // cerr << cur_id; + + // cur_occurence is the current handle while walking along the path + occurrence_handle_t cur_occurrence = path.second.back(); + id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); + // store the path in paths, in the occurrence_handle_t vector. + while (cur_id != graph.get_id(sink_handle)){ + paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); + // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); + cur_occurrence = paths[path.first].back(); + // cur_occurrence = path.second.back(); + cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); + cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; + } + path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); + cerr << path.second.size() << endl; + for (auto handle : path.second) { + cerr << graph.get_sequence(graph.get_occurrence(handle)); + } + } + cerr << "havin' issues here?" << endl; + for (auto path : paths) { + for (auto handle : path.second) { + cerr << graph.get_sequence(graph.get_occurrence(handle)); + } + } + // Resolve multiple_occurrences by identifying which entry in paths + // (of those part of the same path) is longest - that will + // represent the full breadth of the path through the snarl. + for (pair element : multiple_occurrences){ + // A vector of all the path entries in paths: + vector same_path_names = {element.first}; + + int max_len = paths[element.first].size(); + string max_path = element.first; + + for (int occ_num : range_vector(element.second)){ + occ_num++; // we actually need range_vector[1, ..., end()] + string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; + int cur_len = paths[cur_path].size(); + same_path_names.push_back(cur_path); + + if (cur_len > max_len){ + max_len = cur_len; + max_path = cur_path; + } + } + + // get rid of the smaller fragments of path: + for (string name : same_path_names) { + if (name != max_path){ + paths.erase(name); + } + } + } + vector path_strings; + // get just the strings from the unordered_map > paths object: + for (auto path : paths) { + string path_string; + for (auto handle : path.second) { + path_string += graph.get_sequence(graph.get_occurrence(handle)); + } + path_strings.push_back(path_string); + } + return path_strings; +} + +VG align_haplotypes(const vector& source_to_sink_haplotypes){ + seqan::Align align; // create multiple_sequence_alignment object + + seqan::resize(rows(align), source_to_sink_haplotypes.size()); + for (int i = 0; i < source_to_sink_haplotypes.size(); ++i){ + assignSource(row(align, i), source_to_sink_haplotypes[i].c_str()); + } + + globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + + stringstream ss; + ss << align; + MSAConverter myMSAConverter = MSAConverter(); + myMSAConverter.load_alignments(ss, "seqan"); + VG snarl = myMSAConverter.make_graph(); + snarl.clear_paths(); + + + // snarl.serialize_to_ostream(cerr); + return snarl; +} + +// ------------------------------ DEBUG CODE BELOW: ------------------------------------------ + +vector debug_graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ + SubHandleGraph snarl = debug_extract_subgraph(graph, start_id, end_id); + + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl + sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. + snarl.for_each_handle([&](const handle_t& handle) { + bool is_source = true, is_sink = true; + snarl.follow_edges(handle, true, [&](const handle_t& prev) { + is_source = false; + return false; + }); + snarl.follow_edges(handle, false, [&](const handle_t& next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); + + + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; + + snarl.follow_edges(handle, false, [&](const handle_t& next) { + + size_t& count_next = count[next]; + string seq_next = snarl.get_sequence(next); + + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } + + else { + count_next += count_here; + // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ + for (string seq : seqs_here){ + sequences[next].push_back(seq + seq_next); + } + // cerr << "next_seqs: "; + // for (string seq : sequences[next]){ + // cerr << seq << endl; + // } + } + }); + ///TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t& sink : sinks) { + total_count += count[sink]; + } + + // all the sequences at the sinks will be all the sequences in the snarl. + vector walks; + for (handle_t& sink : sinks) { + for (string seq : sequences[sink]){ + walks.push_back(seq); + } + } + + return walks; +} + +// given a start and end node id, construct an extract subgraph between the two nodes (inclusive). +// TODO: change the arguments to handles, which contain orientation within themselves. +// That way, iteration to extract the subgraph will have direction contained within themselves. +// This may actually end up looking like simply parsing an input text file with the handles +// described from the find_snarl output. +SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ + /// make a subgraph containing only nodes of interest. (e.g. a snarl) + // make empty subgraph + SubHandleGraph subgraph = SubHandleGraph(&graph); + + unordered_set visited; // to avoid counting the same node twice. + unordered_set to_visit; // nodes found that belong in the subgraph. + + // TODO: how to ensure that "to the right" of start_handle is the correct direction? + // initialize with start_handle (because we move only to the right of start_handle): + handle_t start_handle = graph.get_handle(start_id); + subgraph.add_handle(start_handle); + visited.insert(graph.get_id(start_handle)); + + // look only to the right of start_handle + graph.follow_edges(start_handle, false, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + + /// explore the rest of the snarl: + while (to_visit.size() != 0) { + // remove cur_handle from to_visit + unordered_set::iterator cur_index = to_visit.begin(); + handle_t cur_handle = graph.get_handle(*cur_index); + + to_visit.erase(cur_index); + + /// visit cur_handle + visited.insert(graph.get_id(cur_handle)); + + subgraph.add_handle(cur_handle); + + if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! + // look for all nodes connected to cur_handle that need to be added + // looking to the left, + graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + // looking to the right, + graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + } + } + return subgraph; +} + + +} + + + + +/* +Misc. //todo's: + do I need to fix the fact that find_haplotypes_not_at_source runs forever when given + a non-snarl? + + TODO: make it so that gbwt file is customized by user rather than hardcoded. + + TODO: make the demo_0 argument into a better name. + + TODO: make it so that you pass the gbwt file directory to a one-liner function that + TODO: generates gbwt graph, extracts haps, aligns haps, and reintegrates haps. + TODO: (eventually will do it for every snarl in the given graph). +*/ + + + + + + +/// JUNK: +//TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. +//TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd +//TODO: (Now that I've converted depth_first fxn return value to a pair.) +// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// //representing all possible walks through the snarl: +// vg::handle_t source_handle = graph.get_handle(source_id); +// vg::handle_t sink_handle = graph.get_handle(sink_id); + +// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); +// cerr << "finished depth_first, now on to reference." << endl; +// vector reference = get_paths(graph, source_handle, sink_handle); + +// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + +// //Make a new snarl from walks: +// VG new_snarl = strings_to_graph(haplotypes); + +// integrate_snarl(graph, new_snarl, source_id, sink_id); + +// } + + +//Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. +//However, is modified to include all haplotypes inside the source/sink handles, +//even ones that don't include the source or sink handles. + + diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp new file mode 100644 index 00000000000..8db1514f97c --- /dev/null +++ b/src/algorithms/0_draft_haplotype_realignment.hpp @@ -0,0 +1,28 @@ +/* +Robin Rounthwaite +Find function call in ./subcommand/main.cpp +*/ +#include +#include "../vg.hpp" +#include "../handle.hpp" +#include "../subgraph.hpp" +#include "count_walks.hpp" +#include "../gbwt_helper.hpp" + +namespace vg { + void align_haplotypes(const GBWTGraph& haploGraph, const pair< vector< vector >, vector< vector > >& haplotypes); + + pair< vector< vector >, vector< vector > > extract_haplotypes(const GBWTGraph& graph, const id_t& source_id, const id_t& sink_id); + + vector> find_haplotypes_not_at_source(const GBWTGraph& haploGraph, unordered_set& touched_handles, const id_t& sink_id); + + vector< string > format_handle_haplotypes_to_strings(const GBWTGraph& haploGraph, const vector< vector< handle_t > >& haplotype_handle_vectors); + + vector get_embedded_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle); + + VG align_haplotypes(const vector& walks); + + vector debug_graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id); + + SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); +} diff --git a/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp b/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp new file mode 100644 index 00000000000..12893191566 --- /dev/null +++ b/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp @@ -0,0 +1,1018 @@ +/* +Misc. //todo's: + do I need to fix the fact that find_haplotypes_not_at_source runs forever when given + a non-snarl? +*/ + +#pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? +#include "0_demo_final_0.hpp" +#include +#include "../vg.hpp" +#include "../handle.hpp" +#include "../subgraph.hpp" +#include "count_walks.hpp" +#include +#include +#include +#include "../msa_converter.hpp" +#include "../snarls.hpp" +#include "../gbwt_helper.hpp" +#include "../stream/vpkg.hpp" +#include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? + +namespace vg { + +//TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. +//TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd +//TODO: (Now that I've converted depth_first fxn return value to a pair.) +// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// //representing all possible walks through the snarl: +// vg::handle_t source_handle = graph.get_handle(source_id); +// vg::handle_t sink_handle = graph.get_handle(sink_id); + +// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); +// cerr << "finished depth_first, now on to reference." << endl; +// vector reference = get_paths(graph, source_handle, sink_handle); + +// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + +// //Make a new snarl from walks: +// VG new_snarl = strings_to_graph(haplotypes); + +// integrate_snarl(graph, new_snarl, source_id, sink_id); + +// } + +// Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. +// Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, +// where each vector of handles represents one path from source to sink. +// alternative function return: +//unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ +vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ + unordered_map > paths; + unordered_map multiple_occurrences; + + // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. + graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { + // Each occurrence represents an embedded path + // (note - in the case of a looped path, there will be multiple occurrences for one path.) + // For each path represented by an occurrence, we need to walk along the path until we reach + // the sink node. That series of handles represents the the sequence of the path. + + string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); + if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) + + //record this in multiple_occurrences, and get the number of times we've seen this occurrence. + int occ_num; + if (multiple_occurrences.find(path) == multiple_occurrences.end()){ + occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. + multiple_occurrences[path] = occ_num; + } else { + occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. + } + + //record the other occurrences with an added identifier to differentiate between paths. + paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); + } + else{ // this is the first time we've encountered this occurrence. + paths[path].emplace_back(occurrence); + } + }); + + //Now, for every occurrence, walk along the path until we reach the sink. + for (pair > path : paths){ + // cerr << "my name" << path.first << endl; + // cerr << "my occurences:" << endl; + // for (auto occ : path.second) { + // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; + // } + // cerr << "testing get_next_occurrence:" << endl; + // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); + // cerr << cur_id; + + // cur_occurence is the current handle while walking along the path + occurrence_handle_t cur_occurrence = path.second.back(); + id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); + // store the path in paths, in the occurrence_handle_t vector. + while (cur_id != graph.get_id(sink_handle)){ + paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); + // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); + cur_occurrence = paths[path.first].back(); + // cur_occurrence = path.second.back(); + cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); + cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; + } + path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); + cerr << path.second.size() << endl; + for (auto handle : path.second) { + cerr << graph.get_sequence(graph.get_occurrence(handle)); + } + } + cerr << "havin' issues here?" << endl; + for (auto path : paths) { + for (auto handle : path.second) { + cerr << graph.get_sequence(graph.get_occurrence(handle)); + } + } + // Resolve multiple_occurrences by identifying which entry in paths + // (of those part of the same path) is longest - that will + // represent the full breadth of the path through the snarl. + for (pair element : multiple_occurrences){ + // A vector of all the path entries in paths: + vector same_path_names = {element.first}; + + int max_len = paths[element.first].size(); + string max_path = element.first; + + for (int occ_num : range_vector(element.second)){ + occ_num++; // we actually need range_vector[1, ..., end()] + string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; + int cur_len = paths[cur_path].size(); + same_path_names.push_back(cur_path); + + if (cur_len > max_len){ + max_len = cur_len; + max_path = cur_path; + } + } + + // get rid of the smaller fragments of path: + for (string name : same_path_names) { + if (name != max_path){ + paths.erase(name); + } + } + } + vector path_strings; + // get just the strings from the unordered_map > paths object: + for (auto path : paths) { + string path_string; + for (auto handle : path.second) { + path_string += graph.get_sequence(graph.get_occurrence(handle)); + } + path_strings.push_back(path_string); + } + return path_strings; +} + +vector> find_haplotypes_not_at_source(const GBWTGraph haploGraph, unordered_set touched_handles, const id_t& sink_id){ + cerr << "finding haplotypes not at source!" << endl; + /// Search every handle in touched handles for haplotypes starting at that point. + // Any new haplotypes will be added to haplotype_queue. + vector, gbwt::SearchState>> haplotype_queue; + + // Fully extended haplotypes (or haplotypes extended to the snarl's sink) + // will be added to finished_haplotypes. + vector> finished_haplotypes; + + // In addition, we need to put the new handle into to_search, because a path may have + // started on the new handle (which means we need to start a searchstate there.) + unordered_set to_search; + + // We don't need to ever check the sink handle, since paths from the sink handle + // extend beyond snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + touched_handles.erase(sink_handle); + + // Create nested function for making a new_search: + auto make_new_search = [&](handle_t handle) { + cerr << "lambda" << endl; + + // Are there any new threads starting at this handle? + gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); + if (new_search != gbwt::SearchState()){ //TODO: this is the "null" version of SearchState, right? + + // Then add them to haplotype_queue. + haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { + + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle long, + // then there isn't any useful string to extract from this. + if (next_handle != sink_handle || next_search == gbwt::SearchState()){ + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); + + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to to_search. + if (touched_handles.find(next_handle) == touched_handles.end()){ + to_search.emplace(next_handle); + } + } + return true; + }); + } + }; + + cerr << "1" << endl; + // Search every handle in touched handles for haplotypes starting at that point. + for (handle_t handle : touched_handles){ + cerr << "in not_at_source: " << haploGraph.get_sequence(handle) << endl; + make_new_search(handle); + } + + /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. + /// Then, check to see if there are any new threads on handles in to_search. + /// Extend those threads, and add any newly found handles to to_search, + /// then search for threads again in to_search again... repeat until to_search remains + /// emptied of new handles. + + // for tracking whether the haplotype thread is still extending: + bool still_extending; + cerr << "2" << endl; + while(!to_search.empty() && !haplotype_queue.empty()){ + while (!haplotype_queue.empty()){ + cerr << "extend haplotype_queue" << endl; + + // get a haplotype to extend out of haplotype_queue - a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + for (gbwt::SearchState next_search: next_searches){ + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // if next_search is empty, then we've fallen off the thread, + // and cur_haplotype can be placed in finished_haplotypes as is for this thread. + if (next_search == gbwt::SearchState()){ + + finished_haplotypes.push_back(cur_haplotype.first); + + } + // if next_search is on the sink_handle, + // then cur_haplotype.first + next_search goes to finished_haplotypes. + else if (haploGraph.get_id(next_handle) == sink_id){ + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + //add next_handle + next_handle_vec.push_back(next_handle); + //place in finished_haplotypes + finished_haplotypes.push_back(next_handle_vec); + + // also, if next_handle hasn't been checked for new threads, add to to_search. + if (touched_handles.find(next_handle) != touched_handles.end()){ + to_search.emplace(next_handle); + } + + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { + + // copy over cur_haplotype: + pair< vector, gbwt::SearchState> cur_haplotype_copy = cur_haplotype; + //modify with next_handle/search + cur_haplotype_copy.first.push_back(next_handle); + cur_haplotype_copy.second = next_search; + // place back in haplotype_queue for further extension. + haplotype_queue.push_back(cur_haplotype_copy); + + // also, if next_handle hasn't been checked for new threads, add to to_search. + if (touched_handles.find(next_handle) != touched_handles.end()){ + to_search.emplace(next_handle); + } + + } + } + + + + } + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search){ + make_new_search(handle); // will add to haplotype_queue if there's any new_searches to be had. + } + to_search.clear(); + + } + return finished_haplotypes; +} + + + + + + +//TODO: does GBWTgraphs have names associated with haplotypes? +//TODO: If so, I should change return value to an unordered map with key haplotype name +//TODO: and value vector of all handles in haplotype (also, rename fxn). + +//Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. +//However, is modified to include all haplotypes inside the source/sink handles, +//even ones that don't include the source or sink handles. +//Returns: a vector of strings representing all paths reaching from source to sink in the snarl, +// and a vector of strings representing all other paths in the snarl (e.g. any that don't +// reach both source and sink in the graph.) +pair, vector> depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ + cerr << "depth first begins!" << endl; + + + ///GBWT graph construction stuff that belongs in mod_main: + ifstream gbwt_stream; + //TODO: make it so that gbwt file is customized by user rather than hardcoded. + // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; + string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new_2.gbwt"; + gbwt_stream.open(gbwt_name); + + unique_ptr gbwt; + // Load the GBWT from its container + gbwt = stream::VPKG::load_one(gbwt_stream); + GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); +// ----------------------------------------------------------------------------------------- + /// Perform depth first search, where whenever the search reaches sink_handle, convert + /// vector of handles to string (should be equivalent to haplotype). + //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - + //TODO: this function currently doesn't account for if it loops back. + + + + + + + + + // cerr << endl << "initial test" << endl << endl; + // handle_t test_handle = haploGraph.get_handle(23495); + + // cerr << "got the test handle?" << haploGraph.get_id(test_handle) << haploGraph.get_sequence(test_handle) << endl; + // cerr << "\n follow edges!" << endl; + // haploGraph.follow_edges(test_handle, true, [&](const handle_t next_handle) { + // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; + // }); + // cerr << "end follow edges!\n " << endl; + + // cerr << "\n follow paths!" << endl; + // haploGraph.follow_paths(haploGraph.get_state(test_handle), [&](const gbwt::SearchState next_state) -> bool { + // handle_t next_handle = haploGraph.node_to_handle(next_state.node); + // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; + // return true; + // }); + // cerr << "end follow paths!\n " << endl; + + // cerr << "\n how many paths overlap test_state?" << endl; + // gbwt::SearchState test_state = haploGraph.get_state(test_handle); + // cerr << "state.size 23495" << test_state.size() << endl; + // handle_t test_handle_2 = haploGraph.get_handle(23494); + // gbwt::SearchState test_state_2 = haploGraph.get_state(test_handle_2); + // cerr << "state.size 23494" << test_state.size() << endl; + + // handle_t test_handle_3 = haploGraph.get_handle(23493); + // gbwt::SearchState test_state_3 = haploGraph.get_state(test_handle_3); + // cerr << "state.size 23493" << test_state.size() << endl; + + // cerr << "rightmost node. (not just 'node to the right'?" << haploGraph.get_sequence(haploGraph.node_to_handle(test_state.node)) << endl; + // cerr << "end test battery" << endl; + + + + + + //touched_handles contains all handles that have been touched by the depth_first_search, + //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch + //from source to sink in the snarl. + unordered_set touched_handles; + + //haplotype_queue contains all started exon_haplotypes not completed yet. + //Every time we encounter a branch in the paths, the next node down the path + //Is stored here, along with the vector of handles that represents the path up + //to the SearchState. + vector< pair< vector, gbwt::SearchState> > haplotype_queue; + + // source and sink handle for haploGraph: + handle_t source_handle = haploGraph.get_handle(source_id); + handle_t sink_handle = haploGraph.get_handle(sink_id); + + //place source in haplotype_queue. + vector source_handle_vec(1, source_handle); + gbwt::SearchState source_state = haploGraph.get_state(source_handle); + haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); + touched_handles.emplace(source_handle); + + //haplotypes contains all "finished" haplotypes - those that were either walked + //to their conclusion, or until they reached the sink. + vector< vector > source_to_sink_haplotype_handle_vecs; + vector< vector > source_without_sink_haplotype_handle_vecs; + + // for every partly-extracted thread, extend the thread until it either reaches + // the sink of the snarl or the end of the thread. + while (!haplotype_queue.empty()) { + + + + + + + + + + // cerr << "iteration! with haplotype_queue:" << endl; + + // for(auto hap : haplotype_queue) { + // cerr << "here's a hap" << endl; + // for(auto handle : hap.first){ + // cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << " "; + // } + // cerr << endl; + // } + + + + + + + + + + // get a haplotype out of haplotype_queue to extend - + // a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + + + + + // cerr << "\n\n follow edges!" << endl; + // haploGraph.follow_edges(cur_haplotype.first.back(), false, [&](const handle_t next_handle) { + // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; + // }); + // cerr << "end follow edges!\n\n " << endl; + + + + // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState next_search) -> bool { + // cerr << "this node immediately follows cur_haplotypes current search_state." << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << haploGraph.get_id(haploGraph.node_to_handle(next_search.node)) << endl; + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue + // or one of the finished haplotype_handle_vecs. + if (next_searches.size() > 1){ + + // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, + // or place in the source_to_sink_haplotype_handle_vecs if haplotype extends to sink, + // or place in the source_without_sink_haplotype_handle_vecs if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches){ + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + + // add the new handle to the vec: + next_handle_vec.push_back(next_handle); + + // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs + if (haploGraph.get_id(next_handle) == sink_id){ + cerr << "anpparently next handle of node " << haploGraph.get_id(cur_haplotype.first.back()) << " is sink" << endl; + source_to_sink_haplotype_handle_vecs.push_back(next_handle_vec); + + } else { // keep extending the haplotype! + + pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); + haplotype_queue.push_back(next_haplotype); + + } + + //next_handle will be touched. + touched_handles.emplace(next_handle); + } + + } // if next_searches is empty, the path has ended but not reached sink. + else if ( next_searches.empty() ) { + + cerr << "next_searches is empty" << endl; + + // We have reached the end of the path, but it doesn't reach the sink. + // we need to add cur_haplotype to source_without_sink_haplotype_handle_vecs. + source_without_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); + + } // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs + else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { + cerr << "next_searches is sink" << endl; + + // Then we need to add cur_haplotype + next_search to source_to_sink_haplotype_handle_vecs. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + cur_haplotype.first.push_back(next_handle); + source_to_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); + + //touched next_search's handle + touched_handles.emplace(next_handle); + + } //else, there is just one next_search, and it's not the end of the path. + //just extend the search by adding (cur_haplotype + next_search to haplotype_queue. + else { + cerr << "normal extend" << endl; + + // get the next_handle from the one next_search. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + + cerr << "this is next_handle" << haploGraph.get_id(next_handle) << endl; + + // modify cur_haplotype with next_handle and next_search. + cur_haplotype.first.push_back(next_handle); + cur_haplotype.second = next_searches.back(); // there's only one next_search in next_searches. + + // put cur_haplotype back in haplotype_queue. + haplotype_queue.push_back(cur_haplotype); + touched_handles.emplace(next_handle); + + } + + } + + for (auto handle : touched_handles){ + cerr << "\n\n\nin starting at source " << haploGraph.get_sequence(handle) << endl; + } + + //TODO: make following code into a separate function - the convert_handle_vec_to_strings, or something. + //Now, transform the each vector of handles in source_to_sink_haplotype_handle_vecs + //into a string, and return as a vector of strings + vector source_to_sink_haplotype_strings; + // for (vector vector_hap : source_to_sink_haplotype_handle_vecs){ + // string hap; + // for (handle_t& handle : vector_hap){ + // hap += haploGraph.get_sequence(handle); + // } + // source_to_sink_haplotype_strings.push_back(hap); + // } + + //Find any haplotypes starting from handles not starting at the source, but which + //still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + + //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. + vector other_haplotype_strings; + for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ + string hap; + for (handle_t& handle : vector_hap){ + hap += haploGraph.get_sequence(handle); + } + other_haplotype_strings.push_back(hap); + } + + //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. + for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ + string hap; + for (handle_t& handle : vector_hap){ + hap += haploGraph.get_sequence(handle); + } + other_haplotype_strings.push_back(hap); + } + + return make_pair(source_to_sink_haplotype_strings, other_haplotype_strings); +} + + + + + + + + + + + + + + + + +//TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. +// Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. +vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ + + ///stuff that will go in mod_main: + ifstream gbwt_stream; + string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; + gbwt_stream.open(gbwt_name); + + unique_ptr gbwt; + // Load the GBWT from its container + gbwt = stream::VPKG::load_one(gbwt_stream); + +// ----------------------------------------------------------------- + /// make subgraph for the snarl: + + // graph.for_each_handle([&] (const handle_t& handle)-> bool{ + // cerr << "test for graph "; + // cerr << graph.get_id(handle) << endl; + // return true; + // }); + + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + + // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ + // cerr << "test for snarl "; + // cerr << snarl.get_id(handle) << endl; + // return true; + // }); + // cerr << "before 1 \n"; + + // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. + GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); + // cerr << "after 1 \n"; + + // cerr << "before \n"; + // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ + // cerr << "test for haploGraph "; + // cerr << haploGraph.get_id(handle) << endl; + // return true; + // }); + // cerr << "after \n"; + + + //TODO:identify source and sinks for troubleshooting! + unordered_map> sequences; // will contain all haplotype walks through snarl + handle_t source_handle = haploGraph.get_handle(source_id); + sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); + + for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { + + vector seqs_here = sequences[handle]; + gbwt::SearchState cur_state = haploGraph.get_state(handle); + + // id_t cur_id = haploGraph.get_id(handle); + // cerr << "cur_id" << cur_id << endl; + + haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { + handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); + + id_t next_id = haploGraph.get_id(next_handle); + cerr << "next_id" << next_id << endl; + + string next_seq = haploGraph.get_sequence(next_handle); + // transfer the sequences for the preceding handle to next_handle's sequences, + // plus the new handle's sequence. + for (string seq : seqs_here){ + sequences[next_handle].push_back(seq + next_seq); + } + return true; + + + }); + } + + // all the sequences at the sinks will be all the sequences in the snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + return sequences[sink_handle]; + // vector testVec; + // return testVec; +} + +//Iterate over all snarls in a graph, and run clean_snarl on it. +void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ + SnarlManager* snarl_manager = new SnarlManager(snarl_stream); + +/* Use this code to count number of snarls in graph. +* int top_count = 0; +* for (const Snarl* snarl : snarl_manager->top_level_snarls()){ +* top_count++; +* } +* cerr << "number of top_level snarls in graph: " << top_count << endl; +* +* int general_count = 0; +* snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ +* general_count++; +* }); +* cerr << "number of total snarls in graph: " << general_count << endl; +*/ + + + vector snarl_roots = snarl_manager->top_level_snarls(); + for (auto roots : snarl_roots){ + clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); + } + + delete snarl_manager; + + +} + +// Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ + //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings + //representing all possible walks through the snarl: + vector walks = graph_to_strings(graph, start_id, end_id); + + //Make a new snarl from walks: + VG new_snarl = strings_to_graph(walks); + + integrate_snarl(graph, new_snarl, start_id, end_id); + +} + +// Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. +void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ + //Get old graph snarl + SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); + + //Identify old and new snarl start and sink + pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); + pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); + + //Check to make sure that newly made snarl has only one start and end. + if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ + cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; + return; + } + //extract old and new snarl start and sink: + handle_t new_snarl_start = new_snarl_defining_handles.first[0]; + handle_t new_snarl_end = new_snarl_defining_handles.second[0]; + + handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; + handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; + + ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete + ///rest of old graph snarl. + + //Get everything needed to replace graph start and sink. + string new_start_seq = new_snarl.get_sequence(new_snarl_start); + string new_end_seq = new_snarl.get_sequence(new_snarl_end); + id_t new_start_id = graph.get_id(graph_snarl_start); + id_t new_end_id = graph.get_id(graph_snarl_end); + vector left_of_start; + graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ + left_of_start.emplace_back(handle); + }); + vector right_of_end; + graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ + right_of_end.emplace_back(handle); + }); + + //Delete all handles in graph_snarl + graph_snarl.for_each_handle([&](const handle_t& handle){ + graph.destroy_handle(handle); + }, false); + + //Make start and end handles for snarl in graph: + handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); + handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); + + //Insert start and end handles: + for (handle_t handle : left_of_start) { + graph.create_edge(handle, new_start_handle); + } + for (handle_t handle : right_of_end) { + graph.create_edge(new_end_handle, handle); + } + + ///Reintegrate rest of new_snarl. + //topologically ordered new_snarl. As I progress through each node in topo_order, + //I can add all the nodes to the right of the snarl. The final node will be the + //end node, which, instead of adding as a new node to graph, I'll re-connect + //to the modified end_node, above. + vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); + + //Construct a parallel graph_snarl_topo_order to identify + //paralogous nodes between new_snarl and graph. + vector graph_snarl_topo_order = {new_start_handle}; + + for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ + //For every handle in new_snarl, make an (unconnected) handle in graph. + string handle_seq = new_snarl.get_sequence(*it); + handle_t graph_handle = graph.create_handle(handle_seq); + graph_snarl_topo_order.push_back(graph_handle); + } + + graph_snarl_topo_order.push_back(new_end_handle); + + //Connect the rest of the nodes: + for (int i = 0; i < new_snarl_topo_order.size(); i++){ + // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; + + new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ + //get topo_index of nodes to be connected to graph start handle + auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); + int topo_index = it - new_snarl_topo_order.begin(); + // cerr << "topo_index" << topo_index << endl; + // cerr << "i" << i << endl; + + //connect graph start handle + graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); + }); + } + +} + +//Returns tuple of two handles, first being start and second being sink. +pair, vector> get_sources_and_sinks(HandleGraph& graph){ + vector sink; + vector source; + + // identify sources and sinks + graph.for_each_handle([&](const handle_t& handle) { + bool is_source = true, is_sink = true; + graph.follow_edges(handle, true, [&](const handle_t& prev) { + is_source = false; + return false; + }); + graph.follow_edges(handle, false, [&](const handle_t& next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + source.push_back(handle); + } + if (is_sink) { + sink.emplace_back(handle); + } + }); + + return pair, vector>(source, sink); + +} + + +VG strings_to_graph(const vector& walks){ + seqan::Align align; // create multiple_sequence_alignment object + + seqan::resize(rows(align), walks.size()); + for (int i = 0; i < walks.size(); ++i){ + assignSource(row(align, i), walks[i].c_str()); + } + + + globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + + stringstream ss; + ss << align; + MSAConverter myMSAConverter = MSAConverter(); + myMSAConverter.load_alignments(ss, "seqan"); + VG snarl = myMSAConverter.make_graph(); + snarl.clear_paths(); + + + // snarl.serialize_to_ostream(cerr); + return snarl; +} + + + + +vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ + SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); + + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl + sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. + snarl.for_each_handle([&](const handle_t& handle) { + bool is_source = true, is_sink = true; + snarl.follow_edges(handle, true, [&](const handle_t& prev) { + is_source = false; + return false; + }); + snarl.follow_edges(handle, false, [&](const handle_t& next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); + + + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; + + snarl.follow_edges(handle, false, [&](const handle_t& next) { + + size_t& count_next = count[next]; + string seq_next = snarl.get_sequence(next); + + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } + + else { + count_next += count_here; + // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ + for (string seq : seqs_here){ + sequences[next].push_back(seq + seq_next); + } + // cerr << "next_seqs: "; + // for (string seq : sequences[next]){ + // cerr << seq << endl; + // } + } + }); + ///TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t& sink : sinks) { + total_count += count[sink]; + } + + // all the sequences at the sinks will be all the sequences in the snarl. + vector walks; + for (handle_t& sink : sinks) { + for (string seq : sequences[sink]){ + walks.push_back(seq); + } + } + + return walks; +} + + +// given a start and end node id, construct an extract subgraph between the two nodes (inclusive). +// TODO: change the arguments to handles, which contain orientation within themselves. +// That way, iteration to extract the subgraph will have direction contained within themselves. +// This may actually end up looking like simply parsing an input text file with the handles +// described from the find_snarl output. +SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ + /// make a subgraph containing only nodes of interest. (e.g. a snarl) + // make empty subgraph + SubHandleGraph subgraph = SubHandleGraph(&graph); + + unordered_set visited; // to avoid counting the same node twice. + unordered_set to_visit; // nodes found that belong in the subgraph. + + // TODO: how to ensure that "to the right" of start_handle is the correct direction? + // initialize with start_handle (because we move only to the right of start_handle): + handle_t start_handle = graph.get_handle(start_id); + subgraph.add_handle(start_handle); + visited.insert(graph.get_id(start_handle)); + + // look only to the right of start_handle + graph.follow_edges(start_handle, false, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + + /// explore the rest of the snarl: + while (to_visit.size() != 0) { + // remove cur_handle from to_visit + unordered_set::iterator cur_index = to_visit.begin(); + handle_t cur_handle = graph.get_handle(*cur_index); + + to_visit.erase(cur_index); + + /// visit cur_handle + visited.insert(graph.get_id(cur_handle)); + + subgraph.add_handle(cur_handle); + + if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! + // look for all nodes connected to cur_handle that need to be added + // looking to the left, + graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + // looking to the right, + graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + } + } + return subgraph; +} +} \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp new file mode 100644 index 00000000000..4a313e0993a --- /dev/null +++ b/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp @@ -0,0 +1,964 @@ +// /* +// Misc. //todo's: +// do I need to fix the fact that find_haplotypes_not_at_source runs forever when given +// a non-snarl? +// */ + +// #pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? +// #include "0_demo_final_0.hpp" +// #include +// #include "../vg.hpp" +// #include "../handle.hpp" +// #include "../subgraph.hpp" +// #include "count_walks.hpp" +// #include +// #include +// #include +// #include "../msa_converter.hpp" +// #include "../snarls.hpp" +// #include "../gbwt_helper.hpp" +// #include "../stream/vpkg.hpp" +// #include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? + +// namespace vg { + +// // void print_kmer(const std::vector>&, const std::string& string){ +// // cerr << string << endl; +// // } + +// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle) { +// // unordered_map > handle_paths get_paths(graph, source_handle, sink_handle); +// // for (auto path : handle_paths) { +// // for (occuhandle : +// // } +// // } + + +// //TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. +// //TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd +// //TODO: (Now that I've converted depth_first fxn return value to a pair.) +// // // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// // void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// // //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// // //representing all possible walks through the snarl: +// // vg::handle_t source_handle = graph.get_handle(source_id); +// // vg::handle_t sink_handle = graph.get_handle(sink_id); + +// // vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); +// // cerr << "finished depth_first, now on to reference." << endl; +// // vector reference = get_paths(graph, source_handle, sink_handle); + +// // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + +// // //Make a new snarl from walks: +// // VG new_snarl = strings_to_graph(haplotypes); + +// // integrate_snarl(graph, new_snarl, source_id, sink_id); + +// // } + +// // TODO: test/debug this! +// // Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. +// // Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, +// // where each vector of handles represents one path from source to sink. +// // alternative function return: +// //unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ +// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ +// unordered_map > paths; +// unordered_map multiple_occurrences; + +// // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. +// graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { +// // Each occurrence represents an embedded path +// // (note - in the case of a looped path, there will be multiple occurrences for one path.) +// // For each path represented by an occurrence, we need to walk along the path until we reach +// // the sink node. That series of handles represents the the sequence of the path. + +// string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); +// if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) + +// //record this in multiple_occurrences, and get the number of times we've seen this occurrence. +// int occ_num; +// if (multiple_occurrences.find(path) == multiple_occurrences.end()){ +// occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. +// multiple_occurrences[path] = occ_num; +// } else { +// occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. +// } + +// //record the other occurrences with an added identifier to differentiate between paths. +// paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); +// } +// else{ // this is the first time we've encountered this occurrence. +// paths[path].emplace_back(occurrence); +// } +// }); + +// //Now, for every occurrence, walk along the path until we reach the sink. +// for (pair > path : paths){ +// // cerr << "my name" << path.first << endl; +// // cerr << "my occurences:" << endl; +// // for (auto occ : path.second) { +// // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; +// // } +// // cerr << "testing get_next_occurrence:" << endl; +// // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); +// // cerr << cur_id; + +// // cur_occurence is the current handle while walking along the path +// occurrence_handle_t cur_occurrence = path.second.back(); +// id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); +// // store the path in paths, in the occurrence_handle_t vector. +// while (cur_id != graph.get_id(sink_handle)){ +// paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); +// // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); +// cur_occurrence = paths[path.first].back(); +// // cur_occurrence = path.second.back(); +// cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); +// cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; +// } +// path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); +// cerr << path.second.size() << endl; +// for (auto handle : path.second) { +// cerr << graph.get_sequence(graph.get_occurrence(handle)); +// } +// } +// cerr << "havin' issues here?" << endl; +// for (auto path : paths) { +// for (auto handle : path.second) { +// cerr << graph.get_sequence(graph.get_occurrence(handle)); +// } +// } +// // Resolve multiple_occurrences by identifying which entry in paths +// // (of those part of the same path) is longest - that will +// // represent the full breadth of the path through the snarl. +// for (pair element : multiple_occurrences){ +// // A vector of all the path entries in paths: +// vector same_path_names = {element.first}; + +// int max_len = paths[element.first].size(); +// string max_path = element.first; + +// for (int occ_num : range_vector(element.second)){ +// occ_num++; // we actually need range_vector[1, ..., end()] +// string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; +// int cur_len = paths[cur_path].size(); +// same_path_names.push_back(cur_path); + +// if (cur_len > max_len){ +// max_len = cur_len; +// max_path = cur_path; +// } +// } + +// // get rid of the smaller fragments of path: +// for (string name : same_path_names) { +// if (name != max_path){ +// paths.erase(name); +// } +// } +// } +// vector path_strings; +// // get just the strings from the unordered_map > paths object: +// for (auto path : paths) { +// string path_string; +// for (auto handle : path.second) { +// path_string += graph.get_sequence(graph.get_occurrence(handle)); +// } +// path_strings.push_back(path_string); +// } +// return path_strings; +// } + +// vector> find_haplotypes_not_at_source(const GBWTGraph haploGraph, unordered_set touched_handles, const id_t& sink_id){ +// cerr << "finding haplotypes not at source!" << endl; +// /// Search every handle in touched handles for haplotypes starting at that point. +// // Any new haplotypes will be added to new_searches. +// vector, gbwt::SearchState>> new_searches; + +// // Fully extended haplotypes (or haplotypes extended to the snarl's sink) +// // will be added to finished_searches. +// vector> finished_searches; + +// // In addition, we need to put the new handle into to_search, because a path may have +// // started on the new handle (which means we need to start a searchstate there.) +// unordered_set to_search; + +// // We don't need to ever check the sink handle, since paths from the sink handle +// // extend beyond snarl. //TODO: true? +// handle_t sink_handle = haploGraph.get_handle(sink_id); +// touched_handles.erase(sink_handle); + +// // Create nested function for making a new_search: +// auto make_new_search = [&](handle_t handle) { +// cerr << "lambda" << endl; + +// // Are there any new threads starting at this handle? +// gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); +// if (new_search != gbwt::SearchState()){ //TODO: this is the "null" version of SearchState, right? +// // Then add them to new_searches. + +// haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { + +// handle_t next_handle = haploGraph.node_to_handle(next_search.node); + +// /// check to make sure that the thread isn't already finished: +// // if next_handle is the sink, or if this thread is only one handle long, +// // then there isn't any useful string to extract from this. +// if (next_handle != sink_handle || next_search == gbwt::SearchState()){ +// // establish a new thread to walk along. +// vector new_path; +// new_path.push_back(handle); +// new_path.push_back(next_handle); + +// pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); + + +// // add the new path to new_searches to be extended. +// new_searches.push_back(make_pair(new_path, next_search)); + +// // if next_handle hasn't been checked for starting threads, add to to_search. +// if (touched_handles.find(next_handle) == touched_handles.end()){ +// to_search.emplace(next_handle); +// } +// } +// return true; +// }); +// } +// }; + +// cerr << "1" << endl; +// // Search every handle in touched handles for haplotypes starting at that point. +// for (handle_t handle : touched_handles){ +// make_new_search(handle); +// } + +// /// Extend any paths in new_searches, and add any newly found handles to to_search. +// /// Then, check to see if there are any new threads on handles in to_search. +// /// Extend those threads, and add any newly found handles to to_search, +// /// then search for threads again in to_search again... repeat until to_search remains +// /// emptied of new handles. + +// // for tracking whether the haplotype thread is still extending: +// bool still_extending; +// cerr << "2" << endl; + +// while (!new_searches.empty()){ +// cerr << "extend new_searches" << endl; + +// for (auto search : new_searches){ +// for (auto handle : search.first){ +// cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << endl; +// } +// } + +// // First, extend new_searches, adding any newly found handles to to_search. +// for (pair, gbwt::SearchState> new_search : new_searches){ + +// // while the thread is still being extended: +// still_extending = true; +// while (still_extending){ +// cerr << "still_extending: " << endl; +// for (auto handle : new_search.first){ +// cerr << haploGraph.get_sequence( handle ) << " " << haploGraph.get_id(handle) << " "; + +// } +// cerr << endl; + +// // first, count the number of alternate, distinct paths we could walk along at the current searchState. +// int path_splits; +// haploGraph.follow_paths(new_search.second, [&](const gbwt::SearchState& next_search) -> bool { +// path_splits++; +// }); + +// /// if there's more than one path_split, then continue to fully extend the first path the haplotype takes. +// /// The rest of the alternate paths must be passed to the end of new_searches. +// /// No matter what, we extend the path: +// // extend along paths until we reach an end condition. +// bool first_path = true; + +// gbwt::SearchState first_next_search; +// still_extending = haploGraph.follow_paths(new_search.second, [&](const gbwt::SearchState& next_search) -> bool { +// if (!first_path) +// handle_t next_handle = haploGraph.node_to_handle(next_search.node); + +// // we only want to fully extend one path at a time. So, if haploGraph.follow_paths gives us +// if (first_path) { +// // if next_handle is the sink, add it to new_search and end extension. +// if( haploGraph.get_id(next_handle) == sink_id ){ +// // if next_handle hasn't been checked for starting threads, add to to_search. +// if (touched_handles.find(next_handle)!= touched_handles.end()){ +// to_search.emplace(next_handle); +// } + +// // if this is the first +// new_search.first.push_back(next_handle); +// return false; // done extending +// } + +// // if next_search falls off the end of the path, then we've already finished extending +// // the path. +// if (next_search == gbwt::SearchState()){ +// return false; +// } + +// // otherwise, continue extending the thread to walk along. +// cerr << new_search.first.size() << endl; +// new_search.first.push_back(next_handle); +// new_search.second = next_search; +// cerr << new_search.first.size() << endl; + +// // if next_handle hasn't been checked for starting threads, add to to_search. +// if (touched_handles.find(next_handle)!= touched_handles.end()){ +// to_search.emplace(next_handle); +// } + +// bool first_path = false; +// } + + +// return true; +// }); +// } +// // new_search is finished extending. Place in finished_searches. +// finished_searches.push_back(new_search.first); +// } +// // All new_searches have been fully extended and added to finished_searches. +// new_searches.clear(); + +// // Then, make more new_searches from the handles in to_search. +// for (handle_t handle : to_search){ +// make_new_search(handle); +// } +// to_search.clear(); + +// } + +// return finished_searches; +// } + + + + + + +// //TODO: does GBWTgraphs have names associated with haplotypes? +// //TODO: If so, I should change return value to an unordered map with key haplotype name +// //TODO: and value vector of all handles in haplotype (also, rename fxn). + +// //Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. +// //However, is modified to include all haplotypes inside the source/sink handles, +// //even ones that don't include the source or sink handles. +// //Returns: a vector of strings representing all paths reaching from source to sink in the snarl, +// // and a vector of strings representing all other paths in the snarl (e.g. any that don't +// // reach both source and sink in the graph.) +// pair, vector> depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// cerr << "depth first begins!" << endl; + + +// ///GBWT graph construction stuff that belongs in mod_main: +// ifstream gbwt_stream; +// //TODO: make it so that gbwt file is customized by user rather than hardcoded. +// // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; +// string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.gbwt"; +// gbwt_stream.open(gbwt_name); + +// unique_ptr gbwt; +// // Load the GBWT from its container +// gbwt = stream::VPKG::load_one(gbwt_stream); +// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); +// // ----------------------------------------------------------------------------------------- +// /// Perform depth first search, where whenever the search reaches sink_handle, convert +// /// vector of handles to string (should be equivalent to haplotype). +// //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - +// //TODO: this function currently doesn't account for if it loops back. + +// //touched_handles contains all handles that have been touched by the depth_first_search, +// //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch +// //from source to sink in the snarl. +// unordered_set touched_handles; + +// //haplotype_queue contains all started exon_haplotypes not completed yet. +// //Every time we encounter a branch in the paths, the next node down the path +// //Is stored here, along with the vector of handles that represents the path up +// //to the SearchState. +// vector< pair< vector, gbwt::SearchState> > haplotype_queue; + +// // source and sink handle for haploGraph: +// handle_t source_handle = haploGraph.get_handle(source_id); +// handle_t sink_handle = haploGraph.get_handle(sink_id); + +// //place source in haplotype_queue. +// vector source_handle_vec(1, source_handle); +// gbwt::SearchState source_state = haploGraph.get_state(source_handle); +// haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); +// touched_handles.emplace(source_handle); + +// //haplotypes contains all "finished" haplotypes - those that were either walked +// //to their conclusion, or until they reached the sink. +// vector< vector > source_to_sink_haplotype_handle_vecs; +// vector< vector > source_without_sink_haplotype_handle_vecs; + +// // cerr << "hap size before pop" << haplotype_queue.size() << endl; +// // haplotype_queue.pop_back(); + +// // cerr << "hap size after pop" << haplotype_queue.size() << endl; + +// while (!haplotype_queue.empty()) { +// cerr << "iteration! with haplotype_queue:" << endl; + +// for(auto hap : haplotype_queue) { +// cerr << "here's a hap" << endl; +// for(auto handle : hap.first){ +// cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << " "; +// } +// cerr << endl; +// } + +// pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); // Tuple of (handles_traversed_so_far, last_touched_SearchState) + +// haplotype_queue.pop_back(); + +// vector next_searches; + +// haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { +// next_searches.push_back(next_search); +// return true; +// }); + +// // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue +// // or one of the haplotype_handle_vecs. +// if (next_searches.size()>1){ + +// // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, +// // or place in the source_to_sink_haplotype_handle_vecs if haplotype extends to sink, +// // or place in the source_without_sink_haplotype_handle_vecs if haplotype ends before reaching sink. +// for (gbwt::SearchState next_search : next_searches){ +// handle_t next_handle = haploGraph.node_to_handle(next_search.node); + +// // copy over the vector of cur_haplotype: +// vector next_handle_vec(cur_haplotype.first); + +// // add the new handle to the vec: +// next_handle_vec.push_back(next_handle); + +// // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs +// if (haploGraph.get_id(next_handle) == sink_id){ + +// source_to_sink_haplotype_handle_vecs.push_back(next_handle_vec); + +// } else { // keep extending the haplotype! + +// pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); +// haplotype_queue.push_back(next_haplotype); + +// } + +// //next_handle will be touched. +// touched_handles.emplace(next_handle); +// } + +// } else if ( next_searches.empty() ) { // if next_searches is empty, the path has ended but not reached sink. + +// // We have reached the end of the path, but it doesn't reach the sink. +// // we need to add cur_haplotype to source_without_sink_haplotype_handle_vecs. +// source_without_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); + +// // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs +// } else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { + +// // Then we need to add cur_haplotype + next_search to source_to_sink_haplotype_handle_vecs. +// handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); +// cur_haplotype.first.push_back(next_handle); +// source_to_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); + +// //touched next_search's handle +// touched_handles.emplace(next_handle); + +// // else, just extend the one search. +// } else { + +// // Then there is just one next_search, and it's not the end of the path. +// // add (cur_haplotype + next_search to haplotype_queue +// handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); +// cur_haplotype.first.push_back(next_handle); +// cur_haplotype.second = next_searches.back(); +// haplotype_queue.push_back(cur_haplotype); +// touched_handles.emplace(next_handle); + +// } + +// } + +// //Now, transform the each vector of handles in source_to_sink_haplotype_handle_vecs +// //into a string, and return as a vector of strings +// vector source_to_sink_haplotype_strings; +// // for (vector vector_hap : source_to_sink_haplotype_handle_vecs){ +// // string hap; +// // for (handle_t& handle : vector_hap){ +// // hap += haploGraph.get_sequence(handle); +// // } +// // source_to_sink_haplotype_strings.push_back(hap); +// // } + +// //Find any haplotypes starting from handles not starting at the source, but which +// //still start somewhere inside the snarl. +// vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + +// //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. +// vector other_haplotype_strings; +// for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ +// string hap; +// for (handle_t& handle : vector_hap){ +// hap += haploGraph.get_sequence(handle); +// } +// other_haplotype_strings.push_back(hap); +// } + +// //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. +// for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ +// string hap; +// for (handle_t& handle : vector_hap){ +// hap += haploGraph.get_sequence(handle); +// } +// other_haplotype_strings.push_back(hap); +// } + +// return make_pair(source_to_sink_haplotype_strings, other_haplotype_strings); +// } + + + + + + + + + + + + + + + + +// //TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. +// // Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. +// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ + +// ///stuff that will go in mod_main: +// ifstream gbwt_stream; +// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; +// gbwt_stream.open(gbwt_name); + +// unique_ptr gbwt; +// // Load the GBWT from its container +// gbwt = stream::VPKG::load_one(gbwt_stream); + +// // ----------------------------------------------------------------- +// /// make subgraph for the snarl: + +// // graph.for_each_handle([&] (const handle_t& handle)-> bool{ +// // cerr << "test for graph "; +// // cerr << graph.get_id(handle) << endl; +// // return true; +// // }); + +// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + +// // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ +// // cerr << "test for snarl "; +// // cerr << snarl.get_id(handle) << endl; +// // return true; +// // }); +// // cerr << "before 1 \n"; + +// // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. +// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); +// // cerr << "after 1 \n"; + +// // cerr << "before \n"; +// // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ +// // cerr << "test for haploGraph "; +// // cerr << haploGraph.get_id(handle) << endl; +// // return true; +// // }); +// // cerr << "after \n"; + + +// //TODO:identify source and sinks for troubleshooting! +// unordered_map> sequences; // will contain all haplotype walks through snarl +// handle_t source_handle = haploGraph.get_handle(source_id); +// sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); + +// for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { + +// vector seqs_here = sequences[handle]; +// gbwt::SearchState cur_state = haploGraph.get_state(handle); + +// // id_t cur_id = haploGraph.get_id(handle); +// // cerr << "cur_id" << cur_id << endl; + +// haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { +// handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); + +// id_t next_id = haploGraph.get_id(next_handle); +// cerr << "next_id" << next_id << endl; + +// string next_seq = haploGraph.get_sequence(next_handle); +// // transfer the sequences for the preceding handle to next_handle's sequences, +// // plus the new handle's sequence. +// for (string seq : seqs_here){ +// sequences[next_handle].push_back(seq + next_seq); +// } +// return true; + + +// }); +// } + +// // all the sequences at the sinks will be all the sequences in the snarl. +// handle_t sink_handle = haploGraph.get_handle(sink_id); +// return sequences[sink_handle]; +// // vector testVec; +// // return testVec; +// } + +// //Iterate over all snarls in a graph, and run clean_snarl on it. +// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ +// SnarlManager* snarl_manager = new SnarlManager(snarl_stream); + +// /* Use this code to count number of snarls in graph. +// * int top_count = 0; +// * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ +// * top_count++; +// * } +// * cerr << "number of top_level snarls in graph: " << top_count << endl; +// * +// * int general_count = 0; +// * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ +// * general_count++; +// * }); +// * cerr << "number of total snarls in graph: " << general_count << endl; +// */ + + +// vector snarl_roots = snarl_manager->top_level_snarls(); +// for (auto roots : snarl_roots){ +// clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); +// } + +// delete snarl_manager; + + +// } + +// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ +// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// //representing all possible walks through the snarl: +// vector walks = graph_to_strings(graph, start_id, end_id); + +// //Make a new snarl from walks: +// VG new_snarl = strings_to_graph(walks); + +// integrate_snarl(graph, new_snarl, start_id, end_id); + +// } + +// // Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. +// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ +// //Get old graph snarl +// SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); + +// //Identify old and new snarl start and sink +// pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); +// pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); + +// //Check to make sure that newly made snarl has only one start and end. +// if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ +// cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; +// return; +// } +// //extract old and new snarl start and sink: +// handle_t new_snarl_start = new_snarl_defining_handles.first[0]; +// handle_t new_snarl_end = new_snarl_defining_handles.second[0]; + +// handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; +// handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; + +// ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete +// ///rest of old graph snarl. + +// //Get everything needed to replace graph start and sink. +// string new_start_seq = new_snarl.get_sequence(new_snarl_start); +// string new_end_seq = new_snarl.get_sequence(new_snarl_end); +// id_t new_start_id = graph.get_id(graph_snarl_start); +// id_t new_end_id = graph.get_id(graph_snarl_end); +// vector left_of_start; +// graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ +// left_of_start.emplace_back(handle); +// }); +// vector right_of_end; +// graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ +// right_of_end.emplace_back(handle); +// }); + +// //Delete all handles in graph_snarl +// graph_snarl.for_each_handle([&](const handle_t& handle){ +// graph.destroy_handle(handle); +// }, false); + +// //Make start and end handles for snarl in graph: +// handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); +// handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); + +// //Insert start and end handles: +// for (handle_t handle : left_of_start) { +// graph.create_edge(handle, new_start_handle); +// } +// for (handle_t handle : right_of_end) { +// graph.create_edge(new_end_handle, handle); +// } + +// ///Reintegrate rest of new_snarl. +// //topologically ordered new_snarl. As I progress through each node in topo_order, +// //I can add all the nodes to the right of the snarl. The final node will be the +// //end node, which, instead of adding as a new node to graph, I'll re-connect +// //to the modified end_node, above. +// vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); + +// //Construct a parallel graph_snarl_topo_order to identify +// //paralogous nodes between new_snarl and graph. +// vector graph_snarl_topo_order = {new_start_handle}; + +// for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ +// //For every handle in new_snarl, make an (unconnected) handle in graph. +// string handle_seq = new_snarl.get_sequence(*it); +// handle_t graph_handle = graph.create_handle(handle_seq); +// graph_snarl_topo_order.push_back(graph_handle); +// } + +// graph_snarl_topo_order.push_back(new_end_handle); + +// //Connect the rest of the nodes: +// for (int i = 0; i < new_snarl_topo_order.size(); i++){ +// // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; + +// new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ +// //get topo_index of nodes to be connected to graph start handle +// auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); +// int topo_index = it - new_snarl_topo_order.begin(); +// // cerr << "topo_index" << topo_index << endl; +// // cerr << "i" << i << endl; + +// //connect graph start handle +// graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); +// }); +// } + +// } + +// //Returns tuple of two handles, first being start and second being sink. +// pair, vector> get_sources_and_sinks(HandleGraph& graph){ +// vector sink; +// vector source; + +// // identify sources and sinks +// graph.for_each_handle([&](const handle_t& handle) { +// bool is_source = true, is_sink = true; +// graph.follow_edges(handle, true, [&](const handle_t& prev) { +// is_source = false; +// return false; +// }); +// graph.follow_edges(handle, false, [&](const handle_t& next) { +// is_sink = false; +// return false; +// }); + +// // base case for dynamic programming +// if (is_source) { +// source.push_back(handle); +// } +// if (is_sink) { +// sink.emplace_back(handle); +// } +// }); + +// return pair, vector>(source, sink); + +// } + + +// VG strings_to_graph(const vector& walks){ +// seqan::Align align; // create multiple_sequence_alignment object + +// seqan::resize(rows(align), walks.size()); +// for (int i = 0; i < walks.size(); ++i){ +// assignSource(row(align, i), walks[i].c_str()); +// } + + +// globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + +// stringstream ss; +// ss << align; +// MSAConverter myMSAConverter = MSAConverter(); +// myMSAConverter.load_alignments(ss, "seqan"); +// VG snarl = myMSAConverter.make_graph(); +// snarl.clear_paths(); + + +// // snarl.serialize_to_ostream(cerr); +// return snarl; +// } + + + + +// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ +// SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); + +// unordered_map> sequences; +// vector sinks; +// unordered_map count; +// count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl +// sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl + +// // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. +// snarl.for_each_handle([&](const handle_t& handle) { +// bool is_source = true, is_sink = true; +// snarl.follow_edges(handle, true, [&](const handle_t& prev) { +// is_source = false; +// return false; +// }); +// snarl.follow_edges(handle, false, [&](const handle_t& next) { +// is_sink = false; +// return false; +// }); + +// // base case for dynamic programming +// if (is_source) { +// count[handle] = 1; +// sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? +// } +// if (is_sink) { +// sinks.emplace_back(handle); +// } +// }); + + +// // count walks by dynamic programming +// bool overflowed = false; +// for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { +// size_t count_here = count[handle]; +// vector seqs_here = sequences[handle]; + +// snarl.follow_edges(handle, false, [&](const handle_t& next) { + +// size_t& count_next = count[next]; +// string seq_next = snarl.get_sequence(next); + +// if (numeric_limits::max() - count_here < count_next) { +// overflowed = true; +// } + +// else { +// count_next += count_here; +// // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ +// for (string seq : seqs_here){ +// sequences[next].push_back(seq + seq_next); +// } +// // cerr << "next_seqs: "; +// // for (string seq : sequences[next]){ +// // cerr << seq << endl; +// // } +// } +// }); +// ///TODO: figure out how to deal with overflow. +// // if (overflowed) { +// // return numeric_limits::max(); +// // } +// } + +// // total up the walks at the sinks +// size_t total_count = 0; +// for (handle_t& sink : sinks) { +// total_count += count[sink]; +// } + +// // all the sequences at the sinks will be all the sequences in the snarl. +// vector walks; +// for (handle_t& sink : sinks) { +// for (string seq : sequences[sink]){ +// walks.push_back(seq); +// } +// } + +// return walks; +// } + + +// // given a start and end node id, construct an extract subgraph between the two nodes (inclusive). +// // TODO: change the arguments to handles, which contain orientation within themselves. +// // That way, iteration to extract the subgraph will have direction contained within themselves. +// // This may actually end up looking like simply parsing an input text file with the handles +// // described from the find_snarl output. +// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ +// /// make a subgraph containing only nodes of interest. (e.g. a snarl) +// // make empty subgraph +// SubHandleGraph subgraph = SubHandleGraph(&graph); + +// unordered_set visited; // to avoid counting the same node twice. +// unordered_set to_visit; // nodes found that belong in the subgraph. + +// // TODO: how to ensure that "to the right" of start_handle is the correct direction? +// // initialize with start_handle (because we move only to the right of start_handle): +// handle_t start_handle = graph.get_handle(start_id); +// subgraph.add_handle(start_handle); +// visited.insert(graph.get_id(start_handle)); + +// // look only to the right of start_handle +// graph.follow_edges(start_handle, false, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); + +// /// explore the rest of the snarl: +// while (to_visit.size() != 0) { +// // remove cur_handle from to_visit +// unordered_set::iterator cur_index = to_visit.begin(); +// handle_t cur_handle = graph.get_handle(*cur_index); + +// to_visit.erase(cur_index); + +// /// visit cur_handle +// visited.insert(graph.get_id(cur_handle)); + +// subgraph.add_handle(cur_handle); + +// if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! +// // look for all nodes connected to cur_handle that need to be added +// // looking to the left, +// graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); +// // looking to the right, +// graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); +// } +// } +// return subgraph; +// } +// } \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp new file mode 100644 index 00000000000..8898bc5ef2e --- /dev/null +++ b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp @@ -0,0 +1,741 @@ +// /* +// In this phase of the code, I've only extracted paths in the depth_first_search +// that are overlapping the source node. Next step is to integrate paths that start +// in the middle of the snarl. +// */ + +// #pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? +// #include "0_demo_final_0.hpp" +// #include +// #include "../vg.hpp" +// #include "../handle.hpp" +// #include "../subgraph.hpp" +// #include "count_walks.hpp" +// #include +// #include +// #include +// #include "../msa_converter.hpp" +// #include "../snarls.hpp" +// #include "../gbwt_helper.hpp" +// #include "../stream/vpkg.hpp" +// #include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? + +// namespace vg { + +// // void print_kmer(const std::vector>&, const std::string& string){ +// // cerr << string << endl; +// // } + +// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle) { +// // unordered_map > handle_paths get_paths(graph, source_handle, sink_handle); +// // for (auto path : handle_paths) { +// // for (occuhandle : +// // } +// // } + +// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// //representing all possible walks through the snarl: +// vg::handle_t source_handle = graph.get_handle(source_id); +// vg::handle_t sink_handle = graph.get_handle(sink_id); + +// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); +// cerr << "finished depth_first, now on to reference." << endl; +// vector reference = get_paths(graph, source_handle, sink_handle); + +// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + +// //Make a new snarl from walks: +// VG new_snarl = strings_to_graph(haplotypes); + +// integrate_snarl(graph, new_snarl, source_id, sink_id); + +// } + +// // TODO: test/debug this! +// // Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. +// // Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, +// // where each vector of handles represents one path from source to sink. +// // alternative function return: +// //unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ +// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ +// unordered_map > paths; +// unordered_map multiple_occurrences; + +// // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. +// graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { +// // Each occurrence represents an embedded path +// // (note - in the case of a looped path, there will be multiple occurrences for one path.) +// // For each path represented by an occurrence, we need to walk along the path until we reach +// // the sink node. That series of handles represents the the sequence of the path. + +// string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); +// if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) + +// //record this in multiple_occurrences, and get the number of times we've seen this occurrence. +// int occ_num; +// if (multiple_occurrences.find(path) == multiple_occurrences.end()){ +// occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. +// multiple_occurrences[path] = occ_num; +// } else { +// occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. +// } + +// //record the other occurrences with an added identifier to differentiate between paths. +// paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); +// } +// else{ // this is the first time we've encountered this occurrence. +// paths[path].emplace_back(occurrence); +// } +// }); + +// //Now, for every occurrence, walk along the path until we reach the sink. +// for (pair > path : paths){ +// // cerr << "my name" << path.first << endl; +// // cerr << "my occurences:" << endl; +// // for (auto occ : path.second) { +// // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; +// // } +// // cerr << "testing get_next_occurrence:" << endl; +// // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); +// // cerr << cur_id; + +// // cur_occurence is the current handle while walking along the path +// occurrence_handle_t cur_occurrence = path.second.back(); +// id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); +// // store the path in paths, in the occurrence_handle_t vector. +// while (cur_id != graph.get_id(sink_handle)){ +// paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); +// // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); +// cur_occurrence = paths[path.first].back(); +// // cur_occurrence = path.second.back(); +// cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); +// cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; +// } +// path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); +// cerr << path.second.size() << endl; +// for (auto handle : path.second) { +// cerr << graph.get_sequence(graph.get_occurrence(handle)); +// } +// } +// cerr << "havin' issues here?" << endl; +// for (auto path : paths) { +// for (auto handle : path.second) { +// cerr << graph.get_sequence(graph.get_occurrence(handle)); +// } +// } +// // Resolve multiple_occurrences by identifying which entry in paths +// // (of those part of the same path) is longest - that will +// // represent the full breadth of the path through the snarl. +// for (pair element : multiple_occurrences){ +// // A vector of all the path entries in paths: +// vector same_path_names = {element.first}; + +// int max_len = paths[element.first].size(); +// string max_path = element.first; + +// for (int occ_num : range_vector(element.second)){ +// occ_num++; // we actually need range_vector[1, ..., end()] +// string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; +// int cur_len = paths[cur_path].size(); +// same_path_names.push_back(cur_path); + +// if (cur_len > max_len){ +// max_len = cur_len; +// max_path = cur_path; +// } +// } + +// // get rid of the smaller fragments of path: +// for (string name : same_path_names) { +// if (name != max_path){ +// paths.erase(name); +// } +// } +// } +// vector path_strings; +// // get just the strings from the unordered_map > paths object: +// for (auto path : paths) { +// string path_string; +// for (auto handle : path.second) { +// path_string += graph.get_sequence(graph.get_occurrence(handle)); +// } +// path_strings.push_back(path_string); +// } +// return path_strings; +// } + +// //TODO: does GBWTgraphs have names associated with haplotypes? +// //TODO: If so, I should change return value to an unordered map with key haplotype name +// //TODO: and value vector of all handles in haplotype (also, rename fxn). + +// //Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. +// //However, is modified to include all haplotypes inside the source/sink handles, +// //even ones that don't include the source or sink handles. +// vector depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ +// ///GBWT graph construction stuff that belongs in mod_main: +// ifstream gbwt_stream; +// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; +// gbwt_stream.open(gbwt_name); + +// unique_ptr gbwt; +// // Load the GBWT from its container +// gbwt = stream::VPKG::load_one(gbwt_stream); +// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); +// // ----------------------------------------------------------------------------------------- +// /// Perform depth first search, where whenever the search reaches sink_handle, convert +// /// vector of handles to string (should be equivalent to haplotype). +// //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - +// //TODO: this function currently doesn't account for if it loops back. + +// //haplotype_queue contains all started exon_haplotypes not completed yet. +// //Every time we encounter a branch in the paths, the next node down the path +// //Is stored here, along with the vector of handles that represents the path up +// //to the SearchState. +// vector< pair< vector, gbwt::SearchState> > haplotype_queue; + +// // source and sink handle for haploGraph: +// handle_t source_handle = haploGraph.get_handle(source_id); +// handle_t sink_handle = haploGraph.get_handle(sink_id); + +// //place source in haplotype_queue. +// vector source_handle_vec(1, source_handle); +// gbwt::SearchState source_state = haploGraph.get_state(source_handle); +// // pair< vector, gbwt::SearchState> source = make_pair(source_handle_vec, source_state); +// haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); +// /* +// cerr << "node id from original graph " << source_id << endl; +// cerr << "node id from handle (immediately after construction) " << haploGraph.get_id(haploGraph.get_handle(source_id)) << endl; +// cerr << "node id from handle " << haploGraph.get_id(haplotype_queue.back().first.back()) << endl; +// cerr << "node id from search state " << haplotype_queue.back().second.node << endl; + + +// cerr << "here's the code I want to run" << endl; +// cerr << "here's the handle I care about " << haploGraph.get_sequence(haplotype_queue.back().first.back()) << endl; +// haploGraph.node_to_handle(haplotype_queue.back().second.node); +// cerr<<" made a handle "<< endl; +// cerr << haploGraph.get_sequence(haploGraph.node_to_handle(haplotype_queue.back().second.node)) << endl; +// cerr << "finished running code. " << endl; +// */ +// //haplotypes contains all "finished" haplotypes - those that were either walked +// //to their conclusion, or until they reached the sink. +// vector< vector > final_haplotypes; + +// /* +// What if I made a list of start_search_states, which would keep track of places I'd like my backtrace to go +// (maybe make this a tuple to ensure that end_search_states find the start_search_state I intended for them.) +// Actually, right now I'm only worrying about paths that start at the source_node. The new paths I encounter +// partway through the traversal will be dealt with a seperate function call that will make use of bidirected +// search to ensure I find all the relevant components of the path. + +// Okay. Keep track of the start_search_state, which is the source_node. +// Then, have a vector of all bidirected SearchStates (variable expanding_search_states) +// that are at "the border" of my current search of the snarl. + +// On each iteration of the while loop, get and pop the cur_search_state +// out of the back of expanding_search_states. Using haploGraph.follow_paths, find every search_state that +// proceeds cur_search_state, and add them to either expanding_search_states or end_search_states. +// - add cur_search_state to end_search_states if next_search_state.empty() = true; or +// - add next_search_state to end_search_states if haploGraph.get_handle(next_search_state.node) == sink_handle. +// - + +// (as a bidirectional) if there are no +// */ + + + +// while (haplotype_queue.size() != 0) { +// cerr << "iteration! with haplotype_queue:" << endl; +// // for (auto hap : haplotype_queue){ +// // cerr << hap.first.size(); +// // } + +// pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); // Tuple of (handles_traversed_so_far, last_touched_SearchState) + +// haplotype_queue.pop_back(); + +// vector next_searches; + +// haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { +// next_searches.push_back(next_search); +// return true; +// }); + +// if (next_searches.size()>1){ +// for (gbwt::SearchState next_search : next_searches){ +// // copy over the vector of cur_haplotype: +// vector next_handle_vec(cur_haplotype.first); +// // add the new handle to the vec: +// next_handle_vec.push_back(haploGraph.node_to_handle(next_search.node)); //TODO: next_search.node is of type node_type, not node_id. Is that okay? +// cerr << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << endl; +// pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); +// haplotype_queue.push_back(next_haplotype); +// } +// } else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { //TODO: once again, is SearchState.node acceptable here? +// // Then we need to add cur_haplotype + next_search to final_haplotypes. +// cur_haplotype.first.push_back(haploGraph.node_to_handle(next_searches.back().node)); +// final_haplotypes.push_back(cur_haplotype.first); + +// } else if ( next_searches.back().empty()) { +// // Then we have reached the end of the path, and need to add cur_haplotype to final_haplotypes. +// final_haplotypes.push_back(cur_haplotype.first); +// } else { +// // Then there is just one next_search, and it's not the end of the path. +// // add (cur_haplotype + next_search to haplotype_queue +// cur_haplotype.first.push_back(haploGraph.node_to_handle(next_searches.back().node)); +// cur_haplotype.second = next_searches.back(); +// haplotype_queue.push_back(cur_haplotype); +// } + +// } + +// //Now, transform the each vector of handles in final_haplotypes into a string, and return as a vector of strings + +// vector string_haplotypes; +// for (vector vector_hap : final_haplotypes){ +// string hap; +// for (handle_t& handle : vector_hap){ +// hap += haploGraph.get_sequence(handle); +// } +// string_haplotypes.push_back(hap); +// } + +// return string_haplotypes; +// } + + + + + + + + + + + + + + + + +// //TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. +// // Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. +// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ + +// ///stuff that will go in mod_main: +// ifstream gbwt_stream; +// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; +// gbwt_stream.open(gbwt_name); + +// unique_ptr gbwt; +// // Load the GBWT from its container +// gbwt = stream::VPKG::load_one(gbwt_stream); + +// // ----------------------------------------------------------------- +// /// make subgraph for the snarl: + +// // graph.for_each_handle([&] (const handle_t& handle)-> bool{ +// // cerr << "test for graph "; +// // cerr << graph.get_id(handle) << endl; +// // return true; +// // }); + +// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + +// // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ +// // cerr << "test for snarl "; +// // cerr << snarl.get_id(handle) << endl; +// // return true; +// // }); +// // cerr << "before 1 \n"; + +// // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. +// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); +// // cerr << "after 1 \n"; + +// // cerr << "before \n"; +// // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ +// // cerr << "test for haploGraph "; +// // cerr << haploGraph.get_id(handle) << endl; +// // return true; +// // }); +// // cerr << "after \n"; + + +// //TODO:identify source and sinks for troubleshooting! +// unordered_map> sequences; // will contain all haplotype walks through snarl +// handle_t source_handle = haploGraph.get_handle(source_id); +// sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); + +// for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { + +// vector seqs_here = sequences[handle]; +// gbwt::SearchState cur_state = haploGraph.get_state(handle); + +// // id_t cur_id = haploGraph.get_id(handle); +// // cerr << "cur_id" << cur_id << endl; + +// haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { +// handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); + +// id_t next_id = haploGraph.get_id(next_handle); +// cerr << "next_id" << next_id << endl; + +// string next_seq = haploGraph.get_sequence(next_handle); +// // transfer the sequences for the preceding handle to next_handle's sequences, +// // plus the new handle's sequence. +// for (string seq : seqs_here){ +// sequences[next_handle].push_back(seq + next_seq); +// } +// return true; + + +// }); +// } + +// // all the sequences at the sinks will be all the sequences in the snarl. +// handle_t sink_handle = haploGraph.get_handle(sink_id); +// return sequences[sink_handle]; +// // vector testVec; +// // return testVec; +// } + +// //Iterate over all snarls in a graph, and run clean_snarl on it. +// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ +// SnarlManager* snarl_manager = new SnarlManager(snarl_stream); + +// /* Use this code to count number of snarls in graph. +// * int top_count = 0; +// * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ +// * top_count++; +// * } +// * cerr << "number of top_level snarls in graph: " << top_count << endl; +// * +// * int general_count = 0; +// * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ +// * general_count++; +// * }); +// * cerr << "number of total snarls in graph: " << general_count << endl; +// */ + + +// vector snarl_roots = snarl_manager->top_level_snarls(); +// for (auto roots : snarl_roots){ +// clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); +// } + +// delete snarl_manager; + + +// } + +// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, +// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. +// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ +// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings +// //representing all possible walks through the snarl: +// vector walks = graph_to_strings(graph, start_id, end_id); + +// //Make a new snarl from walks: +// VG new_snarl = strings_to_graph(walks); + +// integrate_snarl(graph, new_snarl, start_id, end_id); + +// } + +// // Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. +// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ +// //Get old graph snarl +// SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); + +// //Identify old and new snarl start and sink +// pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); +// pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); + +// //Check to make sure that newly made snarl has only one start and end. +// if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ +// cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; +// return; +// } +// //extract old and new snarl start and sink: +// handle_t new_snarl_start = new_snarl_defining_handles.first[0]; +// handle_t new_snarl_end = new_snarl_defining_handles.second[0]; + +// handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; +// handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; + +// ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete +// ///rest of old graph snarl. + +// //Get everything needed to replace graph start and sink. +// string new_start_seq = new_snarl.get_sequence(new_snarl_start); +// string new_end_seq = new_snarl.get_sequence(new_snarl_end); +// id_t new_start_id = graph.get_id(graph_snarl_start); +// id_t new_end_id = graph.get_id(graph_snarl_end); +// vector left_of_start; +// graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ +// left_of_start.emplace_back(handle); +// }); +// vector right_of_end; +// graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ +// right_of_end.emplace_back(handle); +// }); + +// //Delete all handles in graph_snarl +// graph_snarl.for_each_handle([&](const handle_t& handle){ +// graph.destroy_handle(handle); +// }, false); + +// //Make start and end handles for snarl in graph: +// handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); +// handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); + +// //Insert start and end handles: +// for (handle_t handle : left_of_start) { +// graph.create_edge(handle, new_start_handle); +// } +// for (handle_t handle : right_of_end) { +// graph.create_edge(new_end_handle, handle); +// } + +// ///Reintegrate rest of new_snarl. +// //topologically ordered new_snarl. As I progress through each node in topo_order, +// //I can add all the nodes to the right of the snarl. The final node will be the +// //end node, which, instead of adding as a new node to graph, I'll re-connect +// //to the modified end_node, above. +// vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); + +// //Construct a parallel graph_snarl_topo_order to identify +// //paralogous nodes between new_snarl and graph. +// vector graph_snarl_topo_order = {new_start_handle}; + +// for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ +// //For every handle in new_snarl, make an (unconnected) handle in graph. +// string handle_seq = new_snarl.get_sequence(*it); +// handle_t graph_handle = graph.create_handle(handle_seq); +// graph_snarl_topo_order.push_back(graph_handle); +// } + +// graph_snarl_topo_order.push_back(new_end_handle); + +// //Connect the rest of the nodes: +// for (int i = 0; i < new_snarl_topo_order.size(); i++){ +// // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; + +// new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ +// //get topo_index of nodes to be connected to graph start handle +// auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); +// int topo_index = it - new_snarl_topo_order.begin(); +// // cerr << "topo_index" << topo_index << endl; +// // cerr << "i" << i << endl; + +// //connect graph start handle +// graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); +// }); +// } + +// } + +// //Returns tuple of two handles, first being start and second being sink. +// pair, vector> get_sources_and_sinks(HandleGraph& graph){ +// vector sink; +// vector source; + +// // identify sources and sinks +// graph.for_each_handle([&](const handle_t& handle) { +// bool is_source = true, is_sink = true; +// graph.follow_edges(handle, true, [&](const handle_t& prev) { +// is_source = false; +// return false; +// }); +// graph.follow_edges(handle, false, [&](const handle_t& next) { +// is_sink = false; +// return false; +// }); + +// // base case for dynamic programming +// if (is_source) { +// source.push_back(handle); +// } +// if (is_sink) { +// sink.emplace_back(handle); +// } +// }); + +// return pair, vector>(source, sink); + +// } + + +// VG strings_to_graph(const vector& walks){ +// seqan::Align align; // create multiple_sequence_alignment object + +// seqan::resize(rows(align), walks.size()); +// for (int i = 0; i < walks.size(); ++i){ +// assignSource(row(align, i), walks[i].c_str()); +// } + + +// globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + +// stringstream ss; +// ss << align; +// MSAConverter myMSAConverter = MSAConverter(); +// myMSAConverter.load_alignments(ss, "seqan"); +// VG snarl = myMSAConverter.make_graph(); +// snarl.clear_paths(); + + +// // snarl.serialize_to_ostream(cerr); +// return snarl; +// } + + + + +// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ +// SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); + +// unordered_map> sequences; +// vector sinks; +// unordered_map count; +// count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl +// sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl + +// // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. +// snarl.for_each_handle([&](const handle_t& handle) { +// bool is_source = true, is_sink = true; +// snarl.follow_edges(handle, true, [&](const handle_t& prev) { +// is_source = false; +// return false; +// }); +// snarl.follow_edges(handle, false, [&](const handle_t& next) { +// is_sink = false; +// return false; +// }); + +// // base case for dynamic programming +// if (is_source) { +// count[handle] = 1; +// sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? +// } +// if (is_sink) { +// sinks.emplace_back(handle); +// } +// }); + + +// // count walks by dynamic programming +// bool overflowed = false; +// for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { +// size_t count_here = count[handle]; +// vector seqs_here = sequences[handle]; + +// snarl.follow_edges(handle, false, [&](const handle_t& next) { + +// size_t& count_next = count[next]; +// string seq_next = snarl.get_sequence(next); + +// if (numeric_limits::max() - count_here < count_next) { +// overflowed = true; +// } + +// else { +// count_next += count_here; +// // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ +// for (string seq : seqs_here){ +// sequences[next].push_back(seq + seq_next); +// } +// // cerr << "next_seqs: "; +// // for (string seq : sequences[next]){ +// // cerr << seq << endl; +// // } +// } +// }); +// ///TODO: figure out how to deal with overflow. +// // if (overflowed) { +// // return numeric_limits::max(); +// // } +// } + +// // total up the walks at the sinks +// size_t total_count = 0; +// for (handle_t& sink : sinks) { +// total_count += count[sink]; +// } + +// // all the sequences at the sinks will be all the sequences in the snarl. +// vector walks; +// for (handle_t& sink : sinks) { +// for (string seq : sequences[sink]){ +// walks.push_back(seq); +// } +// } + +// return walks; +// } + + +// // given a start and end node id, construct an extract subgraph between the two nodes (inclusive). +// // TODO: change the arguments to handles, which contain orientation within themselves. +// // That way, iteration to extract the subgraph will have direction contained within themselves. +// // This may actually end up looking like simply parsing an input text file with the handles +// // described from the find_snarl output. +// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ +// /// make a subgraph containing only nodes of interest. (e.g. a snarl) +// // make empty subgraph +// SubHandleGraph subgraph = SubHandleGraph(&graph); + +// unordered_set visited; // to avoid counting the same node twice. +// unordered_set to_visit; // nodes found that belong in the subgraph. + +// // TODO: how to ensure that "to the right" of start_handle is the correct direction? +// // initialize with start_handle (because we move only to the right of start_handle): +// handle_t start_handle = graph.get_handle(start_id); +// subgraph.add_handle(start_handle); +// visited.insert(graph.get_id(start_handle)); + +// // look only to the right of start_handle +// graph.follow_edges(start_handle, false, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); + +// /// explore the rest of the snarl: +// while (to_visit.size() != 0) { +// // remove cur_handle from to_visit +// unordered_set::iterator cur_index = to_visit.begin(); +// handle_t cur_handle = graph.get_handle(*cur_index); + +// to_visit.erase(cur_index); + +// /// visit cur_handle +// visited.insert(graph.get_id(cur_handle)); + +// subgraph.add_handle(cur_handle); + +// if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! +// // look for all nodes connected to cur_handle that need to be added +// // looking to the left, +// graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); +// // looking to the right, +// graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ +// // mark the nodes to come as to_visit +// if (visited.find(graph.get_id(handle)) == visited.end()) { +// to_visit.insert(graph.get_id(handle)); +// } +// }); +// } +// } +// return subgraph; +// } +// } \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp new file mode 100644 index 00000000000..5154b47101f --- /dev/null +++ b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp @@ -0,0 +1,37 @@ +// /* +// Robin Rounthwaite +// Find function call in ./subcommand/main.cpp +// */ +// #include +// #include "../vg.hpp" +// #include "../handle.hpp" +// #include "../subgraph.hpp" +// #include "count_walks.hpp" + +// namespace vg { + +// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id); + +// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle); + +// // unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle); +// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle); + +// vector depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id); + +// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id); + +// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); + +// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); + +// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); + +// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id); + +// VG strings_to_graph(const vector& walks); + +// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id); + +// pair, vector> get_sources_and_sinks(HandleGraph& graph); +// } diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index 521e1fa2dc9..430ae8d2a81 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -17,7 +17,10 @@ #include "../algorithms/topological_sort.hpp" #include "../algorithms/remove_high_degree.hpp" -#include "../algorithms/0_demo_final_0.hpp" +#include "../algorithms/0_draft_haplotype_realignment.hpp" +#include "../gbwt_helper.hpp" +#include "../stream/vpkg.hpp" + using namespace std; using namespace vg; @@ -863,17 +866,88 @@ int main_mod(int argc, char** argv) { } if ( !demo_0.empty() ) { + /// Build the gbwt: + ifstream gbwt_stream; + string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new_2.gbwt"; //Nodes 23493 to 23505 + gbwt_stream.open(gbwt_name); + + // Load the GBWT from its container + unique_ptr gbwt; + gbwt = stream::VPKG::load_one(gbwt_stream); + GBWTGraph haploGraph = vg::GBWTGraph(*gbwt, *graph); + + /// Run test code: + vg::id_t source = 23493; vg::id_t sink = 23505; + pair< vector< vector >, vector< vector > > haplotypes = extract_haplotypes(haploGraph, source, sink); + align_haplotypes(haploGraph, haplotypes); + + } + + // graph->serialize_to_ostream(std::cout); + delete graph; + + return 0; +} + +// Register subcommand +static Subcommand vg_mod("mod", "filter, transform, and edit the graph", TOOLKIT, main_mod); + + + + + + + + + + + + + + + + - ///Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. - vg::id_t source = 1; - vg::id_t sink = 8; - vector haplotypes = haplotypes_to_strings(*graph, source, sink); - cout << "here goes!" << endl; - for(string haplotype : haplotypes) { +//TODO: Remove JUNK: + + // vg::id_t source = 23251;//for robin_haplotypes/simple + // vg::id_t sink = 23257;//for robin_haplotypes/simple + // /Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. + // vg::id_t source = 23251;//for robin_haplotypes/simple + // vg::id_t sink = 23257;//for robin_haplotypes/simple + // clean_snarl_from_haplotypes(*graph, source, sink); + // cerr << "done!" << endl; + // vg::handle_t source_handle = graph->get_handle(source); + // vg::handle_t sink_handle = graph->get_handle(sink); + + // vector haplotypes = depth_first_haplotypes_to_strings(*graph, source, sink); + // cerr << "finished depth_first, now on to reference." << endl; + // vector reference = get_paths(*graph, source_handle, sink_handle); + + // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + + // cerr << "here goes!" << endl; + // for(string haplotype : haplotypes) { - cout << haplotype << endl; - } + // cerr << haplotype << endl; + // } + // cerr << "done" << endl; + + + + + + + + + + + + + + + // std::ifstream snarl_stream; // snarl_stream.open(demo_0); @@ -884,14 +958,6 @@ int main_mod(int argc, char** argv) { // } // clean_all_snarls(*graph, snarl_stream); - } - // graph->serialize_to_ostream(std::cout); - delete graph; + // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - - return 0; -} - -// Register subcommand -static Subcommand vg_mod("mod", "filter, transform, and edit the graph", TOOLKIT, main_mod); From 14f52d5261cb46f31ed57cc5e48341cb7bad6460 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 13 Jun 2019 12:04:24 -0700 Subject: [PATCH 11/63] Fixing makefile merge issue --- Makefile | 5 + src/algorithms/0_demo_final_0.cpp | 401 ------------------------------ src/algorithms/0_demo_final_0.hpp | 27 -- src/msa_converter.cpp | 4 +- 4 files changed, 7 insertions(+), 430 deletions(-) delete mode 100644 src/algorithms/0_demo_final_0.cpp delete mode 100644 src/algorithms/0_demo_final_0.hpp diff --git a/Makefile b/Makefile index 5cc724877a3..606c00e59cf 100644 --- a/Makefile +++ b/Makefile @@ -249,11 +249,16 @@ DEPS += $(INC_DIR)/backward.hpp DEPS += $(INC_DIR)/dozeu/dozeu.h ifneq ($(shell uname -s),Darwin) +<<<<<<< Updated upstream # Use tcmalloc only DEPS += $(LIB_DIR)/libtcmalloc_minimal.a LD_LIB_FLAGS += -ltcmalloc_minimal CONFIGURATION_OBJ += $(OBJ_DIR)/tcmalloc_configuration.o +======= + # DEPS += $(LIB_DIR)/libtcmalloc_minimal.a + # LD_LIB_FLAGS += -ltcmalloc_minimal +>>>>>>> Stashed changes endif diff --git a/src/algorithms/0_demo_final_0.cpp b/src/algorithms/0_demo_final_0.cpp deleted file mode 100644 index 480ea3db4f0..00000000000 --- a/src/algorithms/0_demo_final_0.cpp +++ /dev/null @@ -1,401 +0,0 @@ -#pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? -#include "0_demo_final_0.hpp" -#include -#include "../vg.hpp" -#include "../handle.hpp" -#include "../subgraph.hpp" -#include "count_walks.hpp" -#include -#include -#include -#include "../msa_converter.hpp" -#include "../snarls.hpp" -#include "../gbwt_helper.hpp" -#include "../stream/vpkg.hpp" - -namespace vg { - -// void print_kmer(const std::vector>&, const std::string& string){ -// cout << string << endl; -// } - -vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ - - ///stuff that will go in mod_main: - ifstream gbwt_stream; - string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - gbwt_stream.open(gbwt_name); - - unique_ptr gbwt; - // Load the GBWT from its container - gbwt = stream::VPKG::load_one(gbwt_stream); -// ----------------------------------------------------------------- - /// make subgraph for the snarl: - SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); - - GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); -//TODO:identify source and sinks for troubleshooting! - unordered_map> sequences; // will contain all haplotype walks through snarl - handle_t source_handle = haploGraph.get_handle(source_id); - sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); - - for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { - - vector seqs_here = sequences[handle]; - gbwt::SearchState cur_state = haploGraph.get_state(handle); - - haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { - handle_t next_handle = haploGraph.get_handle(next_search.node); - string next_seq = haploGraph.get_sequence(next_handle); - // transfer the sequences for the preceding handle to next_handle's sequences, - // plus the new handle's sequence. - for (string seq : seqs_here){ - sequences[next_handle].push_back(seq + next_seq); - } - - - - }); - } - - // all the sequences at the sinks will be all the sequences in the snarl. - handle_t sink_handle = haploGraph.get_handle(sink_id); - return sequences[sink_handle]; -} - - -void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ - SnarlManager* snarl_manager = new SnarlManager(snarl_stream); - -/* Use this code to count number of snarls in graph. -* int top_count = 0; -* for (const Snarl* snarl : snarl_manager->top_level_snarls()){ -* top_count++; -* } -* cerr << "number of top_level snarls in graph: " << top_count << endl; -* -* int general_count = 0; -* snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ -* general_count++; -* }); -* cerr << "number of total snarls in graph: " << general_count << endl; -*/ - - - vector snarl_roots = snarl_manager->top_level_snarls(); - for (auto roots : snarl_roots){ - clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); - } - - delete snarl_manager; - - -} - -// Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ - //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings - //representing all possible walks through the snarl: - vector walks = graph_to_strings(graph, start_id, end_id); - - //Make a new snarl from walks: - VG new_snarl = strings_to_graph(walks); - - integrate_snarl(graph, new_snarl, start_id, end_id); - -} - -void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ - //Get old graph snarl - SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); - - //Identify old and new snarl start and sink - pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); - pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); - - //Check to make sure that newly made snarl has only one start and end. - if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ - cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; - return; - } - //extract old and new snarl start and sink: - handle_t new_snarl_start = new_snarl_defining_handles.first[0]; - handle_t new_snarl_end = new_snarl_defining_handles.second[0]; - - handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; - handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; - - ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete - ///rest of old graph snarl. - - //Get everything needed to replace graph start and sink. - string new_start_seq = new_snarl.get_sequence(new_snarl_start); - string new_end_seq = new_snarl.get_sequence(new_snarl_end); - id_t new_start_id = graph.get_id(graph_snarl_start); - id_t new_end_id = graph.get_id(graph_snarl_end); - vector left_of_start; - graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ - left_of_start.emplace_back(handle); - }); - vector right_of_end; - graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ - right_of_end.emplace_back(handle); - }); - - //Delete all handles in graph_snarl - graph_snarl.for_each_handle([&](const handle_t& handle){ - graph.destroy_handle(handle); - }, false); - - //Make start and end handles for snarl in graph: - handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); - handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); - - //Insert start and end handles: - for (handle_t handle : left_of_start) { - graph.create_edge(handle, new_start_handle); - } - for (handle_t handle : right_of_end) { - graph.create_edge(new_end_handle, handle); - } - - ///Reintegrate rest of new_snarl. - //topologically ordered new_snarl. As I progress through each node in topo_order, - //I can add all the nodes to the right of the snarl. The final node will be the - //end node, which, instead of adding as a new node to graph, I'll re-connect - //to the modified end_node, above. - vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); - - //Construct a parallel graph_snarl_topo_order to identify - //paralogous nodes between new_snarl and graph. - vector graph_snarl_topo_order = {new_start_handle}; - - for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ - //For every handle in new_snarl, make an (unconnected) handle in graph. - string handle_seq = new_snarl.get_sequence(*it); - handle_t graph_handle = graph.create_handle(handle_seq); - graph_snarl_topo_order.push_back(graph_handle); - } - - graph_snarl_topo_order.push_back(new_end_handle); - - //Connect the rest of the nodes: - for (int i = 0; i < new_snarl_topo_order.size(); i++){ - // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; - - new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ - //get topo_index of nodes to be connected to graph start handle - auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); - int topo_index = it - new_snarl_topo_order.begin(); - // cerr << "topo_index" << topo_index << endl; - // cerr << "i" << i << endl; - - //connect graph start handle - graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); - }); - } - -} - -//Returns tuple of two handles, first being start and second being sink. -pair, vector> get_sources_and_sinks(HandleGraph& graph){ - vector sink; - vector source; - - // identify sources and sinks - graph.for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - graph.follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - graph.follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - source.push_back(handle); - } - if (is_sink) { - sink.emplace_back(handle); - } - }); - - return pair, vector>(source, sink); - -} - - -VG strings_to_graph(const vector& walks){ - seqan::Align align; // create multiple_sequence_alignment object - - seqan::resize(rows(align), walks.size()); - for (int i = 0; i < walks.size(); ++i){ - assignSource(row(align, i), walks[i].c_str()); - } - - - globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - - stringstream ss; - ss << align; - MSAConverter myMSAConverter = MSAConverter(); - myMSAConverter.load_alignments(ss, "seqan"); - VG snarl = myMSAConverter.make_graph(); - snarl.clear_paths(); - - - // snarl.serialize_to_ostream(cout); - return snarl; -} - - - - -vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ - SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); - - unordered_map> sequences; - vector sinks; - unordered_map count; - count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl - sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - - // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. - snarl.for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - snarl.follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - snarl.follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - count[handle] = 1; - sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? - } - if (is_sink) { - sinks.emplace_back(handle); - } - }); - - - // count walks by dynamic programming - bool overflowed = false; - for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { - size_t count_here = count[handle]; - vector seqs_here = sequences[handle]; - - snarl.follow_edges(handle, false, [&](const handle_t& next) { - - size_t& count_next = count[next]; - string seq_next = snarl.get_sequence(next); - - if (numeric_limits::max() - count_here < count_next) { - overflowed = true; - } - - else { - count_next += count_here; - // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ - for (string seq : seqs_here){ - sequences[next].push_back(seq + seq_next); - } - // cout << "next_seqs: "; - // for (string seq : sequences[next]){ - // cout << seq << endl; - // } - } - }); - ///TODO: figure out how to deal with overflow. - // if (overflowed) { - // return numeric_limits::max(); - // } - } - - // total up the walks at the sinks - size_t total_count = 0; - for (handle_t& sink : sinks) { - total_count += count[sink]; - } - - // all the sequences at the sinks will be all the sequences in the snarl. - vector walks; - for (handle_t& sink : sinks) { - for (string seq : sequences[sink]){ - walks.push_back(seq); - } - } - - return walks; -} - - -// given a start and end node id, construct an extract subgraph between the two nodes (inclusive). -// TODO: change the arguments to handles, which contain orientation within themselves. -// That way, iteration to extract the subgraph will have direction contained within themselves. -// This may actually end up looking like simply parsing an input text file with the handles -// described from the find_snarl output. -SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ - /// make a subgraph containing only nodes of interest. (e.g. a snarl) - // make empty subgraph - SubHandleGraph subgraph = SubHandleGraph(&graph); - - unordered_set visited; // to avoid counting the same node twice. - unordered_set to_visit; // nodes found that belong in the subgraph. - - // TODO: how to ensure that "to the right" of start_handle is the correct direction? - // initialize with start_handle (because we move only to the right of start_handle): - handle_t start_handle = graph.get_handle(start_id); - subgraph.add_handle(start_handle); - visited.insert(graph.get_id(start_handle)); - - // look only to the right of start_handle - graph.follow_edges(start_handle, false, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - - /// explore the rest of the snarl: - while (to_visit.size() != 0) { - // remove cur_handle from to_visit - unordered_set::iterator cur_index = to_visit.begin(); - handle_t cur_handle = graph.get_handle(*cur_index); - - to_visit.erase(cur_index); - - /// visit cur_handle - visited.insert(graph.get_id(cur_handle)); - - subgraph.add_handle(cur_handle); - - if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! - // look for all nodes connected to cur_handle that need to be added - // looking to the left, - graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - // looking to the right, - graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - } - } - return subgraph; -} -} \ No newline at end of file diff --git a/src/algorithms/0_demo_final_0.hpp b/src/algorithms/0_demo_final_0.hpp deleted file mode 100644 index 67f0210f7ee..00000000000 --- a/src/algorithms/0_demo_final_0.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* -Robin Rounthwaite -Find function call in ./subcommand/main.cpp -*/ -#include -#include "../vg.hpp" -#include "../handle.hpp" -#include "../subgraph.hpp" -#include "count_walks.hpp" - -namespace vg { - vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id); - - void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); - - void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); - - SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); - - vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id); - - VG strings_to_graph(const vector& walks); - - void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id); - - pair, vector> get_sources_and_sinks(HandleGraph& graph); -} diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index 5d62a225b63..fa75a993a9d 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -7,8 +7,8 @@ #include "vg.hpp" #include "msa_converter.hpp" - -// #define debug_msa_converter +//TODO: remove definition +#define debug_msa_converter namespace vg { From f4506bd224b7d28a8265eb5e4a593ca4895eb6e9 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 13 Jun 2019 12:27:18 -0700 Subject: [PATCH 12/63] Makefile edits --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 606c00e59cf..5cc724877a3 100644 --- a/Makefile +++ b/Makefile @@ -249,16 +249,11 @@ DEPS += $(INC_DIR)/backward.hpp DEPS += $(INC_DIR)/dozeu/dozeu.h ifneq ($(shell uname -s),Darwin) -<<<<<<< Updated upstream # Use tcmalloc only DEPS += $(LIB_DIR)/libtcmalloc_minimal.a LD_LIB_FLAGS += -ltcmalloc_minimal CONFIGURATION_OBJ += $(OBJ_DIR)/tcmalloc_configuration.o -======= - # DEPS += $(LIB_DIR)/libtcmalloc_minimal.a - # LD_LIB_FLAGS += -ltcmalloc_minimal ->>>>>>> Stashed changes endif From 435438d9f7ed2395552df1b9f90d34e17f2a7335 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 1 Jul 2019 12:29:18 -0700 Subject: [PATCH 13/63] Added normalize snarl argument `vg normalize`. On branch normalize_snarls Changes to be committed: modified: src/algorithms/0_draft_haplotype_realignment.cpp modified: src/algorithms/0_draft_haplotype_realignment.hpp deleted: src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp deleted: src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp deleted: src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp deleted: src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp new file: src/subcommand/0_normalize_main.cpp --- .../0_draft_haplotype_realignment.cpp | 1447 +++++++++++------ .../0_draft_haplotype_realignment.hpp | 147 +- ...d_includes_non_path_oriented_approach).cpp | 1018 ------------ ...old_0-diff_extension_for_not_at_source.cpp | 964 ----------- .../0_demo_final_old_0-only_source_paths.cpp | 741 --------- .../0_demo_final_old_0-only_source_paths.hpp | 37 - src/subcommand/0_normalize_main.cpp | 176 ++ 7 files changed, 1248 insertions(+), 3282 deletions(-) delete mode 100644 src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp delete mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp delete mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp delete mode 100644 src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp create mode 100644 src/subcommand/0_normalize_main.cpp diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index 09f192bb532..ba57ad05bb8 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -1,161 +1,209 @@ -#pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? +#pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? #include "0_draft_haplotype_realignment.hpp" -#include #include +#include #include -#include #include +#include -#include "../vg.hpp" #include "../gbwt_helper.hpp" -#include "../stream/vpkg.hpp" -#include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? +#include "../handle.hpp" +#include "../msa_converter.hpp" +#include "../snarls.hpp" +#include "../vg.hpp" #include -#include #include -#include "../msa_converter.hpp" - -//TODO: Tomorrow's goal: edit haplotypes using Jordan's technique to re-integrate your snarl. +#include +// #include "../../deps/libhandlegraph/src/include/handlegraph/path_handle_graph.hpp" namespace vg { -/// Given the haplotypes extracted from the graph in extract_haplotypes, -// creates a new subgraph made from the realignment of the extracted -// haplotypes. -void align_haplotypes(const GBWTGraph& haploGraph, const pair< vector< vector >, vector< vector > >& haplotypes){ - vector< string > haplotypes_from_source_to_sink = format_handle_haplotypes_to_strings(haploGraph, haplotypes.first); - vector< string > other_haplotypes = format_handle_haplotypes_to_strings(haploGraph, haplotypes.second); - //TODO: Debug: disamiguate beign/ending regions of nodes by adding leading/trailing AAA seq (essential a special character). - for (string& hap : haplotypes_from_source_to_sink){ - hap = "AAAAAAAA" + hap + "AAAAAAAA"; - } - - - VG new_snarl = align_haplotypes(haplotypes_from_source_to_sink); - //TODO: Debug workaround to avoid hassle of overwriting inputGraph. - new_snarl.serialize_to_ostream(cout); - vector walks = debug_graph_to_strings(new_snarl, 2, 12); - - //TODO: Debug print statements - // cerr << "source_to_sink haplotypes" << endl; - // for (string hap : haplotypes_from_source_to_sink){ - // cerr << hap << endl << endl; - // } - // cerr << "source_to_sink_walks" << endl; - // for (string walk : walks){ - // cerr << walk << endl << endl; - // } - // cerr << "are there any walks that aren't haplotypes?" << endl; - // for (string walk : walks){ - // if (find(haplotypes_from_source_to_sink.begin(), haplotypes_from_source_to_sink.end(), walk) != haplotypes_from_source_to_sink.end()){ - // cerr << "good" << endl; - // } else { - // cerr << "bad walk" << endl; - // cerr << walk << endl; - // } - // } - cerr << "are there any haps that aren't walks?" << endl; - for (string hap : haplotypes_from_source_to_sink){ - if (find(walks.begin(), walks.end(), hap) != walks.end()){ - cerr << "good" << endl; - } else { - cerr << "bad hap" << endl; - cerr << hap << endl; +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl +// Runs disambiguate_snarl on every top-level snarl in the graph, so long as the +// snarl only contains haplotype threads that extend fully from source to sink. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding GBWTGraph of graph. +// snarl_stream: the file stream from .snarl file corresponding to graph. +void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, ifstream &snarl_stream) { + cerr << "disambiguate_top_level_snarls" << endl; + SnarlManager *snarl_manager = new SnarlManager(snarl_stream); + + /** Use this code to count number of snarls in graph. + * int top_count = 0; + * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ + * top_count++; + * } + * cerr << "number of top_level snarls in graph: " << top_count << endl; + * + * int general_count = 0; + * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ + * general_count++; + * }); + * cerr << "number of total snarls in graph: " << general_count << endl; + */ + + int i = 0; + vector snarl_roots = snarl_manager->top_level_snarls(); + for (auto roots : snarl_roots) { + // TODO: debug_code: + cerr << "return to root node ids, disambiguate snarl with.. " << endl; + cerr << "root node ids: " << roots->start().node_id() << " " + << roots->end().node_id() << endl; + disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + roots->end().node_id()); + i += 1; + cerr << endl << endl << "normalized " << i << " snarl(s)." << endl; + if (i == 2) { + break; } } - // cerr << "other haplotypes" << endl; - // for (string hap : other_haplotypes){ - // cerr << hap << endl << endl; - // } - // vector actually_source_to_sink; - // vector to_print_other_haps; - // cerr << "other haplotypes sorted" << endl; - // for (string hap : other_haplotypes){ - // if (find(haplotypes_from_source_to_sink.begin(), haplotypes_from_source_to_sink.end(), hap) != haplotypes_from_source_to_sink.end()){ - // actually_source_to_sink.emplace_back(hap); - // } else { - // to_print_other_haps.emplace_back(hap); - // } + delete snarl_manager; +} - // } - // sort(actually_source_to_sink.begin(), actually_source_to_sink.end()); - // cerr << "actually source to sink" << actually_source_to_sink.size() << endl; - // for (string hap : actually_source_to_sink){ - // cerr << hap << endl << endl; - // } - // cerr << endl << endl << "to_print_other_haps" << to_print_other_haps.size() << endl; - // sort(to_print_other_haps.begin(), to_print_other_haps.end()); - // for (string hap : to_print_other_haps){ - // cerr << hap << endl << endl; - // } +// For a snarl in the given graph, with every edge covered by at least one haplotype +// thread in the GBWTGraph, +// extract all sequences in the snarl corresponding to the haplotype threads and +// re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are +// preserved; GBWT haplotypes in the snarl are not conserved. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding GBWTGraph of graph. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: none. +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl. +void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id) { + cerr << "disambiguate_snarl" << endl; + + // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. + // Return value is pair< haplotypes_that_stretch_from_source_to_sink, + // haplotypes_that_end/start_prematurely > + pair>, vector>> haplotypes = + extract_gbwt_haplotypes(haploGraph, source_id, sink_id); + + // TODO: this if statement removes snarls where a haplotype begins/ends in the middle + // TODO: of the snarl. Get rid of this once alignment issue is addressed! + if (haplotypes.second.empty()) { + // Convert the haplotypes from vector format to string format. + vector haplotypes_from_source_to_sink = + format_handle_haplotypes_to_strings(haploGraph, haplotypes.first); + // vector< string > other_haplotypes = + // format_handle_haplotypes_to_strings(haploGraph, haplotypes.second); + + // Align the new snarl: + // TODO: find better way to improve disamiguation of beginning/ending regions of + // nodes + // TODO: than by adding leading/trailing AAA seq (essentially a special + // character). + for (string &hap : haplotypes_from_source_to_sink) { + hap = "AAAAAAAA" + hap + "AAAAAAAA"; + } + VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + + // Get the embedded paths in the snarl out of the graph, for the purposes of + // moving them into the new snarl. + vector> embedded_paths = + extract_embedded_paths_in_snarl(graph, source_id, sink_id); + + cerr << "paths: " << endl; + for (auto path : embedded_paths){ + cerr << " path " << graph.get_path_name(graph.get_path_handle_of_step(path.first)) << endl; + for (auto step : {path.first, graph.get_previous_step(path.second)}){ + cerr << "\t" << graph.get_id(graph.get_handle_of_step(step)) << " "; + } + cerr << endl; + } - + // integrate the new_snarl into the graph, removing the old snarl as you go. + integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); + cerr << endl; + } else { + cerr << "found a snarl with haplotypes in the middle. Start: " << source_id + << " sink is " << sink_id << endl; + } } - -//Returns: a pair containting two sets of paths (each represented by a vector). The first -// in the pair represents all paths reaching from source to sink in the snarl, and the -// second representing all other paths in the snarl (e.g. any that don't reach both -// source and sink in the graph.) -pair< vector< vector >, vector< vector > > extract_haplotypes(const GBWTGraph& haploGraph, - const id_t& source_id, - const id_t& sink_id){ - cerr << "depth first begins!" << endl; - //touched_handles contains all handles that have been touched by the depth_first_search, - //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch - //from source to sink in the snarl. +// TODO: test that it successfully extracts any haplotypes that start/end in the middle of +// TODO: the snarl. +// For a snarl in a given GBWTGraph, extract all the haplotypes in the snarl. Haplotypes +// are represented +// by vectors of handles, representing the chain of handles in a thread. +// Arguments: +// haploGraph: the GBWTGraph containing the snarl. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a pair containting two sets of paths (each represented by a vector). The +// first in the pair represents all paths reaching from source to sink in the snarl, +// and the second representing all other paths in the snarl (e.g. any that don't +// reach both source and sink in the graph.) +pair>, vector>> +extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id) { + cerr << "extract_gbwt_haplotypes" << endl; + + // touched_handles contains all handles that have been touched by the + // depth_first_search, for later use in other_haplotypes_to_strings, which identifies + // paths that didn't stretch from source to sink in the snarl. unordered_set touched_handles; - //haplotype_queue contains all started exon_haplotypes not completed yet. - //Every time we encounter a branch in the paths, the next node down the path - //Is stored here, along with the vector of handles that represents the path up - //to the SearchState. - vector< pair< vector, gbwt::SearchState> > haplotype_queue; + // haplotype_queue contains all started exon_haplotypes not completed yet. + // Every time we encounter a branch in the paths, the next node down the path + // Is stored here, along with the vector of handles that represents the path up + // to the SearchState. + vector, gbwt::SearchState>> haplotype_queue; // source and sink handle for haploGraph: handle_t source_handle = haploGraph.get_handle(source_id); handle_t sink_handle = haploGraph.get_handle(sink_id); - //place source in haplotype_queue. + // place source in haplotype_queue. vector source_handle_vec(1, source_handle); gbwt::SearchState source_state = haploGraph.get_state(source_handle); - haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); + haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); touched_handles.emplace(source_handle); - //haplotypes contains all "finished" haplotypes - those that were either walked - //to their conclusion, or until they reached the sink. - vector< vector > haplotypes_from_source_to_sink; - vector< vector > other_haplotypes; + // haplotypes contains all "finished" haplotypes - those that were either walked + // to their conclusion, or until they reached the sink. + vector> haplotypes_from_source_to_sink; + vector> other_haplotypes; // for every partly-extracted thread, extend the thread until it either reaches // the sink of the snarl or the end of the thread. while (!haplotype_queue.empty()) { - // get a haplotype out of haplotype_queue to extend - + // get a haplotype out of haplotype_queue to extend - // a tuple of (handles_traversed_so_far, last_touched_SearchState) - pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); haplotype_queue.pop_back(); - // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + // get all the subsequent search_states that immediately follow the searchstate + // from cur_haplotype. vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState next_search) -> bool { - // cerr << "this node immediately follows cur_haplotypes current search_state." << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << haploGraph.get_id(haploGraph.node_to_handle(next_search.node)) << endl; - next_searches.push_back(next_search); - return true; - }); - - // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue - // or one of the finished haplotype_handle_vectors. - if (next_searches.size() > 1){ - - // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, - // or place in the haplotypes_from_source_to_sink if haplotype extends to sink, - // or place in the other_haplotypes if haplotype ends before reaching sink. - for (gbwt::SearchState next_search : next_searches){ + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be + // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. + if (next_searches.size() > 1) { + // for every next_search in next_searches, either create a new, extended + // cur_haplotype to push into haplotype queue, or place in the + // haplotypes_from_source_to_sink if haplotype extends to sink, or place in + // the other_haplotypes if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches) { handle_t next_handle = haploGraph.node_to_handle(next_search.node); // copy over the vector of cur_haplotype: @@ -165,106 +213,87 @@ pair< vector< vector >, vector< vector > > extract_haplotype next_handle_vec.push_back(next_handle); // if new_handle is the sink, put in haplotypes_from_source_to_sink - if (haploGraph.get_id(next_handle) == sink_id){ + if (haploGraph.get_id(next_handle) == sink_id) { haplotypes_from_source_to_sink.push_back(next_handle_vec); - - } else { // keep extending the haplotype! - - pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); + } else // keep extending the haplotype! + { + pair, gbwt::SearchState> next_haplotype = + make_pair(next_handle_vec, next_search); haplotype_queue.push_back(next_haplotype); - } - - //next_handle will be touched. + // next_handle will be touched. touched_handles.emplace(next_handle); } - - } // if next_searches is empty, the path has ended but not reached sink. - else if ( next_searches.empty() ) { - //TODO: debug - // cerr << "next_searches is empty" << endl; - + } + // if next_searches is empty, the path has ended but not reached sink. + else if (next_searches.empty()) { // We have reached the end of the path, but it doesn't reach the sink. // we need to add cur_haplotype to other_haplotypes. other_haplotypes.push_back(cur_haplotype.first); - - } // if new_handle is the sink, put in haplotypes_from_source_to_sink - else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { - // TODO: debug: - // cerr << "next_searches is sink" << endl; - - // Then we need to add cur_haplotype + next_search to haplotypes_from_source_to_sink. + } + // if new_handle is the sink, put in haplotypes_from_source_to_sink + else if (haploGraph.get_id( + haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { + // Then we need to add cur_haplotype + next_search to + // haplotypes_from_source_to_sink. handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); cur_haplotype.first.push_back(next_handle); haplotypes_from_source_to_sink.push_back(cur_haplotype.first); - //touched next_search's handle + // touched next_search's handle touched_handles.emplace(next_handle); - - } //else, there is just one next_search, and it's not the end of the path. - //just extend the search by adding (cur_haplotype + next_search to haplotype_queue. + } + // else, there is just one next_search, and it's not the end of the path. + // just extend the search by adding (cur_haplotype + next_search to + // haplotype_queue. else { - // get the next_handle from the one next_search. handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); - // TODO: debug: - // cerr << "normal extend" << endl; - // cerr << "this is next_handle" << haploGraph.get_id(next_handle) << endl; - - // modify cur_haplotype with next_handle and next_search. + // modify cur_haplotype with next_handle and next_search. cur_haplotype.first.push_back(next_handle); - cur_haplotype.second = next_searches.back(); // there's only one next_search in next_searches. - + cur_haplotype.second = + next_searches.back(); // there's only one next_search in next_searches. + // put cur_haplotype back in haplotype_queue. haplotype_queue.push_back(cur_haplotype); touched_handles.emplace(next_handle); - } - } - //Find any haplotypes starting from handles not starting at the source, but which - //still start somewhere inside the snarl. - vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + // Find any haplotypes starting from handles not starting at the source, but which + // still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = + find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); // move haplotypes_not_starting_at_source into other_haplotypes: - other_haplotypes.reserve(other_haplotypes.size() + haplotypes_not_starting_at_source.size()); - move(haplotypes_not_starting_at_source.begin(), haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + other_haplotypes.reserve(other_haplotypes.size() + + haplotypes_not_starting_at_source.size()); + move(haplotypes_not_starting_at_source.begin(), + haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); return make_pair(haplotypes_from_source_to_sink, other_haplotypes); } -vector< string > format_handle_haplotypes_to_strings(const GBWTGraph& haploGraph, const vector< vector< handle_t > >& haplotype_handle_vectors){ - vector< string > haplotype_strings; - for (vector haplotype_handles : haplotype_handle_vectors){ - string hap; - for (handle_t& handle : haplotype_handles){ - hap += haploGraph.get_sequence(handle); - } - haplotype_strings.push_back(hap); - } - return haplotype_strings; -} - -vector> find_haplotypes_not_at_source(const GBWTGraph& haploGraph, unordered_set& touched_handles, const id_t& sink_id){ - //TODO: debug: source handle size? - // cerr << '\n\n\n\n' << endl; - // for (id_t node_id = 23493; node_id <= 23505; node_id ++){ - // handle_t trial_handle = haploGraph.get_handle(node_id); - // gbwt::SearchState normal_search = haploGraph.get_state(trial_handle); - // cerr << "is normal searchstate at handle " << haploGraph.get_id(trial_handle) << " empty? " << normal_search.empty() << " size: " << normal_search.size() << endl; - // gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(trial_handle)); - // cerr << "is the prefix searchstate empty? " << new_search.empty() << " size: " << new_search.size() << endl; - // } - - - +// Used to complete the traversal of a snarl along its haplotype threads, when there are +// handles connected to the snarl by +// threads that start after the source handle. (Threads that merely end before the +// sink handle are addressed in extract_gbwt_haplotypes). +// Arguments: +// haploGraph: the GBWTgraph containing the haplotype threads. +// touched_handles: any handles found in the snarl so far. +// sink_id: the id of the final handle in the snarl. +// Returns: +// a vector of haplotypes in vector format that start in the middle of the +// snarl. +vector> +find_haplotypes_not_at_source(const GBWTGraph &haploGraph, + unordered_set &touched_handles, + const id_t &sink_id) { + cerr << "find_haplotypes_not_at_source" << endl; - - cerr << "finding haplotypes not at source!" << endl; /// Search every handle in touched handles for haplotypes starting at that point. - // Any new haplotypes will be added to haplotype_queue. + // Any new haplotypes will be added to haplotype_queue. vector, gbwt::SearchState>> haplotype_queue; // Fully extended haplotypes (or haplotypes extended to the snarl's sink) @@ -280,266 +309,172 @@ vector> find_haplotypes_not_at_source(const GBWTGraph& haploGra handle_t sink_handle = haploGraph.get_handle(sink_id); touched_handles.erase(sink_handle); - // Create nested function for making a new_search: - auto make_new_search = [&](handle_t handle) { - cerr << "lambda" << endl; - + // Nested function for making a new_search. Identifies threads starting at a given + // handle and + // either adds them as a full haplotype (if the haplotype is one handle long) or + // makes a new entry to haplotype_queue. + auto make_new_search = [&](handle_t handle) { // Are there any new threads starting at this handle? - gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); - // if (new_search != gbwt::SearchState()){ - if (!new_search.empty()){ - //TODO: Debug code: are searchstates empty? - cerr << "apparently new thread starts at node: " << haploGraph.get_id(handle) << endl; - cerr << "is the searchstate empty? " << new_search.empty() << " size: " << new_search.size() << endl; + gbwt::SearchState new_search = + haploGraph.index.prefix(haploGraph.handle_to_node(handle)); + if (!new_search.empty()) { + // TODO: test_code code: are searchstates empty? + cerr << "apparently new thread starts at node: " << haploGraph.get_id(handle) + << endl; + cerr << "is the searchstate empty? " << new_search.empty() + << " size: " << new_search.size() << endl; // Then add them to haplotype_queue. - haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { - - handle_t next_handle = haploGraph.node_to_handle(next_search.node); - - /// check to make sure that the thread isn't already finished: - // if next_handle is the sink, or if this thread is only one handle long, - // then there isn't any useful string to extract from this. - if (next_handle != sink_handle || next_search == gbwt::SearchState()){ - // establish a new thread to walk along. - vector new_path; - new_path.push_back(handle); - new_path.push_back(next_handle); - - pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); - - - // add the new path to haplotype_queue to be extended. - haplotype_queue.push_back(make_pair(new_path, next_search)); - - // if next_handle hasn't been checked for starting threads, add to to_search. - if (touched_handles.find(next_handle) == touched_handles.end()){ - to_search.emplace(next_handle); + haploGraph.follow_paths( + new_search, [&](const gbwt::SearchState &next_search) -> bool { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle + // long, then there isn't any useful string to extract from this. + if (next_handle != sink_handle || + next_search == gbwt::SearchState()) { + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState> mypair = + make_pair(new_path, next_search); + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to + // to_search. + if (touched_handles.find(next_handle) == touched_handles.end()) { + to_search.emplace(next_handle); + } } - } - return true; - }); + return true; + }); } }; - // TODO: Debug code: Search every handle in touched handles for haplotypes starting at that point. - // for (handle_t handle : touched_handles){ - // cerr << "isn't a source handle: " << haploGraph.get_sequence(handle) << endl; - // make_new_search(handle); - // } - /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. /// Then, check to see if there are any new threads on handles in to_search. - /// Extend those threads, and add any newly found handles to to_search, + /// Extend those threads, and add any newly found handles to to_search, /// then search for threads again in to_search again... repeat until to_search remains /// emptied of new handles. - // for tracking whether the haplotype thread is still extending: + // for tracking whether the haplotype thread is still extending: bool still_extending; - - // TODO: Debug code: did we find any haplotypes that need extending? - // cerr << "haps need extending below:" << endl; - // for (auto handle : to_search){ - // cerr << "hap needs extending: " << haploGraph.get_id(handle) << " " << haploGraph.get_sequence(handle) << endl; - // } - // cerr << "haps queue:" << endl; - // for (auto hap : haplotype_queue){ - // handle_t handle = haploGraph.node_to_handle(hap.second.node); - // cerr << "need to search hap: " << haploGraph.get_id(handle) << " " << haploGraph.get_sequence(handle) << endl; - // } - // extend haplotypes on any nodes found to act as a starting thread. - while(!to_search.empty() || !haplotype_queue.empty()){ - while (!haplotype_queue.empty()){ - cerr << "extend haplotype_queue" << endl; - - // get a haplotype to extend out of haplotype_queue - a tuple of (handles_traversed_so_far, last_touched_SearchState) - pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + while (!to_search.empty() || !haplotype_queue.empty()) { + while (!haplotype_queue.empty()) { + // get a haplotype to extend out of haplotype_queue - a tuple of + // (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = + haplotype_queue.back(); haplotype_queue.pop_back(); - // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. + // get all the subsequent search_states that immediately follow the + // searchstate from cur_haplotype. vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState &next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); - for (gbwt::SearchState next_search: next_searches){ + for (gbwt::SearchState next_search : next_searches) { handle_t next_handle = haploGraph.node_to_handle(next_search.node); // if next_search is empty, then we've fallen off the thread, - // and cur_haplotype can be placed in finished_haplotypes as is for this thread. - if (next_search == gbwt::SearchState()){ - + // and cur_haplotype can be placed in finished_haplotypes as is for this + // thread. + if (next_search == gbwt::SearchState()) { finished_haplotypes.push_back(cur_haplotype.first); + } - } - // if next_search is on the sink_handle, + // if next_search is on the sink_handle, // then cur_haplotype.first + next_search goes to finished_haplotypes. - else if (haploGraph.get_id(next_handle) == sink_id){ + else if (haploGraph.get_id(next_handle) == sink_id) { // copy over the vector of cur_haplotype: vector next_handle_vec(cur_haplotype.first); - //add next_handle + // add next_handle next_handle_vec.push_back(next_handle); - //place in finished_haplotypes + // place in finished_haplotypes finished_haplotypes.push_back(next_handle_vec); - // also, if next_handle hasn't been checked for new threads, add to to_search. - if (touched_handles.find(next_handle) != touched_handles.end()){ + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { to_search.emplace(next_handle); } - - } - // otherwise, just place an extended cur_haplotype in haplotype_queue. - else { + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { // copy over cur_haplotype: - pair< vector, gbwt::SearchState> cur_haplotype_copy = cur_haplotype; - //modify with next_handle/search + pair, gbwt::SearchState> cur_haplotype_copy = + cur_haplotype; + // modify with next_handle/search cur_haplotype_copy.first.push_back(next_handle); cur_haplotype_copy.second = next_search; // place back in haplotype_queue for further extension. haplotype_queue.push_back(cur_haplotype_copy); - // also, if next_handle hasn't been checked for new threads, add to to_search. - if (touched_handles.find(next_handle) != touched_handles.end()){ + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { to_search.emplace(next_handle); } - } - } - - - + } } - // Then, make more new_searches from the handles in to_search. - for (handle_t handle : to_search){ - make_new_search(handle); // will add to haplotype_queue if there's any new_searches to be had. + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search) { + make_new_search(handle); // will add to haplotype_queue if there's any + // new_searches to be had. } to_search.clear(); - } return finished_haplotypes; } - -//TODO: make return a vector> instead, then convert using separate fxn. -// Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. -// Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, -// where each vector of handles represents one path from source to sink. -// alternative function return: -//unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ -vector get_embedded_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ - unordered_map > paths; - unordered_map multiple_occurrences; - - // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. - graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { - // Each occurrence represents an embedded path - // (note - in the case of a looped path, there will be multiple occurrences for one path.) - // For each path represented by an occurrence, we need to walk along the path until we reach - // the sink node. That series of handles represents the the sequence of the path. - - string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); - if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) - - //record this in multiple_occurrences, and get the number of times we've seen this occurrence. - int occ_num; - if (multiple_occurrences.find(path) == multiple_occurrences.end()){ - occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. - multiple_occurrences[path] = occ_num; - } else { - occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. - } - - //record the other occurrences with an added identifier to differentiate between paths. - paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); - } - else{ // this is the first time we've encountered this occurrence. - paths[path].emplace_back(occurrence); - } - }); - - //Now, for every occurrence, walk along the path until we reach the sink. - for (pair > path : paths){ - // cerr << "my name" << path.first << endl; - // cerr << "my occurences:" << endl; - // for (auto occ : path.second) { - // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; - // } - // cerr << "testing get_next_occurrence:" << endl; - // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); - // cerr << cur_id; - - // cur_occurence is the current handle while walking along the path - occurrence_handle_t cur_occurrence = path.second.back(); - id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); - // store the path in paths, in the occurrence_handle_t vector. - while (cur_id != graph.get_id(sink_handle)){ - paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); - // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); - cur_occurrence = paths[path.first].back(); - // cur_occurrence = path.second.back(); - cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); - cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; - } - path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); - cerr << path.second.size() << endl; - for (auto handle : path.second) { - cerr << graph.get_sequence(graph.get_occurrence(handle)); - } - } - cerr << "havin' issues here?" << endl; - for (auto path : paths) { - for (auto handle : path.second) { - cerr << graph.get_sequence(graph.get_occurrence(handle)); - } - } - // Resolve multiple_occurrences by identifying which entry in paths - // (of those part of the same path) is longest - that will - // represent the full breadth of the path through the snarl. - for (pair element : multiple_occurrences){ - // A vector of all the path entries in paths: - vector same_path_names = {element.first}; - - int max_len = paths[element.first].size(); - string max_path = element.first; - - for (int occ_num : range_vector(element.second)){ - occ_num++; // we actually need range_vector[1, ..., end()] - string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; - int cur_len = paths[cur_path].size(); - same_path_names.push_back(cur_path); - - if (cur_len > max_len){ - max_len = cur_len; - max_path = cur_path; - } - } - - // get rid of the smaller fragments of path: - for (string name : same_path_names) { - if (name != max_path){ - paths.erase(name); - } +// Given a vector of haplotypes of format vector< handle_t >, returns a vector of +// haplotypes of +// format string (which is the concatenated sequences in the handles). +// Arguments: +// haploGraph: a GBWTGraph which contains the handles in vector< handle_t > +// haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< handle_t +// > format. +// Returns: a vector of haplotypes of format string (which is the concatenated sequences +// in the handles). +vector format_handle_haplotypes_to_strings( + const GBWTGraph &haploGraph, + const vector> &haplotype_handle_vectors) { + cerr << "format_handle_haplotypes_to_strings" << endl; + vector haplotype_strings; + for (vector haplotype_handles : haplotype_handle_vectors) { + string hap; + for (handle_t &handle : haplotype_handles) { + hap += haploGraph.get_sequence(handle); } + haplotype_strings.push_back(hap); } - vector path_strings; - // get just the strings from the unordered_map > paths object: - for (auto path : paths) { - string path_string; - for (auto handle : path.second) { - path_string += graph.get_sequence(graph.get_occurrence(handle)); - } - path_strings.push_back(path_string); - } - return path_strings; + return haplotype_strings; } -VG align_haplotypes(const vector& source_to_sink_haplotypes){ +// TODO: eventually change to deal with haplotypes that start/end in middle of snarl. +// Aligns haplotypes to create a new graph using MSAConverter's seqan converter. +// Assumes that each haplotype stretches from source to sink. +// Arguments: +// source_to_sink_haplotypes: a vector of haplotypes in string format (concat of +// handle sequences). +// Returns: +// VG object representing the newly realigned snarl. +VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotypes) { + cerr << "align_source_to_sink_haplotypes" << endl; seqan::Align align; // create multiple_sequence_alignment object - + seqan::resize(rows(align), source_to_sink_haplotypes.size()); - for (int i = 0; i < source_to_sink_haplotypes.size(); ++i){ + for (int i = 0; i < source_to_sink_haplotypes.size(); ++i) { assignSource(row(align, i), source_to_sink_haplotypes[i].c_str()); } @@ -552,106 +487,144 @@ VG align_haplotypes(const vector& source_to_sink_haplotypes){ VG snarl = myMSAConverter.make_graph(); snarl.clear_paths(); + // TODO: find better way to improve disamiguation of beginning/ending regions of nodes + // TODO: than by adding leading/trailing AAA seq (essentially a special + // character). + pair, vector> source_and_sink = + debug_get_sources_and_sinks(snarl); + + // Replace source with a handle that has the leading AAA seq removed. + handle_t source = source_and_sink.first.back(); + string source_seq = snarl.get_sequence(source); + id_t source_id = snarl.get_id(source); + handle_t new_source = snarl.create_handle(source_seq.substr(8, source_seq.size())); + snarl.follow_edges(source, false, [&](const handle_t &handle) { + snarl.create_edge(new_source, handle); + }); + snarl.destroy_handle(source); + + handle_t sink = source_and_sink.second.back(); + string sink_seq = snarl.get_sequence(sink); + id_t sink_id = snarl.get_id(sink); + handle_t new_sink = snarl.create_handle(sink_seq.substr(0, (sink_seq.size() - 8))); + snarl.follow_edges( + sink, true, [&](const handle_t &handle) { snarl.create_edge(handle, new_sink); }); + snarl.destroy_handle(sink); // snarl.serialize_to_ostream(cerr); return snarl; } -// ------------------------------ DEBUG CODE BELOW: ------------------------------------------ - -vector debug_graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ - SubHandleGraph snarl = debug_extract_subgraph(graph, start_id, end_id); - - unordered_map> sequences; - vector sinks; - unordered_map count; - count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl - sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - - // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. - snarl.for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - snarl.follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - snarl.follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - count[handle] = 1; - sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? - } - if (is_sink) { - sinks.emplace_back(handle); +// Finds all embedded paths that either start or end in a snarl (or both) defined by +// source_id, sink_id. +// returns a vector of the embedded paths, where each entry in the vector is defined +// by the pair of step_handles closest to the beginning and end of the path. If the +// path is fully contained within the snarl, these step_handles will the be the +// leftmost and rightmost handles in the path. +// Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a vector containing all the embedded paths in the snarl, in pair< step_handle_t, +// step_handle_t > > format. Pair.first is the first step in the path's range of +// interest, and pair.second is the step *after* the last step in the path's range of +// interest (can be the null step at end of path). +vector> +extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id) { + cerr << "extract_embedded_paths_in_snarl" << endl; + + // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't + // extend the path to a point beyond the source or sink. + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + // key is path_handle, value is a step in that path from which to extend. + unordered_map paths_found; + + // look for handles with paths we haven't touched yet. + snarl.for_each_handle([&](const handle_t &handle) { + vector steps = graph.steps_of_handle(handle); + // do any of these steps belong to a path not in paths_found? + for (step_handle_t &step : steps) { + path_handle_t path = graph.get_path_handle_of_step(step); + // If it's a step along a new path, save the first step to that path we find. + // In addtion, if there are multiple steps found in the path, (The avoidance + // of source and sink here is to ensure that we can properly check to see if + // we've reached the end of an embedded path walking in any arbitrary + // direction (i.e. source towards sink or sink towards source). + if (paths_found.find(path) == paths_found.end() || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { + // then we need to mark it as found and save the step. + paths_found[path] = step; + } } }); - - // count walks by dynamic programming - bool overflowed = false; - for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { - size_t count_here = count[handle]; - vector seqs_here = sequences[handle]; + /// for each step_handle_t corresponding to a unique path, we want to get the steps + /// closest to both the end and beginning step that still remains in the snarl. + // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", + // TODO so that I could iterate over the fxn. That sounds template-messy in C++ + // tho'. Should I? + vector> paths_in_snarl; + for (auto &it : paths_found) { + step_handle_t step = it.second; + // path_in_snarl describes the start and end steps in the path, + // as constrained by the snarl. + pair path_in_snarl; + + // Look for the step closest to the beginning of the path, as constrained by the + // snarl. + step_handle_t begin_in_snarl_step = step; + id_t begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + + while ((begin_in_snarl_id != source_id) && (begin_in_snarl_id != sink_id) && + graph.has_previous_step(begin_in_snarl_step)) { + begin_in_snarl_step = graph.get_previous_step(begin_in_snarl_step); + begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + } + path_in_snarl.first = begin_in_snarl_step; - snarl.follow_edges(handle, false, [&](const handle_t& next) { - - size_t& count_next = count[next]; - string seq_next = snarl.get_sequence(next); - - if (numeric_limits::max() - count_here < count_next) { - overflowed = true; - } + // Look for the step closest to the end of the path, as constrained by the snarl. + step_handle_t end_in_snarl_step = step; + id_t end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); - else { - count_next += count_here; - // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ - for (string seq : seqs_here){ - sequences[next].push_back(seq + seq_next); - } - // cerr << "next_seqs: "; - // for (string seq : sequences[next]){ - // cerr << seq << endl; - // } - } - }); - ///TODO: figure out how to deal with overflow. - // if (overflowed) { - // return numeric_limits::max(); - // } - } - - // total up the walks at the sinks - size_t total_count = 0; - for (handle_t& sink : sinks) { - total_count += count[sink]; - } - - // all the sequences at the sinks will be all the sequences in the snarl. - vector walks; - for (handle_t& sink : sinks) { - for (string seq : sequences[sink]){ - walks.push_back(seq); + while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and + graph.has_next_step(end_in_snarl_step)) { + end_in_snarl_step = graph.get_next_step(end_in_snarl_step); + end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); } + // Note: when adding the end step, path notation convention requires that we add + // the null step at the end of the path (or the next arbitrary step, in the case + // of a path that extends beyond our snarl.) + // TODO: do we want the next arbitrary step in that latter case? + path_in_snarl.second = graph.get_next_step(end_in_snarl_step); + + paths_in_snarl.push_back(path_in_snarl); } - return walks; + return paths_in_snarl; } -// given a start and end node id, construct an extract subgraph between the two nodes (inclusive). -// TODO: change the arguments to handles, which contain orientation within themselves. -// That way, iteration to extract the subgraph will have direction contained within themselves. -// This may actually end up looking like simply parsing an input text file with the handles -// described from the find_snarl output. -SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ +// TODO: change the arguments to handles, which contain orientation within themselves. +// Given a start and end node id, construct an extract subgraph between the two nodes +// (inclusive). Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a SubHandleGraph containing only the handles in graph that are between start_id +// and sink_id. +SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &sink_id) { + cerr << "extract_subgraph" << endl; /// make a subgraph containing only nodes of interest. (e.g. a snarl) // make empty subgraph SubHandleGraph subgraph = SubHandleGraph(&graph); - unordered_set visited; // to avoid counting the same node twice. + unordered_set visited; // to avoid counting the same node twice. unordered_set to_visit; // nodes found that belong in the subgraph. // TODO: how to ensure that "to the right" of start_handle is the correct direction? @@ -661,7 +634,7 @@ SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, co visited.insert(graph.get_id(start_handle)); // look only to the right of start_handle - graph.follow_edges(start_handle, false, [&](const handle_t& handle){ + graph.follow_edges(start_handle, false, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); @@ -681,17 +654,17 @@ SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, co subgraph.add_handle(cur_handle); - if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! + if (graph.get_id(cur_handle) != sink_id) { // don't iterate past end node! // look for all nodes connected to cur_handle that need to be added // looking to the left, - graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ + graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); } }); // looking to the right, - graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ + graph.follow_edges(cur_handle, false, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); @@ -702,59 +675,511 @@ SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, co return subgraph; } +// Integrates the snarl into the graph, replacing the snarl occupying the space between +// source_id and sink_id. +// In the process, transfers any embedded paths traversing the old snarl into the new +// snarl. +// Arguments: +// graph: the graph in which we want to insert the snarl. +// to_insert_snarl: a *separate* handle_graph from graph, often generated from +// MSAconverter. embedded_paths: a vector of paths, where each is a pair. +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// source_id: the source of the old (to be replaced) snarl in graph +// sink_id: the sink of the old (to be replaced) snarl in graph. +// Return: None. +// TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same +// TODO: orientation as we expect? i.e. that they don't move backward? I think +// TODO: we want match_orientation to be = true, but this may cause problems +// TODO: in some cases given the way we currently construct handles (fixed when we +// TODO: create snarl-scanning interface). +// TODO: It may also be that we *don't want match_orientation to be true, +// TODO: if we're tracking a path that loops backward in the snarl. Hmm... Will think +// about this. +void integrate_snarl(MutablePathDeletableHandleGraph &graph, + const HandleGraph &to_insert_snarl, + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id) { + cerr << "integrate_snarl" << endl; + // Get old graph snarl + SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); + + // TODO: test_code: Check to make sure that newly made snarl has only one start and + // end. + // TODO: (shouldn't be necessary once we've implemented alignment with + // leading/trailing special chars.) Identify old and new snarl start and sink + pair, vector> to_insert_snarl_defining_handles = + debug_get_sources_and_sinks(to_insert_snarl); + + if (to_insert_snarl_defining_handles.first.size() > 1 || + to_insert_snarl_defining_handles.second.size() > 1) { + cerr << "ERROR: newly made snarl with more than one start or end. # of starts: " + << to_insert_snarl_defining_handles.first.size() + << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; + return; + } + + /// Replace start and end handles of old graph snarl with to_insert_snarl start and + /// end, and delete rest of old graph snarl: + + // add to_insert_snarl into graph without directly attaching the snarl to the graph + // (yet). + vector to_insert_snarl_topo_order = + algorithms::lazier_topological_order(&to_insert_snarl); + + // Construct a parallel new_snarl_topo_order to identify + // paralogous nodes between to_insert_snarl and the new snarl inserted in graph. + vector new_snarl_topo_order; + // integrate the handles from to_insert_snarl into the graph, and keep track of their + // identities by adding them to new_snarl_topo_order. + for (handle_t to_insert_snarl_handle : to_insert_snarl_topo_order) { + handle_t graph_handle = + graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); + new_snarl_topo_order.push_back(graph_handle); + } + + // Connect the newly made handles in the graph together the way they were connected in + // to_insert_snarl: + for (int i = 0; i < to_insert_snarl_topo_order.size(); i++) { + to_insert_snarl.follow_edges( + to_insert_snarl_topo_order[i], false, [&](const handle_t &snarl_handle) { + // get topo_index of nodes to be connected to graph start handle + auto it = find(to_insert_snarl_topo_order.begin(), + to_insert_snarl_topo_order.end(), snarl_handle); + int topo_index = it - to_insert_snarl_topo_order.begin(); + + // connect graph start handle + graph.create_edge(new_snarl_topo_order[i], + new_snarl_topo_order[topo_index]); + }); + } + + // Add the neighbors of the source and sink of the original snarl to the new_snarl's + // source and sink. + // source integration: + for (bool id : {source_id, sink_id}) { + graph.follow_edges(graph.get_handle(source_id), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_snarl_topo_order[0]); + }); + graph.follow_edges( + graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { + graph.create_edge(new_snarl_topo_order.back(), next_handle); + }); + } + + // For each path of interest, move it onto the new_snarl. + for (auto path : embedded_paths) { + move_path_to_snarl(graph, path, new_snarl_topo_order); + } + + // Destroy the old snarl. + old_snarl.for_each_handle( + [&](const handle_t &handle) { graph.destroy_handle(handle); }); + + // Replace the source and sink handles with ones that have the original source/sink id + // (for compatibility with future iterations on neighboring top-level snarls using the + // same snarl manager. Couldn't replace it before b/c we needed the old handles to + // move the paths. + handle_t new_source_handle = + graph.create_handle(graph.get_sequence(new_snarl_topo_order.front()), source_id); + handle_t new_sink_handle = + graph.create_handle(graph.get_sequence(new_snarl_topo_order.back()), sink_id); + + // move the source edges: + // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could + // in Python!) + graph.follow_edges(new_snarl_topo_order.front(), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_source_handle); + }); + graph.follow_edges(new_snarl_topo_order.front(), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_source_handle, next_handle); + }); + + // move the sink edges: + graph.follow_edges(new_snarl_topo_order.back(), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_sink_handle); + }); + graph.follow_edges(new_snarl_topo_order.back(), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_sink_handle, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle(new_snarl_topo_order.front(), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_source_handle}); + }); + graph.for_each_step_on_handle(new_snarl_topo_order.back(), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_sink_handle}); + }); + + // delete the previously created source and sink: + for (handle_t handle : {new_snarl_topo_order.front(), new_snarl_topo_order.back()}) { + graph.destroy_handle(handle); + } } +// Moves a path from its original location in the graph to a new snarl, +// defined by a vector of interconnected handles. +// Arguments: graph: the graph containing the old_embedded_path and the handles in +// new_snarl_handles +// old_embedded_path: a pair, where +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// new_snarl_handles: all the handles in the new snarl, inside the graph. +// Return: None. +void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, + const pair &old_embedded_path, + const vector &new_snarl_handles) { + cerr << endl << "move_path_to_snarl" << endl; + // cerr << "new_snarl_handles: " << endl; + // for (handle_t handle: new_snarl_handles){ + // cerr << graph.get_id(handle) << endl; + // } + cerr << "for path " + << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << endl; + // get the sequence associated with the path + string path_seq; + step_handle_t cur_step = old_embedded_path.first; + cerr << " old_embedded path looks like: " + << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " + << graph.get_id(graph.get_handle_of_step( + graph.get_previous_step(old_embedded_path.second))) + << endl; + cerr << "this is the original path handle ids: "; + while (cur_step != old_embedded_path.second) { + cerr << graph.get_id(graph.get_handle_of_step(cur_step)); + path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); + cur_step = graph.get_next_step(cur_step); + } + cerr << endl; + cerr << "pathseq " << path_seq << endl; + + // for the given path, find every good possible starting handle in the new_snarl + // format of pair is < possible_path_handle_vec, + // starting_index_in_the_first_handle, current_index_in_path_seq> + vector, int, int>> possible_paths; + for (handle_t handle : new_snarl_handles) { + string handle_seq = graph.get_sequence(handle); + // starting index is where the path would begin in the handle, + // since it could begin in the middle of the handle. + vector starting_indices = + check_handle_as_start_of_path_seq(handle_seq, path_seq); + // if there is a starting index, + if (starting_indices.size() != 0) { + // if the starting_indices implies that the starting handle entirely contains + // the path_seq of interest: + if ((handle_seq.size() - starting_indices.back()) >= path_seq.size()) { + // then we've already found the full mapping location of the path! Move + // the path, end the method. + // TODO: move the path to the new vector of handles, splitting start and + // end handles if need be. + vector new_path{handle}; + graph.rewrite_segment(old_embedded_path.first, old_embedded_path.second, + new_path); + return; + } else { + cerr << "adding possible path at node " << graph.get_id(handle) << endl; + // add it as a possible_path. + vector possible_path_handle_vec{handle}; + for (auto starting_index : starting_indices) { + possible_paths.push_back( + make_tuple(possible_path_handle_vec, starting_index, + handle_seq.size() - starting_index)); + } + } + } + } + // for every possible path, extend it to determine if it really is the path we're + // looking for: + while (!possible_paths.empty()) { + // take a path off of possible_paths: + tuple, int, int> possible_path = possible_paths.back(); + possible_paths.pop_back(); + + // extend the path through all right-extending edges to see if any subsequent + // paths still + // satisfy the requirements for bein a possible_path: + bool no_path = graph.follow_edges( + get<0>(possible_path).back(), false, [&](const handle_t &next) { + string next_seq = graph.get_sequence(next); + int &cur_index_in_path = get<2>(possible_path); + + // if the next handle would be the ending handle for the path, + if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { + // check to see if the sequence in the handle is suitable for ending + // the path: + int compare_length = path_seq.size() - cur_index_in_path; + if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, + compare_length) == 0) { + // we've found the new path! Move path to the new sequence, and + // end the function. + get<0>(possible_path).push_back(next); + // TODO: move the path to the new vector of handles, splitting + // start and end handles if need be. + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, + get<0>(possible_path)); + + // TODO: test_code: show when we find a path: + cerr << "found a full path named " + << graph.get_path_name(graph.get_path_handle_of_step( + old_embedded_path.first)) + << "! Here is the sequence of handles:" << endl; + for (handle_t handle : get<0>(possible_path)) { + cerr << graph.get_id(handle) << ": " + << graph.get_sequence(handle) << " " << endl; + } + return false; + } + } + // see if the next handle would be the continuation of the path, but not + // the end, + else { + // check to see if the sequence in the handle is suitable for + // extending the path: + int compare_length = next_seq.size(); + if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, + compare_length) == 0) { + // extend the path + get<0>(possible_path).push_back(next); + + // update the current index in path_seq. + get<2>(possible_path) += next_seq.size(); + + // place back into possible_paths + possible_paths.push_back(possible_path); + } + } + // continue to iterate through follow_edges. + return true; + }); + // if we've found a complete path in the above follow_edges, then we've already + // moved the path, and we're done. + if (!no_path) { + return; + } + } + // if we failed to find a path, show an error message. + // TODO: make this better! Throw an exception? + cerr << "Warning! Didn't find a corresponding path of name " + << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << " from the old snarl in the newly aligned snarl." << endl + << endl; + cerr << "Here's the sequence of the path: " << path_seq << endl + << "Here's the start and end node ids of the path: " + << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " + << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << endl + << endl; +} -/* -Misc. //todo's: - do I need to fix the fact that find_haplotypes_not_at_source runs forever when given - a non-snarl? +// Determines whether some subsequence in a handle satisfies the condition of being the +// beginning of a path. +// If the path_seq is longer than the handle_seq, only checks subsequences that reach +// from the beginning/middle of the handle_seq to the end. If path_seq is shorter +// than handle_seq, checks for any substring of length path_seq within the +// handle_seq, as well as substrings smaller than length path_seq that extend beyond +// the current handle. +// Arguments: +// handle_seq: the sequence in the handle we're trying to identify as a +// start_of_path_seq. path_seq: the sequence in the path we're trying to find +// starting points for in handle_seq +// Return: a vector of all potential starting index of the subsequence in the handle_seq. +vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq) { + vector possible_start_indices; + cerr << "check_handle_as_start_of_path_seq" << endl; + // If the handle_seq.size <= path_seq.size, look for subsequences reaching from + // beginning/middle of handle_seq to the end - where path_seq may run off the end + // of this handle to the next in the snarl. + if (handle_seq.size() <= path_seq.size()) { + // iterate through all possible starting positions in the handle_seq. + for (int handle_start_i = 0; handle_start_i < handle_seq.size(); + handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // if handle_seq.size > path_seq.size, look for any subsequence within handle_seq of + // path_seq.size, as well as any subsequence smaller than path_seq reaching from + // middle of handle_seq to the end of handle_seq. + else { + // first, search through all handle_seq for any comparable subsequence of + // path_seq.size. Note: only differences between this for loop and above for loop + // is that handle_start_i stops at (<= path_seq.size() - handle_seq.size()), and + // subseq.size() = path_seq.size() + for (int handle_start_i = 0; + handle_start_i < (handle_seq.size() - path_seq.size()); handle_start_i++) { + int subseq_size = path_seq.size(); + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + // second, search through the last few bases of handle_seq for the beginning of + // path_seq. Note: nearly identical for loop to the one in "if (handle_seq.size() + // <= path_seq.size())" + for (int handle_start_i = (handle_seq.size() - path_seq.size() + 1); + handle_start_i < handle_seq.size(); handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // Note: if we passed through the above check without returning anything, then there + // isn't any satisfactory subsequence. + return possible_start_indices; +} - TODO: make it so that gbwt file is customized by user rather than hardcoded. +// ------------------------------ DEBUG CODE BELOW: +// ------------------------------------------ - TODO: make the demo_0 argument into a better name. +// Returns pair where pair.first is a vector of all sources of the given graph and +// path.second is all the sinks of the given graph. If graph is a subhandlegraph of a +// snarl, there should only be one source and sink each. +pair, vector> +debug_get_sources_and_sinks(const HandleGraph &graph) { + cerr << "debug_get_source_and_sinks" << endl; + vector sink; + vector source; - TODO: make it so that you pass the gbwt file directory to a one-liner function that - TODO: generates gbwt graph, extracts haps, aligns haps, and reintegrates haps. - TODO: (eventually will do it for every snarl in the given graph). -*/ + // identify sources and sinks + graph.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + graph.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + graph.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + // base case for dynamic programming + if (is_source) { + source.push_back(handle); + } + if (is_sink) { + sink.emplace_back(handle); + } + }); + return pair, vector>(source, sink); +} +// Runs through the whole snarl and generates all possible strings representing walks from +// source to sink. Generates a combinatorial number of possible paths with splits in the +// snarl. +vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, + id_t start_id, id_t sink_id) { + cerr << "debug_graph_to_strings" << endl; + SubHandleGraph snarl = extract_subgraph(graph, start_id, sink_id); + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(snarl.get_node_count()); // resize count to contain enough buckets for + // size of snarl + sequences.reserve(snarl.get_node_count()); // resize sequences to contain enough + // buckets for size of snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, we + // can just use start_id and sink_id. + snarl.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + snarl.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + snarl.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back( + snarl.get_sequence(handle)); // TODO: presented in the handle's local + // forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); -/// JUNK: -//TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. -//TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd -//TODO: (Now that I've converted depth_first fxn return value to a pair.) -// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// //representing all possible walks through the snarl: -// vg::handle_t source_handle = graph.get_handle(source_id); -// vg::handle_t sink_handle = graph.get_handle(sink_id); + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t &handle : algorithms::lazier_topological_order(&snarl)) { + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; -// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); -// cerr << "finished depth_first, now on to reference." << endl; -// vector reference = get_paths(graph, source_handle, sink_handle); + snarl.follow_edges(handle, false, [&](const handle_t &next) { + size_t &count_next = count[next]; + string seq_next = snarl.get_sequence(next); -// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - -// //Make a new snarl from walks: -// VG new_snarl = strings_to_graph(haplotypes); + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } -// integrate_snarl(graph, new_snarl, source_id, sink_id); - -// } + else { + count_next += count_here; + for (string seq : seqs_here) { + sequences[next].push_back(seq + seq_next); + } + } + }); + /// TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t &sink : sinks) { + total_count += count[sink]; + } -//Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. -//However, is modified to include all haplotypes inside the source/sink handles, -//even ones that don't include the source or sink handles. + // all the sequences at the sinks will be all the sequences in the snarl. + vector walks; + for (handle_t &sink : sinks) { + for (string seq : sequences[sink]) { + walks.push_back(seq); + } + } + return walks; +} +} // namespace vg diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp index 8db1514f97c..932d663bcf9 100644 --- a/src/algorithms/0_draft_haplotype_realignment.hpp +++ b/src/algorithms/0_draft_haplotype_realignment.hpp @@ -2,27 +2,152 @@ Robin Rounthwaite Find function call in ./subcommand/main.cpp */ -#include -#include "../vg.hpp" +#include "../gbwt_helper.hpp" #include "../handle.hpp" #include "../subgraph.hpp" +#include "../vg.hpp" #include "count_walks.hpp" -#include "../gbwt_helper.hpp" +#include + +/* TODO for improving haplotype_realignment. +Tomorrow: +* scale code upwards so that you can run code on every snarl in given graph. +* also add requirement that haps entering snarl = haps exiting snarl. +TODO: align haplotypes_not_at_source once we have a solution for alignments that insert +TODO: the haplotype in a specified location +TODO: (use more unique marker signals to identify where in other strings the +TODO: middle-haplotype should align?) + +TODO: consider splitting handles where embedded paths begin/end in the middle of a handle. +TODO: (Note: would need to dynamically change other paths containing that handle. :-/) +TODO: Or simply split the handles of interest and then realign the paths - expensive. +TODO: Or insert *yet another* marker char to id where embedded paths begin/end, so its +TODO: easily find where to split the handles afterwards. AND! it makes moving the +TODO: paths less expensive. +TODO: (fewer spots to check alignment in the snarl). If we have unique markers for +TODO: each path, then +TODO: it becomes O(N) time, instead of ~O(N*M*n) (N: number of bases in snarl; M: +TODO: number of bases in path; +TODO: n: number of potential starting places in the snarl (note: slightly less +TODO: expensive since n is +TODO: divided up among the M paths).) +TODO: this would also addres the possibility of an embedded path being moved to an +TODO: alternative location +TODO: when it overlaps a repetitive sequence. (previous thought, tho' above one is +TODO: better): do I Need +TODO: to account for this with a sense of "bases distant from source"? +TODO: make it so that gbwt file is customized by user rather than hardcoded. + +TODO: make it so that you pass the gbwt file directory to a one-liner function +TODO: (ran in normalize_main) that generates gbwt graph, extracts haps, +TODO: aligns haps, and reintegrates haps. (eventually will do it for every +TODO: snarl in the given graph). + +*/ namespace vg { - void align_haplotypes(const GBWTGraph& haploGraph, const pair< vector< vector >, vector< vector > >& haplotypes); +void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, ifstream &snarl_stream); + +void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id); + +pair>, vector>> +extract_gbwt_haplotypes(const GBWTGraph &graph, const id_t &source_id, + const id_t &sink_id); + +vector> +find_haplotypes_not_at_source(const GBWTGraph &haploGraph, + unordered_set &touched_handles, + const id_t &sink_id); + +vector format_handle_haplotypes_to_strings( + const GBWTGraph &haploGraph, + const vector> &haplotype_handle_vectors); - pair< vector< vector >, vector< vector > > extract_haplotypes(const GBWTGraph& graph, const id_t& source_id, const id_t& sink_id); +VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotypes); - vector> find_haplotypes_not_at_source(const GBWTGraph& haploGraph, unordered_set& touched_handles, const id_t& sink_id); +vector> +extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id); - vector< string > format_handle_haplotypes_to_strings(const GBWTGraph& haploGraph, const vector< vector< handle_t > >& haplotype_handle_vectors); +SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &end_id); - vector get_embedded_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle); +void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph &new_snarl, + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id); - VG align_haplotypes(const vector& walks); +void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, + const pair &old_embedded_path, + const vector &new_snarl_handles); - vector debug_graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id); +vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq); - SubHandleGraph debug_extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); +// -------------------------------- DEBUG CODE BELOW: ------------------------------------ + +pair, vector> +debug_get_sources_and_sinks(const HandleGraph &graph); + +vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, + id_t start_id, id_t end_id); + +vector debug_get_embedded_paths_from_source_to_sink(const PathHandleGraph &graph, + const handle_t &source_handle, + const handle_t &sink_handle); +} // namespace vg + +/* +Deleted stuff: + +void jordan_bug(MutablePathDeletableHandleGraph& graph){ + + // example with one node: + handle_t example = graph.get_handle(23448); + handle_t replacement = graph.create_handle("GATTACA", 1); + + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could in Python!) + graph.follow_edges(example, true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, replacement); + }); + graph.follow_edges(example, false, + [&](const handle_t &next_handle) { + graph.create_edge(replacement, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle(example, [&](step_handle_t step) + { + graph.rewrite_segment(step, graph.get_next_step(step), vector{replacement}); + }); + + // example with two nodes: + handle_t example_1 = graph.get_handle(23450); + handle_t replacement_1 = graph.create_handle("GATTACA", 2); + handle_t replacement_2 = graph.create_handle("GATTACA", 3); + graph.create_edge(replacement_1, replacement_2); + + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could in Python!) + graph.follow_edges(example_1, true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, replacement_1); + }); + graph.follow_edges(example_1, false, + [&](const handle_t &next_handle) { + graph.create_edge(replacement_2, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle(example_1, [&](step_handle_t step) + { + graph.rewrite_segment(step, step, vector{replacement_1, replacement_2}); + }); } + */ + + diff --git a/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp b/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp deleted file mode 100644 index 12893191566..00000000000 --- a/src/algorithms/0_old_drafts/0_demo_final_0_(before_code_clean_and_includes_non_path_oriented_approach).cpp +++ /dev/null @@ -1,1018 +0,0 @@ -/* -Misc. //todo's: - do I need to fix the fact that find_haplotypes_not_at_source runs forever when given - a non-snarl? -*/ - -#pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? -#include "0_demo_final_0.hpp" -#include -#include "../vg.hpp" -#include "../handle.hpp" -#include "../subgraph.hpp" -#include "count_walks.hpp" -#include -#include -#include -#include "../msa_converter.hpp" -#include "../snarls.hpp" -#include "../gbwt_helper.hpp" -#include "../stream/vpkg.hpp" -#include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? - -namespace vg { - -//TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. -//TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd -//TODO: (Now that I've converted depth_first fxn return value to a pair.) -// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// //representing all possible walks through the snarl: -// vg::handle_t source_handle = graph.get_handle(source_id); -// vg::handle_t sink_handle = graph.get_handle(sink_id); - -// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); -// cerr << "finished depth_first, now on to reference." << endl; -// vector reference = get_paths(graph, source_handle, sink_handle); - -// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - -// //Make a new snarl from walks: -// VG new_snarl = strings_to_graph(haplotypes); - -// integrate_snarl(graph, new_snarl, source_id, sink_id); - -// } - -// Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. -// Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, -// where each vector of handles represents one path from source to sink. -// alternative function return: -//unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ -vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ - unordered_map > paths; - unordered_map multiple_occurrences; - - // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. - graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { - // Each occurrence represents an embedded path - // (note - in the case of a looped path, there will be multiple occurrences for one path.) - // For each path represented by an occurrence, we need to walk along the path until we reach - // the sink node. That series of handles represents the the sequence of the path. - - string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); - if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) - - //record this in multiple_occurrences, and get the number of times we've seen this occurrence. - int occ_num; - if (multiple_occurrences.find(path) == multiple_occurrences.end()){ - occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. - multiple_occurrences[path] = occ_num; - } else { - occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. - } - - //record the other occurrences with an added identifier to differentiate between paths. - paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); - } - else{ // this is the first time we've encountered this occurrence. - paths[path].emplace_back(occurrence); - } - }); - - //Now, for every occurrence, walk along the path until we reach the sink. - for (pair > path : paths){ - // cerr << "my name" << path.first << endl; - // cerr << "my occurences:" << endl; - // for (auto occ : path.second) { - // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; - // } - // cerr << "testing get_next_occurrence:" << endl; - // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); - // cerr << cur_id; - - // cur_occurence is the current handle while walking along the path - occurrence_handle_t cur_occurrence = path.second.back(); - id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); - // store the path in paths, in the occurrence_handle_t vector. - while (cur_id != graph.get_id(sink_handle)){ - paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); - // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); - cur_occurrence = paths[path.first].back(); - // cur_occurrence = path.second.back(); - cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); - cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; - } - path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); - cerr << path.second.size() << endl; - for (auto handle : path.second) { - cerr << graph.get_sequence(graph.get_occurrence(handle)); - } - } - cerr << "havin' issues here?" << endl; - for (auto path : paths) { - for (auto handle : path.second) { - cerr << graph.get_sequence(graph.get_occurrence(handle)); - } - } - // Resolve multiple_occurrences by identifying which entry in paths - // (of those part of the same path) is longest - that will - // represent the full breadth of the path through the snarl. - for (pair element : multiple_occurrences){ - // A vector of all the path entries in paths: - vector same_path_names = {element.first}; - - int max_len = paths[element.first].size(); - string max_path = element.first; - - for (int occ_num : range_vector(element.second)){ - occ_num++; // we actually need range_vector[1, ..., end()] - string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; - int cur_len = paths[cur_path].size(); - same_path_names.push_back(cur_path); - - if (cur_len > max_len){ - max_len = cur_len; - max_path = cur_path; - } - } - - // get rid of the smaller fragments of path: - for (string name : same_path_names) { - if (name != max_path){ - paths.erase(name); - } - } - } - vector path_strings; - // get just the strings from the unordered_map > paths object: - for (auto path : paths) { - string path_string; - for (auto handle : path.second) { - path_string += graph.get_sequence(graph.get_occurrence(handle)); - } - path_strings.push_back(path_string); - } - return path_strings; -} - -vector> find_haplotypes_not_at_source(const GBWTGraph haploGraph, unordered_set touched_handles, const id_t& sink_id){ - cerr << "finding haplotypes not at source!" << endl; - /// Search every handle in touched handles for haplotypes starting at that point. - // Any new haplotypes will be added to haplotype_queue. - vector, gbwt::SearchState>> haplotype_queue; - - // Fully extended haplotypes (or haplotypes extended to the snarl's sink) - // will be added to finished_haplotypes. - vector> finished_haplotypes; - - // In addition, we need to put the new handle into to_search, because a path may have - // started on the new handle (which means we need to start a searchstate there.) - unordered_set to_search; - - // We don't need to ever check the sink handle, since paths from the sink handle - // extend beyond snarl. - handle_t sink_handle = haploGraph.get_handle(sink_id); - touched_handles.erase(sink_handle); - - // Create nested function for making a new_search: - auto make_new_search = [&](handle_t handle) { - cerr << "lambda" << endl; - - // Are there any new threads starting at this handle? - gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); - if (new_search != gbwt::SearchState()){ //TODO: this is the "null" version of SearchState, right? - - // Then add them to haplotype_queue. - haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { - - handle_t next_handle = haploGraph.node_to_handle(next_search.node); - - /// check to make sure that the thread isn't already finished: - // if next_handle is the sink, or if this thread is only one handle long, - // then there isn't any useful string to extract from this. - if (next_handle != sink_handle || next_search == gbwt::SearchState()){ - // establish a new thread to walk along. - vector new_path; - new_path.push_back(handle); - new_path.push_back(next_handle); - - pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); - - - // add the new path to haplotype_queue to be extended. - haplotype_queue.push_back(make_pair(new_path, next_search)); - - // if next_handle hasn't been checked for starting threads, add to to_search. - if (touched_handles.find(next_handle) == touched_handles.end()){ - to_search.emplace(next_handle); - } - } - return true; - }); - } - }; - - cerr << "1" << endl; - // Search every handle in touched handles for haplotypes starting at that point. - for (handle_t handle : touched_handles){ - cerr << "in not_at_source: " << haploGraph.get_sequence(handle) << endl; - make_new_search(handle); - } - - /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. - /// Then, check to see if there are any new threads on handles in to_search. - /// Extend those threads, and add any newly found handles to to_search, - /// then search for threads again in to_search again... repeat until to_search remains - /// emptied of new handles. - - // for tracking whether the haplotype thread is still extending: - bool still_extending; - cerr << "2" << endl; - while(!to_search.empty() && !haplotype_queue.empty()){ - while (!haplotype_queue.empty()){ - cerr << "extend haplotype_queue" << endl; - - // get a haplotype to extend out of haplotype_queue - a tuple of (handles_traversed_so_far, last_touched_SearchState) - pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); - haplotype_queue.pop_back(); - - // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. - vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); - - for (gbwt::SearchState next_search: next_searches){ - handle_t next_handle = haploGraph.node_to_handle(next_search.node); - - // if next_search is empty, then we've fallen off the thread, - // and cur_haplotype can be placed in finished_haplotypes as is for this thread. - if (next_search == gbwt::SearchState()){ - - finished_haplotypes.push_back(cur_haplotype.first); - - } - // if next_search is on the sink_handle, - // then cur_haplotype.first + next_search goes to finished_haplotypes. - else if (haploGraph.get_id(next_handle) == sink_id){ - - // copy over the vector of cur_haplotype: - vector next_handle_vec(cur_haplotype.first); - //add next_handle - next_handle_vec.push_back(next_handle); - //place in finished_haplotypes - finished_haplotypes.push_back(next_handle_vec); - - // also, if next_handle hasn't been checked for new threads, add to to_search. - if (touched_handles.find(next_handle) != touched_handles.end()){ - to_search.emplace(next_handle); - } - - } - // otherwise, just place an extended cur_haplotype in haplotype_queue. - else { - - // copy over cur_haplotype: - pair< vector, gbwt::SearchState> cur_haplotype_copy = cur_haplotype; - //modify with next_handle/search - cur_haplotype_copy.first.push_back(next_handle); - cur_haplotype_copy.second = next_search; - // place back in haplotype_queue for further extension. - haplotype_queue.push_back(cur_haplotype_copy); - - // also, if next_handle hasn't been checked for new threads, add to to_search. - if (touched_handles.find(next_handle) != touched_handles.end()){ - to_search.emplace(next_handle); - } - - } - } - - - - } - // Then, make more new_searches from the handles in to_search. - for (handle_t handle : to_search){ - make_new_search(handle); // will add to haplotype_queue if there's any new_searches to be had. - } - to_search.clear(); - - } - return finished_haplotypes; -} - - - - - - -//TODO: does GBWTgraphs have names associated with haplotypes? -//TODO: If so, I should change return value to an unordered map with key haplotype name -//TODO: and value vector of all handles in haplotype (also, rename fxn). - -//Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. -//However, is modified to include all haplotypes inside the source/sink handles, -//even ones that don't include the source or sink handles. -//Returns: a vector of strings representing all paths reaching from source to sink in the snarl, -// and a vector of strings representing all other paths in the snarl (e.g. any that don't -// reach both source and sink in the graph.) -pair, vector> depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ - cerr << "depth first begins!" << endl; - - - ///GBWT graph construction stuff that belongs in mod_main: - ifstream gbwt_stream; - //TODO: make it so that gbwt file is customized by user rather than hardcoded. - // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new_2.gbwt"; - gbwt_stream.open(gbwt_name); - - unique_ptr gbwt; - // Load the GBWT from its container - gbwt = stream::VPKG::load_one(gbwt_stream); - GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); -// ----------------------------------------------------------------------------------------- - /// Perform depth first search, where whenever the search reaches sink_handle, convert - /// vector of handles to string (should be equivalent to haplotype). - //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - - //TODO: this function currently doesn't account for if it loops back. - - - - - - - - - // cerr << endl << "initial test" << endl << endl; - // handle_t test_handle = haploGraph.get_handle(23495); - - // cerr << "got the test handle?" << haploGraph.get_id(test_handle) << haploGraph.get_sequence(test_handle) << endl; - // cerr << "\n follow edges!" << endl; - // haploGraph.follow_edges(test_handle, true, [&](const handle_t next_handle) { - // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; - // }); - // cerr << "end follow edges!\n " << endl; - - // cerr << "\n follow paths!" << endl; - // haploGraph.follow_paths(haploGraph.get_state(test_handle), [&](const gbwt::SearchState next_state) -> bool { - // handle_t next_handle = haploGraph.node_to_handle(next_state.node); - // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; - // return true; - // }); - // cerr << "end follow paths!\n " << endl; - - // cerr << "\n how many paths overlap test_state?" << endl; - // gbwt::SearchState test_state = haploGraph.get_state(test_handle); - // cerr << "state.size 23495" << test_state.size() << endl; - // handle_t test_handle_2 = haploGraph.get_handle(23494); - // gbwt::SearchState test_state_2 = haploGraph.get_state(test_handle_2); - // cerr << "state.size 23494" << test_state.size() << endl; - - // handle_t test_handle_3 = haploGraph.get_handle(23493); - // gbwt::SearchState test_state_3 = haploGraph.get_state(test_handle_3); - // cerr << "state.size 23493" << test_state.size() << endl; - - // cerr << "rightmost node. (not just 'node to the right'?" << haploGraph.get_sequence(haploGraph.node_to_handle(test_state.node)) << endl; - // cerr << "end test battery" << endl; - - - - - - //touched_handles contains all handles that have been touched by the depth_first_search, - //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch - //from source to sink in the snarl. - unordered_set touched_handles; - - //haplotype_queue contains all started exon_haplotypes not completed yet. - //Every time we encounter a branch in the paths, the next node down the path - //Is stored here, along with the vector of handles that represents the path up - //to the SearchState. - vector< pair< vector, gbwt::SearchState> > haplotype_queue; - - // source and sink handle for haploGraph: - handle_t source_handle = haploGraph.get_handle(source_id); - handle_t sink_handle = haploGraph.get_handle(sink_id); - - //place source in haplotype_queue. - vector source_handle_vec(1, source_handle); - gbwt::SearchState source_state = haploGraph.get_state(source_handle); - haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); - touched_handles.emplace(source_handle); - - //haplotypes contains all "finished" haplotypes - those that were either walked - //to their conclusion, or until they reached the sink. - vector< vector > source_to_sink_haplotype_handle_vecs; - vector< vector > source_without_sink_haplotype_handle_vecs; - - // for every partly-extracted thread, extend the thread until it either reaches - // the sink of the snarl or the end of the thread. - while (!haplotype_queue.empty()) { - - - - - - - - - - // cerr << "iteration! with haplotype_queue:" << endl; - - // for(auto hap : haplotype_queue) { - // cerr << "here's a hap" << endl; - // for(auto handle : hap.first){ - // cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << " "; - // } - // cerr << endl; - // } - - - - - - - - - - // get a haplotype out of haplotype_queue to extend - - // a tuple of (handles_traversed_so_far, last_touched_SearchState) - pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); - haplotype_queue.pop_back(); - - - - - - // cerr << "\n\n follow edges!" << endl; - // haploGraph.follow_edges(cur_haplotype.first.back(), false, [&](const handle_t next_handle) { - // cerr << "next handles: " << haploGraph.get_sequence(next_handle) << haploGraph.get_id(next_handle) << endl; - // }); - // cerr << "end follow edges!\n\n " << endl; - - - - // get all the subsequent search_states that immediately follow the searchstate from cur_haplotype. - vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState next_search) -> bool { - // cerr << "this node immediately follows cur_haplotypes current search_state." << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << haploGraph.get_id(haploGraph.node_to_handle(next_search.node)) << endl; - next_searches.push_back(next_search); - return true; - }); - - // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue - // or one of the finished haplotype_handle_vecs. - if (next_searches.size() > 1){ - - // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, - // or place in the source_to_sink_haplotype_handle_vecs if haplotype extends to sink, - // or place in the source_without_sink_haplotype_handle_vecs if haplotype ends before reaching sink. - for (gbwt::SearchState next_search : next_searches){ - handle_t next_handle = haploGraph.node_to_handle(next_search.node); - - // copy over the vector of cur_haplotype: - vector next_handle_vec(cur_haplotype.first); - - // add the new handle to the vec: - next_handle_vec.push_back(next_handle); - - // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs - if (haploGraph.get_id(next_handle) == sink_id){ - cerr << "anpparently next handle of node " << haploGraph.get_id(cur_haplotype.first.back()) << " is sink" << endl; - source_to_sink_haplotype_handle_vecs.push_back(next_handle_vec); - - } else { // keep extending the haplotype! - - pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); - haplotype_queue.push_back(next_haplotype); - - } - - //next_handle will be touched. - touched_handles.emplace(next_handle); - } - - } // if next_searches is empty, the path has ended but not reached sink. - else if ( next_searches.empty() ) { - - cerr << "next_searches is empty" << endl; - - // We have reached the end of the path, but it doesn't reach the sink. - // we need to add cur_haplotype to source_without_sink_haplotype_handle_vecs. - source_without_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); - - } // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs - else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { - cerr << "next_searches is sink" << endl; - - // Then we need to add cur_haplotype + next_search to source_to_sink_haplotype_handle_vecs. - handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); - cur_haplotype.first.push_back(next_handle); - source_to_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); - - //touched next_search's handle - touched_handles.emplace(next_handle); - - } //else, there is just one next_search, and it's not the end of the path. - //just extend the search by adding (cur_haplotype + next_search to haplotype_queue. - else { - cerr << "normal extend" << endl; - - // get the next_handle from the one next_search. - handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); - - cerr << "this is next_handle" << haploGraph.get_id(next_handle) << endl; - - // modify cur_haplotype with next_handle and next_search. - cur_haplotype.first.push_back(next_handle); - cur_haplotype.second = next_searches.back(); // there's only one next_search in next_searches. - - // put cur_haplotype back in haplotype_queue. - haplotype_queue.push_back(cur_haplotype); - touched_handles.emplace(next_handle); - - } - - } - - for (auto handle : touched_handles){ - cerr << "\n\n\nin starting at source " << haploGraph.get_sequence(handle) << endl; - } - - //TODO: make following code into a separate function - the convert_handle_vec_to_strings, or something. - //Now, transform the each vector of handles in source_to_sink_haplotype_handle_vecs - //into a string, and return as a vector of strings - vector source_to_sink_haplotype_strings; - // for (vector vector_hap : source_to_sink_haplotype_handle_vecs){ - // string hap; - // for (handle_t& handle : vector_hap){ - // hap += haploGraph.get_sequence(handle); - // } - // source_to_sink_haplotype_strings.push_back(hap); - // } - - //Find any haplotypes starting from handles not starting at the source, but which - //still start somewhere inside the snarl. - vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); - - //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. - vector other_haplotype_strings; - for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ - string hap; - for (handle_t& handle : vector_hap){ - hap += haploGraph.get_sequence(handle); - } - other_haplotype_strings.push_back(hap); - } - - //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. - for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ - string hap; - for (handle_t& handle : vector_hap){ - hap += haploGraph.get_sequence(handle); - } - other_haplotype_strings.push_back(hap); - } - - return make_pair(source_to_sink_haplotype_strings, other_haplotype_strings); -} - - - - - - - - - - - - - - - - -//TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. -// Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. -vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ - - ///stuff that will go in mod_main: - ifstream gbwt_stream; - string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - gbwt_stream.open(gbwt_name); - - unique_ptr gbwt; - // Load the GBWT from its container - gbwt = stream::VPKG::load_one(gbwt_stream); - -// ----------------------------------------------------------------- - /// make subgraph for the snarl: - - // graph.for_each_handle([&] (const handle_t& handle)-> bool{ - // cerr << "test for graph "; - // cerr << graph.get_id(handle) << endl; - // return true; - // }); - - SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); - - // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ - // cerr << "test for snarl "; - // cerr << snarl.get_id(handle) << endl; - // return true; - // }); - // cerr << "before 1 \n"; - - // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. - GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); - // cerr << "after 1 \n"; - - // cerr << "before \n"; - // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ - // cerr << "test for haploGraph "; - // cerr << haploGraph.get_id(handle) << endl; - // return true; - // }); - // cerr << "after \n"; - - - //TODO:identify source and sinks for troubleshooting! - unordered_map> sequences; // will contain all haplotype walks through snarl - handle_t source_handle = haploGraph.get_handle(source_id); - sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); - - for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { - - vector seqs_here = sequences[handle]; - gbwt::SearchState cur_state = haploGraph.get_state(handle); - - // id_t cur_id = haploGraph.get_id(handle); - // cerr << "cur_id" << cur_id << endl; - - haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { - handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); - - id_t next_id = haploGraph.get_id(next_handle); - cerr << "next_id" << next_id << endl; - - string next_seq = haploGraph.get_sequence(next_handle); - // transfer the sequences for the preceding handle to next_handle's sequences, - // plus the new handle's sequence. - for (string seq : seqs_here){ - sequences[next_handle].push_back(seq + next_seq); - } - return true; - - - }); - } - - // all the sequences at the sinks will be all the sequences in the snarl. - handle_t sink_handle = haploGraph.get_handle(sink_id); - return sequences[sink_handle]; - // vector testVec; - // return testVec; -} - -//Iterate over all snarls in a graph, and run clean_snarl on it. -void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ - SnarlManager* snarl_manager = new SnarlManager(snarl_stream); - -/* Use this code to count number of snarls in graph. -* int top_count = 0; -* for (const Snarl* snarl : snarl_manager->top_level_snarls()){ -* top_count++; -* } -* cerr << "number of top_level snarls in graph: " << top_count << endl; -* -* int general_count = 0; -* snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ -* general_count++; -* }); -* cerr << "number of total snarls in graph: " << general_count << endl; -*/ - - - vector snarl_roots = snarl_manager->top_level_snarls(); - for (auto roots : snarl_roots){ - clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); - } - - delete snarl_manager; - - -} - -// Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ - //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings - //representing all possible walks through the snarl: - vector walks = graph_to_strings(graph, start_id, end_id); - - //Make a new snarl from walks: - VG new_snarl = strings_to_graph(walks); - - integrate_snarl(graph, new_snarl, start_id, end_id); - -} - -// Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. -void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ - //Get old graph snarl - SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); - - //Identify old and new snarl start and sink - pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); - pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); - - //Check to make sure that newly made snarl has only one start and end. - if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ - cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; - return; - } - //extract old and new snarl start and sink: - handle_t new_snarl_start = new_snarl_defining_handles.first[0]; - handle_t new_snarl_end = new_snarl_defining_handles.second[0]; - - handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; - handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; - - ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete - ///rest of old graph snarl. - - //Get everything needed to replace graph start and sink. - string new_start_seq = new_snarl.get_sequence(new_snarl_start); - string new_end_seq = new_snarl.get_sequence(new_snarl_end); - id_t new_start_id = graph.get_id(graph_snarl_start); - id_t new_end_id = graph.get_id(graph_snarl_end); - vector left_of_start; - graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ - left_of_start.emplace_back(handle); - }); - vector right_of_end; - graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ - right_of_end.emplace_back(handle); - }); - - //Delete all handles in graph_snarl - graph_snarl.for_each_handle([&](const handle_t& handle){ - graph.destroy_handle(handle); - }, false); - - //Make start and end handles for snarl in graph: - handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); - handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); - - //Insert start and end handles: - for (handle_t handle : left_of_start) { - graph.create_edge(handle, new_start_handle); - } - for (handle_t handle : right_of_end) { - graph.create_edge(new_end_handle, handle); - } - - ///Reintegrate rest of new_snarl. - //topologically ordered new_snarl. As I progress through each node in topo_order, - //I can add all the nodes to the right of the snarl. The final node will be the - //end node, which, instead of adding as a new node to graph, I'll re-connect - //to the modified end_node, above. - vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); - - //Construct a parallel graph_snarl_topo_order to identify - //paralogous nodes between new_snarl and graph. - vector graph_snarl_topo_order = {new_start_handle}; - - for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ - //For every handle in new_snarl, make an (unconnected) handle in graph. - string handle_seq = new_snarl.get_sequence(*it); - handle_t graph_handle = graph.create_handle(handle_seq); - graph_snarl_topo_order.push_back(graph_handle); - } - - graph_snarl_topo_order.push_back(new_end_handle); - - //Connect the rest of the nodes: - for (int i = 0; i < new_snarl_topo_order.size(); i++){ - // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; - - new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ - //get topo_index of nodes to be connected to graph start handle - auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); - int topo_index = it - new_snarl_topo_order.begin(); - // cerr << "topo_index" << topo_index << endl; - // cerr << "i" << i << endl; - - //connect graph start handle - graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); - }); - } - -} - -//Returns tuple of two handles, first being start and second being sink. -pair, vector> get_sources_and_sinks(HandleGraph& graph){ - vector sink; - vector source; - - // identify sources and sinks - graph.for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - graph.follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - graph.follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - source.push_back(handle); - } - if (is_sink) { - sink.emplace_back(handle); - } - }); - - return pair, vector>(source, sink); - -} - - -VG strings_to_graph(const vector& walks){ - seqan::Align align; // create multiple_sequence_alignment object - - seqan::resize(rows(align), walks.size()); - for (int i = 0; i < walks.size(); ++i){ - assignSource(row(align, i), walks[i].c_str()); - } - - - globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - - stringstream ss; - ss << align; - MSAConverter myMSAConverter = MSAConverter(); - myMSAConverter.load_alignments(ss, "seqan"); - VG snarl = myMSAConverter.make_graph(); - snarl.clear_paths(); - - - // snarl.serialize_to_ostream(cerr); - return snarl; -} - - - - -vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ - SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); - - unordered_map> sequences; - vector sinks; - unordered_map count; - count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl - sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - - // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. - snarl.for_each_handle([&](const handle_t& handle) { - bool is_source = true, is_sink = true; - snarl.follow_edges(handle, true, [&](const handle_t& prev) { - is_source = false; - return false; - }); - snarl.follow_edges(handle, false, [&](const handle_t& next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - count[handle] = 1; - sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? - } - if (is_sink) { - sinks.emplace_back(handle); - } - }); - - - // count walks by dynamic programming - bool overflowed = false; - for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { - size_t count_here = count[handle]; - vector seqs_here = sequences[handle]; - - snarl.follow_edges(handle, false, [&](const handle_t& next) { - - size_t& count_next = count[next]; - string seq_next = snarl.get_sequence(next); - - if (numeric_limits::max() - count_here < count_next) { - overflowed = true; - } - - else { - count_next += count_here; - // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ - for (string seq : seqs_here){ - sequences[next].push_back(seq + seq_next); - } - // cerr << "next_seqs: "; - // for (string seq : sequences[next]){ - // cerr << seq << endl; - // } - } - }); - ///TODO: figure out how to deal with overflow. - // if (overflowed) { - // return numeric_limits::max(); - // } - } - - // total up the walks at the sinks - size_t total_count = 0; - for (handle_t& sink : sinks) { - total_count += count[sink]; - } - - // all the sequences at the sinks will be all the sequences in the snarl. - vector walks; - for (handle_t& sink : sinks) { - for (string seq : sequences[sink]){ - walks.push_back(seq); - } - } - - return walks; -} - - -// given a start and end node id, construct an extract subgraph between the two nodes (inclusive). -// TODO: change the arguments to handles, which contain orientation within themselves. -// That way, iteration to extract the subgraph will have direction contained within themselves. -// This may actually end up looking like simply parsing an input text file with the handles -// described from the find_snarl output. -SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ - /// make a subgraph containing only nodes of interest. (e.g. a snarl) - // make empty subgraph - SubHandleGraph subgraph = SubHandleGraph(&graph); - - unordered_set visited; // to avoid counting the same node twice. - unordered_set to_visit; // nodes found that belong in the subgraph. - - // TODO: how to ensure that "to the right" of start_handle is the correct direction? - // initialize with start_handle (because we move only to the right of start_handle): - handle_t start_handle = graph.get_handle(start_id); - subgraph.add_handle(start_handle); - visited.insert(graph.get_id(start_handle)); - - // look only to the right of start_handle - graph.follow_edges(start_handle, false, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - - /// explore the rest of the snarl: - while (to_visit.size() != 0) { - // remove cur_handle from to_visit - unordered_set::iterator cur_index = to_visit.begin(); - handle_t cur_handle = graph.get_handle(*cur_index); - - to_visit.erase(cur_index); - - /// visit cur_handle - visited.insert(graph.get_id(cur_handle)); - - subgraph.add_handle(cur_handle); - - if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! - // look for all nodes connected to cur_handle that need to be added - // looking to the left, - graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - // looking to the right, - graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ - // mark the nodes to come as to_visit - if (visited.find(graph.get_id(handle)) == visited.end()) { - to_visit.insert(graph.get_id(handle)); - } - }); - } - } - return subgraph; -} -} \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp deleted file mode 100644 index 4a313e0993a..00000000000 --- a/src/algorithms/0_old_drafts/0_demo_final_old_0-diff_extension_for_not_at_source.cpp +++ /dev/null @@ -1,964 +0,0 @@ -// /* -// Misc. //todo's: -// do I need to fix the fact that find_haplotypes_not_at_source runs forever when given -// a non-snarl? -// */ - -// #pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? -// #include "0_demo_final_0.hpp" -// #include -// #include "../vg.hpp" -// #include "../handle.hpp" -// #include "../subgraph.hpp" -// #include "count_walks.hpp" -// #include -// #include -// #include -// #include "../msa_converter.hpp" -// #include "../snarls.hpp" -// #include "../gbwt_helper.hpp" -// #include "../stream/vpkg.hpp" -// #include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? - -// namespace vg { - -// // void print_kmer(const std::vector>&, const std::string& string){ -// // cerr << string << endl; -// // } - -// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle) { -// // unordered_map > handle_paths get_paths(graph, source_handle, sink_handle); -// // for (auto path : handle_paths) { -// // for (occuhandle : -// // } -// // } - - -// //TODO: fix the clean_snarl_from_haplotypes fxn to properly combine partial and full alignments. -// //TODO: make sure that I'm inserting all reference haplotypes in the spot that I wantd -// //TODO: (Now that I've converted depth_first fxn return value to a pair.) -// // // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// // void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// // //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// // //representing all possible walks through the snarl: -// // vg::handle_t source_handle = graph.get_handle(source_id); -// // vg::handle_t sink_handle = graph.get_handle(sink_id); - -// // vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); -// // cerr << "finished depth_first, now on to reference." << endl; -// // vector reference = get_paths(graph, source_handle, sink_handle); - -// // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - -// // //Make a new snarl from walks: -// // VG new_snarl = strings_to_graph(haplotypes); - -// // integrate_snarl(graph, new_snarl, source_id, sink_id); - -// // } - -// // TODO: test/debug this! -// // Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. -// // Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, -// // where each vector of handles represents one path from source to sink. -// // alternative function return: -// //unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ -// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ -// unordered_map > paths; -// unordered_map multiple_occurrences; - -// // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. -// graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { -// // Each occurrence represents an embedded path -// // (note - in the case of a looped path, there will be multiple occurrences for one path.) -// // For each path represented by an occurrence, we need to walk along the path until we reach -// // the sink node. That series of handles represents the the sequence of the path. - -// string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); -// if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) - -// //record this in multiple_occurrences, and get the number of times we've seen this occurrence. -// int occ_num; -// if (multiple_occurrences.find(path) == multiple_occurrences.end()){ -// occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. -// multiple_occurrences[path] = occ_num; -// } else { -// occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. -// } - -// //record the other occurrences with an added identifier to differentiate between paths. -// paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); -// } -// else{ // this is the first time we've encountered this occurrence. -// paths[path].emplace_back(occurrence); -// } -// }); - -// //Now, for every occurrence, walk along the path until we reach the sink. -// for (pair > path : paths){ -// // cerr << "my name" << path.first << endl; -// // cerr << "my occurences:" << endl; -// // for (auto occ : path.second) { -// // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; -// // } -// // cerr << "testing get_next_occurrence:" << endl; -// // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); -// // cerr << cur_id; - -// // cur_occurence is the current handle while walking along the path -// occurrence_handle_t cur_occurrence = path.second.back(); -// id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); -// // store the path in paths, in the occurrence_handle_t vector. -// while (cur_id != graph.get_id(sink_handle)){ -// paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); -// // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); -// cur_occurrence = paths[path.first].back(); -// // cur_occurrence = path.second.back(); -// cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); -// cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; -// } -// path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); -// cerr << path.second.size() << endl; -// for (auto handle : path.second) { -// cerr << graph.get_sequence(graph.get_occurrence(handle)); -// } -// } -// cerr << "havin' issues here?" << endl; -// for (auto path : paths) { -// for (auto handle : path.second) { -// cerr << graph.get_sequence(graph.get_occurrence(handle)); -// } -// } -// // Resolve multiple_occurrences by identifying which entry in paths -// // (of those part of the same path) is longest - that will -// // represent the full breadth of the path through the snarl. -// for (pair element : multiple_occurrences){ -// // A vector of all the path entries in paths: -// vector same_path_names = {element.first}; - -// int max_len = paths[element.first].size(); -// string max_path = element.first; - -// for (int occ_num : range_vector(element.second)){ -// occ_num++; // we actually need range_vector[1, ..., end()] -// string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; -// int cur_len = paths[cur_path].size(); -// same_path_names.push_back(cur_path); - -// if (cur_len > max_len){ -// max_len = cur_len; -// max_path = cur_path; -// } -// } - -// // get rid of the smaller fragments of path: -// for (string name : same_path_names) { -// if (name != max_path){ -// paths.erase(name); -// } -// } -// } -// vector path_strings; -// // get just the strings from the unordered_map > paths object: -// for (auto path : paths) { -// string path_string; -// for (auto handle : path.second) { -// path_string += graph.get_sequence(graph.get_occurrence(handle)); -// } -// path_strings.push_back(path_string); -// } -// return path_strings; -// } - -// vector> find_haplotypes_not_at_source(const GBWTGraph haploGraph, unordered_set touched_handles, const id_t& sink_id){ -// cerr << "finding haplotypes not at source!" << endl; -// /// Search every handle in touched handles for haplotypes starting at that point. -// // Any new haplotypes will be added to new_searches. -// vector, gbwt::SearchState>> new_searches; - -// // Fully extended haplotypes (or haplotypes extended to the snarl's sink) -// // will be added to finished_searches. -// vector> finished_searches; - -// // In addition, we need to put the new handle into to_search, because a path may have -// // started on the new handle (which means we need to start a searchstate there.) -// unordered_set to_search; - -// // We don't need to ever check the sink handle, since paths from the sink handle -// // extend beyond snarl. //TODO: true? -// handle_t sink_handle = haploGraph.get_handle(sink_id); -// touched_handles.erase(sink_handle); - -// // Create nested function for making a new_search: -// auto make_new_search = [&](handle_t handle) { -// cerr << "lambda" << endl; - -// // Are there any new threads starting at this handle? -// gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); -// if (new_search != gbwt::SearchState()){ //TODO: this is the "null" version of SearchState, right? -// // Then add them to new_searches. - -// haploGraph.follow_paths(new_search, [&](const gbwt::SearchState& next_search) -> bool { - -// handle_t next_handle = haploGraph.node_to_handle(next_search.node); - -// /// check to make sure that the thread isn't already finished: -// // if next_handle is the sink, or if this thread is only one handle long, -// // then there isn't any useful string to extract from this. -// if (next_handle != sink_handle || next_search == gbwt::SearchState()){ -// // establish a new thread to walk along. -// vector new_path; -// new_path.push_back(handle); -// new_path.push_back(next_handle); - -// pair, gbwt::SearchState > mypair = make_pair(new_path, next_search); - - -// // add the new path to new_searches to be extended. -// new_searches.push_back(make_pair(new_path, next_search)); - -// // if next_handle hasn't been checked for starting threads, add to to_search. -// if (touched_handles.find(next_handle) == touched_handles.end()){ -// to_search.emplace(next_handle); -// } -// } -// return true; -// }); -// } -// }; - -// cerr << "1" << endl; -// // Search every handle in touched handles for haplotypes starting at that point. -// for (handle_t handle : touched_handles){ -// make_new_search(handle); -// } - -// /// Extend any paths in new_searches, and add any newly found handles to to_search. -// /// Then, check to see if there are any new threads on handles in to_search. -// /// Extend those threads, and add any newly found handles to to_search, -// /// then search for threads again in to_search again... repeat until to_search remains -// /// emptied of new handles. - -// // for tracking whether the haplotype thread is still extending: -// bool still_extending; -// cerr << "2" << endl; - -// while (!new_searches.empty()){ -// cerr << "extend new_searches" << endl; - -// for (auto search : new_searches){ -// for (auto handle : search.first){ -// cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << endl; -// } -// } - -// // First, extend new_searches, adding any newly found handles to to_search. -// for (pair, gbwt::SearchState> new_search : new_searches){ - -// // while the thread is still being extended: -// still_extending = true; -// while (still_extending){ -// cerr << "still_extending: " << endl; -// for (auto handle : new_search.first){ -// cerr << haploGraph.get_sequence( handle ) << " " << haploGraph.get_id(handle) << " "; - -// } -// cerr << endl; - -// // first, count the number of alternate, distinct paths we could walk along at the current searchState. -// int path_splits; -// haploGraph.follow_paths(new_search.second, [&](const gbwt::SearchState& next_search) -> bool { -// path_splits++; -// }); - -// /// if there's more than one path_split, then continue to fully extend the first path the haplotype takes. -// /// The rest of the alternate paths must be passed to the end of new_searches. -// /// No matter what, we extend the path: -// // extend along paths until we reach an end condition. -// bool first_path = true; - -// gbwt::SearchState first_next_search; -// still_extending = haploGraph.follow_paths(new_search.second, [&](const gbwt::SearchState& next_search) -> bool { -// if (!first_path) -// handle_t next_handle = haploGraph.node_to_handle(next_search.node); - -// // we only want to fully extend one path at a time. So, if haploGraph.follow_paths gives us -// if (first_path) { -// // if next_handle is the sink, add it to new_search and end extension. -// if( haploGraph.get_id(next_handle) == sink_id ){ -// // if next_handle hasn't been checked for starting threads, add to to_search. -// if (touched_handles.find(next_handle)!= touched_handles.end()){ -// to_search.emplace(next_handle); -// } - -// // if this is the first -// new_search.first.push_back(next_handle); -// return false; // done extending -// } - -// // if next_search falls off the end of the path, then we've already finished extending -// // the path. -// if (next_search == gbwt::SearchState()){ -// return false; -// } - -// // otherwise, continue extending the thread to walk along. -// cerr << new_search.first.size() << endl; -// new_search.first.push_back(next_handle); -// new_search.second = next_search; -// cerr << new_search.first.size() << endl; - -// // if next_handle hasn't been checked for starting threads, add to to_search. -// if (touched_handles.find(next_handle)!= touched_handles.end()){ -// to_search.emplace(next_handle); -// } - -// bool first_path = false; -// } - - -// return true; -// }); -// } -// // new_search is finished extending. Place in finished_searches. -// finished_searches.push_back(new_search.first); -// } -// // All new_searches have been fully extended and added to finished_searches. -// new_searches.clear(); - -// // Then, make more new_searches from the handles in to_search. -// for (handle_t handle : to_search){ -// make_new_search(handle); -// } -// to_search.clear(); - -// } - -// return finished_searches; -// } - - - - - - -// //TODO: does GBWTgraphs have names associated with haplotypes? -// //TODO: If so, I should change return value to an unordered map with key haplotype name -// //TODO: and value vector of all handles in haplotype (also, rename fxn). - -// //Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. -// //However, is modified to include all haplotypes inside the source/sink handles, -// //even ones that don't include the source or sink handles. -// //Returns: a vector of strings representing all paths reaching from source to sink in the snarl, -// // and a vector of strings representing all other paths in the snarl (e.g. any that don't -// // reach both source and sink in the graph.) -// pair, vector> depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// cerr << "depth first begins!" << endl; - - -// ///GBWT graph construction stuff that belongs in mod_main: -// ifstream gbwt_stream; -// //TODO: make it so that gbwt file is customized by user rather than hardcoded. -// // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; -// string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.gbwt"; -// gbwt_stream.open(gbwt_name); - -// unique_ptr gbwt; -// // Load the GBWT from its container -// gbwt = stream::VPKG::load_one(gbwt_stream); -// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); -// // ----------------------------------------------------------------------------------------- -// /// Perform depth first search, where whenever the search reaches sink_handle, convert -// /// vector of handles to string (should be equivalent to haplotype). -// //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - -// //TODO: this function currently doesn't account for if it loops back. - -// //touched_handles contains all handles that have been touched by the depth_first_search, -// //for later use in other_haplotypes_to_strings, which identifies paths that didn't stretch -// //from source to sink in the snarl. -// unordered_set touched_handles; - -// //haplotype_queue contains all started exon_haplotypes not completed yet. -// //Every time we encounter a branch in the paths, the next node down the path -// //Is stored here, along with the vector of handles that represents the path up -// //to the SearchState. -// vector< pair< vector, gbwt::SearchState> > haplotype_queue; - -// // source and sink handle for haploGraph: -// handle_t source_handle = haploGraph.get_handle(source_id); -// handle_t sink_handle = haploGraph.get_handle(sink_id); - -// //place source in haplotype_queue. -// vector source_handle_vec(1, source_handle); -// gbwt::SearchState source_state = haploGraph.get_state(source_handle); -// haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); -// touched_handles.emplace(source_handle); - -// //haplotypes contains all "finished" haplotypes - those that were either walked -// //to their conclusion, or until they reached the sink. -// vector< vector > source_to_sink_haplotype_handle_vecs; -// vector< vector > source_without_sink_haplotype_handle_vecs; - -// // cerr << "hap size before pop" << haplotype_queue.size() << endl; -// // haplotype_queue.pop_back(); - -// // cerr << "hap size after pop" << haplotype_queue.size() << endl; - -// while (!haplotype_queue.empty()) { -// cerr << "iteration! with haplotype_queue:" << endl; - -// for(auto hap : haplotype_queue) { -// cerr << "here's a hap" << endl; -// for(auto handle : hap.first){ -// cerr << haploGraph.get_sequence(handle) << " " << haploGraph.get_id(handle) << " "; -// } -// cerr << endl; -// } - -// pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); // Tuple of (handles_traversed_so_far, last_touched_SearchState) - -// haplotype_queue.pop_back(); - -// vector next_searches; - -// haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { -// next_searches.push_back(next_search); -// return true; -// }); - -// // if next_searches > 1, then we need to make multiple new haplotypes to be recorded in haplotype_queue -// // or one of the haplotype_handle_vecs. -// if (next_searches.size()>1){ - -// // for every next_search in next_searches, either create a new, extended cur_haplotype to push into haplotype queue, -// // or place in the source_to_sink_haplotype_handle_vecs if haplotype extends to sink, -// // or place in the source_without_sink_haplotype_handle_vecs if haplotype ends before reaching sink. -// for (gbwt::SearchState next_search : next_searches){ -// handle_t next_handle = haploGraph.node_to_handle(next_search.node); - -// // copy over the vector of cur_haplotype: -// vector next_handle_vec(cur_haplotype.first); - -// // add the new handle to the vec: -// next_handle_vec.push_back(next_handle); - -// // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs -// if (haploGraph.get_id(next_handle) == sink_id){ - -// source_to_sink_haplotype_handle_vecs.push_back(next_handle_vec); - -// } else { // keep extending the haplotype! - -// pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); -// haplotype_queue.push_back(next_haplotype); - -// } - -// //next_handle will be touched. -// touched_handles.emplace(next_handle); -// } - -// } else if ( next_searches.empty() ) { // if next_searches is empty, the path has ended but not reached sink. - -// // We have reached the end of the path, but it doesn't reach the sink. -// // we need to add cur_haplotype to source_without_sink_haplotype_handle_vecs. -// source_without_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); - -// // if new_handle is the sink, put in source_to_sink_haplotype_handle_vecs -// } else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { - -// // Then we need to add cur_haplotype + next_search to source_to_sink_haplotype_handle_vecs. -// handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); -// cur_haplotype.first.push_back(next_handle); -// source_to_sink_haplotype_handle_vecs.push_back(cur_haplotype.first); - -// //touched next_search's handle -// touched_handles.emplace(next_handle); - -// // else, just extend the one search. -// } else { - -// // Then there is just one next_search, and it's not the end of the path. -// // add (cur_haplotype + next_search to haplotype_queue -// handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); -// cur_haplotype.first.push_back(next_handle); -// cur_haplotype.second = next_searches.back(); -// haplotype_queue.push_back(cur_haplotype); -// touched_handles.emplace(next_handle); - -// } - -// } - -// //Now, transform the each vector of handles in source_to_sink_haplotype_handle_vecs -// //into a string, and return as a vector of strings -// vector source_to_sink_haplotype_strings; -// // for (vector vector_hap : source_to_sink_haplotype_handle_vecs){ -// // string hap; -// // for (handle_t& handle : vector_hap){ -// // hap += haploGraph.get_sequence(handle); -// // } -// // source_to_sink_haplotype_strings.push_back(hap); -// // } - -// //Find any haplotypes starting from handles not starting at the source, but which -// //still start somewhere inside the snarl. -// vector> haplotypes_not_starting_at_source = find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); - -// //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. -// vector other_haplotype_strings; -// for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ -// string hap; -// for (handle_t& handle : vector_hap){ -// hap += haploGraph.get_sequence(handle); -// } -// other_haplotype_strings.push_back(hap); -// } - -// //Convert handle_t in source_without_sink_haplotype_handle_vecs to strings. -// for (vector vector_hap : source_without_sink_haplotype_handle_vecs){ -// string hap; -// for (handle_t& handle : vector_hap){ -// hap += haploGraph.get_sequence(handle); -// } -// other_haplotype_strings.push_back(hap); -// } - -// return make_pair(source_to_sink_haplotype_strings, other_haplotype_strings); -// } - - - - - - - - - - - - - - - - -// //TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. -// // Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. -// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ - -// ///stuff that will go in mod_main: -// ifstream gbwt_stream; -// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; -// gbwt_stream.open(gbwt_name); - -// unique_ptr gbwt; -// // Load the GBWT from its container -// gbwt = stream::VPKG::load_one(gbwt_stream); - -// // ----------------------------------------------------------------- -// /// make subgraph for the snarl: - -// // graph.for_each_handle([&] (const handle_t& handle)-> bool{ -// // cerr << "test for graph "; -// // cerr << graph.get_id(handle) << endl; -// // return true; -// // }); - -// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); - -// // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ -// // cerr << "test for snarl "; -// // cerr << snarl.get_id(handle) << endl; -// // return true; -// // }); -// // cerr << "before 1 \n"; - -// // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. -// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); -// // cerr << "after 1 \n"; - -// // cerr << "before \n"; -// // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ -// // cerr << "test for haploGraph "; -// // cerr << haploGraph.get_id(handle) << endl; -// // return true; -// // }); -// // cerr << "after \n"; - - -// //TODO:identify source and sinks for troubleshooting! -// unordered_map> sequences; // will contain all haplotype walks through snarl -// handle_t source_handle = haploGraph.get_handle(source_id); -// sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); - -// for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { - -// vector seqs_here = sequences[handle]; -// gbwt::SearchState cur_state = haploGraph.get_state(handle); - -// // id_t cur_id = haploGraph.get_id(handle); -// // cerr << "cur_id" << cur_id << endl; - -// haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { -// handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); - -// id_t next_id = haploGraph.get_id(next_handle); -// cerr << "next_id" << next_id << endl; - -// string next_seq = haploGraph.get_sequence(next_handle); -// // transfer the sequences for the preceding handle to next_handle's sequences, -// // plus the new handle's sequence. -// for (string seq : seqs_here){ -// sequences[next_handle].push_back(seq + next_seq); -// } -// return true; - - -// }); -// } - -// // all the sequences at the sinks will be all the sequences in the snarl. -// handle_t sink_handle = haploGraph.get_handle(sink_id); -// return sequences[sink_handle]; -// // vector testVec; -// // return testVec; -// } - -// //Iterate over all snarls in a graph, and run clean_snarl on it. -// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ -// SnarlManager* snarl_manager = new SnarlManager(snarl_stream); - -// /* Use this code to count number of snarls in graph. -// * int top_count = 0; -// * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ -// * top_count++; -// * } -// * cerr << "number of top_level snarls in graph: " << top_count << endl; -// * -// * int general_count = 0; -// * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ -// * general_count++; -// * }); -// * cerr << "number of total snarls in graph: " << general_count << endl; -// */ - - -// vector snarl_roots = snarl_manager->top_level_snarls(); -// for (auto roots : snarl_roots){ -// clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); -// } - -// delete snarl_manager; - - -// } - -// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ -// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// //representing all possible walks through the snarl: -// vector walks = graph_to_strings(graph, start_id, end_id); - -// //Make a new snarl from walks: -// VG new_snarl = strings_to_graph(walks); - -// integrate_snarl(graph, new_snarl, start_id, end_id); - -// } - -// // Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. -// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ -// //Get old graph snarl -// SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); - -// //Identify old and new snarl start and sink -// pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); -// pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); - -// //Check to make sure that newly made snarl has only one start and end. -// if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ -// cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; -// return; -// } -// //extract old and new snarl start and sink: -// handle_t new_snarl_start = new_snarl_defining_handles.first[0]; -// handle_t new_snarl_end = new_snarl_defining_handles.second[0]; - -// handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; -// handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; - -// ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete -// ///rest of old graph snarl. - -// //Get everything needed to replace graph start and sink. -// string new_start_seq = new_snarl.get_sequence(new_snarl_start); -// string new_end_seq = new_snarl.get_sequence(new_snarl_end); -// id_t new_start_id = graph.get_id(graph_snarl_start); -// id_t new_end_id = graph.get_id(graph_snarl_end); -// vector left_of_start; -// graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ -// left_of_start.emplace_back(handle); -// }); -// vector right_of_end; -// graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ -// right_of_end.emplace_back(handle); -// }); - -// //Delete all handles in graph_snarl -// graph_snarl.for_each_handle([&](const handle_t& handle){ -// graph.destroy_handle(handle); -// }, false); - -// //Make start and end handles for snarl in graph: -// handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); -// handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); - -// //Insert start and end handles: -// for (handle_t handle : left_of_start) { -// graph.create_edge(handle, new_start_handle); -// } -// for (handle_t handle : right_of_end) { -// graph.create_edge(new_end_handle, handle); -// } - -// ///Reintegrate rest of new_snarl. -// //topologically ordered new_snarl. As I progress through each node in topo_order, -// //I can add all the nodes to the right of the snarl. The final node will be the -// //end node, which, instead of adding as a new node to graph, I'll re-connect -// //to the modified end_node, above. -// vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); - -// //Construct a parallel graph_snarl_topo_order to identify -// //paralogous nodes between new_snarl and graph. -// vector graph_snarl_topo_order = {new_start_handle}; - -// for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ -// //For every handle in new_snarl, make an (unconnected) handle in graph. -// string handle_seq = new_snarl.get_sequence(*it); -// handle_t graph_handle = graph.create_handle(handle_seq); -// graph_snarl_topo_order.push_back(graph_handle); -// } - -// graph_snarl_topo_order.push_back(new_end_handle); - -// //Connect the rest of the nodes: -// for (int i = 0; i < new_snarl_topo_order.size(); i++){ -// // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; - -// new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ -// //get topo_index of nodes to be connected to graph start handle -// auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); -// int topo_index = it - new_snarl_topo_order.begin(); -// // cerr << "topo_index" << topo_index << endl; -// // cerr << "i" << i << endl; - -// //connect graph start handle -// graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); -// }); -// } - -// } - -// //Returns tuple of two handles, first being start and second being sink. -// pair, vector> get_sources_and_sinks(HandleGraph& graph){ -// vector sink; -// vector source; - -// // identify sources and sinks -// graph.for_each_handle([&](const handle_t& handle) { -// bool is_source = true, is_sink = true; -// graph.follow_edges(handle, true, [&](const handle_t& prev) { -// is_source = false; -// return false; -// }); -// graph.follow_edges(handle, false, [&](const handle_t& next) { -// is_sink = false; -// return false; -// }); - -// // base case for dynamic programming -// if (is_source) { -// source.push_back(handle); -// } -// if (is_sink) { -// sink.emplace_back(handle); -// } -// }); - -// return pair, vector>(source, sink); - -// } - - -// VG strings_to_graph(const vector& walks){ -// seqan::Align align; // create multiple_sequence_alignment object - -// seqan::resize(rows(align), walks.size()); -// for (int i = 0; i < walks.size(); ++i){ -// assignSource(row(align, i), walks[i].c_str()); -// } - - -// globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - -// stringstream ss; -// ss << align; -// MSAConverter myMSAConverter = MSAConverter(); -// myMSAConverter.load_alignments(ss, "seqan"); -// VG snarl = myMSAConverter.make_graph(); -// snarl.clear_paths(); - - -// // snarl.serialize_to_ostream(cerr); -// return snarl; -// } - - - - -// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ -// SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); - -// unordered_map> sequences; -// vector sinks; -// unordered_map count; -// count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl -// sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - -// // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. -// snarl.for_each_handle([&](const handle_t& handle) { -// bool is_source = true, is_sink = true; -// snarl.follow_edges(handle, true, [&](const handle_t& prev) { -// is_source = false; -// return false; -// }); -// snarl.follow_edges(handle, false, [&](const handle_t& next) { -// is_sink = false; -// return false; -// }); - -// // base case for dynamic programming -// if (is_source) { -// count[handle] = 1; -// sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? -// } -// if (is_sink) { -// sinks.emplace_back(handle); -// } -// }); - - -// // count walks by dynamic programming -// bool overflowed = false; -// for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { -// size_t count_here = count[handle]; -// vector seqs_here = sequences[handle]; - -// snarl.follow_edges(handle, false, [&](const handle_t& next) { - -// size_t& count_next = count[next]; -// string seq_next = snarl.get_sequence(next); - -// if (numeric_limits::max() - count_here < count_next) { -// overflowed = true; -// } - -// else { -// count_next += count_here; -// // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ -// for (string seq : seqs_here){ -// sequences[next].push_back(seq + seq_next); -// } -// // cerr << "next_seqs: "; -// // for (string seq : sequences[next]){ -// // cerr << seq << endl; -// // } -// } -// }); -// ///TODO: figure out how to deal with overflow. -// // if (overflowed) { -// // return numeric_limits::max(); -// // } -// } - -// // total up the walks at the sinks -// size_t total_count = 0; -// for (handle_t& sink : sinks) { -// total_count += count[sink]; -// } - -// // all the sequences at the sinks will be all the sequences in the snarl. -// vector walks; -// for (handle_t& sink : sinks) { -// for (string seq : sequences[sink]){ -// walks.push_back(seq); -// } -// } - -// return walks; -// } - - -// // given a start and end node id, construct an extract subgraph between the two nodes (inclusive). -// // TODO: change the arguments to handles, which contain orientation within themselves. -// // That way, iteration to extract the subgraph will have direction contained within themselves. -// // This may actually end up looking like simply parsing an input text file with the handles -// // described from the find_snarl output. -// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ -// /// make a subgraph containing only nodes of interest. (e.g. a snarl) -// // make empty subgraph -// SubHandleGraph subgraph = SubHandleGraph(&graph); - -// unordered_set visited; // to avoid counting the same node twice. -// unordered_set to_visit; // nodes found that belong in the subgraph. - -// // TODO: how to ensure that "to the right" of start_handle is the correct direction? -// // initialize with start_handle (because we move only to the right of start_handle): -// handle_t start_handle = graph.get_handle(start_id); -// subgraph.add_handle(start_handle); -// visited.insert(graph.get_id(start_handle)); - -// // look only to the right of start_handle -// graph.follow_edges(start_handle, false, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); - -// /// explore the rest of the snarl: -// while (to_visit.size() != 0) { -// // remove cur_handle from to_visit -// unordered_set::iterator cur_index = to_visit.begin(); -// handle_t cur_handle = graph.get_handle(*cur_index); - -// to_visit.erase(cur_index); - -// /// visit cur_handle -// visited.insert(graph.get_id(cur_handle)); - -// subgraph.add_handle(cur_handle); - -// if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! -// // look for all nodes connected to cur_handle that need to be added -// // looking to the left, -// graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); -// // looking to the right, -// graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); -// } -// } -// return subgraph; -// } -// } \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp deleted file mode 100644 index 8898bc5ef2e..00000000000 --- a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.cpp +++ /dev/null @@ -1,741 +0,0 @@ -// /* -// In this phase of the code, I've only extracted paths in the depth_first_search -// that are overlapping the source node. Next step is to integrate paths that start -// in the middle of the snarl. -// */ - -// #pragma once //TODO: remove this, to avoid warnings + maybe bad coding practice? -// #include "0_demo_final_0.hpp" -// #include -// #include "../vg.hpp" -// #include "../handle.hpp" -// #include "../subgraph.hpp" -// #include "count_walks.hpp" -// #include -// #include -// #include -// #include "../msa_converter.hpp" -// #include "../snarls.hpp" -// #include "../gbwt_helper.hpp" -// #include "../stream/vpkg.hpp" -// #include "../../include/handlegraph/path_handle_graph.hpp" //TODO: Do I need this? - -// namespace vg { - -// // void print_kmer(const std::vector>&, const std::string& string){ -// // cerr << string << endl; -// // } - -// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle) { -// // unordered_map > handle_paths get_paths(graph, source_handle, sink_handle); -// // for (auto path : handle_paths) { -// // for (occuhandle : -// // } -// // } - -// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// //representing all possible walks through the snarl: -// vg::handle_t source_handle = graph.get_handle(source_id); -// vg::handle_t sink_handle = graph.get_handle(sink_id); - -// vector haplotypes = depth_first_haplotypes_to_strings(graph, source_id, sink_id); -// cerr << "finished depth_first, now on to reference." << endl; -// vector reference = get_paths(graph, source_handle, sink_handle); - -// haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - -// //Make a new snarl from walks: -// VG new_snarl = strings_to_graph(haplotypes); - -// integrate_snarl(graph, new_snarl, source_id, sink_id); - -// } - -// // TODO: test/debug this! -// // Given a snarl in graph defined by source_handle and sink_handle, return all walks associated with an embedded path. -// // Only walks along embedded paths. Returns a map with string keys and values of vectors of handles, -// // where each vector of handles represents one path from source to sink. -// // alternative function return: -// //unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle){ -// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle){ -// unordered_map > paths; -// unordered_map multiple_occurrences; - -// // TODO: figure out how to ensure that the occurrence handle is in the correct orientation, i.e. towards the sink. -// graph.for_each_occurrence_on_handle(source_handle, [&] (const occurrence_handle_t& occurrence) { -// // Each occurrence represents an embedded path -// // (note - in the case of a looped path, there will be multiple occurrences for one path.) -// // For each path represented by an occurrence, we need to walk along the path until we reach -// // the sink node. That series of handles represents the the sequence of the path. - -// string path = graph.get_path_name(graph.get_path_handle_of_occurrence(occurrence)); -// if (paths.find(path) != paths.end()){ // if there are multiple occurrences on the same path for source_handle (i.e. a loop) - -// //record this in multiple_occurrences, and get the number of times we've seen this occurrence. -// int occ_num; -// if (multiple_occurrences.find(path) == multiple_occurrences.end()){ -// occ_num = 1; // counting from 0, where the first ("zeroeth") occurrence doesn't get a special key name in paths. -// multiple_occurrences[path] = occ_num; -// } else { -// occ_num = multiple_occurrences[path]++; // also increments multiple_occurrences. -// } - -// //record the other occurrences with an added identifier to differentiate between paths. -// paths["occurrence_" + to_string(occ_num) + ":::" + path].emplace_back(occurrence); -// } -// else{ // this is the first time we've encountered this occurrence. -// paths[path].emplace_back(occurrence); -// } -// }); - -// //Now, for every occurrence, walk along the path until we reach the sink. -// for (pair > path : paths){ -// // cerr << "my name" << path.first << endl; -// // cerr << "my occurences:" << endl; -// // for (auto occ : path.second) { -// // cerr << "occurrence " << graph.get_sequence(graph.get_occurrence(occ)) << endl; -// // } -// // cerr << "testing get_next_occurrence:" << endl; -// // id_t cur_id = graph.get_id(graph.get_occurrence(path.second)); -// // cerr << cur_id; - -// // cur_occurence is the current handle while walking along the path -// occurrence_handle_t cur_occurrence = path.second.back(); -// id_t cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); -// // store the path in paths, in the occurrence_handle_t vector. -// while (cur_id != graph.get_id(sink_handle)){ -// paths[path.first].push_back(graph.get_next_occurrence(cur_occurrence)); -// // path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); -// cur_occurrence = paths[path.first].back(); -// // cur_occurrence = path.second.back(); -// cur_id = graph.get_id(graph.get_occurrence(cur_occurrence)); -// cerr << "cur id " << cur_id << " sink id " << graph.get_id(sink_handle) << endl; -// } -// path.second.emplace_back(graph.get_next_occurrence(cur_occurrence)); -// cerr << path.second.size() << endl; -// for (auto handle : path.second) { -// cerr << graph.get_sequence(graph.get_occurrence(handle)); -// } -// } -// cerr << "havin' issues here?" << endl; -// for (auto path : paths) { -// for (auto handle : path.second) { -// cerr << graph.get_sequence(graph.get_occurrence(handle)); -// } -// } -// // Resolve multiple_occurrences by identifying which entry in paths -// // (of those part of the same path) is longest - that will -// // represent the full breadth of the path through the snarl. -// for (pair element : multiple_occurrences){ -// // A vector of all the path entries in paths: -// vector same_path_names = {element.first}; - -// int max_len = paths[element.first].size(); -// string max_path = element.first; - -// for (int occ_num : range_vector(element.second)){ -// occ_num++; // we actually need range_vector[1, ..., end()] -// string cur_path = "occurrence_" + to_string(occ_num) + ":::" + element.first; -// int cur_len = paths[cur_path].size(); -// same_path_names.push_back(cur_path); - -// if (cur_len > max_len){ -// max_len = cur_len; -// max_path = cur_path; -// } -// } - -// // get rid of the smaller fragments of path: -// for (string name : same_path_names) { -// if (name != max_path){ -// paths.erase(name); -// } -// } -// } -// vector path_strings; -// // get just the strings from the unordered_map > paths object: -// for (auto path : paths) { -// string path_string; -// for (auto handle : path.second) { -// path_string += graph.get_sequence(graph.get_occurrence(handle)); -// } -// path_strings.push_back(path_string); -// } -// return path_strings; -// } - -// //TODO: does GBWTgraphs have names associated with haplotypes? -// //TODO: If so, I should change return value to an unordered map with key haplotype name -// //TODO: and value vector of all handles in haplotype (also, rename fxn). - -// //Depth first search here is based on get_exon_haplotypes from transcriptome.cpp. -// //However, is modified to include all haplotypes inside the source/sink handles, -// //even ones that don't include the source or sink handles. -// vector depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id){ -// ///GBWT graph construction stuff that belongs in mod_main: -// ifstream gbwt_stream; -// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; -// gbwt_stream.open(gbwt_name); - -// unique_ptr gbwt; -// // Load the GBWT from its container -// gbwt = stream::VPKG::load_one(gbwt_stream); -// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); -// // ----------------------------------------------------------------------------------------- -// /// Perform depth first search, where whenever the search reaches sink_handle, convert -// /// vector of handles to string (should be equivalent to haplotype). -// //TODO: somehow note/account for how recording of haplotype will be terminated the first time it touches the sink_handle - -// //TODO: this function currently doesn't account for if it loops back. - -// //haplotype_queue contains all started exon_haplotypes not completed yet. -// //Every time we encounter a branch in the paths, the next node down the path -// //Is stored here, along with the vector of handles that represents the path up -// //to the SearchState. -// vector< pair< vector, gbwt::SearchState> > haplotype_queue; - -// // source and sink handle for haploGraph: -// handle_t source_handle = haploGraph.get_handle(source_id); -// handle_t sink_handle = haploGraph.get_handle(sink_id); - -// //place source in haplotype_queue. -// vector source_handle_vec(1, source_handle); -// gbwt::SearchState source_state = haploGraph.get_state(source_handle); -// // pair< vector, gbwt::SearchState> source = make_pair(source_handle_vec, source_state); -// haplotype_queue.push_back( make_pair( source_handle_vec, source_state ) ); -// /* -// cerr << "node id from original graph " << source_id << endl; -// cerr << "node id from handle (immediately after construction) " << haploGraph.get_id(haploGraph.get_handle(source_id)) << endl; -// cerr << "node id from handle " << haploGraph.get_id(haplotype_queue.back().first.back()) << endl; -// cerr << "node id from search state " << haplotype_queue.back().second.node << endl; - - -// cerr << "here's the code I want to run" << endl; -// cerr << "here's the handle I care about " << haploGraph.get_sequence(haplotype_queue.back().first.back()) << endl; -// haploGraph.node_to_handle(haplotype_queue.back().second.node); -// cerr<<" made a handle "<< endl; -// cerr << haploGraph.get_sequence(haploGraph.node_to_handle(haplotype_queue.back().second.node)) << endl; -// cerr << "finished running code. " << endl; -// */ -// //haplotypes contains all "finished" haplotypes - those that were either walked -// //to their conclusion, or until they reached the sink. -// vector< vector > final_haplotypes; - -// /* -// What if I made a list of start_search_states, which would keep track of places I'd like my backtrace to go -// (maybe make this a tuple to ensure that end_search_states find the start_search_state I intended for them.) -// Actually, right now I'm only worrying about paths that start at the source_node. The new paths I encounter -// partway through the traversal will be dealt with a seperate function call that will make use of bidirected -// search to ensure I find all the relevant components of the path. - -// Okay. Keep track of the start_search_state, which is the source_node. -// Then, have a vector of all bidirected SearchStates (variable expanding_search_states) -// that are at "the border" of my current search of the snarl. - -// On each iteration of the while loop, get and pop the cur_search_state -// out of the back of expanding_search_states. Using haploGraph.follow_paths, find every search_state that -// proceeds cur_search_state, and add them to either expanding_search_states or end_search_states. -// - add cur_search_state to end_search_states if next_search_state.empty() = true; or -// - add next_search_state to end_search_states if haploGraph.get_handle(next_search_state.node) == sink_handle. -// - - -// (as a bidirectional) if there are no -// */ - - - -// while (haplotype_queue.size() != 0) { -// cerr << "iteration! with haplotype_queue:" << endl; -// // for (auto hap : haplotype_queue){ -// // cerr << hap.first.size(); -// // } - -// pair< vector, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); // Tuple of (handles_traversed_so_far, last_touched_SearchState) - -// haplotype_queue.pop_back(); - -// vector next_searches; - -// haploGraph.follow_paths(cur_haplotype.second, [&](const gbwt::SearchState& next_search) -> bool { -// next_searches.push_back(next_search); -// return true; -// }); - -// if (next_searches.size()>1){ -// for (gbwt::SearchState next_search : next_searches){ -// // copy over the vector of cur_haplotype: -// vector next_handle_vec(cur_haplotype.first); -// // add the new handle to the vec: -// next_handle_vec.push_back(haploGraph.node_to_handle(next_search.node)); //TODO: next_search.node is of type node_type, not node_id. Is that okay? -// cerr << haploGraph.get_sequence(haploGraph.node_to_handle(next_search.node)) << endl; -// pair< vector, gbwt::SearchState> next_haplotype = make_pair(next_handle_vec, next_search); -// haplotype_queue.push_back(next_haplotype); -// } -// } else if (haploGraph.get_id(haploGraph.node_to_handle(next_searches.back().node)) == sink_id ) { //TODO: once again, is SearchState.node acceptable here? -// // Then we need to add cur_haplotype + next_search to final_haplotypes. -// cur_haplotype.first.push_back(haploGraph.node_to_handle(next_searches.back().node)); -// final_haplotypes.push_back(cur_haplotype.first); - -// } else if ( next_searches.back().empty()) { -// // Then we have reached the end of the path, and need to add cur_haplotype to final_haplotypes. -// final_haplotypes.push_back(cur_haplotype.first); -// } else { -// // Then there is just one next_search, and it's not the end of the path. -// // add (cur_haplotype + next_search to haplotype_queue -// cur_haplotype.first.push_back(haploGraph.node_to_handle(next_searches.back().node)); -// cur_haplotype.second = next_searches.back(); -// haplotype_queue.push_back(cur_haplotype); -// } - -// } - -// //Now, transform the each vector of handles in final_haplotypes into a string, and return as a vector of strings - -// vector string_haplotypes; -// for (vector vector_hap : final_haplotypes){ -// string hap; -// for (handle_t& handle : vector_hap){ -// hap += haploGraph.get_sequence(handle); -// } -// string_haplotypes.push_back(hap); -// } - -// return string_haplotypes; -// } - - - - - - - - - - - - - - - - -// //TODO: delete this function once I've decided I don't want it anymore. Should be replaced with (renamed) depth_first_haplotypes_to_strings. -// // Pull out each haplotype passing through a snarl (defined by source_id and sink_id) as a string. -// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id){ - -// ///stuff that will go in mod_main: -// ifstream gbwt_stream; -// string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; -// gbwt_stream.open(gbwt_name); - -// unique_ptr gbwt; -// // Load the GBWT from its container -// gbwt = stream::VPKG::load_one(gbwt_stream); - -// // ----------------------------------------------------------------- -// /// make subgraph for the snarl: - -// // graph.for_each_handle([&] (const handle_t& handle)-> bool{ -// // cerr << "test for graph "; -// // cerr << graph.get_id(handle) << endl; -// // return true; -// // }); - -// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); - -// // snarl.for_each_handle_impl([&] (const handle_t& handle)-> bool{ -// // cerr << "test for snarl "; -// // cerr << snarl.get_id(handle) << endl; -// // return true; -// // }); -// // cerr << "before 1 \n"; - -// // GBWTGraph haploGraph = GBWTGraph(*gbwt, snarl); //TODO: figure out how to prevent error msg here. -// GBWTGraph haploGraph = GBWTGraph(*gbwt, graph); -// // cerr << "after 1 \n"; - -// // cerr << "before \n"; -// // haploGraph.for_each_handle([&] (const handle_t& handle)-> bool{ -// // cerr << "test for haploGraph "; -// // cerr << haploGraph.get_id(handle) << endl; -// // return true; -// // }); -// // cerr << "after \n"; - - -// //TODO:identify source and sinks for troubleshooting! -// unordered_map> sequences; // will contain all haplotype walks through snarl -// handle_t source_handle = haploGraph.get_handle(source_id); -// sequences[source_handle].push_back(haploGraph.get_sequence(source_handle)); - -// for (const handle_t& handle : algorithms::lazier_topological_order(&haploGraph)) { - -// vector seqs_here = sequences[handle]; -// gbwt::SearchState cur_state = haploGraph.get_state(handle); - -// // id_t cur_id = haploGraph.get_id(handle); -// // cerr << "cur_id" << cur_id << endl; - -// haploGraph.follow_paths(cur_state, [&](const gbwt::SearchState& next_search) -> bool { -// handle_t next_handle = GBWTGraph::node_to_handle(next_search.node); - -// id_t next_id = haploGraph.get_id(next_handle); -// cerr << "next_id" << next_id << endl; - -// string next_seq = haploGraph.get_sequence(next_handle); -// // transfer the sequences for the preceding handle to next_handle's sequences, -// // plus the new handle's sequence. -// for (string seq : seqs_here){ -// sequences[next_handle].push_back(seq + next_seq); -// } -// return true; - - -// }); -// } - -// // all the sequences at the sinks will be all the sequences in the snarl. -// handle_t sink_handle = haploGraph.get_handle(sink_id); -// return sequences[sink_handle]; -// // vector testVec; -// // return testVec; -// } - -// //Iterate over all snarls in a graph, and run clean_snarl on it. -// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream){ -// SnarlManager* snarl_manager = new SnarlManager(snarl_stream); - -// /* Use this code to count number of snarls in graph. -// * int top_count = 0; -// * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ -// * top_count++; -// * } -// * cerr << "number of top_level snarls in graph: " << top_count << endl; -// * -// * int general_count = 0; -// * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ -// * general_count++; -// * }); -// * cerr << "number of total snarls in graph: " << general_count << endl; -// */ - - -// vector snarl_roots = snarl_manager->top_level_snarls(); -// for (auto roots : snarl_roots){ -// clean_snarl(graph, roots->start().node_id(), roots->end().node_id()); -// } - -// delete snarl_manager; - - -// } - -// // Given a graph and a start_id and end_id representing the beginning and end of the snarl, -// // replaces the nodes between start_id and end_id (inclusive) with the sequence of interest. -// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ -// //Convert subgraph of graph, defined by start_id and end_id, into a vector of strings -// //representing all possible walks through the snarl: -// vector walks = graph_to_strings(graph, start_id, end_id); - -// //Make a new snarl from walks: -// VG new_snarl = strings_to_graph(walks); - -// integrate_snarl(graph, new_snarl, start_id, end_id); - -// } - -// // Given a larger graph and a (usually cleaned snarl) subgraph, integrate new_snarl into the graph at start_id and end_id. -// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id){ -// //Get old graph snarl -// SubHandleGraph graph_snarl = extract_subgraph(graph, start_id, end_id); - -// //Identify old and new snarl start and sink -// pair, vector> graph_snarl_defining_handles = get_sources_and_sinks(graph_snarl); -// pair, vector> new_snarl_defining_handles = get_sources_and_sinks(new_snarl); - -// //Check to make sure that newly made snarl has only one start and end. -// if(new_snarl_defining_handles.first.size() > 1 || new_snarl_defining_handles.second.size() > 1){ -// cerr << "newly made snarl with more than one start or end. # of starts: " << new_snarl_defining_handles.first.size() << " # of ends: " << new_snarl_defining_handles.second.size() << endl; -// return; -// } -// //extract old and new snarl start and sink: -// handle_t new_snarl_start = new_snarl_defining_handles.first[0]; -// handle_t new_snarl_end = new_snarl_defining_handles.second[0]; - -// handle_t graph_snarl_start = graph_snarl_defining_handles.first[0]; -// handle_t graph_snarl_end = graph_snarl_defining_handles.second[0]; - -// ///Replace start and end handles of old graph snarl with new_snarl start and end, and delete -// ///rest of old graph snarl. - -// //Get everything needed to replace graph start and sink. -// string new_start_seq = new_snarl.get_sequence(new_snarl_start); -// string new_end_seq = new_snarl.get_sequence(new_snarl_end); -// id_t new_start_id = graph.get_id(graph_snarl_start); -// id_t new_end_id = graph.get_id(graph_snarl_end); -// vector left_of_start; -// graph.follow_edges(graph_snarl_start, true, [&](const handle_t& handle){ -// left_of_start.emplace_back(handle); -// }); -// vector right_of_end; -// graph.follow_edges(graph_snarl_end, false, [&](const handle_t& handle){ -// right_of_end.emplace_back(handle); -// }); - -// //Delete all handles in graph_snarl -// graph_snarl.for_each_handle([&](const handle_t& handle){ -// graph.destroy_handle(handle); -// }, false); - -// //Make start and end handles for snarl in graph: -// handle_t new_start_handle = graph.create_handle(new_start_seq, new_start_id); -// handle_t new_end_handle = graph.create_handle(new_end_seq, new_end_id); - -// //Insert start and end handles: -// for (handle_t handle : left_of_start) { -// graph.create_edge(handle, new_start_handle); -// } -// for (handle_t handle : right_of_end) { -// graph.create_edge(new_end_handle, handle); -// } - -// ///Reintegrate rest of new_snarl. -// //topologically ordered new_snarl. As I progress through each node in topo_order, -// //I can add all the nodes to the right of the snarl. The final node will be the -// //end node, which, instead of adding as a new node to graph, I'll re-connect -// //to the modified end_node, above. -// vector new_snarl_topo_order = algorithms::lazier_topological_order(&new_snarl); - -// //Construct a parallel graph_snarl_topo_order to identify -// //paralogous nodes between new_snarl and graph. -// vector graph_snarl_topo_order = {new_start_handle}; - -// for (auto it = ++new_snarl_topo_order.begin(); it != --new_snarl_topo_order.end(); it++){ -// //For every handle in new_snarl, make an (unconnected) handle in graph. -// string handle_seq = new_snarl.get_sequence(*it); -// handle_t graph_handle = graph.create_handle(handle_seq); -// graph_snarl_topo_order.push_back(graph_handle); -// } - -// graph_snarl_topo_order.push_back(new_end_handle); - -// //Connect the rest of the nodes: -// for (int i = 0; i < new_snarl_topo_order.size(); i++){ -// // cerr << new_snarl.get_id(new_snarl_topo_order[i]) << endl; - -// new_snarl.follow_edges(new_snarl_topo_order[i], false, [&](const handle_t& snarl_handle){ -// //get topo_index of nodes to be connected to graph start handle -// auto it = find(new_snarl_topo_order.begin(), new_snarl_topo_order.end(), snarl_handle); -// int topo_index = it - new_snarl_topo_order.begin(); -// // cerr << "topo_index" << topo_index << endl; -// // cerr << "i" << i << endl; - -// //connect graph start handle -// graph.create_edge(graph_snarl_topo_order[i], graph_snarl_topo_order[topo_index]); -// }); -// } - -// } - -// //Returns tuple of two handles, first being start and second being sink. -// pair, vector> get_sources_and_sinks(HandleGraph& graph){ -// vector sink; -// vector source; - -// // identify sources and sinks -// graph.for_each_handle([&](const handle_t& handle) { -// bool is_source = true, is_sink = true; -// graph.follow_edges(handle, true, [&](const handle_t& prev) { -// is_source = false; -// return false; -// }); -// graph.follow_edges(handle, false, [&](const handle_t& next) { -// is_sink = false; -// return false; -// }); - -// // base case for dynamic programming -// if (is_source) { -// source.push_back(handle); -// } -// if (is_sink) { -// sink.emplace_back(handle); -// } -// }); - -// return pair, vector>(source, sink); - -// } - - -// VG strings_to_graph(const vector& walks){ -// seqan::Align align; // create multiple_sequence_alignment object - -// seqan::resize(rows(align), walks.size()); -// for (int i = 0; i < walks.size(); ++i){ -// assignSource(row(align, i), walks[i].c_str()); -// } - - -// globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); - -// stringstream ss; -// ss << align; -// MSAConverter myMSAConverter = MSAConverter(); -// myMSAConverter.load_alignments(ss, "seqan"); -// VG snarl = myMSAConverter.make_graph(); -// snarl.clear_paths(); - - -// // snarl.serialize_to_ostream(cerr); -// return snarl; -// } - - - - -// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id){ -// SubHandleGraph snarl = extract_subgraph(graph, start_id, end_id); - -// unordered_map> sequences; -// vector sinks; -// unordered_map count; -// count.reserve(snarl.node_size()); // resize count to contain enough buckets for size of snarl -// sequences.reserve(snarl.node_size()); // resize sequences to contain enough buckets for size of snarl - -// // identify sources and sinks //TODO: once we've established that this fxn works, we can just use start_id and end_id. -// snarl.for_each_handle([&](const handle_t& handle) { -// bool is_source = true, is_sink = true; -// snarl.follow_edges(handle, true, [&](const handle_t& prev) { -// is_source = false; -// return false; -// }); -// snarl.follow_edges(handle, false, [&](const handle_t& next) { -// is_sink = false; -// return false; -// }); - -// // base case for dynamic programming -// if (is_source) { -// count[handle] = 1; -// sequences[handle].push_back(snarl.get_sequence(handle)); //TODO: presented in the handle's local forward orientation. An issue? -// } -// if (is_sink) { -// sinks.emplace_back(handle); -// } -// }); - - -// // count walks by dynamic programming -// bool overflowed = false; -// for (const handle_t& handle : algorithms::lazier_topological_order(&snarl)) { -// size_t count_here = count[handle]; -// vector seqs_here = sequences[handle]; - -// snarl.follow_edges(handle, false, [&](const handle_t& next) { - -// size_t& count_next = count[next]; -// string seq_next = snarl.get_sequence(next); - -// if (numeric_limits::max() - count_here < count_next) { -// overflowed = true; -// } - -// else { -// count_next += count_here; -// // for (auto it = seqs_here.begin(); it == seqs_here.end(); it++){ -// for (string seq : seqs_here){ -// sequences[next].push_back(seq + seq_next); -// } -// // cerr << "next_seqs: "; -// // for (string seq : sequences[next]){ -// // cerr << seq << endl; -// // } -// } -// }); -// ///TODO: figure out how to deal with overflow. -// // if (overflowed) { -// // return numeric_limits::max(); -// // } -// } - -// // total up the walks at the sinks -// size_t total_count = 0; -// for (handle_t& sink : sinks) { -// total_count += count[sink]; -// } - -// // all the sequences at the sinks will be all the sequences in the snarl. -// vector walks; -// for (handle_t& sink : sinks) { -// for (string seq : sequences[sink]){ -// walks.push_back(seq); -// } -// } - -// return walks; -// } - - -// // given a start and end node id, construct an extract subgraph between the two nodes (inclusive). -// // TODO: change the arguments to handles, which contain orientation within themselves. -// // That way, iteration to extract the subgraph will have direction contained within themselves. -// // This may actually end up looking like simply parsing an input text file with the handles -// // described from the find_snarl output. -// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id){ -// /// make a subgraph containing only nodes of interest. (e.g. a snarl) -// // make empty subgraph -// SubHandleGraph subgraph = SubHandleGraph(&graph); - -// unordered_set visited; // to avoid counting the same node twice. -// unordered_set to_visit; // nodes found that belong in the subgraph. - -// // TODO: how to ensure that "to the right" of start_handle is the correct direction? -// // initialize with start_handle (because we move only to the right of start_handle): -// handle_t start_handle = graph.get_handle(start_id); -// subgraph.add_handle(start_handle); -// visited.insert(graph.get_id(start_handle)); - -// // look only to the right of start_handle -// graph.follow_edges(start_handle, false, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); - -// /// explore the rest of the snarl: -// while (to_visit.size() != 0) { -// // remove cur_handle from to_visit -// unordered_set::iterator cur_index = to_visit.begin(); -// handle_t cur_handle = graph.get_handle(*cur_index); - -// to_visit.erase(cur_index); - -// /// visit cur_handle -// visited.insert(graph.get_id(cur_handle)); - -// subgraph.add_handle(cur_handle); - -// if (graph.get_id(cur_handle) != end_id){ // don't iterate past end node! -// // look for all nodes connected to cur_handle that need to be added -// // looking to the left, -// graph.follow_edges(cur_handle, true, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); -// // looking to the right, -// graph.follow_edges(cur_handle, false, [&](const handle_t& handle){ -// // mark the nodes to come as to_visit -// if (visited.find(graph.get_id(handle)) == visited.end()) { -// to_visit.insert(graph.get_id(handle)); -// } -// }); -// } -// } -// return subgraph; -// } -// } \ No newline at end of file diff --git a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp b/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp deleted file mode 100644 index 5154b47101f..00000000000 --- a/src/algorithms/0_old_drafts/0_demo_final_old_0-only_source_paths.hpp +++ /dev/null @@ -1,37 +0,0 @@ -// /* -// Robin Rounthwaite -// Find function call in ./subcommand/main.cpp -// */ -// #include -// #include "../vg.hpp" -// #include "../handle.hpp" -// #include "../subgraph.hpp" -// #include "count_walks.hpp" - -// namespace vg { - -// void clean_snarl_from_haplotypes(MutablePathDeletableHandleGraph& graph, const id_t& source_id, const id_t& sink_id); - -// // vector get_path_strings(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle); - -// // unordered_map > get_paths(PathHandleGraph& graph, handle_t& source_handle, handle_t& sink_handle); -// vector get_paths(const PathHandleGraph& graph, const handle_t& source_handle, const handle_t& sink_handle); - -// vector depth_first_haplotypes_to_strings(const HandleGraph& graph, const id_t& source_id, const id_t& sink_id); - -// vector haplotypes_to_strings(MutablePathDeletableHandleGraph& graph, id_t& source_id, id_t& sink_id); - -// void clean_all_snarls(MutablePathDeletableHandleGraph& graph, ifstream& snarl_stream); - -// void clean_snarl(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); - -// SubHandleGraph extract_subgraph(MutablePathDeletableHandleGraph& graph, const id_t& start_id, const id_t& end_id); - -// vector graph_to_strings(MutablePathDeletableHandleGraph& graph, id_t start_id, id_t end_id); - -// VG strings_to_graph(const vector& walks); - -// void integrate_snarl(MutablePathDeletableHandleGraph& graph, HandleGraph& new_snarl, const id_t& start_id, const id_t& end_id); - -// pair, vector> get_sources_and_sinks(HandleGraph& graph); -// } diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp new file mode 100644 index 00000000000..ff7b187f4ae --- /dev/null +++ b/src/subcommand/0_normalize_main.cpp @@ -0,0 +1,176 @@ +// mod.cpp: define the "normalize" subcommand, which realigns snarls to produce more efficient representations of snarls. + +#include +#include +#include + +#include "subcommand.hpp" + +#include "../hash_graph.hpp" +#include "../algorithms/0_draft_haplotype_realignment.hpp" +#include "../gbwt_helper.hpp" +#include "../../include/vg/io/vpkg.hpp" + + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void help_normalize(char** argv) { + cerr << "usage: " << argv[0] << " normalize [options] >[mod.vg]" << endl + << "Modifies snarls, outputs modified on stdout." << endl + << endl + << "options:" << endl + << " -n, --normalize normalizes a currently-hardcoded snarl from a graph." << endl; +} + +int main_normalize(int argc, char** argv) { + + if (argc == 2) { + help_normalize(argv); + return 1; + } + + bool normalize = false; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + + { + {"help", no_argument, 0, 'h'}, + {"normalize", no_argument, 0, 'n'}, //TODO: change no_argument to required_argument, assuming we want one. + {0, 0, 0, 0} + }; + + int option_index = 0; + c = getopt_long (argc, argv, "n", //TODO: change to "n:" later, when we have something to specify. + long_options, &option_index); + + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + + case 'n': + normalize = true; + } + } + + HashGraph* graph; + get_input_file(optind, argc, argv, [&](istream& in) { + graph = new HashGraph(in); + }); + + if ( normalize ) + { + /// Build the gbwt: + ifstream gbwt_stream; + string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.gbwt"; //Nodes 23493 to 23505 + gbwt_stream.open(gbwt_name); + + // Load the GBWT from its container + unique_ptr gbwt; + gbwt = vg::io::VPKG::load_one(gbwt_stream); + GBWTGraph haploGraph = vg::GBWTGraph(*gbwt, *graph); + + std::ifstream snarl_stream; + string snarl_file = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.snarls"; + snarl_stream.open(snarl_file); + + if (!snarl_stream) { + cerr << "error:[vg mod] Cannot open Snarls file " << snarl_file << endl; + exit(1); + } + + // run test code on all snarls in graph. + disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); + + + // /// Run test code on a single snarl: + // vg::id_t source = 23493; vg::id_t sink = 23505; + // disambiguate_snarl(*graph, haploGraph, source, sink); + + } + + graph->serialize(std::cout); + delete graph; + + return 0; +} + +// Register subcommand +static Subcommand vg_normalize("normalize", "edit snarls to reduce information duplication", TOOLKIT, main_normalize); + + + + + + + + + + + + + + + + + +//TODO: Remove JUNK: + + // vg::id_t source = 23251;//for robin_haplotypes/simple + // vg::id_t sink = 23257;//for robin_haplotypes/simple + // /Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. + // vg::id_t source = 23251;//for robin_haplotypes/simple + // vg::id_t sink = 23257;//for robin_haplotypes/simple + // clean_snarl_from_haplotypes(*graph, source, sink); + // cerr << "done!" << endl; + // vg::handle_t source_handle = graph->get_handle(source); + // vg::handle_t sink_handle = graph->get_handle(sink); + + // vector haplotypes = depth_first_haplotypes_to_strings(*graph, source, sink); + // cerr << "finished depth_first, now on to reference." << endl; + // vector reference = get_paths(*graph, source_handle, sink_handle); + + // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); + + // cerr << "here goes!" << endl; + // for(string haplotype : haplotypes) { + + // cerr << haplotype << endl; + // } + // cerr << "done" << endl; + + + + + + + + + + + + + + + + + // std::ifstream snarl_stream; + // snarl_stream.open(demo_0); + + // if (!snarl_stream) { + // cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; + // exit(1); + // } + + // clean_all_snarls(*graph, snarl_stream); + + // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; + From 488df51b5cde29eccfabb662214ebe913faac8c3 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 1 Jul 2019 14:31:04 -0700 Subject: [PATCH 14/63] jemalloc compilation issue fixed. Changes to be committed: modified: Makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5cc724877a3..03f78ad9519 100644 --- a/Makefile +++ b/Makefile @@ -262,8 +262,8 @@ endif $(BIN_DIR)/vg: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIGURATION_OBJ) $(DEPS) . ./source_me.sh && $(CXX) $(CXXFLAGS) -o $(BIN_DIR)/vg $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIGURATION_OBJ) -lvg $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) -static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) - $(CXX) $(CXXFLAGS) -o $(BIN_DIR)/vg $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) -lvg $(STATIC_FLAGS) $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) +static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIGURATION_OBJ) $(DEPS) $(LINK_DEPS) + $(CXX) $(CXXFLAGS) -o $(BIN_DIR)/vg $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIGURATION_OBJ) -lvg $(STATIC_FLAGS) $(LD_INCLUDE_FLAGS) $(LD_LIB_FLAGS) $(ROCKSDB_LDFLAGS) $(LIB_DIR)/libvg.a: $(OBJ) $(ALGORITHMS_OBJ) $(IO_OBJ) $(DEP_OBJ) $(DEPS) rm -f $@ From 3cddb6c4cee33ad422439cbf5d534ffb8793d601 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 1 Jul 2019 15:03:44 -0700 Subject: [PATCH 15/63] reverted mod_main.cpp (hackily) and continued to edit 0_draft_haplotype_realignment.cpp to debug move embedded paths. --- .../0_draft_haplotype_realignment.cpp | 6 + src/subcommand/mod_main.cpp | 108 +----------------- 2 files changed, 10 insertions(+), 104 deletions(-) diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index ba57ad05bb8..d8600daaedd 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -876,6 +876,12 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // since it could begin in the middle of the handle. vector starting_indices = check_handle_as_start_of_path_seq(handle_seq, path_seq); + //TODO: debug_code: indices of start? + cerr << "indices of start?" << endl; + for (auto start : starting_indices){ + cerr << start << " "; + } + cerr << endl; // if there is a starting index, if (starting_indices.size() != 0) { // if the starting_indices implies that the starting handle entirely contains diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index ce35afef577..36c2194f3a1 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -17,11 +17,6 @@ #include "../algorithms/topological_sort.hpp" #include "../algorithms/remove_high_degree.hpp" -#include "../algorithms/0_draft_haplotype_realignment.hpp" -#include "../gbwt_helper.hpp" -#include "../stream/vpkg.hpp" - - using namespace std; using namespace vg; using namespace vg::subcommand; @@ -80,9 +75,7 @@ void help_mod(char** argv) { << " -a, --cactus convert to cactus graph representation" << endl << " -v, --sample-vcf FILE for a graph with allele paths, compute the sample graph from the given VCF" << endl << " -G, --sample-graph FILE subset an augmented graph to a sample graph using a Locus file" << endl - << " -t, --threads N for tasks that can be done in parallel, use this many threads" << endl - << " -F, --demo_0 FILE Given a .snarls file (from command vg snarls) and the corresponding graph," << endl - << " simplifies redundancy in graph's snarls." << endl; + << " -t, --threads N for tasks that can be done in parallel, use this many threads" << endl; } int main_mod(int argc, char** argv) { @@ -130,7 +123,6 @@ int main_mod(int argc, char** argv) { string vcf_filename; string loci_filename; int max_degree = 0; - string demo_0; int c; optind = 2; // force optind past command positional argument @@ -183,12 +175,11 @@ int main_mod(int argc, char** argv) { {"sample-vcf", required_argument, 0, 'v'}, {"sample-graph", required_argument, 0, 'G'}, {"max-degree", required_argument, 0, 'M'}, - {"demo_0", required_argument, 0, 'F'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:F:", + c = getopt_long (argc, argv, "hk:oi:q:Q:cpl:e:mt:SX:KPsunzNAf:CDr:Ig:x:RTU:Bbd:Ow:L:y:Z:Eav:G:M:", long_options, &option_index); @@ -374,10 +365,6 @@ int main_mod(int argc, char** argv) { max_degree = parse(optarg); break; - case 'F': - demo_0 = optarg; - break; - case 'h': case '?': help_mod(argv); @@ -771,99 +758,12 @@ int main_mod(int argc, char** argv) { graph->paths = Paths(); } - if ( !demo_0.empty() ) { - /// Build the gbwt: - ifstream gbwt_stream; - string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new_2.gbwt"; //Nodes 23493 to 23505 - gbwt_stream.open(gbwt_name); + graph->serialize_to_ostream(std::cout); - // Load the GBWT from its container - unique_ptr gbwt; - gbwt = stream::VPKG::load_one(gbwt_stream); - GBWTGraph haploGraph = vg::GBWTGraph(*gbwt, *graph); - - /// Run test code: - vg::id_t source = 23493; vg::id_t sink = 23505; - pair< vector< vector >, vector< vector > > haplotypes = extract_haplotypes(haploGraph, source, sink); - align_haplotypes(haploGraph, haplotypes); - - } - - // graph->serialize_to_ostream(std::cout); delete graph; return 0; } // Register subcommand -static Subcommand vg_mod("mod", "filter, transform, and edit the graph", TOOLKIT, main_mod); - - - - - - - - - - - - - - - - - - -//TODO: Remove JUNK: - - // vg::id_t source = 23251;//for robin_haplotypes/simple - // vg::id_t sink = 23257;//for robin_haplotypes/simple - // /Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. - // vg::id_t source = 23251;//for robin_haplotypes/simple - // vg::id_t sink = 23257;//for robin_haplotypes/simple - // clean_snarl_from_haplotypes(*graph, source, sink); - // cerr << "done!" << endl; - // vg::handle_t source_handle = graph->get_handle(source); - // vg::handle_t sink_handle = graph->get_handle(sink); - - // vector haplotypes = depth_first_haplotypes_to_strings(*graph, source, sink); - // cerr << "finished depth_first, now on to reference." << endl; - // vector reference = get_paths(*graph, source_handle, sink_handle); - - // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - - // cerr << "here goes!" << endl; - // for(string haplotype : haplotypes) { - - // cerr << haplotype << endl; - // } - // cerr << "done" << endl; - - - - - - - - - - - - - - - - - // std::ifstream snarl_stream; - // snarl_stream.open(demo_0); - - // if (!snarl_stream) { - // cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; - // exit(1); - // } - - // clean_all_snarls(*graph, snarl_stream); - - // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - +static Subcommand vg_mod("mod", "filter, transform, and edit the graph", TOOLKIT, main_mod); \ No newline at end of file From 5dfb8f3d6e913440499c55a07c29f34f8ee75ef3 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 5 Jul 2019 15:34:02 -0700 Subject: [PATCH 16/63] embedded paths now move to new snarls successfully. --- robin_bash/normalize_snarls.sh | 133 ++++++++++++ .../0_draft_haplotype_realignment.cpp | 204 +++++++++--------- .../0_draft_haplotype_realignment.hpp | 22 +- src/msa_converter.cpp | 3 +- src/subcommand/0_normalize_main.cpp | 6 +- 5 files changed, 252 insertions(+), 116 deletions(-) create mode 100755 robin_bash/normalize_snarls.sh diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh new file mode 100755 index 00000000000..24654426c56 --- /dev/null +++ b/robin_bash/normalize_snarls.sh @@ -0,0 +1,133 @@ +#!/bin/bash +## to give permission to run script: +# chmod +x test_F_on_x.sh +## this bash to be run on dir /vg/ + +# VG=chr10_subgraph_2dels-shift-729006 +# VG=clean_chr10_subgraph_0_new +# VG=chr10_subgraph_2dels-323159 +# VG=chr10_subgraph_3ins-1558671 + +## useful debug tools: +# export VG_FULL_TRACEBACK=1 +# valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg +## in terminal: +# gdb vg +# run mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg + +export VG_FULL_TRACEBACK=1 +set -e + +##for testing GBWT: +. ./source_me.sh && make -j 8 +echo running! + +#hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). +vg normalize -n test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.hg >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg +vg convert -a test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg -V >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg +./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg | + dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg +chromium-browser test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg + +# #for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). [the last use of mod_main before using normalize_main]. +# ./bin/vg mod -F blah test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.vg >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.vg +# ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.vg| \ +# dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.svg +# chromium-browser test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.svg + +# vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg + +# To produce a gbwt file: +# vg index -x chr10_subgraph_0_new.xg -G chr10_subgraph_0_new.gbwt -v HGSVC.haps.chr10.vcf.gz chr10_subgraph_0_new.vg + +# vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg >test/robin_snarl_examples/cleaned_snarl.vg +# # valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg +# bin/vg view -dpn test/robin_snarl_examples/chr10_subgraph_0_new.vg| \ +# dot -Tsvg -o test/robin_snarl_examples/chr10_subgraph_0_new_1.svg +# chromium-browser test/robin_snarl_examples/chr10_subgraph_0_new_1.svg + +# vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg +# vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg >test/robin_haplotypes/simple/modified_snarl/modified_snarl.vg +# bin/vg view -dpn test/robin_haplotypes/simple/modified_snarl/modified_snarl.vg| \ +# dot -Tsvg -o test/robin_haplotypes/simple/modified_snarl/modified_snarl.svg +# chromium-browser test/robin_haplotypes/simple/modified_snarl/modified_snarl.svg + +# gdb vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg + +##for looking at snarl number: +# . ./source_me.sh && make -j 8 +# bin/vg snarls test/$VG.vg > test/$VG.snarls +# bin/vg mod -F test/$VG.snarls test/$VG.vg + +## for running clean_all_snarls: +# . ./source_me.sh && make -j 8 +# bin/vg snarls test/$VG.vg > test/$VG.snarls +# bin/vg mod -F test/$VG.snarls test/$VG.vg >test/clean_$VG.vg +# bin/vg view -dpn test/clean_$VG.vg| \ +# dot -Tsvg -o test/clean_$VG.svg +# chromium-browser test/clean_$VG.svg + +# . ./source_me.sh && make -j 8 +# bin/vg mod -F "220 218" test/0_cluttered_snarl_simple.vg # for misc. terminal output (0_demo_final_0) +# bin/vg mod -F "1 6" test/bash_x_out_1.vg # for misc. terminal output (old) + +##View "cluttered" snarl for 0_demo_final_0: +# bin/vg view -dpn test/0_cluttered_snarl_simple.vg| \ +# dot -Tsvg -o test/0_cluttered_snarl_simple.svg +# chromium-browser test/0_cluttered_snarl_simple.svg + +##Testing 0_demo_final_0 on 0_cluttered_snarl_simple.vg: +# bin/vg mod -F "220 218" test/0_cluttered_snarl_simple.vg >test/0_clean_snarl_simple.vg +# bin/vg view -dpn test/0_clean_snarl_simple.vg| \ +# dot -Tsvg -o test/0_clean_snarl_simple.svg +# chromium-browser test/0_clean_snarl_simple.svg + +##Testing 0_demo_final_0 clean_all_snarls on 0_cluttered_snarl_simple.vg with snarl: +# bin/vg snarls test/0_cluttered_snarl_simple.vg > test/0_cluttered_snarl_simple.snarls + +##view terminal output: +# bin/vg mod -F "test/0_cluttered_snarl_simple.snarls" test/0_cluttered_snarl_simple.vg + +##view graph: +# bin/vg mod -F "test/0_cluttered_snarl_simple.snarls" test/0_cluttered_snarl_simple.vg >test/0_clean_all_snarl_simple.vg +# bin/vg view -dpn test/0_clean_all_snarl_simple.vg| \ +# dot -Tsvg -o test/0_clean_all_snarl_simple.svg +# chromium-browser test/0_clean_all_snarl_simple.svg + +##view snarls: +# vg view -Rj test/0_cluttered_snarl_simple.snarls + +## The following was for when F was adding 6 T's to all the nodes in the first snarl. +## its output was bash_x_out_1.vg and bash_x_out_1.svg: + +# bin/vg mod -F "1 6" test/x.vg >test/bash_x_out_1.vg +# bin/vg view -dpn test/bash_x_out_1.vg| \ +# dot -Tsvg -o test/bash_x_out_1.svg + +# chromium-browser test/bash_x_out_1.svg + +## Now, using graph bash_x_out.vg, consolidate duplicated T's into a single node. + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/bash_x_out_clean_snarl.vg +# bin/vg view -dpn test/bash_x_out_1_clean_snarl.vg| \ +# dot -Tsvg -o test/bash_x_out_1_clean_snarl.svg + +# chromium-browser test/bash_x_out_1_clean_snarl.svg + +## testing 0_demo_align_strings.cpp + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/0_align_strings.vg +# bin/vg view -dpn test/0_align_strings.vg| \ +# dot -Tsvg -o test/0_align_strings.svg + +# chromium-browser test/0_align_strings.svg + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/0_align_strings.vg +# bin/vg view -dpn test/0_align_strings.vg| \ +# dot -Tsvg -o test/0_align_strings.svg + +# chromium-browser test/0_align_strings.svg + +# vg view -dpn chr10_subgraph_0_new.vg| \ +# dot -Tsvg -o chr10_subgraph_0_new_2.svg +# chromium-browser chr10_subgraph_0_new_2.svg diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index d8600daaedd..731b47eb9ae 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -47,21 +47,24 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, * cerr << "number of total snarls in graph: " << general_count << endl; */ - int i = 0; + int num_snarls_normalized = 0; + int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); for (auto roots : snarl_roots) { - // TODO: debug_code: - cerr << "return to root node ids, disambiguate snarl with.. " << endl; - cerr << "root node ids: " << roots->start().node_id() << " " - << roots->end().node_id() << endl; - disambiguate_snarl(graph, haploGraph, roots->start().node_id(), - roots->end().node_id()); - i += 1; - cerr << endl << endl << "normalized " << i << " snarl(s)." << endl; - if (i == 2) { - break; + bool success = disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + roots->end().node_id()); + if (success) { + num_snarls_normalized += 1; + } else { + num_snarls_skipped += 1; } } + cerr << endl + << "normalized " << num_snarls_normalized << " snarl(s), skipped " + << num_snarls_skipped + << " snarls b/c they had haplotypes starting/ending in the middle " + "of the snarl." + << endl; delete snarl_manager; } @@ -79,7 +82,7 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, // Returns: none. // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl. -void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, +bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id) { cerr << "disambiguate_snarl" << endl; @@ -114,22 +117,14 @@ void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, vector> embedded_paths = extract_embedded_paths_in_snarl(graph, source_id, sink_id); - cerr << "paths: " << endl; - for (auto path : embedded_paths){ - cerr << " path " << graph.get_path_name(graph.get_path_handle_of_step(path.first)) << endl; - for (auto step : {path.first, graph.get_previous_step(path.second)}){ - cerr << "\t" << graph.get_id(graph.get_handle_of_step(step)) << " "; - } - cerr << endl; - } - // integrate the new_snarl into the graph, removing the old snarl as you go. integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); cerr << endl; - + return true; } else { cerr << "found a snarl with haplotypes in the middle. Start: " << source_id << " sink is " << sink_id << endl; + return false; } } @@ -511,7 +506,6 @@ VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotyp sink, true, [&](const handle_t &handle) { snarl.create_edge(handle, new_sink); }); snarl.destroy_handle(sink); - // snarl.serialize_to_ostream(cerr); return snarl; } @@ -757,23 +751,29 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, }); } + // save the source and sink values of new_snarl_topo_order, since topological order is + // not necessarily preserved by move_path_to_snarl. Is temporary b/c we need to + // replace the handles with ones with the right id_t label for source and sink later + // on. + id_t temp_snarl_source_id = graph.get_id(new_snarl_topo_order.front()); + id_t temp_snarl_sink_id = graph.get_id(new_snarl_topo_order.back()); + // Add the neighbors of the source and sink of the original snarl to the new_snarl's // source and sink. // source integration: - for (bool id : {source_id, sink_id}) { - graph.follow_edges(graph.get_handle(source_id), true, - [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, new_snarl_topo_order[0]); - }); - graph.follow_edges( - graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { - graph.create_edge(new_snarl_topo_order.back(), next_handle); - }); - } + graph.follow_edges( + graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, graph.get_handle(temp_snarl_source_id)); + }); + graph.follow_edges( + graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { + graph.create_edge(graph.get_handle(temp_snarl_sink_id), next_handle); + }); // For each path of interest, move it onto the new_snarl. for (auto path : embedded_paths) { - move_path_to_snarl(graph, path, new_snarl_topo_order); + move_path_to_snarl(graph, path, new_snarl_topo_order, temp_snarl_source_id, + temp_snarl_sink_id); } // Destroy the old snarl. @@ -784,87 +784,79 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // (for compatibility with future iterations on neighboring top-level snarls using the // same snarl manager. Couldn't replace it before b/c we needed the old handles to // move the paths. - handle_t new_source_handle = - graph.create_handle(graph.get_sequence(new_snarl_topo_order.front()), source_id); + handle_t new_source_handle = graph.create_handle( + graph.get_sequence(graph.get_handle(temp_snarl_source_id)), source_id); handle_t new_sink_handle = graph.create_handle(graph.get_sequence(new_snarl_topo_order.back()), sink_id); // move the source edges: // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could // in Python!) - graph.follow_edges(new_snarl_topo_order.front(), true, + graph.follow_edges(graph.get_handle(temp_snarl_source_id), true, [&](const handle_t &prev_handle) { graph.create_edge(prev_handle, new_source_handle); }); - graph.follow_edges(new_snarl_topo_order.front(), false, + graph.follow_edges(graph.get_handle(temp_snarl_source_id), false, [&](const handle_t &next_handle) { graph.create_edge(new_source_handle, next_handle); }); // move the sink edges: - graph.follow_edges(new_snarl_topo_order.back(), true, + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), true, [&](const handle_t &prev_handle) { graph.create_edge(prev_handle, new_sink_handle); }); - graph.follow_edges(new_snarl_topo_order.back(), false, + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), false, [&](const handle_t &next_handle) { graph.create_edge(new_sink_handle, next_handle); }); // move the paths: - graph.for_each_step_on_handle(new_snarl_topo_order.front(), [&](step_handle_t step) { - graph.rewrite_segment(step, graph.get_next_step(step), - vector{new_source_handle}); - }); - graph.for_each_step_on_handle(new_snarl_topo_order.back(), [&](step_handle_t step) { - graph.rewrite_segment(step, graph.get_next_step(step), - vector{new_sink_handle}); - }); + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_source_handle}); + }); + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_sink_handle}); + }); // delete the previously created source and sink: - for (handle_t handle : {new_snarl_topo_order.front(), new_snarl_topo_order.back()}) { + for (handle_t handle : + {graph.get_handle(temp_snarl_source_id), graph.get_handle(temp_snarl_sink_id)}) { graph.destroy_handle(handle); } } // Moves a path from its original location in the graph to a new snarl, // defined by a vector of interconnected handles. +// NOTE: the handles in new_snarl_handles may not preserve topological order after +// being passed to this method, if they were ordered before. // Arguments: graph: the graph containing the old_embedded_path and the handles in -// new_snarl_handles +// new_snarl_topo_order // old_embedded_path: a pair, where // pair.first is the first step_handle of interest in the // old_embedded_path, and pair.second is the step_handle *after* // the last step_handle of interest in the old_embedded_path (can // be the null step at the end of the path.) -// new_snarl_handles: all the handles in the new snarl, inside the graph. +// new_snarl_topo_order: all the handles in the new snarl, inside the graph. // Return: None. void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, const pair &old_embedded_path, - const vector &new_snarl_handles) { - cerr << endl << "move_path_to_snarl" << endl; - // cerr << "new_snarl_handles: " << endl; - // for (handle_t handle: new_snarl_handles){ - // cerr << graph.get_id(handle) << endl; - // } - cerr << "for path " - << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) - << endl; + vector &new_snarl_handles, id_t &source_id, + id_t &sink_id) { + cerr << "move_path_to_snarl" << endl; + // get the sequence associated with the path string path_seq; step_handle_t cur_step = old_embedded_path.first; - cerr << " old_embedded path looks like: " - << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " - << graph.get_id(graph.get_handle_of_step( - graph.get_previous_step(old_embedded_path.second))) - << endl; - cerr << "this is the original path handle ids: "; + while (cur_step != old_embedded_path.second) { - cerr << graph.get_id(graph.get_handle_of_step(cur_step)); path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); cur_step = graph.get_next_step(cur_step); } - cerr << endl; - cerr << "pathseq " << path_seq << endl; // for the given path, find every good possible starting handle in the new_snarl // format of pair is < possible_path_handle_vec, @@ -876,27 +868,20 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // since it could begin in the middle of the handle. vector starting_indices = check_handle_as_start_of_path_seq(handle_seq, path_seq); - //TODO: debug_code: indices of start? - cerr << "indices of start?" << endl; - for (auto start : starting_indices){ - cerr << start << " "; - } - cerr << endl; + // if there is a starting index, if (starting_indices.size() != 0) { // if the starting_indices implies that the starting handle entirely contains // the path_seq of interest: + if ((handle_seq.size() - starting_indices.back()) >= path_seq.size()) { // then we've already found the full mapping location of the path! Move // the path, end the method. - // TODO: move the path to the new vector of handles, splitting start and - // end handles if need be. vector new_path{handle}; graph.rewrite_segment(old_embedded_path.first, old_embedded_path.second, new_path); return; } else { - cerr << "adding possible path at node " << graph.get_id(handle) << endl; // add it as a possible_path. vector possible_path_handle_vec{handle}; for (auto starting_index : starting_indices) { @@ -911,16 +896,20 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // for every possible path, extend it to determine if it really is the path we're // looking for: while (!possible_paths.empty()) { - // take a path off of possible_paths: - tuple, int, int> possible_path = possible_paths.back(); + // take a path off of possible_paths, which will be copied for every iteration through graph.follow_edges, below: + tuple, int, int> possible_path_query = possible_paths.back(); possible_paths.pop_back(); // extend the path through all right-extending edges to see if any subsequent - // paths still - // satisfy the requirements for bein a possible_path: + // paths still satisfy the requirements for being a possible_path: bool no_path = graph.follow_edges( - get<0>(possible_path).back(), false, [&](const handle_t &next) { + get<0>(possible_path_query).back(), false, [&](const handle_t &next) { + // make a copy to be extended for through each possible next handle in follow edges. + tuple, int, int> possible_path = possible_path_query; + + // extract relevant information to make code more readable. string next_seq = graph.get_sequence(next); + id_t next_id = graph.get_id(next); int &cur_index_in_path = get<2>(possible_path); // if the next handle would be the ending handle for the path, @@ -932,22 +921,42 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, compare_length) == 0) { // we've found the new path! Move path to the new sequence, and // end the function. - get<0>(possible_path).push_back(next); - // TODO: move the path to the new vector of handles, splitting - // start and end handles if need be. + + if (compare_length < next_seq.size()) { + // If the path ends before the end of next_seq, then split the + // handle so that the path ends flush with the end of the + // first of the two split handles. + + // divide the handle where the path ends; + pair divided_next = + graph.divide_handle(next, compare_length); + get<0>(possible_path).push_back(divided_next.first); + + // Special case if next is the sink or the source, to preserve + // the reassignment of source and sink ids in integrate_snarl. + if (next_id = sink_id) { + sink_id = graph.get_id(divided_next.second); + } + + // TODO: NOTE: finding the old "next" handle is expensive. + // TODO: Use different container? + auto it = find(new_snarl_handles.begin(), + new_snarl_handles.end(), next); + + // replace the old invalidated handle with one of the new ones + *it = divided_next.first; + // stick the other new handle on the end of new_snarl_handles. + new_snarl_handles.push_back(divided_next.second); + + } else { + // otherwise, the end of the path already coincides with the + // end of the handle. In that case, just add it to the path. + get<0>(possible_path).push_back(next); + } graph.rewrite_segment(old_embedded_path.first, old_embedded_path.second, get<0>(possible_path)); - // TODO: test_code: show when we find a path: - cerr << "found a full path named " - << graph.get_path_name(graph.get_path_handle_of_step( - old_embedded_path.first)) - << "! Here is the sequence of handles:" << endl; - for (handle_t handle : get<0>(possible_path)) { - cerr << graph.get_id(handle) << ": " - << graph.get_sequence(handle) << " " << endl; - } return false; } } @@ -985,11 +994,6 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) << " from the old snarl in the newly aligned snarl." << endl << endl; - cerr << "Here's the sequence of the path: " << path_seq << endl - << "Here's the start and end node ids of the path: " - << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " - << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << endl - << endl; } // Determines whether some subsequence in a handle satisfies the condition of being the diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp index 932d663bcf9..cb31e1a7768 100644 --- a/src/algorithms/0_draft_haplotype_realignment.hpp +++ b/src/algorithms/0_draft_haplotype_realignment.hpp @@ -49,7 +49,7 @@ namespace vg { void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, ifstream &snarl_stream); -void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, +bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id); @@ -81,7 +81,8 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph & void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, const pair &old_embedded_path, - const vector &new_snarl_handles); + vector &new_snarl_handles, id_t &source_id, + id_t &sink_id); vector check_handle_as_start_of_path_seq(const string &handle_seq, const string &path_seq); @@ -108,9 +109,9 @@ void jordan_bug(MutablePathDeletableHandleGraph& graph){ handle_t example = graph.get_handle(23448); handle_t replacement = graph.create_handle("GATTACA", 1); - // move the source edges: - //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could in Python!) - graph.follow_edges(example, true, + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could +in Python!) graph.follow_edges(example, true, [&](const handle_t &prev_handle) { graph.create_edge(prev_handle, replacement); }); @@ -122,7 +123,8 @@ void jordan_bug(MutablePathDeletableHandleGraph& graph){ // move the paths: graph.for_each_step_on_handle(example, [&](step_handle_t step) { - graph.rewrite_segment(step, graph.get_next_step(step), vector{replacement}); + graph.rewrite_segment(step, graph.get_next_step(step), +vector{replacement}); }); // example with two nodes: @@ -131,9 +133,9 @@ void jordan_bug(MutablePathDeletableHandleGraph& graph){ handle_t replacement_2 = graph.create_handle("GATTACA", 3); graph.create_edge(replacement_1, replacement_2); - // move the source edges: - //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could in Python!) - graph.follow_edges(example_1, true, + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could +in Python!) graph.follow_edges(example_1, true, [&](const handle_t &prev_handle) { graph.create_edge(prev_handle, replacement_1); }); @@ -149,5 +151,3 @@ void jordan_bug(MutablePathDeletableHandleGraph& graph){ }); } */ - - diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index fa75a993a9d..e191a2361a5 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -7,8 +7,7 @@ #include "vg.hpp" #include "msa_converter.hpp" -//TODO: remove definition -#define debug_msa_converter +// #define debug_msa_converter namespace vg { diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index ff7b187f4ae..cf9b5441892 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -6,7 +6,7 @@ #include "subcommand.hpp" -#include "../hash_graph.hpp" +#include "../../include/sglib/hash_graph.hpp" #include "../algorithms/0_draft_haplotype_realignment.hpp" #include "../gbwt_helper.hpp" #include "../../include/vg/io/vpkg.hpp" @@ -61,9 +61,9 @@ int main_normalize(int argc, char** argv) { } } - HashGraph* graph; + sglib::HashGraph* graph; get_input_file(optind, argc, argv, [&](istream& in) { - graph = new HashGraph(in); + graph = new sglib::HashGraph(in); }); if ( normalize ) From d92fa392f792ef71bb75b862bc83870082bdd3d8 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Tue, 9 Jul 2019 13:55:32 -0700 Subject: [PATCH 17/63] shell script update --- robin_bash/normalize_snarls.sh | 165 +++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100755 robin_bash/normalize_snarls.sh diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh new file mode 100755 index 00000000000..36c7a59feff --- /dev/null +++ b/robin_bash/normalize_snarls.sh @@ -0,0 +1,165 @@ +#!/bin/bash +## to give permission to run script: +# chmod +x test_F_on_x.sh +## this bash to be run on dir /vg/ + +# VG=chr10_subgraph_2dels-shift-729006 +# VG=clean_chr10_subgraph_0_new +# VG=chr10_subgraph_2dels-323159 +# VG=chr10_subgraph_3ins-1558671 + +## useful debug tools: +# export VG_FULL_TRACEBACK=1 +# valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg +## in terminal: +# gdb vg +# run mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg + +export VG_FULL_TRACEBACK=0 +set -e + +echo compiling! +. ./source_me.sh && make -j 8 +echo running! + +##running normalize_snarls on a full chromosome. +VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +TEST_DIR=test/robin_chromosomes/chr10 +FILE_BASENAME=hgsvc_chr10_construct + +# Jordan's buggy command example +vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -V >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.vg +echo halfway_there +vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg +echo all_done + +# # To produce .snarls: +# vg snarls $VG_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/$FILE_BASENAME.gbwt -v $VG_DIR/HGSVC.haps.chr10.vcf.gz $VG_DIR/$FILE_BASENAME.vg +# echo "GBWT MADE" +# Convert .vg to .hg: +# vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg +# echo "CONVERTED VG TO HG" +Run normalize algorithm: +vg normalize -n -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "NORMALIZED HG MADE" +# convert .hg to .vg +vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg +echo "CONVERTED BACK TO VG." +# # visualize +# ./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME_normalized.vg| \ +# dot -Tsvg -o $TEST_DIR/$FILE_BASENAME_normalized.svg +# # chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg + +# #hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). +# vg normalize -n test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.hg >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg +# vg convert -a test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg -V >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg +# ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg | +# dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg +# chromium-browser test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg + +# #for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). [the last use of mod_main before using normalize_main]. +# ./bin/vg mod -F blah test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.vg >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.vg +# ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.vg| \ +# dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.svg +# chromium-browser test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.svg + +# vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg + +# To produce a gbwt file: +# vg index -x chr10_subgraph_0_new.xg -G chr10_subgraph_0_new.gbwt -v HGSVC.haps.chr10.vcf.gz chr10_subgraph_0_new.vg + +# vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg >test/robin_snarl_examples/cleaned_snarl.vg +# # valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg +# bin/vg view -dpn test/robin_snarl_examples/chr10_subgraph_0_new.vg| \ +# dot -Tsvg -o test/robin_snarl_examples/chr10_subgraph_0_new_1.svg +# chromium-browser test/robin_snarl_examples/chr10_subgraph_0_new_1.svg + +# vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg +# vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg >test/robin_haplotypes/simple/modified_snarl/modified_snarl.vg +# bin/vg view -dpn test/robin_haplotypes/simple/modified_snarl/modified_snarl.vg| \ +# dot -Tsvg -o test/robin_haplotypes/simple/modified_snarl/modified_snarl.svg +# chromium-browser test/robin_haplotypes/simple/modified_snarl/modified_snarl.svg + +# gdb vg mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg + +##for looking at snarl number: +# . ./source_me.sh && make -j 8 +# bin/vg snarls test/$VG.vg > test/$VG.snarls +# bin/vg mod -F test/$VG.snarls test/$VG.vg + +## for running clean_all_snarls: +# . ./source_me.sh && make -j 8 +# bin/vg snarls test/$VG.vg > test/$VG.snarls +# bin/vg mod -F test/$VG.snarls test/$VG.vg >test/clean_$VG.vg +# bin/vg view -dpn test/clean_$VG.vg| \ +# dot -Tsvg -o test/clean_$VG.svg +# chromium-browser test/clean_$VG.svg + +# . ./source_me.sh && make -j 8 +# bin/vg mod -F "220 218" test/0_cluttered_snarl_simple.vg # for misc. terminal output (0_demo_final_0) +# bin/vg mod -F "1 6" test/bash_x_out_1.vg # for misc. terminal output (old) + +##View "cluttered" snarl for 0_demo_final_0: +# bin/vg view -dpn test/0_cluttered_snarl_simple.vg| \ +# dot -Tsvg -o test/0_cluttered_snarl_simple.svg +# chromium-browser test/0_cluttered_snarl_simple.svg + +##Testing 0_demo_final_0 on 0_cluttered_snarl_simple.vg: +# bin/vg mod -F "220 218" test/0_cluttered_snarl_simple.vg >test/0_clean_snarl_simple.vg +# bin/vg view -dpn test/0_clean_snarl_simple.vg| \ +# dot -Tsvg -o test/0_clean_snarl_simple.svg +# chromium-browser test/0_clean_snarl_simple.svg + +##Testing 0_demo_final_0 clean_all_snarls on 0_cluttered_snarl_simple.vg with snarl: +# bin/vg snarls test/0_cluttered_snarl_simple.vg > test/0_cluttered_snarl_simple.snarls + +##view terminal output: +# bin/vg mod -F "test/0_cluttered_snarl_simple.snarls" test/0_cluttered_snarl_simple.vg + +##view graph: +# bin/vg mod -F "test/0_cluttered_snarl_simple.snarls" test/0_cluttered_snarl_simple.vg >test/0_clean_all_snarl_simple.vg +# bin/vg view -dpn test/0_clean_all_snarl_simple.vg| \ +# dot -Tsvg -o test/0_clean_all_snarl_simple.svg +# chromium-browser test/0_clean_all_snarl_simple.svg + +##view snarls: +# vg view -Rj test/0_cluttered_snarl_simple.snarls + +## The following was for when F was adding 6 T's to all the nodes in the first snarl. +## its output was bash_x_out_1.vg and bash_x_out_1.svg: + +# bin/vg mod -F "1 6" test/x.vg >test/bash_x_out_1.vg +# bin/vg view -dpn test/bash_x_out_1.vg| \ +# dot -Tsvg -o test/bash_x_out_1.svg + +# chromium-browser test/bash_x_out_1.svg + +## Now, using graph bash_x_out.vg, consolidate duplicated T's into a single node. + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/bash_x_out_clean_snarl.vg +# bin/vg view -dpn test/bash_x_out_1_clean_snarl.vg| \ +# dot -Tsvg -o test/bash_x_out_1_clean_snarl.svg + +# chromium-browser test/bash_x_out_1_clean_snarl.svg + +## testing 0_demo_align_strings.cpp + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/0_align_strings.vg +# bin/vg view -dpn test/0_align_strings.vg| \ +# dot -Tsvg -o test/0_align_strings.svg + +# chromium-browser test/0_align_strings.svg + +# bin/vg mod -F "1 6" test/bash_x_out_1.vg >test/0_align_strings.vg +# bin/vg view -dpn test/0_align_strings.vg| \ +# dot -Tsvg -o test/0_align_strings.svg + +# chromium-browser test/0_align_strings.svg + +# vg view -dpn chr10_subgraph_0_new.vg| \ +# dot -Tsvg -o chr10_subgraph_0_new_2.svg +# chromium-browser chr10_subgraph_0_new_2.svg + From 0dfca26041852d7598e39732f450e5a05233d2f7 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 9 Jul 2019 15:02:59 -0700 Subject: [PATCH 18/63] update shell --- robin_bash/normalize_snarls.sh | 40 ++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 32b16ec750d..dcc14c17810 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,16 +22,21 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! -##running normalize_snarls on a full chromosome. -VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 -TEST_DIR=test/robin_chromosomes/chr10 -FILE_BASENAME=hgsvc_chr10_construct - -# Jordan's buggy command example -vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -V >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.vg -echo halfway_there -vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg -echo all_done +## testing vg convert +TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert +vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg +vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg + +# ##running normalize_snarls on a full chromosome. +# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +# TEST_DIR=test/robin_chromosomes/chr10 +# FILE_BASENAME=hgsvc_chr10_construct + +# # Jordan's buggy command example +# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -V >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.vg +# echo halfway_there +# vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg +# echo all_done # # To produce .snarls: # vg snarls $VG_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls @@ -42,12 +47,12 @@ echo all_done # Convert .vg to .hg: # vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg # echo "CONVERTED VG TO HG" -Run normalize algorithm: -vg normalize -n -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -echo "NORMALIZED HG MADE" -# convert .hg to .vg -vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg -echo "CONVERTED BACK TO VG." +# # Run normalize algorithm: +# vg normalize -n -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +# echo "NORMALIZED HG MADE" +# # convert .hg to .vg +# vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg +# echo "CONVERTED BACK TO VG." # # visualize # ./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME_normalized.vg| \ # dot -Tsvg -o $TEST_DIR/$FILE_BASENAME_normalized.svg @@ -162,7 +167,4 @@ echo "CONVERTED BACK TO VG." # vg view -dpn chr10_subgraph_0_new.vg| \ # dot -Tsvg -o chr10_subgraph_0_new_2.svg # chromium-browser chr10_subgraph_0_new_2.svg -<<<<<<< HEAD -======= ->>>>>>> origin From 82d6e53db0368507191afff23dc3ebfe605c11f2 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Tue, 9 Jul 2019 16:18:44 -0700 Subject: [PATCH 19/63] added normalize arguments -g and -s --- robin_bash/normalize_snarls.sh | 23 ++++++++++++----------- src/subcommand/0_normalize_main.cpp | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index dcc14c17810..8e5dc4bac44 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -23,17 +23,18 @@ echo compiling! echo running! ## testing vg convert -TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert -vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg -vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg +# TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert +# TEST_CONVERT_FILE=test/robin_chromosomes/test_convert/test_convert +# vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg +# vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg -# ##running normalize_snarls on a full chromosome. -# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 -# TEST_DIR=test/robin_chromosomes/chr10 -# FILE_BASENAME=hgsvc_chr10_construct +##running normalize_snarls on a full chromosome. +VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +TEST_DIR=test/robin_chromosomes/chr10 +FILE_BASENAME=hgsvc_chr10_construct # # Jordan's buggy command example -# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -V >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.vg +# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg # echo halfway_there # vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg # echo all_done @@ -47,9 +48,9 @@ vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg # Convert .vg to .hg: # vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg # echo "CONVERTED VG TO HG" -# # Run normalize algorithm: -# vg normalize -n -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -# echo "NORMALIZED HG MADE" +# Run normalize algorithm: +vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "NORMALIZED HG MADE" # # convert .hg to .vg # vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg # echo "CONVERTED BACK TO VG." diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index cf9b5441892..9366d2e5dc4 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -22,6 +22,8 @@ void help_normalize(char** argv) { << endl << "options:" << endl << " -n, --normalize normalizes a currently-hardcoded snarl from a graph." << endl; + << " -g, --gbwt gbwt corresponding to hashgraph." << endl; + << " -s, --snarls snarls file corresponding to hashgraph." << endl; } int main_normalize(int argc, char** argv) { @@ -32,6 +34,8 @@ int main_normalize(int argc, char** argv) { } bool normalize = false; + string gbwt; + string snarls; int c; optind = 2; // force optind past command positional argument @@ -40,12 +44,13 @@ int main_normalize(int argc, char** argv) { { {"help", no_argument, 0, 'h'}, - {"normalize", no_argument, 0, 'n'}, //TODO: change no_argument to required_argument, assuming we want one. + {"gbwt", required_argument, 0, 'g'}, + {"snarls", required_argument, 0, 's'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "n", //TODO: change to "n:" later, when we have something to specify. + c = getopt_long (argc, argv, "g:s:h", long_options, &option_index); @@ -56,9 +61,13 @@ int main_normalize(int argc, char** argv) { switch (c) { - case 'n': + case 'g': + gbwt = optarg; normalize = true; - } + + case 's': + snarls = optarg; + } sglib::HashGraph* graph; From a3931b6387e76f9dbdfa8ad7378c8b30ad5d9539 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Wed, 10 Jul 2019 11:19:57 -0700 Subject: [PATCH 20/63] added arguments to normalize main, further updates --- robin_bash/normalize_snarls.sh | 26 ++++++++++++++++++++------ src/subcommand/0_normalize_main.cpp | 10 +++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 8e5dc4bac44..40544cfef7f 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -27,11 +27,25 @@ echo running! # TEST_CONVERT_FILE=test/robin_chromosomes/test_convert/test_convert # vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg # vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg +## testing vg normalize on smaller graph (checking that serialization still works): +TEST_NORMALIZE_FILE=test/robin_chromosomes/test_normalize/test_normalize +# To produce .snarls: +vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls +echo "SNARLS MADE" +# To produce .gbwt: +vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg +echo "GBWT MADE" +# Convert .vg to .hg: +vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg +echo "CONVERTED VG TO HG" +# Run normalize algorithm: +vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "normalized." -##running normalize_snarls on a full chromosome. -VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 -TEST_DIR=test/robin_chromosomes/chr10 -FILE_BASENAME=hgsvc_chr10_construct +# ##running normalize_snarls on a full chromosome. +# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +# TEST_DIR=test/robin_chromosomes/chr10 +# FILE_BASENAME=hgsvc_chr10_construct # # Jordan's buggy command example # vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg @@ -49,8 +63,8 @@ FILE_BASENAME=hgsvc_chr10_construct # vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg # echo "CONVERTED VG TO HG" # Run normalize algorithm: -vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -echo "NORMALIZED HG MADE" +# vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +# echo "NORMALIZED HG MADE" # # convert .hg to .vg # vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg # echo "CONVERTED BACK TO VG." diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 9366d2e5dc4..38f6b54915c 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -21,13 +21,14 @@ void help_normalize(char** argv) { << "Modifies snarls, outputs modified on stdout." << endl << endl << "options:" << endl - << " -n, --normalize normalizes a currently-hardcoded snarl from a graph." << endl; - << " -g, --gbwt gbwt corresponding to hashgraph." << endl; + << " -n, --normalize normalizes a currently-hardcoded snarl from a graph." << endl + << " -g, --gbwt gbwt corresponding to hashgraph." << endl << " -s, --snarls snarls file corresponding to hashgraph." << endl; } int main_normalize(int argc, char** argv) { + if (argc == 2) { help_normalize(argv); return 1; @@ -67,7 +68,10 @@ int main_normalize(int argc, char** argv) { case 's': snarls = optarg; - + + // default: + // abort(); + } } sglib::HashGraph* graph; From 5f3f9550710eeaa25a49d5a32749c02d22d26b6a Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 10 Jul 2019 11:34:18 -0700 Subject: [PATCH 21/63] checking code runs on local machine --- robin_bash/normalize_snarls.sh | 60 +++++++++++++++-------------- src/subcommand/0_normalize_main.cpp | 3 +- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 40544cfef7f..74788f29ec5 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,47 +22,38 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! -## testing vg convert -# TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert -# TEST_CONVERT_FILE=test/robin_chromosomes/test_convert/test_convert -# vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg -# vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg + + ## testing vg normalize on smaller graph (checking that serialization still works): -TEST_NORMALIZE_FILE=test/robin_chromosomes/test_normalize/test_normalize -# To produce .snarls: -vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls -echo "SNARLS MADE" -# To produce .gbwt: -vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg -echo "GBWT MADE" -# Convert .vg to .hg: -vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg -echo "CONVERTED VG TO HG" -# Run normalize algorithm: -vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -echo "normalized." +# TEST_NORMALIZE_FILE=test/robin_chromosomes/test_normalize/test_normalize +# # To produce .snarls: +# vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +# echo "normalized." # ##running normalize_snarls on a full chromosome. # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 # TEST_DIR=test/robin_chromosomes/chr10 # FILE_BASENAME=hgsvc_chr10_construct -# # Jordan's buggy command example -# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -# echo halfway_there -# vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg -# echo all_done - # # To produce .snarls: # vg snarls $VG_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls # echo "SNARLS MADE" # # To produce .gbwt: # vg index -G $TEST_DIR/$FILE_BASENAME.gbwt -v $VG_DIR/HGSVC.haps.chr10.vcf.gz $VG_DIR/$FILE_BASENAME.vg # echo "GBWT MADE" -# Convert .vg to .hg: +# # Convert .vg to .hg: # vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg # echo "CONVERTED VG TO HG" -# Run normalize algorithm: +# # Run normalize algorithm: # vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg # echo "NORMALIZED HG MADE" # # convert .hg to .vg @@ -73,8 +64,21 @@ echo "normalized." # dot -Tsvg -o $TEST_DIR/$FILE_BASENAME_normalized.svg # # chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg -# #hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). -# vg normalize -n test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.hg >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg + +# ## Jordan's buggy command example +# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg +# echo halfway_there +# vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg +# echo all_done +# ## testing vg convert +# TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert +# TEST_CONVERT_FILE=test/robin_chromosomes/test_convert/test_convert +# vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg +# vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg + +##hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). +TEST=test/robin_haplotypes/threads_in_middle_example +vg normalize -g $TEST/chr10_subgraph_0_new.gbwt -s $TEST/chr10_subgraph_0_new.snarls $TEST/chr10_subgraph_0_new.hg >$TEST/cleaned_mid_hap_snarl.hg # vg convert -a test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg -V >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg # ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg | # dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 38f6b54915c..3ea95d8db1b 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -21,7 +21,6 @@ void help_normalize(char** argv) { << "Modifies snarls, outputs modified on stdout." << endl << endl << "options:" << endl - << " -n, --normalize normalizes a currently-hardcoded snarl from a graph." << endl << " -g, --gbwt gbwt corresponding to hashgraph." << endl << " -s, --snarls snarls file corresponding to hashgraph." << endl; } @@ -69,7 +68,7 @@ int main_normalize(int argc, char** argv) { case 's': snarls = optarg; - // default: + // default: //TODO: get this to work, instead of always causing crash. // abort(); } } From 1f7435143b633e2e6cb48eac389dde32dd090f21 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Wed, 10 Jul 2019 12:07:20 -0700 Subject: [PATCH 22/63] shell update --- robin_bash/normalize_snarls.sh | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 74788f29ec5..d461e616b67 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -25,25 +25,24 @@ echo running! ## testing vg normalize on smaller graph (checking that serialization still works): -# TEST_NORMALIZE_FILE=test/robin_chromosomes/test_normalize/test_normalize -# # To produce .snarls: -# vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls -# echo "SNARLS MADE" -# # To produce .gbwt: -# vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg -# echo "GBWT MADE" -# # Convert .vg to .hg: -# vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg -# echo "CONVERTED VG TO HG" -# # Run normalize algorithm: -# vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -# echo "normalized." +TEST_NORMALIZE_FILE=test/robin_tests/test_normalize/chr10_subgraph_0_new +# To produce .snarls: +vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls +echo "SNARLS MADE" +# To produce .gbwt: +vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg +echo "GBWT MADE" +# Convert .vg to .hg: +vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg +echo "CONVERTED VG TO HG" +# Run normalize algorithm: +vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "normalized." # ##running normalize_snarls on a full chromosome. # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 # TEST_DIR=test/robin_chromosomes/chr10 # FILE_BASENAME=hgsvc_chr10_construct - # # To produce .snarls: # vg snarls $VG_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls # echo "SNARLS MADE" @@ -76,9 +75,9 @@ echo running! # vg convert -v $TEST_CONVERT_FILE.vg -A >$TEST_CONVERT_FILE.hg # vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg -##hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). -TEST=test/robin_haplotypes/threads_in_middle_example -vg normalize -g $TEST/chr10_subgraph_0_new.gbwt -s $TEST/chr10_subgraph_0_new.snarls $TEST/chr10_subgraph_0_new.hg >$TEST/cleaned_mid_hap_snarl.hg +# ##hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). +# TEST=test/robin_haplotypes/threads_in_middle_example +# vg normalize -g $TEST/chr10_subgraph_0_new.gbwt -s $TEST/chr10_subgraph_0_new.snarls $TEST/chr10_subgraph_0_new.hg >$TEST/cleaned_mid_hap_snarl.hg # vg convert -a test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg -V >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg # ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg | # dot -Tsvg -o test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.svg From b2549bd8f3aa413af38da5ff8203d4dce93f7038 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 10 Jul 2019 13:19:09 -0700 Subject: [PATCH 23/63] fixing issues bash, also debugging arguments in normalize main --- robin_bash/normalize_snarls.sh | 14 +++++++------- src/subcommand/0_normalize_main.cpp | 5 ++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index d461e616b67..b8ea90f1ddd 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -23,20 +23,20 @@ echo compiling! echo running! - ## testing vg normalize on smaller graph (checking that serialization still works): -TEST_NORMALIZE_FILE=test/robin_tests/test_normalize/chr10_subgraph_0_new +TEST_DIR=test/robin_tests/robin_haplotypes/threads_in_middle_example # To produce .snarls: -vg snarls $TEST_NORMALIZE_FILE.vg >$TEST_NORMALIZE_FILE.snarls +vg snarls $TEST_DIR/chr10_subgraph_0_new.vg >$TEST_DIR/new_remake_test/normalize.snarls echo "SNARLS MADE" # To produce .gbwt: -vg index -G $TEST_NORMALIZE_FILE.gbwt -v test/robin_chromosomes/test_normalize/HGSVC.haps.chr10.vcf.gz $TEST_NORMALIZE_FILE.vg +vg index -G $TEST_DIR/new_remake_test/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/chr10_subgraph_0_new.vg echo "GBWT MADE" # Convert .vg to .hg: -vg convert -v $TEST_NORMALIZE_FILE.vg -A >$TEST_NORMALIZE_FILE.hg +vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +# vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg echo "CONVERTED VG TO HG" # Run normalize algorithm: -vg normalize -g TEST_NORMALIZE_FILE.gbwt -s TEST_NORMALIZE_FILE.snarls TEST_NORMALIZE_FILE.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +vg normalize -g $TEST_DIR/new_remake_test/normalize.gbwt -s $TEST_DIR/new_remake_test/normalize.snarls $TEST_DIR/new_remake_test/normalize.hg >$TEST_DIR/new_remake_test/normalize_out.hg echo "normalized." # ##running normalize_snarls on a full chromosome. @@ -76,7 +76,7 @@ echo "normalized." # vg convert -a $TEST_CONVERT_FILE.hg -V >$TEST_CONVERT_FILE.vg # ##hg-oriented commands for working on aligning haplotype in middle of snarl. (snarl nodes 23493-23505). -# TEST=test/robin_haplotypes/threads_in_middle_example +# TEST=test/robin_tests/robin_haplotypes/threads_in_middle_example # vg normalize -g $TEST/chr10_subgraph_0_new.gbwt -s $TEST/chr10_subgraph_0_new.snarls $TEST/chr10_subgraph_0_new.hg >$TEST/cleaned_mid_hap_snarl.hg # vg convert -a test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl.hg -V >test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg # ./bin/vg view -dpn test/robin_haplotypes/threads_in_middle_example/cleaned_mid_hap_snarl_from_hash.vg | diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 3ea95d8db1b..c2ef6e9011d 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -82,8 +82,7 @@ int main_normalize(int argc, char** argv) { { /// Build the gbwt: ifstream gbwt_stream; - string gbwt_name = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.gbwt"; //Nodes 23493 to 23505 - gbwt_stream.open(gbwt_name); + gbwt_stream.open(gbwt); // Load the GBWT from its container unique_ptr gbwt; @@ -91,7 +90,7 @@ int main_normalize(int argc, char** argv) { GBWTGraph haploGraph = vg::GBWTGraph(*gbwt, *graph); std::ifstream snarl_stream; - string snarl_file = "test/robin_haplotypes/threads_in_middle_example/chr10_subgraph_0_new.snarls"; + string snarl_file = snarls; snarl_stream.open(snarl_file); if (!snarl_stream) { From 422e42d11de0321cd28a728e72fd8b94ab65e653 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Wed, 10 Jul 2019 14:15:42 -0700 Subject: [PATCH 24/63] bash commands for normalize smaller graph works --- robin_bash/normalize_snarls.sh | 37 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index b8ea90f1ddd..749e0a379ab 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -23,25 +23,40 @@ echo compiling! echo running! -## testing vg normalize on smaller graph (checking that serialization still works): -TEST_DIR=test/robin_tests/robin_haplotypes/threads_in_middle_example +## testing vg normalize in Courtyard on smaller graph: +TEST_DIR=test/robin_tests/normalize_2 # To produce .snarls: -vg snarls $TEST_DIR/chr10_subgraph_0_new.vg >$TEST_DIR/new_remake_test/normalize.snarls +vg snarls $TEST_DIR/normalize.vg >$TEST_DIR/normalize.snarls echo "SNARLS MADE" # To produce .gbwt: -vg index -G $TEST_DIR/new_remake_test/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/chr10_subgraph_0_new.vg +vg index -G $TEST_DIR/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/normalize.vg echo "GBWT MADE" # Convert .vg to .hg: -vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg -# vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +vg convert -v $TEST_DIR/normalize.vg -A >$TEST_DIR/normalize.hg echo "CONVERTED VG TO HG" # Run normalize algorithm: -vg normalize -g $TEST_DIR/new_remake_test/normalize.gbwt -s $TEST_DIR/new_remake_test/normalize.snarls $TEST_DIR/new_remake_test/normalize.hg >$TEST_DIR/new_remake_test/normalize_out.hg +vg normalize -g $TEST_DIR/normalize.gbwt -s $TEST_DIR/normalize.snarls $TEST_DIR/normalize.hg >$TEST_DIR/normalize_out.hg echo "normalized." +# ## testing vg normalize in local machine on smaller graph (checking that serialization still works): +# TEST_DIR=test/robin_tests/robin_haplotypes/threads_in_middle_example +# # To produce .snarls: +# vg snarls $TEST_DIR/chr10_subgraph_0_new.vg >$TEST_DIR/new_remake_test/normalize.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/new_remake_test/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/chr10_subgraph_0_new.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +# # vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# vg normalize -g $TEST_DIR/new_remake_test/normalize.gbwt -s $TEST_DIR/new_remake_test/normalize.snarls $TEST_DIR/new_remake_test/normalize.hg >$TEST_DIR/new_remake_test/normalize_out.hg +# echo "normalized." + # ##running normalize_snarls on a full chromosome. # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 -# TEST_DIR=test/robin_chromosomes/chr10 +# TEST_DIR=test/robin_tests/chr10 # FILE_BASENAME=hgsvc_chr10_construct # # To produce .snarls: # vg snarls $VG_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls @@ -52,7 +67,7 @@ echo "normalized." # # Convert .vg to .hg: # vg convert -v $VG_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg # echo "CONVERTED VG TO HG" -# # Run normalize algorithm: +# Run normalize algorithm: # vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg # echo "NORMALIZED HG MADE" # # convert .hg to .vg @@ -65,9 +80,9 @@ echo "normalized." # ## Jordan's buggy command example -# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg +# vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_tests/chr10/hgsvc_chr10_construct_test.hg # echo halfway_there -# vg convert -a test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_chromosomes/chr10/hgsvc_chr10_construct_test.pg +# vg convert -a test/robin_tests/chr10/hgsvc_chr10_construct_test.hg -P >test/robin_tests/chr10/hgsvc_chr10_construct_test.pg # echo all_done # ## testing vg convert # TEST_CONVERT_FILE=test/robin_haplotypes/test_convert/test_convert From 3a862974784a81ac21486853d1761c4f37f9a962 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Wed, 10 Jul 2019 16:26:37 -0700 Subject: [PATCH 25/63] made a subset of chromosome 10 for debugging --- robin_bash/normalize_snarls.sh | 73 +++++++++++-------- .../0_draft_haplotype_realignment.cpp | 9 ++- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 749e0a379ab..bee072816c7 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -23,36 +23,20 @@ echo compiling! echo running! -## testing vg normalize in Courtyard on smaller graph: -TEST_DIR=test/robin_tests/normalize_2 -# To produce .snarls: -vg snarls $TEST_DIR/normalize.vg >$TEST_DIR/normalize.snarls -echo "SNARLS MADE" -# To produce .gbwt: -vg index -G $TEST_DIR/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/normalize.vg -echo "GBWT MADE" -# Convert .vg to .hg: -vg convert -v $TEST_DIR/normalize.vg -A >$TEST_DIR/normalize.hg -echo "CONVERTED VG TO HG" -# Run normalize algorithm: -vg normalize -g $TEST_DIR/normalize.gbwt -s $TEST_DIR/normalize.snarls $TEST_DIR/normalize.hg >$TEST_DIR/normalize_out.hg -echo "normalized." +## run normalize_snarls on subsetted full chromosome 10: +TEST_DIR=test/robin_tests/chr10 +vg normalize -g $TEST_DIR/hgsvc_chr10_construct.gbwt -s $TEST_DIR/hgsvc_chr10_construct.snarls $TEST_DIR/hgsvc_chr10_construct.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "NORMALIZED HG MADE" -# ## testing vg normalize in local machine on smaller graph (checking that serialization still works): -# TEST_DIR=test/robin_tests/robin_haplotypes/threads_in_middle_example -# # To produce .snarls: -# vg snarls $TEST_DIR/chr10_subgraph_0_new.vg >$TEST_DIR/new_remake_test/normalize.snarls -# echo "SNARLS MADE" -# # To produce .gbwt: -# vg index -G $TEST_DIR/new_remake_test/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/chr10_subgraph_0_new.vg -# echo "GBWT MADE" -# # Convert .vg to .hg: -# vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg -# # vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg -# echo "CONVERTED VG TO HG" -# # Run normalize algorithm: -# vg normalize -g $TEST_DIR/new_remake_test/normalize.gbwt -s $TEST_DIR/new_remake_test/normalize.snarls $TEST_DIR/new_remake_test/normalize.hg >$TEST_DIR/new_remake_test/normalize_out.hg -# echo "normalized." +# ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) +# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +# TEST_DIR=test/robin_tests/chr10_subset +# vg mod -g 7280 -x 5360 $VG_DIR/hgsvc_chr10_construct.vg >$TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg #produces snarl from 1879:12785 + +# ## run normalize_snarls on already made full chromosome 10: +# TEST_DIR=test/robin_tests/chr10 +# vg normalize -g $TEST_DIR/hgsvc_chr10_construct.gbwt -s $TEST_DIR/hgsvc_chr10_construct.snarls $TEST_DIR/hgsvc_chr10_construct.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +# echo "NORMALIZED HG MADE" # ##running normalize_snarls on a full chromosome. # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 @@ -79,6 +63,37 @@ echo "normalized." # # chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg +# ## testing vg normalize in Courtyard on smaller graph: +# TEST_DIR=test/robin_tests/normalize_2 +# # To produce .snarls: +# vg snarls $TEST_DIR/normalize.vg >$TEST_DIR/normalize.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/normalize.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/normalize.vg -A >$TEST_DIR/normalize.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# vg normalize -g $TEST_DIR/normalize.gbwt -s $TEST_DIR/normalize.snarls $TEST_DIR/normalize.hg >$TEST_DIR/normalize_out.hg +# echo "normalized." + +# ## testing vg normalize in local machine on smaller graph (checking that serialization still works): +# TEST_DIR=test/robin_tests/robin_haplotypes/threads_in_middle_example +# # To produce .snarls: +# vg snarls $TEST_DIR/chr10_subgraph_0_new.vg >$TEST_DIR/new_remake_test/normalize.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/new_remake_test/normalize.gbwt -v $TEST_DIR/HGSVC.haps.chr10.vcf.gz $TEST_DIR/chr10_subgraph_0_new.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +# # vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/new_remake_test/normalize.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# vg normalize -g $TEST_DIR/new_remake_test/normalize.gbwt -s $TEST_DIR/new_remake_test/normalize.snarls $TEST_DIR/new_remake_test/normalize.hg >$TEST_DIR/new_remake_test/normalize_out.hg +# echo "normalized." + # ## Jordan's buggy command example # vg convert -v /public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10/hgsvc_chr10_construct.vg -A >test/robin_tests/chr10/hgsvc_chr10_construct_test.hg # echo halfway_there diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index 731b47eb9ae..104fbb06a65 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -51,6 +51,9 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); for (auto roots : snarl_roots) { + cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) + << " source: " << roots->start().node_id() << " sink: " + << roots->end().node_id() << endl; bool success = disambiguate_snarl(graph, haploGraph, roots->start().node_id(), roots->end().node_id()); if (success) { @@ -896,7 +899,8 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // for every possible path, extend it to determine if it really is the path we're // looking for: while (!possible_paths.empty()) { - // take a path off of possible_paths, which will be copied for every iteration through graph.follow_edges, below: + // take a path off of possible_paths, which will be copied for every iteration + // through graph.follow_edges, below: tuple, int, int> possible_path_query = possible_paths.back(); possible_paths.pop_back(); @@ -904,7 +908,8 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // paths still satisfy the requirements for being a possible_path: bool no_path = graph.follow_edges( get<0>(possible_path_query).back(), false, [&](const handle_t &next) { - // make a copy to be extended for through each possible next handle in follow edges. + // make a copy to be extended for through each possible next handle in + // follow edges. tuple, int, int> possible_path = possible_path_query; // extract relevant information to make code more readable. From 34927580421430b7c84722a1c0adb0b90dbc143b Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 11 Jul 2019 09:28:45 -0700 Subject: [PATCH 26/63] local machine run subsetted chr10 commands added --- robin_bash/normalize_snarls.sh | 41 ++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index bee072816c7..3fd519c656e 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,11 +22,44 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! - ## run normalize_snarls on subsetted full chromosome 10: -TEST_DIR=test/robin_tests/chr10 -vg normalize -g $TEST_DIR/hgsvc_chr10_construct.gbwt -s $TEST_DIR/hgsvc_chr10_construct.snarls $TEST_DIR/hgsvc_chr10_construct.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -echo "NORMALIZED HG MADE" +TEST_DIR=test/robin_tests/chr10_subset +FILE_NAME=chr10_subset +# To produce .snarls: +vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls +echo "SNARLS MADE" +# To produce .gbwt: +vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +echo "GBWT MADE" +# Convert .vg to .hg: +vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +echo "CONVERTED VG TO HG" +# vg convert -a $TEST_DIR/$FILE_NAME.hg -V >$TEST_DIR/$FILE_NAME.vg +# echo "CONVERTED HG TO VG" +# Run normalize algorithm: +ls $TEST_DIR +# echo $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg $TEST_DIR/chr10_subset_normalized.hg +vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subset_normalized.hg +echo "normalized." + + +# ## run normalize_snarls on local machine small test: +# TEST_DIR=test/robin_tests/test_normalize +# FILE_NAME=chr10_subgraph_0_new +# # To produce .snarls: +# vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +# # vg convert -v $TEST_DIR/chr10_subgraph_0_new.vg -A >$TEST_DIR/$FILE_NAME.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# echo $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg $TEST_DIR/chr10_subgraph_0_new_normalized.hg +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subgraph_0_new_normalized.hg +# echo "normalized." # ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 From a0b4bf04642f5049cac57f0d8c2669145fa8ee90 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Thu, 11 Jul 2019 12:30:16 -0700 Subject: [PATCH 27/63] bash debug find command --- robin_bash/normalize_snarls.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index 3fd519c656e..e263d8b5a92 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,14 +22,24 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! +# ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) +# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +# TEST_DIR=test/robin_tests/chr10_subset/set_1 +# FILE_NAME=chr10_subset_vg_find +# # vg find -x $VG_DIR/hgsvc_chr10_construct.xg -p "chr10:1883-12677" -c 10 >$TEST_DIR/$FILE_NAME.vg +# vg find -x $VG_DIR/hgsvc_chr10_construct.xg -n 7280 -c 5360 >$TEST_DIR/$FILE_NAME.vg #1878:12785 node range. +# # vg mod -g 3000 -x 5 $VG_DIR/hgsvc_chr10_construct.vg >$TEST_DIR/$FILE_NAME.vg +# echo "vg subgraph made!" + + ## run normalize_snarls on subsetted full chromosome 10: -TEST_DIR=test/robin_tests/chr10_subset -FILE_NAME=chr10_subset +TEST_DIR=test/robin_tests/chr10_subset/set_1 +FILE_NAME=chr10_subset_vg_find # To produce .snarls: vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls echo "SNARLS MADE" # To produce .gbwt: -vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg echo "GBWT MADE" # Convert .vg to .hg: vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg @@ -39,7 +49,7 @@ echo "CONVERTED VG TO HG" # Run normalize algorithm: ls $TEST_DIR # echo $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg $TEST_DIR/chr10_subset_normalized.hg -vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subset_normalized.hg +vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subset_vg_find_normalized.hg echo "normalized." From e2bfeb1fbba1ea87a329762c83e64ba42962a9e5 Mon Sep 17 00:00:00 2001 From: Robin Rounthwaite Date: Thu, 11 Jul 2019 12:38:31 -0700 Subject: [PATCH 28/63] bash run on full chromosome on local machine --- robin_bash/normalize_snarls.sh | 68 ++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index e263d8b5a92..c1d814a659c 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,6 +22,36 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! +##running normalize_snarls on a full chromosome - local machine. +# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +TEST_DIR=test/robin_tests/full_chr10 +FILE_BASENAME=hgsvc_chr10_construct +# visualize unchanged chr10 +./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME.vg| \ +dot -Tsvg -o $TEST_DIR/$FILE_BASENAME.svg +echo "visualization made" +# To produce .snarls: +vg snarls $TEST_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls +echo "SNARLS MADE" +# To produce .gbwt: +vg index -G $TEST_DIR/$FILE_BASENAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_BASENAME.vg +echo "GBWT MADE" +# Convert .vg to .hg: +vg convert -v $TEST_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg +echo "CONVERTED VG TO HG" +Run normalize algorithm: +vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg +echo "NORMALIZED HG MADE" +# convert .hg to .vg +vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg +echo "CONVERTED BACK TO VG." +# visualize +./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME_normalized.vg| \ +dot -Tsvg -o $TEST_DIR/$FILE_BASENAME_normalized.svg +# chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg + + + # ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 # TEST_DIR=test/robin_tests/chr10_subset/set_1 @@ -32,25 +62,25 @@ echo running! # echo "vg subgraph made!" -## run normalize_snarls on subsetted full chromosome 10: -TEST_DIR=test/robin_tests/chr10_subset/set_1 -FILE_NAME=chr10_subset_vg_find -# To produce .snarls: -vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls -echo "SNARLS MADE" -# To produce .gbwt: -vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg -echo "GBWT MADE" -# Convert .vg to .hg: -vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg -echo "CONVERTED VG TO HG" -# vg convert -a $TEST_DIR/$FILE_NAME.hg -V >$TEST_DIR/$FILE_NAME.vg -# echo "CONVERTED HG TO VG" -# Run normalize algorithm: -ls $TEST_DIR -# echo $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg $TEST_DIR/chr10_subset_normalized.hg -vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subset_vg_find_normalized.hg -echo "normalized." +# ## run normalize_snarls on subsetted full chromosome 10: +# TEST_DIR=test/robin_tests/chr10_subset/set_1 +# FILE_NAME=chr10_subset_vg_find +# # To produce .snarls: +# vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +# echo "CONVERTED VG TO HG" +# # vg convert -a $TEST_DIR/$FILE_NAME.hg -V >$TEST_DIR/$FILE_NAME.vg +# # echo "CONVERTED HG TO VG" +# # Run normalize algorithm: +# ls $TEST_DIR +# # echo $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg $TEST_DIR/chr10_subset_normalized.hg +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/chr10_subset_vg_find_normalized.hg +# echo "normalized." # ## run normalize_snarls on local machine small test: From 6ef96ad8b9cca384d97620fa6be3f9cd059af2e2 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 12 Jul 2019 12:14:31 -0700 Subject: [PATCH 29/63] bash update --- robin_bash/normalize_snarls.sh | 63 +++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh index c1d814a659c..0384193dc59 100755 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -22,34 +22,51 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! +# TEST_DIR=test/robin_tests/vis_vg_find_sample +# FILE_NAME=chr10_subset_vg_find + +# ./bin/vg view -dpn $TEST_DIR/$FILE_NAME.vg| \ +# dot -Tsvg -o $TEST_DIR/$FILE_NAME.svg +# chromium-browser $TEST_DIR/$FILE_NAME.svg + ##running normalize_snarls on a full chromosome - local machine. # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 TEST_DIR=test/robin_tests/full_chr10 -FILE_BASENAME=hgsvc_chr10_construct -# visualize unchanged chr10 -./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME.vg| \ -dot -Tsvg -o $TEST_DIR/$FILE_BASENAME.svg -echo "visualization made" -# To produce .snarls: -vg snarls $TEST_DIR/$FILE_BASENAME.vg >$TEST_DIR/$FILE_BASENAME.snarls -echo "SNARLS MADE" -# To produce .gbwt: -vg index -G $TEST_DIR/$FILE_BASENAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_BASENAME.vg -echo "GBWT MADE" -# Convert .vg to .hg: -vg convert -v $TEST_DIR/$FILE_BASENAME.vg -A >$TEST_DIR/$FILE_BASENAME.hg -echo "CONVERTED VG TO HG" -Run normalize algorithm: -vg normalize -g $TEST_DIR/$FILE_BASENAME.gbwt -s $TEST_DIR/$FILE_BASENAME.snarls $TEST_DIR/$FILE_BASENAME.hg >$TEST_DIR/$FILE_BASENAME_normalized.hg -echo "NORMALIZED HG MADE" -# convert .hg to .vg -vg convert -a $TEST_DIR/$FILE_BASENAME_normalized.hg -V $TEST_DIR/$FILE_BASENAME_normalized.vg -echo "CONVERTED BACK TO VG." -# visualize -./bin/vg view -dpn $TEST_DIR/$FILE_BASENAME_normalized.vg| \ -dot -Tsvg -o $TEST_DIR/$FILE_BASENAME_normalized.svg +FILE_NAME=hgsvc_chr10_construct +# # visualize subsetted chr10 +# vg mod -g 7280 -x 5360 $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg #produces snarl from 1879:12785 +# echo "subgraph made" +# # vg find -x $TEST_DIR/hgsvc_chr10_construct.xg -n 7280 -c 5360 >$TEST_DIR/$FILE_NAME.vg #1878:12785 node range. +# ./bin/vg view -dpn $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg| \ +# dot -Tsvg -o $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.svg # chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg +# echo "visualization made" +# # To produce .snarls: +# vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls +# echo "SNARLS MADE" +# # To produce .gbwt: +# vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +# echo "GBWT MADE" +# # Convert .vg to .hg: +# vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +# echo "CONVERTED VG TO HG" +# # Run normalize algorithm: +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/hgsvc_chr10_construct_normalized.hg +# echo "NORMALIZED HG MADE" +# # convert .hg to .vg +# vg convert -a $TEST_DIR/hgsvc_chr10_construct_normalized.hg -V $TEST_DIR/hgsvc_chr10_construct_normalized.vg +# echo "CONVERTED BACK TO VG." +# # visualize +# ./bin/vg view -dpn $TEST_DIR/hgsvc_chr10_construct_normalized.vg| \ +# dot -Tsvg -o $TEST_DIR/hgsvc_chr10_construct_normalized.svg +# chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg + +# ./bin/vg view -dpn $TEST_DIR/$FILE_NAME.vg| \ +# dot -Tsvg -o $TEST_DIR/$FILE_NAME.svg +# chromium-browser $TEST_DIR/$FILE_NAME.svg + + # ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) From 9f39f44dc5a4914802fecec9e32ee388d02b8a28 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 7 Aug 2019 16:53:11 -0700 Subject: [PATCH 30/63] normalize snarls now runs on full hg38 thousand genomes chr10 graph. Evaluation pipeline started. --- robin_bash/normalize_snarls.sh | 81 +- .../0_draft_haplotype_realignment.cpp | 784 +++++++++++++----- .../0_draft_haplotype_realignment.hpp | 21 +- ...0_draft_snarl_normalization_evaluation.cpp | 32 + src/msa_converter.cpp | 164 ++-- src/subcommand/0_normalize_main.cpp | 99 +-- 6 files changed, 804 insertions(+), 377 deletions(-) mode change 100755 => 100644 robin_bash/normalize_snarls.sh create mode 100644 src/algorithms/0_draft_snarl_normalization_evaluation.cpp diff --git a/robin_bash/normalize_snarls.sh b/robin_bash/normalize_snarls.sh old mode 100755 new mode 100644 index 0384193dc59..b9b60286492 --- a/robin_bash/normalize_snarls.sh +++ b/robin_bash/normalize_snarls.sh @@ -10,18 +10,25 @@ ## useful debug tools: # export VG_FULL_TRACEBACK=1 -# valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg + +# valgrind vg mod -F blah test/robin_snarl_examples/chr10_subgraph_0_new.vg + #Note: to make more informative, commment out the two lines in MakeFile under "use + #jemalloc", delete bin/vg, and recompile. + ## in terminal: # gdb vg -# run mod -F blah test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.vg +# run normalize -g test/robin_tests/full_chr10/hgsvc_chr10_construct.gbwt -s test/robin_tests/full_chr10/hgsvc_chr10_construct.snarls test/robin_tests/full_chr10/hgsvc_chr10_construct.hg >test/robin_tests/full_chr10/hgsvc_chr10_construct_normalized.hg -export VG_FULL_TRACEBACK=0 +export VG_FULL_TRACEBACK=1 set -e echo compiling! . ./source_me.sh && make -j 8 echo running! +## constructing a smaller graph from a larger one - one method of subsetting a graph. +# vg construct -r test/small/x.fa -v test/small/x.vcf.gz -R x:1-10 + # TEST_DIR=test/robin_tests/vis_vg_find_sample # FILE_NAME=chr10_subset_vg_find @@ -30,44 +37,68 @@ echo running! # chromium-browser $TEST_DIR/$FILE_NAME.svg ##running normalize_snarls on a full chromosome - local machine. -# VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 TEST_DIR=test/robin_tests/full_chr10 FILE_NAME=hgsvc_chr10_construct -# # visualize subsetted chr10 -# vg mod -g 7280 -x 5360 $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg #produces snarl from 1879:12785 -# echo "subgraph made" -# # vg find -x $TEST_DIR/hgsvc_chr10_construct.xg -n 7280 -c 5360 >$TEST_DIR/$FILE_NAME.vg #1878:12785 node range. -# ./bin/vg view -dpn $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg| \ -# dot -Tsvg -o $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.svg -# chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg -# echo "visualization made" -# # To produce .snarls: -# vg snarls $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/$FILE_NAME.snarls -# echo "SNARLS MADE" -# # To produce .gbwt: -# vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg -# echo "GBWT MADE" -# # Convert .vg to .hg: -# vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +## for printing out the subsnarl: +vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/full_chr10_normalized.vg +# visualize +# ./bin/vg view -dpn $TEST_DIR/graph3.vg| \ +# dot -Tsvg -o $TEST_DIR/graph3.svg +# chromium-browser $TEST_DIR/graph3.svg + +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/hgsvc_chr10_construct_normalized_one_snarl_pre_long_chr10.hg +# echo "NORMALIZED HG MADE" +# # convert .hg to .vg +# vg convert -a $TEST_DIR/hgsvc_chr10_construct_normalized_one_snarl_pre_long_chr10.hg -V $TEST_DIR/hgsvc_chr10_construct_normalized_one_snarl_pre_long_chr10.vg +# echo "CONVERTED BACK TO VG." +# #extract subsnarl: +# vg find + +# # visualize +# ./bin/vg view -dpn hgsvc_chr10_one_snarl_pre_long_chr10_extracted.vg| \ +# dot -Tsvg -o hgsvc_chr10_one_snarl_pre_long_chr10_extracted.svg +# chromium-browser hgsvc_chr10_one_snarl_pre_long_chr10_extracted.svg + + + + + + # echo "CONVERTED VG TO HG" -# # Run normalize algorithm: +# Run normalize algorithm: +# valgrind --leak-check=full vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/hgsvc_chr10_construct_normalized.hg +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >big_chr10_path.vg # vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/hgsvc_chr10_construct_normalized.hg # echo "NORMALIZED HG MADE" # # convert .hg to .vg # vg convert -a $TEST_DIR/hgsvc_chr10_construct_normalized.hg -V $TEST_DIR/hgsvc_chr10_construct_normalized.vg # echo "CONVERTED BACK TO VG." # # visualize -# ./bin/vg view -dpn $TEST_DIR/hgsvc_chr10_construct_normalized.vg| \ -# dot -Tsvg -o $TEST_DIR/hgsvc_chr10_construct_normalized.svg -# chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg - # ./bin/vg view -dpn $TEST_DIR/$FILE_NAME.vg| \ # dot -Tsvg -o $TEST_DIR/$FILE_NAME.svg # chromium-browser $TEST_DIR/$FILE_NAME.svg +# ./bin/vg view -dpn big_chr10_path.vg| \ +# dot -Tsvg -o big_chr10_path.svg +# chromium-browser big_chr10_path.svg +# # visualize subsetted chr10 +# vg mod -g 7280 -x 5360 $TEST_DIR/$FILE_NAME.vg >$TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg #produces snarl from 1879:12785 +# echo "subgraph made" +# # vg find -x $TEST_DIR/hgsvc_chr10_construct.xg -n 7280 -c 5360 >$TEST_DIR/$FILE_NAME.vg #1878:12785 node range. +# ./bin/vg view -dpn $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.vg| \ +# dot -Tsvg -o $TEST_DIR/hgsvc_chr10_construct_first_few_snarls.svg +# chromium-browser $TEST_DIR/$FILE_BASENAME_normalized.svg +# ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) +# # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 +# TEST_DIR=test/robin_tests/chr10_subset/set_1 +# FILE_NAME=chr10_subset_vg_find +# # vg find -x $VG_DIR/hgsvc_chr10_construct.xg -p "chr10:1883-12677" -c 10 >$TEST_DIR/$FILE_NAME.vg +# vg find -x $TEST_DIR/hgsvc_chr10_construct.xg -n 7280 -c 5360 >$TEST_DIR/$FILE_NAME.vg #1878:12785 node range. +# # vg mod -g 3000 -x 5 $VG_DIR/hgsvc_chr10_construct.vg >$TEST_DIR/$FILE_NAME.vg +# echo "vg subgraph made!" # ## split off the first few snarls from chromosome ten: (aiming for nodes between 1883 and 12677) # VG_DIR=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps/chr10 diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index 104fbb06a65..e886893e1b0 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -1,3 +1,10 @@ +// TODO: I remove snarls where a haplotype begins/ends in the middle +// TODO: of the snarl. Get rid of this once alignment issue is addressed! +// TODO: also, limits the number of haplotypes to be aligned, since snarl starting at +// TODO: 2049699 with 258 haplotypes is taking many minutes. + +// TODO: another had 146 haplotypes and took maybe 5 minutes to align. (kept that one +// in tho' ) #pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? #include "0_draft_haplotype_realignment.hpp" @@ -13,10 +20,11 @@ #include "../msa_converter.hpp" #include "../snarls.hpp" #include "../vg.hpp" -#include -#include -#include -// #include "../../deps/libhandlegraph/src/include/handlegraph/path_handle_graph.hpp" +#include "is_acyclic.hpp" + +#include "../types.hpp" +#include "extract_containing_graph.hpp" + namespace vg { @@ -50,25 +58,47 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, int num_snarls_normalized = 0; int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); + bool success; for (auto roots : snarl_roots) { - cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) - << " source: " << roots->start().node_id() << " sink: " - << roots->end().node_id() << endl; - bool success = disambiguate_snarl(graph, haploGraph, roots->start().node_id(), - roots->end().node_id()); - if (success) { - num_snarls_normalized += 1; - } else { - num_snarls_skipped += 1; - } + + // if (roots->start().node_id() == 1883 or roots->start().node_id() == 4211565) { + cerr << "disambiguating snarl #" + << (num_snarls_normalized + num_snarls_skipped) + << " source: " << roots->start().node_id() + << " sink: " << roots->end().node_id() << endl; + success = disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + roots->end().node_id()); + + if (success) { + num_snarls_normalized += 1; + } else { + num_snarls_skipped += 1; + } + // } } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " << num_snarls_skipped << " snarls b/c they had haplotypes starting/ending in the middle " - "of the snarl." + "of the snarl, the snarl was cyclic, it exceeded the size limit, or there " + "were handles not connected by the gbwt info." << endl; + /// Args: + /// source graph to extract subgraph from + /// into graph to extract into + /// positions search outward from these positions + /// max_dist include all nodes and edges that can be reached in at most + /// this distance reversing_walk_length also find graph material that can be reached + + // //todo: debug_statement + // VG outGraph; + // pos_t source_pos = make_pos_t(4211565, false, 0); + // vector pos_vec; + // pos_vec.push_back(source_pos); + // algorithms::extract_containing_graph(&graph, &outGraph, pos_vec, 150); + // outGraph.serialize_to_ostream(cout); + delete snarl_manager; } @@ -88,45 +118,109 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id) { - cerr << "disambiguate_snarl" << endl; + // cerr << "disambiguate_snarl" << endl; + + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + + if (!algorithms::is_acyclic(&snarl)) { + cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; + return false; + } // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. // Return value is pair< haplotypes_that_stretch_from_source_to_sink, // haplotypes_that_end/start_prematurely > - pair>, vector>> haplotypes = - extract_gbwt_haplotypes(haploGraph, source_id, sink_id); + tuple>, vector>, unordered_set> + haplotypes = extract_gbwt_haplotypes(snarl, haploGraph, source_id, sink_id); + + // check to make sure that the gbwt graph has threads connecting all handles: + // ( needs the unordered_set from extract_gbwt haplotypes to be equal to the number of + // handles in the snarl). + int handles_in_snarl = 0; + snarl.for_each_handle([&](const handle_t handle) { + handles_in_snarl++; + }); // TODO: this if statement removes snarls where a haplotype begins/ends in the middle // TODO: of the snarl. Get rid of this once alignment issue is addressed! - if (haplotypes.second.empty()) { + // TODO: also, limits the number of haplotypes to be aligned, since snarl starting at + // TODO: 2049699 with 258 haplotypes is taking many minutes. + if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < 200 && + get<2>(haplotypes).size() == handles_in_snarl) { + // if (get<1>(haplotypes).empty()) { // Convert the haplotypes from vector format to string format. vector haplotypes_from_source_to_sink = - format_handle_haplotypes_to_strings(haploGraph, haplotypes.first); + format_handle_haplotypes_to_strings(haploGraph, get<0>(haplotypes)); // vector< string > other_haplotypes = - // format_handle_haplotypes_to_strings(haploGraph, haplotypes.second); - - // Align the new snarl: - // TODO: find better way to improve disamiguation of beginning/ending regions of - // nodes - // TODO: than by adding leading/trailing AAA seq (essentially a special - // character). - for (string &hap : haplotypes_from_source_to_sink) { - hap = "AAAAAAAA" + hap + "AAAAAAAA"; - } - VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + // format_handle_haplotypes_to_strings(haploGraph, get<1>(haplotypes)); // Get the embedded paths in the snarl out of the graph, for the purposes of - // moving them into the new snarl. + // moving them into the new snarl. In addition, any embedded paths that stretch + // from source to sink are aligned in the new snarl. + // TODO: once haplotypes that begin/end in the middle of the snarl have been + // TODO: accounted for in the code, align all embedded paths? (and remove next + // TODO: chunk of code that finds source-to-sink paths)? vector> embedded_paths = extract_embedded_paths_in_snarl(graph, source_id, sink_id); + // todo: debug_statment (lots of em) + // cerr << "\t gonna try to find an embedded path stretching from source to sink!" + // << endl; find the paths that stretch from source to sink: + for (auto path : embedded_paths) { + // cerr << "checking path of name " << + // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with + // start " << graph.get_id(graph.get_handle_of_step(path.first)) << " and sink + // " << + // graph.get_id(graph.get_handle_of_step(graph.get_previous_step(path.second))) + // << endl; + if (graph.get_id(graph.get_handle_of_step(path.first)) == source_id && + graph.get_id(graph.get_handle_of_step( + graph.get_previous_step(path.second))) == sink_id) { + // cerr << "adding path of name " << + // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << endl; + // get the sequence of the source to sink path, and add it to the paths to + // be aligned. + string path_seq; + step_handle_t cur_step = path.first; + while (cur_step != path.second) { + path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); + cur_step = graph.get_next_step(cur_step); + } + haplotypes_from_source_to_sink.push_back(path_seq); + } + } + // Align the new snarl: + VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + + // todo: debug_statement + // new_snarl.for_each_handle([&](const handle_t& handle) { + // cerr << new_snarl.get_id(handle) << " " << new_snarl.get_sequence(handle) + // << "\t"; + // }); + + // //todo: debug_statement: + // new_snarl.serialize_to_ostream(cout); + // return true; + // integrate the new_snarl into the graph, removing the old snarl as you go. integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); cerr << endl; return true; } else { - cerr << "found a snarl with haplotypes in the middle. Start: " << source_id - << " sink is " << sink_id << endl; + if (!get<1>(haplotypes).empty()) { + cerr << "found a snarl with haplotypes in the middle. Start: " << source_id + << " sink is " << sink_id << ". Skipping." << endl; + } + if (get<0>(haplotypes).size() > 200) { + cerr << "found a snarl with too many haplotypes (" + << get<0>(haplotypes).size() << ") to efficiently align. Skipping." + << endl; + } + if (get<2>(haplotypes).size() != handles_in_snarl) { + cerr << "some handles in the snarl aren't accounted for by the gbwt graph. " + "Skipping." + << endl; + } return false; } } @@ -145,15 +239,11 @@ bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, // first in the pair represents all paths reaching from source to sink in the snarl, // and the second representing all other paths in the snarl (e.g. any that don't // reach both source and sink in the graph.) -pair>, vector>> -extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, - const id_t &sink_id) { - cerr << "extract_gbwt_haplotypes" << endl; - - // touched_handles contains all handles that have been touched by the - // depth_first_search, for later use in other_haplotypes_to_strings, which identifies - // paths that didn't stretch from source to sink in the snarl. - unordered_set touched_handles; +// pair>, vector>> +tuple>, vector>, unordered_set> +extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph, + const id_t &source_id, const id_t &sink_id) { + // cerr << "extract_gbwt_haplotypes" << endl; // haplotype_queue contains all started exon_haplotypes not completed yet. // Every time we encounter a branch in the paths, the next node down the path @@ -169,16 +259,31 @@ extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, vector source_handle_vec(1, source_handle); gbwt::SearchState source_state = haploGraph.get_state(source_handle); haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); - touched_handles.emplace(source_handle); + + // touched_handles contains all handles that have been touched by the + // depth first search below, for later use in other_haplotypes_to_strings, which + // identifies paths that didn't stretch from source to sink in the snarl. + unordered_set touched_handles{source_handle, sink_handle}; // haplotypes contains all "finished" haplotypes - those that were either walked // to their conclusion, or until they reached the sink. vector> haplotypes_from_source_to_sink; vector> other_haplotypes; + // int prev_size = 0; // for every partly-extracted thread, extend the thread until it either reaches // the sink of the snarl or the end of the thread. while (!haplotype_queue.empty()) { + // todo: debug_statement + // cerr << "haplotype queue: "; + // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; + // for (auto hap : haplotype_queue) { + // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; + // for (handle_t handle : hap.first) { + // cerr << haploGraph.get_id(handle) << " "; + // } + // cerr << endl; + // } // get a haplotype out of haplotype_queue to extend - // a tuple of (handles_traversed_so_far, last_touched_SearchState) @@ -203,7 +308,16 @@ extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, // the other_haplotypes if haplotype ends before reaching sink. for (gbwt::SearchState next_search : next_searches) { handle_t next_handle = haploGraph.node_to_handle(next_search.node); - + if (!snarl.has_node(snarl.get_id(next_handle))) { + cerr << "snarl starting at node " << source_id + << " has a thread that incorrectly connects a node contained " + "within the snarl (node " + << haploGraph.get_id(cur_haplotype.first.back()) + << ") and a node outside the snarl (" + << haploGraph.get_id(next_handle) + << "). This thread connection will be ignored." << endl; + continue; + } // copy over the vector of cur_haplotype: vector next_handle_vec(cur_haplotype.first); @@ -228,8 +342,9 @@ extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, // We have reached the end of the path, but it doesn't reach the sink. // we need to add cur_haplotype to other_haplotypes. other_haplotypes.push_back(cur_haplotype.first); + } - // if new_handle is the sink, put in haplotypes_from_source_to_sink + // if next_handle is the sink, put in haplotypes_from_source_to_sink else if (haploGraph.get_id( haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { // Then we need to add cur_haplotype + next_search to @@ -270,7 +385,9 @@ extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, move(haplotypes_not_starting_at_source.begin(), haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); - return make_pair(haplotypes_from_source_to_sink, other_haplotypes); + return tuple>, vector>, + unordered_set>{haplotypes_from_source_to_sink, + other_haplotypes, touched_handles}; } // Used to complete the traversal of a snarl along its haplotype threads, when there are @@ -288,7 +405,7 @@ vector> find_haplotypes_not_at_source(const GBWTGraph &haploGraph, unordered_set &touched_handles, const id_t &sink_id) { - cerr << "find_haplotypes_not_at_source" << endl; + // cerr << "find_haplotypes_not_at_source" << endl; /// Search every handle in touched handles for haplotypes starting at that point. // Any new haplotypes will be added to haplotype_queue. @@ -305,7 +422,7 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, // We don't need to ever check the sink handle, since paths from the sink handle // extend beyond snarl. handle_t sink_handle = haploGraph.get_handle(sink_id); - touched_handles.erase(sink_handle); + // touched_handles.erase(sink_handle); // Nested function for making a new_search. Identifies threads starting at a given // handle and @@ -316,11 +433,6 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, gbwt::SearchState new_search = haploGraph.index.prefix(haploGraph.handle_to_node(handle)); if (!new_search.empty()) { - // TODO: test_code code: are searchstates empty? - cerr << "apparently new thread starts at node: " << haploGraph.get_id(handle) - << endl; - cerr << "is the searchstate empty? " << new_search.empty() - << " size: " << new_search.size() << endl; // Then add them to haplotype_queue. haploGraph.follow_paths( new_search, [&](const gbwt::SearchState &next_search) -> bool { @@ -447,7 +559,6 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, vector format_handle_haplotypes_to_strings( const GBWTGraph &haploGraph, const vector> &haplotype_handle_vectors) { - cerr << "format_handle_haplotypes_to_strings" << endl; vector haplotype_strings; for (vector haplotype_handles : haplotype_handle_vectors) { string hap; @@ -467,9 +578,51 @@ vector format_handle_haplotypes_to_strings( // handle sequences). // Returns: // VG object representing the newly realigned snarl. -VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotypes) { - cerr << "align_source_to_sink_haplotypes" << endl; - seqan::Align align; // create multiple_sequence_alignment object +VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { + // cerr << "align_source_to_sink_haplotypes" << endl; + cerr << "number of strings to align: " << source_to_sink_haplotypes.size() << endl; + // TODO: make the following comment true, so that I can normalize haplotypes that + // TODO: aren't source_to_sink by adding a similar special character to strings in + // TODO: the middle of the snarl. + // modify source_to_sink_haplotypes to replace the leading and + // trailing character with a special character. This ensures that the leading char of + // the haplotype becomes the first character in the newly aligned snarl's source - it + // maintains the context of the snarl. + + // store the source/sink chars for later reattachment to source and sink. + string source_char(1, source_to_sink_haplotypes.back().front()); + string sink_char(1, source_to_sink_haplotypes.back().back()); + + for (string &hap : source_to_sink_haplotypes) { + hap.replace(0, 1, "X"); + hap.replace(hap.size() - 1, 1, "X"); + } + + // /// make a new scoring matrix with _match=5, _mismatch = -3, _gap_extend = -1, and + // _gap_open = -3, EXCEPT that Q has to be matched with Q (so match score between Q + // and Q =len(seq)+1) + // // 1. Define type and constants. + // // + // // Define types for the score value and the scoring scheme. + // typedef int TValue; + // typedef seqan::Score > + // TScoringScheme; + // // Define our gap scores in some constants. + // int const gapOpenScore = -1; + // int const gapExtendScore = -1; + + // static int const _data[TAB_SIZE] = + // { + // 1, 0, 0, 0, 0, + // 0, 1, 0, 0, 0, + // 0, 0, 1, 0, 0, + // 0, 0, 0, 1, 0, + // 0, 0, 0, 0, 0 + // }; + + // create seqan multiple_sequence_alignment object + //// seqan::Align align; + seqan::Align align; seqan::resize(rows(align), source_to_sink_haplotypes.size()); for (int i = 0; i < source_to_sink_haplotypes.size(); ++i) { @@ -478,37 +631,48 @@ VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotyp globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + vector row_strings; + for (auto &row : rows(align)) { + string row_string; + auto it = begin(row); + auto itEnd = end(row); + for (; it != itEnd; it++) { + row_string += *it; + } + // todo: debug_statement + // cerr << "ROW_STRING: " << row_string << endl; + // edit the row so that the proper source and sink chars are added to the + // haplotype instead of the special characters added to ensure correct alignment + // of source and sink. + row_string.replace(0, 1, source_char); + row_string.replace(row_string.size() - 1, 1, sink_char); + row_strings.push_back(row_string); + } + stringstream ss; - ss << align; + for (string seq : row_strings) { + ss << endl << seq; + } + // ss << align; MSAConverter myMSAConverter = MSAConverter(); myMSAConverter.load_alignments(ss, "seqan"); VG snarl = myMSAConverter.make_graph(); snarl.clear_paths(); - // TODO: find better way to improve disamiguation of beginning/ending regions of nodes - // TODO: than by adding leading/trailing AAA seq (essentially a special - // character). pair, vector> source_and_sink = debug_get_sources_and_sinks(snarl); - // Replace source with a handle that has the leading AAA seq removed. - handle_t source = source_and_sink.first.back(); - string source_seq = snarl.get_sequence(source); - id_t source_id = snarl.get_id(source); - handle_t new_source = snarl.create_handle(source_seq.substr(8, source_seq.size())); - snarl.follow_edges(source, false, [&](const handle_t &handle) { - snarl.create_edge(new_source, handle); - }); - snarl.destroy_handle(source); - - handle_t sink = source_and_sink.second.back(); - string sink_seq = snarl.get_sequence(sink); - id_t sink_id = snarl.get_id(sink); - handle_t new_sink = snarl.create_handle(sink_seq.substr(0, (sink_seq.size() - 8))); - snarl.follow_edges( - sink, true, [&](const handle_t &handle) { snarl.create_edge(handle, new_sink); }); - snarl.destroy_handle(sink); + // TODO: throw exception(?) instead of cerr, or remove these messages if I'm confident + // TODO: code works. + if (source_and_sink.first.size() != 1) { + cerr << "WARNING! Snarl realignment has generated " + << source_and_sink.first.size() << " source nodes." << endl; + } + if (source_and_sink.second.size() != 1) { + cerr << "WARNING! Snarl realignment has generated " + << source_and_sink.second.size() << " sink nodes." << endl; + } return snarl; } @@ -530,7 +694,20 @@ VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotyp vector> extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, const id_t &sink_id) { - cerr << "extract_embedded_paths_in_snarl" << endl; + // cerr << "extract_embedded_paths_in_snarl" << endl; + // cerr << "source id: " << source_id << endl; + // cerr << "source id contains what paths?: " << endl; + // for (auto step : graph.steps_of_handle(graph.get_handle(source_id))) { + // cerr << "\t" << graph.get_path_name(graph.get_path_handle_of_step(step)) << + // endl; + // } + // cerr << "neighbors of 71104? (should include 71097):" << endl; + // handle_t test_handle = graph.get_handle(71104); + // graph.follow_edges(test_handle, true, [&](const handle_t &handle) { + // cerr << graph.get_id(handle) << endl; + // }); + // cerr << "can I still access source handle?" + // << graph.get_sequence(graph.get_handle(source_id)) << endl; // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't // extend the path to a point beyond the source or sink. @@ -558,6 +735,13 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source } }); + // todo: debug_statement + // cerr << "################looking for new paths################" << endl; + // for (auto path : paths_found) { + // cerr << graph.get_path_name(path.first) << " " + // << graph.get_id(graph.get_handle_of_step(path.second)) << endl; + // } + /// for each step_handle_t corresponding to a unique path, we want to get the steps /// closest to both the end and beginning step that still remains in the snarl. // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", @@ -576,7 +760,7 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source id_t begin_in_snarl_id = graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); - while ((begin_in_snarl_id != source_id) && (begin_in_snarl_id != sink_id) && + while ((begin_in_snarl_id != source_id) && graph.has_previous_step(begin_in_snarl_step)) { begin_in_snarl_step = graph.get_previous_step(begin_in_snarl_step); begin_in_snarl_id = @@ -588,8 +772,9 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source step_handle_t end_in_snarl_step = step; id_t end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); - while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and - graph.has_next_step(end_in_snarl_step)) { + // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and + // graph.has_next_step(end_in_snarl_step)) { + while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { end_in_snarl_step = graph.get_next_step(end_in_snarl_step); end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); } @@ -616,7 +801,7 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source // and sink_id. SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, const id_t &sink_id) { - cerr << "extract_subgraph" << endl; + // cerr << "extract_subgraph" << endl; /// make a subgraph containing only nodes of interest. (e.g. a snarl) // make empty subgraph SubHandleGraph subgraph = SubHandleGraph(&graph); @@ -699,12 +884,20 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph &to_insert_snarl, const vector> embedded_paths, const id_t &source_id, const id_t &sink_id) { - cerr << "integrate_snarl" << endl; + // cerr << "integrate_snarl" << endl; + + // //todo: debug_statement + // cerr << "handles in to_insert_snarl:" << endl; + // to_insert_snarl.for_each_handle([&](const handle_t &handle) { + // cerr << to_insert_snarl.get_id(handle) << " " + // << to_insert_snarl.get_sequence(handle) << " \t"; + // }); + cerr << endl; // Get old graph snarl SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); - // TODO: test_code: Check to make sure that newly made snarl has only one start and - // end. + // TODO: debug_statement: Check to make sure that newly made snarl has only one start + // and end. // TODO: (shouldn't be necessary once we've implemented alignment with // leading/trailing special chars.) Identify old and new snarl start and sink pair, vector> to_insert_snarl_defining_handles = @@ -775,8 +968,10 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // For each path of interest, move it onto the new_snarl. for (auto path : embedded_paths) { + // //todo: debug_statement + // cerr << "the new sink id: " << temp_snarl_sink_id << endl; move_path_to_snarl(graph, path, new_snarl_topo_order, temp_snarl_source_id, - temp_snarl_sink_id); + temp_snarl_sink_id, source_id, sink_id); } // Destroy the old snarl. @@ -829,6 +1024,7 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // delete the previously created source and sink: for (handle_t handle : {graph.get_handle(temp_snarl_source_id), graph.get_handle(temp_snarl_sink_id)}) { + graph.destroy_handle(handle); } } @@ -848,25 +1044,71 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // Return: None. void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, const pair &old_embedded_path, - vector &new_snarl_handles, id_t &source_id, - id_t &sink_id) { - cerr << "move_path_to_snarl" << endl; + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id) { + // cerr << "move_path_to_snarl" << endl; + // //TODO: debug_statement: + // cerr << "path name: " + // << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // << endl; + // cerr << "source: " << new_source_id << " sink: " << new_sink_id << endl; + // if (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) == + // "chr10") { + // cerr << "\t\tstart and end of old embedded path: " + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) + // << "end id" + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << + // endl; + // } + // cerr << "#### handles in snarl (according to move_path_to_snarl): ####" << endl; + // for (handle_t handle : new_snarl_handles) { + // cerr << "\t" << graph.get_id(handle) << " " << graph.get_sequence(handle); + // } + // cerr << endl << endl; + // cerr << "~~~~~ Handles following each handle:" << endl; + // for (handle_t handle : new_snarl_handles) { + // cerr << "neighbors of handle " << graph.get_id(handle) << " (" + // < + // //todo: debug_statement + // cerr << "checking handles as start of path-seq" << endl; vector, int, int>> possible_paths; for (handle_t handle : new_snarl_handles) { string handle_seq = graph.get_sequence(handle); + // starting index is where the path would begin in the handle, // since it could begin in the middle of the handle. vector starting_indices = @@ -874,20 +1116,27 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // if there is a starting index, if (starting_indices.size() != 0) { - // if the starting_indices implies that the starting handle entirely contains - // the path_seq of interest: - - if ((handle_seq.size() - starting_indices.back()) >= path_seq.size()) { - // then we've already found the full mapping location of the path! Move - // the path, end the method. - vector new_path{handle}; - graph.rewrite_segment(old_embedded_path.first, old_embedded_path.second, - new_path); - return; - } else { - // add it as a possible_path. - vector possible_path_handle_vec{handle}; - for (auto starting_index : starting_indices) { + for (int starting_index : starting_indices) { + if ((handle_seq.size() - starting_index) >= path_seq.size() && + source_and_sink_handles_map_properly(graph, new_source_id, + new_sink_id, touching_source, + touching_sink, handle, handle)) { + // if the entire path fits inside the current handle, and if any + // paths that touched source and sink in the old snarl would be + // touching source and sink in the new snarl, then we've already + // found the full mapping location of the path! Move the path, end + // the method. + vector new_path{handle}; + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, new_path); + // //todo: debug_statement + // cerr << "found a full mapping at " << graph.get_id(handle) + // << " w/ seq " << graph.get_sequence(handle) << endl; + return; + } else { + // this is a potential starting handle for the path. Add as a + // possible_path. + vector possible_path_handle_vec{handle}; possible_paths.push_back( make_tuple(possible_path_handle_vec, starting_index, handle_seq.size() - starting_index)); @@ -896,6 +1145,19 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, } } + // //todo: debug_statement: + // cerr << "done checking handles as start of path seq" << endl; + + // //TODO: debug_statement: + // cerr << "possible paths so far: " << endl; + // for (tuple, int, int> path : possible_paths) { + // cerr << " possible start: "; + // for (handle_t handle : get<0>(path)) { + // cerr << graph.get_id(handle) << " "; + // } + // cerr << endl; + // } + // for every possible path, extend it to determine if it really is the path we're // looking for: while (!possible_paths.empty()) { @@ -904,10 +1166,22 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, tuple, int, int> possible_path_query = possible_paths.back(); possible_paths.pop_back(); + // //TODO: debug_statement: + // for (tuple, int, int> path : possible_paths) { + // cerr << "*\tpossible path query: "; + // for (handle_t handle : get<0>(possible_path_query)) { + // cerr << graph.get_id(handle) << " " << graph.get_sequence(handle) << " "; + // } + // cerr << endl; + // } + // extend the path through all right-extending edges to see if any subsequent // paths still satisfy the requirements for being a possible_path: bool no_path = graph.follow_edges( get<0>(possible_path_query).back(), false, [&](const handle_t &next) { + // //todo: debug_statement + // cerr << "next handle id and seq: " << graph.get_id(next) << " " + // << graph.get_sequence(next) << endl; // make a copy to be extended for through each possible next handle in // follow edges. tuple, int, int> possible_path = possible_path_query; @@ -916,107 +1190,217 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, string next_seq = graph.get_sequence(next); id_t next_id = graph.get_id(next); int &cur_index_in_path = get<2>(possible_path); - - // if the next handle would be the ending handle for the path, - if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { - // check to see if the sequence in the handle is suitable for ending - // the path: - int compare_length = path_seq.size() - cur_index_in_path; - if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, - compare_length) == 0) { - // we've found the new path! Move path to the new sequence, and - // end the function. - - if (compare_length < next_seq.size()) { - // If the path ends before the end of next_seq, then split the - // handle so that the path ends flush with the end of the - // first of the two split handles. - - // divide the handle where the path ends; - pair divided_next = - graph.divide_handle(next, compare_length); - get<0>(possible_path).push_back(divided_next.first); - - // Special case if next is the sink or the source, to preserve - // the reassignment of source and sink ids in integrate_snarl. - if (next_id = sink_id) { - sink_id = graph.get_id(divided_next.second); + if (cur_index_in_path <= path_seq.size() && + (find(new_snarl_handles.cbegin(), new_snarl_handles.cend(), next) != + new_snarl_handles.cend())) { + // if the next handle would be the ending handle for the path, + if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { + // cerr << "next handle would be the ending handle for the path" + // << endl; check to see if the sequence in the handle is suitable + // for ending the path: + int compare_length = path_seq.size() - cur_index_in_path; + + // //todo: debug_statement + // cerr << "about to compare. compare val: " + // << (next_seq.compare(0, compare_length, path_seq, + // cur_index_in_path, compare_length) == + // 0) + // << " source_and_sink_handles_map " + // << source_and_sink_handles_map_properly( + // graph, new_source_id, new_sink_id, touching_source, + // touching_sink, get<0>(possible_path).front(), next) + // << endl; + if ((next_seq.compare(0, compare_length, path_seq, + cur_index_in_path, compare_length) == 0) && + source_and_sink_handles_map_properly( + graph, new_source_id, new_sink_id, touching_source, + touching_sink, get<0>(possible_path).front(), next)) { + // todo: debug_statement + // cerr << "compared." << endl; + + // we've found the new path! Move path to the new sequence, + // and end the function. + + if (compare_length < next_seq.size()) { + // If the path ends before the end of next_seq, then split + // the handle so that the path ends flush with the end of + // the first of the two split handles. + + // divide the handle where the path ends; + pair divided_next = + graph.divide_handle(next, compare_length); + get<0>(possible_path).push_back(divided_next.first); + + // Special case if next is the sink or the source, to + // preserve the reassignment of source and sink ids in + // integrate_snarl. + if (next_id == new_sink_id) { + new_sink_id = graph.get_id(divided_next.second); + } + + // TODO: NOTE: finding the old "next" handle is expensive. + // TODO: Use different container? + auto it = find(new_snarl_handles.begin(), + new_snarl_handles.end(), next); + + // replace the old invalidated handle with one of the new + // ones + *it = divided_next.first; + // stick the other new handle on the end of + // new_snarl_handles. + new_snarl_handles.push_back(divided_next.second); + + } else { + // otherwise, the end of the path already coincides with + // the end of the handle. In that case, just add it to the + // path. + get<0>(possible_path).push_back(next); } + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, + get<0>(possible_path)); + // //todo: debug_statement: + // cerr << "got a full path: "; + // for (handle_t handle : get<0>(possible_path)) { + // cerr << graph.get_id(handle) << " "; + // } + // cerr << endl; + + // we've already found the path. No need to keep looking for + // more paths. + return false; + } + } + // see if the next handle would be the continuation of the path, but + // not the end, + else { + + // check to see if the sequence in the handle is suitable for + // extending the path: + int compare_length = next_seq.size(); + // //todo: debug_statement + // cerr << "compare returned false" << endl; + // cerr << "compare in returned false: " + // << " next_seq len " << next_seq.size() << " compare_length + // " + // << compare_length << " path_seq len " << path_seq.size() + // << " cur_index_in_path " << cur_index_in_path << endl; + // cerr << "if statement eval: cur_index_in_path <= + // next_seq.size() " + // << (cur_index_in_path <= next_seq.size()) + // << " next_seq.compare(0, compare_length, path_seq, " + // "cur_index_in_path, compare_length) == 0) " + // << (next_seq.compare(0, compare_length, path_seq, + // cur_index_in_path, compare_length) == + // 0) + // << endl; + if (next_seq.compare(0, compare_length, path_seq, + cur_index_in_path, compare_length) == 0) { + // cerr << "compared in return false" << endl; + // extend the path + get<0>(possible_path).push_back(next); - // TODO: NOTE: finding the old "next" handle is expensive. - // TODO: Use different container? - auto it = find(new_snarl_handles.begin(), - new_snarl_handles.end(), next); - - // replace the old invalidated handle with one of the new ones - *it = divided_next.first; - // stick the other new handle on the end of new_snarl_handles. - new_snarl_handles.push_back(divided_next.second); + // update the current index in path_seq. + get<2>(possible_path) += next_seq.size(); - } else { - // otherwise, the end of the path already coincides with the - // end of the handle. In that case, just add it to the path. - get<0>(possible_path).push_back(next); + // place back into possible_paths + possible_paths.push_back(possible_path); + // cerr << "extending the path!" << endl; } - graph.rewrite_segment(old_embedded_path.first, - old_embedded_path.second, - get<0>(possible_path)); - - return false; - } - } - // see if the next handle would be the continuation of the path, but not - // the end, - else { - // check to see if the sequence in the handle is suitable for - // extending the path: - int compare_length = next_seq.size(); - if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, - compare_length) == 0) { - // extend the path - get<0>(possible_path).push_back(next); - - // update the current index in path_seq. - get<2>(possible_path) += next_seq.size(); - - // place back into possible_paths - possible_paths.push_back(possible_path); } } // continue to iterate through follow_edges. return true; }); - // if we've found a complete path in the above follow_edges, then we've already - // moved the path, and we're done. + // //todo: debug_statement: + // if + // (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // == + // "_alt_19f9bc9ad2826f58f113965edf36bb93740df46d_0") { + // cerr << "mystery node 4214930: " + // << graph.get_sequence(graph.get_handle(4214930)) << endl; + // } + + // if we've found a complete path in the above follow_edges, then we've + // already moved the path, and we're done. if (!no_path) { return; } } + // //todo: figure out how to do some better error message instead of cerr. // if we failed to find a path, show an error message. - // TODO: make this better! Throw an exception? cerr << "Warning! Didn't find a corresponding path of name " << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) - << " from the old snarl in the newly aligned snarl." << endl + << " from the old snarl in the newly aligned snarl. This snarl will not be " + "normalized." + "########################################################" + << endl << endl; + // throw graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); + // assert(true && "Warning! Didn't find a corresponding path of name " + + // graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // + " from the old snarl in the newly aligned snarl."); +} + +/** Used to help move_path_to_snarl map paths from an old snarl to its newly + * normalized counterpart. In particular, ensures that any paths which touch the + * source and/or sink of the old snarl still do so in the new snarl (which is + * important to ensure that we don't break any paths partway through the snarl.) + * + * @param {HandleGraph} graph : the graph that contains the old and new snarl + * nodes. + * @param {id_t} new_source_id : the node id of the newly created source. + * @param {id_t} new_sink_id : the node id of the newly created sink. + * @param {bool} touching_source : true if the path is connected to the old + * source. + * @param {bool} touching_sink : true if the path is connected to the old + * sink. + * @param {handle_t} potential_source : proposed source for the path in the new snarl. + * @param {handle_t} potential_sink : proposed sink for the path in the new snarl. + * @return {bool} : true if the path satisfies the requirement + * that, if the original path covered the old source or sink, the new path also covers + * the same respective nodes in the new snarl. + */ +bool source_and_sink_handles_map_properly( + const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, + const bool &touching_source, const bool &touching_sink, + const handle_t &potential_source, const handle_t &potential_sink) { + + bool path_map = false; + // cerr << "touching source? " << touching_source << "touching_sink" << touching_sink + // << "source is source?" << (graph.get_id(potential_source) == new_source_id) + // << " sink is sink: " << (graph.get_id(potential_sink) == new_sink_id) << endl; + if (touching_source && touching_sink) { + path_map = ((graph.get_id(potential_source) == new_source_id) && + (graph.get_id(potential_sink) == new_sink_id)); + } else if (touching_source) { + path_map = (graph.get_id(potential_source) == new_source_id); + } else if (touching_sink) { + path_map = (graph.get_id(potential_sink) == new_sink_id); + } else { + path_map = true; + } + // cerr << "path_map " << path_map << endl; + return path_map; } -// Determines whether some subsequence in a handle satisfies the condition of being the -// beginning of a path. -// If the path_seq is longer than the handle_seq, only checks subsequences that reach -// from the beginning/middle of the handle_seq to the end. If path_seq is shorter -// than handle_seq, checks for any substring of length path_seq within the -// handle_seq, as well as substrings smaller than length path_seq that extend beyond -// the current handle. +// Determines whether some subsequence in a handle satisfies the condition of being +// the beginning of a path. +// If the path_seq is longer than the handle_seq, only checks subsequences that +// reach from the beginning/middle of the handle_seq to the end. If path_seq is +// shorter than handle_seq, checks for any substring of length path_seq within +// the handle_seq, as well as substrings smaller than length path_seq that extend +// beyond the current handle. // Arguments: // handle_seq: the sequence in the handle we're trying to identify as a // start_of_path_seq. path_seq: the sequence in the path we're trying to find // starting points for in handle_seq -// Return: a vector of all potential starting index of the subsequence in the handle_seq. +// Return: a vector of all potential starting index of the subsequence in the +// handle_seq. vector check_handle_as_start_of_path_seq(const string &handle_seq, const string &path_seq) { vector possible_start_indices; - cerr << "check_handle_as_start_of_path_seq" << endl; // If the handle_seq.size <= path_seq.size, look for subsequences reaching from // beginning/middle of handle_seq to the end - where path_seq may run off the end // of this handle to the next in the snarl. @@ -1035,16 +1419,16 @@ vector check_handle_as_start_of_path_seq(const string &handle_seq, } } } - // if handle_seq.size > path_seq.size, look for any subsequence within handle_seq of - // path_seq.size, as well as any subsequence smaller than path_seq reaching from - // middle of handle_seq to the end of handle_seq. + // if handle_seq.size > path_seq.size, look for any subsequence within handle_seq + // of path_seq.size, as well as any subsequence smaller than path_seq reaching + // from middle of handle_seq to the end of handle_seq. else { // first, search through all handle_seq for any comparable subsequence of - // path_seq.size. Note: only differences between this for loop and above for loop - // is that handle_start_i stops at (<= path_seq.size() - handle_seq.size()), and - // subseq.size() = path_seq.size() + // path_seq.size. Note: only differences between this for loop and above for + // loop is that handle_start_i stops at (<= path_seq.size() - + // handle_seq.size()), and subseq.size() = path_seq.size() for (int handle_start_i = 0; - handle_start_i < (handle_seq.size() - path_seq.size()); handle_start_i++) { + handle_start_i <= (handle_seq.size() - path_seq.size()); handle_start_i++) { int subseq_size = path_seq.size(); // The path_seq subsequence of interest is from 0 to subseq_size; // The handle_seq subsequence of interest starts at handle_start_i @@ -1055,8 +1439,9 @@ vector check_handle_as_start_of_path_seq(const string &handle_seq, possible_start_indices.push_back(handle_start_i); } } - // second, search through the last few bases of handle_seq for the beginning of - // path_seq. Note: nearly identical for loop to the one in "if (handle_seq.size() + // second, search through the last few bases of handle_seq for the beginning + // of path_seq. Note: nearly identical for loop to the one in "if + // (handle_seq.size() // <= path_seq.size())" for (int handle_start_i = (handle_seq.size() - path_seq.size() + 1); handle_start_i < handle_seq.size(); handle_start_i++) { @@ -1071,8 +1456,8 @@ vector check_handle_as_start_of_path_seq(const string &handle_seq, } } } - // Note: if we passed through the above check without returning anything, then there - // isn't any satisfactory subsequence. + // Note: if we passed through the above check without returning anything, then + // there isn't any satisfactory subsequence and we'll return an empty vector. return possible_start_indices; } @@ -1084,7 +1469,7 @@ vector check_handle_as_start_of_path_seq(const string &handle_seq, // snarl, there should only be one source and sink each. pair, vector> debug_get_sources_and_sinks(const HandleGraph &graph) { - cerr << "debug_get_source_and_sinks" << endl; + // cerr << "debug_get_source_and_sinks" << endl; vector sink; vector source; @@ -1108,28 +1493,27 @@ debug_get_sources_and_sinks(const HandleGraph &graph) { sink.emplace_back(handle); } }); - return pair, vector>(source, sink); } -// Runs through the whole snarl and generates all possible strings representing walks from -// source to sink. Generates a combinatorial number of possible paths with splits in the -// snarl. +// Runs through the whole snarl and generates all possible strings representing walks +// from source to sink. Generates a combinatorial number of possible paths with splits +// in the snarl. vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, id_t start_id, id_t sink_id) { - cerr << "debug_graph_to_strings" << endl; + // cerr << "debug_graph_to_strings" << endl; SubHandleGraph snarl = extract_subgraph(graph, start_id, sink_id); unordered_map> sequences; vector sinks; unordered_map count; - count.reserve(snarl.get_node_count()); // resize count to contain enough buckets for - // size of snarl + count.reserve(snarl.get_node_count()); // resize count to contain enough buckets + // for size of snarl sequences.reserve(snarl.get_node_count()); // resize sequences to contain enough // buckets for size of snarl - // identify sources and sinks //TODO: once we've established that this fxn works, we - // can just use start_id and sink_id. + // identify sources and sinks //TODO: once we've established that this fxn works, + // we can just use start_id and sink_id. snarl.for_each_handle([&](const handle_t &handle) { bool is_source = true, is_sink = true; snarl.follow_edges(handle, true, [&](const handle_t &prev) { diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp index cb31e1a7768..889f086b17c 100644 --- a/src/algorithms/0_draft_haplotype_realignment.hpp +++ b/src/algorithms/0_draft_haplotype_realignment.hpp @@ -53,9 +53,9 @@ bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id); -pair>, vector>> -extract_gbwt_haplotypes(const GBWTGraph &graph, const id_t &source_id, - const id_t &sink_id); +tuple>, vector>, unordered_set> +extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &graph, + const id_t &source_id, const id_t &sink_id); vector> find_haplotypes_not_at_source(const GBWTGraph &haploGraph, @@ -66,7 +66,7 @@ vector format_handle_haplotypes_to_strings( const GBWTGraph &haploGraph, const vector> &haplotype_handle_vectors); -VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotypes); +VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); vector> extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, @@ -81,8 +81,17 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph & void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, const pair &old_embedded_path, - vector &new_snarl_handles, id_t &source_id, - id_t &sink_id); + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id); + +bool source_and_sink_handles_map_properly(const HandleGraph &graph, + const id_t &new_source_id, + const id_t &new_sink_id, + const bool &touching_source, + const bool &touching_sink, + const handle_t &potential_source, + const handle_t &potential_sink); vector check_handle_as_start_of_path_seq(const string &handle_seq, const string &path_seq); diff --git a/src/algorithms/0_draft_snarl_normalization_evaluation.cpp b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp new file mode 100644 index 00000000000..b92550d1d23 --- /dev/null +++ b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp @@ -0,0 +1,32 @@ +#pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? +#include +#include +#include "../snarls.hpp" +namespace vg { + +/** + * evaluates the performance of the snarl normalizer. + * 42794 - another snarl with the issue where the path jumps over a node of interest? Is this my fault or..? + * 43899 + * 71109 + * stops for several minuts at 2049699 b/c large size snarl now at 1:40 + */ + +void evaluate_normalized_snarls(ifstream &snarl_stream) { + cerr << "evaluate_normalized_snarls" << endl; + SnarlManager *snarl_manager = new SnarlManager(snarl_stream); + + // Use this code to count number of snarls in graph. + int top_count = 0; + for (const Snarl* snarl : snarl_manager->top_level_snarls()){ + top_count++; + } + cerr << "number of top_level snarls in graph: " << top_count << endl; + int general_count = 0; + snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ + general_count++; + }); + cerr << "number of total snarls in graph: " << general_count << endl; + +} +} \ No newline at end of file diff --git a/src/msa_converter.cpp b/src/msa_converter.cpp index e191a2361a5..fdbbf21882e 100644 --- a/src/msa_converter.cpp +++ b/src/msa_converter.cpp @@ -36,84 +36,112 @@ using namespace std; }; if (format == "seqan") { - // conservation line starts with int. - unordered_set conservation_chars{'0','1','2','3','4','5','6','7','8','9'}; - - auto is_conservation_line = [&](string& line) { - bool conservation_line = false; - for (char c : line) { - if (!isspace(c)) { - if (conservation_chars.count(c)) { - conservation_line = true; - } - else { - conservation_line = false; - } - break; - } - } - return conservation_line; - }; - - auto is_blank = [](const string& str) { - return all_of(str.begin(), str.end(), [](char c){return isspace(c);}); - }; - - auto is_draw_line = [&](string& line) { - bool draw_line = false; - for (char c : line) { - if (!isspace(c)) { - if (c == '|') { - draw_line = true; - } - else { - draw_line = false; - } - break; - } - } - return draw_line; - }; + // in actuality, this works with an istream that has just the alignment + // sequence rows from a seqan alignment file. To parse the full file format, + // see commented-out code below. - // removes leading whitespace of line. - auto get_next_line = [&](istream& in) { - string line; - getline(in, line); + // remove first empty line. + string line; + getline(in, line); - for (auto it = line.begin(); it != line.end(); it++){ - if (!isspace(*it)) { - line.erase(line.begin(), it); - break; - } - } - return line; - }; + int seq_count = 0; // make an alignment block alignments.emplace_back(); auto& alignment = alignments.back(); + // alignment[to_string(seq_count)].append(line); - int seq_count = 0; - string line; - line = get_next_line(in); + // while there's still more lines to extract from the istream: while (!in.eof()) { - if (is_conservation_line(line)){ - seq_count = 0; - } - else if (!is_draw_line(line) && !is_blank(line)) { - auto iter = alignment.find(to_string(seq_count)); - if (iter != alignment.end()) { - iter->second.append(line); - } - else { - alignment[to_string(seq_count)] = line; - } - seq_count++; - } - line = get_next_line(in); - + getline(in, line); + alignment[to_string(seq_count)].append(line); + seq_count++; } + + //Note: this is old code, which parses the full seqan align file format. + // Above code instead works with all the rows in a seqan Align object, + // appended to a stream with \n for separators. (rows accessed with rows(align)). + + + // // conservation line starts with int. + // unordered_set conservation_chars{'0','1','2','3','4','5','6','7','8','9'}; + + // auto is_conservation_line = [&](string& line) { + // bool conservation_line = false; + // for (char c : line) { + // if (!isspace(c)) { + // if (conservation_chars.count(c)) { + // conservation_line = true; + // } + // else { + // conservation_line = false; + // } + // break; + // } + // } + // return conservation_line; + // }; + + // auto is_blank = [](const string& str) { + // return all_of(str.begin(), str.end(), [](char c){return isspace(c);}); + // }; + + // auto is_draw_line = [&](string& line) { + // bool draw_line = false; + // for (char c : line) { + // if (!isspace(c)) { + // if (c == '|') { + // draw_line = true; + // } + // else { + // draw_line = false; + // } + // break; + // } + // } + // return draw_line; + // }; + + // // removes leading whitespace of line. + // auto get_next_line = [&](istream& in) { + // string line; + // getline(in, line); + + // for (auto it = line.begin(); it != line.end(); it++){ + // if (!isspace(*it)) { + // line.erase(line.begin(), it); + // break; + // } + // } + // return line; + // }; + + // // make an alignment block + // alignments.emplace_back(); + // auto& alignment = alignments.back(); + + // int seq_count = 0; + // string line; + // line = get_next_line(in); + // while (!in.eof()) { + // if (is_conservation_line(line)){ + // seq_count = 0; + // } + // else if (!is_draw_line(line) && !is_blank(line)) { + // auto iter = alignment.find(to_string(seq_count)); + // if (iter != alignment.end()) { + // iter->second.append(line); + // } + // else { + // alignment[to_string(seq_count)] = line; + // } + // seq_count++; + // } + // line = get_next_line(in); + + // } + } diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index c2ef6e9011d..7b79a09694a 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -10,6 +10,7 @@ #include "../algorithms/0_draft_haplotype_realignment.hpp" #include "../gbwt_helper.hpp" #include "../../include/vg/io/vpkg.hpp" +#include "../algorithms/0_draft_snarl_normalization_evaluation.cpp" using namespace std; @@ -33,6 +34,7 @@ int main_normalize(int argc, char** argv) { return 1; } + bool evaluate = false; bool normalize = false; string gbwt; string snarls; @@ -46,11 +48,12 @@ int main_normalize(int argc, char** argv) { {"help", no_argument, 0, 'h'}, {"gbwt", required_argument, 0, 'g'}, {"snarls", required_argument, 0, 's'}, + {"evaluate", no_argument, 0, 'e'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "g:s:h", + c = getopt_long (argc, argv, "g:s:eh", long_options, &option_index); @@ -64,12 +67,18 @@ int main_normalize(int argc, char** argv) { case 'g': gbwt = optarg; normalize = true; + break; + case 'e': + evaluate = true; + break; + case 's': snarls = optarg; + break; - // default: //TODO: get this to work, instead of always causing crash. - // abort(); + default: + abort(); } } @@ -101,87 +110,21 @@ int main_normalize(int argc, char** argv) { // run test code on all snarls in graph. disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); + } - // /// Run test code on a single snarl: - // vg::id_t source = 23493; vg::id_t sink = 23505; - // disambiguate_snarl(*graph, haploGraph, source, sink); - + if ( evaluate ) { + std::ifstream snarl_stream; + string snarl_file = snarls; + snarl_stream.open(snarl_file); + cerr << "about to evaluate normalized snarls" << endl; + vg::evaluate_normalized_snarls(snarl_stream); } - graph->serialize(std::cout); + // graph->serialize(std::cout); delete graph; return 0; } // Register subcommand -static Subcommand vg_normalize("normalize", "edit snarls to reduce information duplication", TOOLKIT, main_normalize); - - - - - - - - - - - - - - - - - -//TODO: Remove JUNK: - - // vg::id_t source = 23251;//for robin_haplotypes/simple - // vg::id_t sink = 23257;//for robin_haplotypes/simple - // /Testing gbwt_helper.hpp's for_each_kmer function. This issue is that I don't know how to construct a gbwt::GBWT haplotypes object. Nor do I know how to determine what size k I should use. - // vg::id_t source = 23251;//for robin_haplotypes/simple - // vg::id_t sink = 23257;//for robin_haplotypes/simple - // clean_snarl_from_haplotypes(*graph, source, sink); - // cerr << "done!" << endl; - // vg::handle_t source_handle = graph->get_handle(source); - // vg::handle_t sink_handle = graph->get_handle(sink); - - // vector haplotypes = depth_first_haplotypes_to_strings(*graph, source, sink); - // cerr << "finished depth_first, now on to reference." << endl; - // vector reference = get_paths(*graph, source_handle, sink_handle); - - // haplotypes.insert(end(haplotypes), begin(reference), end(reference)); - - // cerr << "here goes!" << endl; - // for(string haplotype : haplotypes) { - - // cerr << haplotype << endl; - // } - // cerr << "done" << endl; - - - - - - - - - - - - - - - - - // std::ifstream snarl_stream; - // snarl_stream.open(demo_0); - - // if (!snarl_stream) { - // cerr << "error:[vg mod] Cannot open Snarls file " << demo_0 << endl; - // exit(1); - // } - - // clean_all_snarls(*graph, snarl_stream); - - // string gbwt_name = "test/robin_haplotypes/simple/chr10_subgraph_2dels-shift-729006.gbwt"; - +static Subcommand vg_normalize("normalize", "edit snarls to reduce information duplication", TOOLKIT, main_normalize); \ No newline at end of file From 537ef1e0f691762e5cb15b27144ee975969d2504 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 9 Aug 2019 09:42:52 -0700 Subject: [PATCH 31/63] extract_gbwt_haplotypes now catches incorrect gbwt connections between handles within the snarl that have no connecting edge in the graph. --- .../0_draft_haplotype_realignment.cpp | 179 ++++++++++++------ .../0_draft_haplotype_realignment.hpp | 15 +- ...0_draft_snarl_normalization_evaluation.cpp | 5 + src/subcommand/0_normalize_main.cpp | 61 +++--- 4 files changed, 167 insertions(+), 93 deletions(-) diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index e886893e1b0..e5fc0f7aae6 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -25,7 +25,6 @@ #include "../types.hpp" #include "extract_containing_graph.hpp" - namespace vg { // TODO: allow for snarls that have haplotypes that begin or end in the middle of the @@ -58,18 +57,30 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, int num_snarls_normalized = 0; int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); - bool success; - for (auto roots : snarl_roots) { + // error_record's bools are: + // + tuple one_snarl_error_record; + tuple full_error_record; + int num_of_errors = 4; - // if (roots->start().node_id() == 1883 or roots->start().node_id() == 4211565) { - cerr << "disambiguating snarl #" - << (num_snarls_normalized + num_snarls_skipped) - << " source: " << roots->start().node_id() - << " sink: " << roots->end().node_id() << endl; - success = disambiguate_snarl(graph, haploGraph, roots->start().node_id(), - roots->end().node_id()); + for (auto roots : snarl_roots) { - if (success) { + // if (roots->start().node_id() == 4181165) { + // cerr << "disambiguating snarl #" << (num_snarls_normalized + + // num_snarls_skipped) + // << " source: " << roots->start().node_id() + // << " sink: " << roots->end().node_id() << endl; + one_snarl_error_record = disambiguate_snarl( + graph, haploGraph, roots->start().node_id(), roots->end().node_id()); + get<0>(full_error_record) += get<0>(one_snarl_error_record); + get<1>(full_error_record) += get<1>(one_snarl_error_record); + get<2>(full_error_record) += get<2>(one_snarl_error_record); + get<3>(full_error_record) += get<3>(one_snarl_error_record); + if (!(get<0>(one_snarl_error_record) || get<1>(one_snarl_error_record) || + get<2>(one_snarl_error_record) || get<3>(one_snarl_error_record))) { num_snarls_normalized += 1; } else { num_snarls_skipped += 1; @@ -78,11 +89,14 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " - << num_snarls_skipped - << " snarls b/c they had haplotypes starting/ending in the middle " - "of the snarl, the snarl was cyclic, it exceeded the size limit, or there " - "were handles not connected by the gbwt info." - << endl; + << num_snarls_skipped << " snarls because. . .\nthey exceeded the size limit (" + << get<0>(full_error_record) + << "snarls),\nhad haplotypes starting/ending in the middle of the snarl (" + << get<1>(full_error_record) << "),\nthe snarl was cyclic (" + << get<3>(full_error_record) + << " snarls),\nor there " + "were handles not connected by the gbwt info (" + << get<2>(full_error_record) << " snarls)." << endl; /// Args: /// source graph to extract subgraph from @@ -115,16 +129,22 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, // Returns: none. // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl. -bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, const id_t &source_id, - const id_t &sink_id) { +tuple disambiguate_snarl(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, + const id_t &source_id, + const id_t &sink_id) { // cerr << "disambiguate_snarl" << endl; - + // error_record's bools are: + // + tuple error_record{0, 0, 0, 0}; SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); if (!algorithms::is_acyclic(&snarl)) { cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; - return false; + get<3>(error_record) = true; } // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. @@ -137,9 +157,7 @@ bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, // ( needs the unordered_set from extract_gbwt haplotypes to be equal to the number of // handles in the snarl). int handles_in_snarl = 0; - snarl.for_each_handle([&](const handle_t handle) { - handles_in_snarl++; - }); + snarl.for_each_handle([&](const handle_t handle) { handles_in_snarl++; }); // TODO: this if statement removes snarls where a haplotype begins/ends in the middle // TODO: of the snarl. Get rid of this once alignment issue is addressed! @@ -163,9 +181,7 @@ bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, vector> embedded_paths = extract_embedded_paths_in_snarl(graph, source_id, sink_id); - // todo: debug_statment (lots of em) - // cerr << "\t gonna try to find an embedded path stretching from source to sink!" - // << endl; find the paths that stretch from source to sink: + // find the paths that stretch from source to sink: for (auto path : embedded_paths) { // cerr << "checking path of name " << // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with @@ -192,36 +208,38 @@ bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, // Align the new snarl: VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + // todo: make 32 a part of the general object maximum handle_size info. + force_maximum_handle_size(new_snarl, 32); + // todo: debug_statement // new_snarl.for_each_handle([&](const handle_t& handle) { // cerr << new_snarl.get_id(handle) << " " << new_snarl.get_sequence(handle) // << "\t"; // }); - // //todo: debug_statement: - // new_snarl.serialize_to_ostream(cout); - // return true; - // integrate the new_snarl into the graph, removing the old snarl as you go. integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); - cerr << endl; - return true; + return error_record; } else { if (!get<1>(haplotypes).empty()) { - cerr << "found a snarl with haplotypes in the middle. Start: " << source_id - << " sink is " << sink_id << ". Skipping." << endl; + cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id + << " with haplotypes that start or end in the middle. Skipping." << endl; + get<1>(error_record) = true; } if (get<0>(haplotypes).size() > 200) { - cerr << "found a snarl with too many haplotypes (" + cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() << ") to efficiently align. Skipping." << endl; + get<0>(error_record) = true; } if (get<2>(haplotypes).size() != handles_in_snarl) { - cerr << "some handles in the snarl aren't accounted for by the gbwt graph. " + cerr << "some handles in the snarl starting at " << source_id << " and ending at " << sink_id + << " aren't accounted for by the gbwt graph. " "Skipping." << endl; + get<2>(error_record) = true; } - return false; + return error_record; } } @@ -270,6 +288,10 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph vector> haplotypes_from_source_to_sink; vector> other_haplotypes; + // sometimes a gbwt thread will indicate a connection between two handles that doesn't + // actually exist in the graph. These connections need to be ignored. + unordered_set incorrect_connections; + // int prev_size = 0; // for every partly-extracted thread, extend the thread until it either reaches // the sink of the snarl or the end of the thread. @@ -308,14 +330,29 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // the other_haplotypes if haplotype ends before reaching sink. for (gbwt::SearchState next_search : next_searches) { handle_t next_handle = haploGraph.node_to_handle(next_search.node); - if (!snarl.has_node(snarl.get_id(next_handle))) { - cerr << "snarl starting at node " << source_id - << " has a thread that incorrectly connects a node contained " - "within the snarl (node " - << haploGraph.get_id(cur_haplotype.first.back()) - << ") and a node outside the snarl (" - << haploGraph.get_id(next_handle) - << "). This thread connection will be ignored." << endl; + // if (!snarl.has_node(snarl.get_id(next_handle)) && + // make_pair(haploGraph.get_id(cur_haplotype.first.back()),haploGraph.get_id(next_handle))) + // { + if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { + if (incorrect_connections.find( + snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == + incorrect_connections.end()) { + cerr << "snarl starting at node " << source_id << " and ending at " << sink_id + << " has a thread that incorrectly connects two nodes that don't have any edge connecting them. These two nodes are " + << haploGraph.get_id(cur_haplotype.first.back()) + << " and " + << haploGraph.get_id(next_handle) + << ". This thread connection will be ignored." << endl; + incorrect_connections.emplace(snarl.edge_handle( + cur_haplotype.first.back(), next_handle)); + + //todo: debug_statement + cerr << "next handle(s) of handle " << snarl.get_id(cur_haplotype.first.back()) << " according to snarl:" << endl; + snarl.follow_edges(cur_haplotype.first.back(), false, [&] (const handle_t handle){ + cerr << "\t" < of cur_haplotype: @@ -580,7 +617,7 @@ vector format_handle_haplotypes_to_strings( // VG object representing the newly realigned snarl. VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { // cerr << "align_source_to_sink_haplotypes" << endl; - cerr << "number of strings to align: " << source_to_sink_haplotypes.size() << endl; + // cerr << "number of strings to align: " << source_to_sink_haplotypes.size() << endl; // TODO: make the following comment true, so that I can normalize haplotypes that // TODO: aren't source_to_sink by adding a similar special character to strings in // TODO: the middle of the snarl. @@ -676,6 +713,39 @@ VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { return snarl; } +/** For each handle in a given graph, divides any handles greater than max_size into parts + * that are equal to or less than the size of max_size. + * + * @param {MutableHandleGraph} graph : the graph in which we want to force a maximum + * handle size for all handles. + * @param {size_t} max_size : the maximum size we want a handle to be. + */ +void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size) { + // forcing each handle in the graph to have a maximum sequence length of max_size: + graph.for_each_handle([&](handle_t handle) { + // all the positions we want to make in the handle are in offsets. + vector offsets; + + size_t sequence_len = graph.get_sequence(handle).size(); + int number_of_divisions = floor(sequence_len / max_size); + + // if the handle divides evenly into subhandles of size max_size, we don't need to + // make the last cut (which would be at the very end of the handle - cutting off + // no sequence). + if (sequence_len % max_size == 0) { + number_of_divisions--; + } + + // calculate the position of all the divisions we want to make. + for (int i = 1; i <= number_of_divisions; i++) { + offsets.push_back(i * max_size); + } + + // divide the handle into parts. + graph.divide_handle(handle, offsets); + }); +} + // Finds all embedded paths that either start or end in a snarl (or both) defined by // source_id, sink_id. // returns a vector of the embedded paths, where each entry in the vector is defined @@ -892,7 +962,7 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // cerr << to_insert_snarl.get_id(handle) << " " // << to_insert_snarl.get_sequence(handle) << " \t"; // }); - cerr << endl; + // cerr << endl; // Get old graph snarl SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); @@ -905,7 +975,8 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, if (to_insert_snarl_defining_handles.first.size() > 1 || to_insert_snarl_defining_handles.second.size() > 1) { - cerr << "ERROR: newly made snarl with more than one start or end. # of starts: " + cerr << "ERROR: newly made snarl from a snarl starting at " << source_id + << " has more than one start or end. # of starts: " << to_insert_snarl_defining_handles.first.size() << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; return; @@ -1330,11 +1401,13 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, } // //todo: figure out how to do some better error message instead of cerr. // if we failed to find a path, show an error message. - cerr << "Warning! Didn't find a corresponding path of name " + cerr << "##########################\nWarning! Didn't find a corresponding path of " + "name " << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) - << " from the old snarl in the newly aligned snarl. This snarl will not be " - "normalized." - "########################################################" + << " from the old snarl at " << old_source_id + << " in the newly aligned snarl. This snarl WILL be " + "normalized, resulting in a probably incorrectly-constructed snarl." + "\n##########################" << endl << endl; // throw graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp index 889f086b17c..c3c81c86426 100644 --- a/src/algorithms/0_draft_haplotype_realignment.hpp +++ b/src/algorithms/0_draft_haplotype_realignment.hpp @@ -49,7 +49,7 @@ namespace vg { void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, ifstream &snarl_stream); -bool disambiguate_snarl(MutablePathDeletableHandleGraph &graph, +tuple disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id); @@ -68,6 +68,8 @@ vector format_handle_haplotypes_to_strings( VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); +void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); + vector> extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, const id_t &sink_id); @@ -85,13 +87,10 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, id_t &new_sink_id, const id_t &old_source_id, const id_t &old_sink_id); -bool source_and_sink_handles_map_properly(const HandleGraph &graph, - const id_t &new_source_id, - const id_t &new_sink_id, - const bool &touching_source, - const bool &touching_sink, - const handle_t &potential_source, - const handle_t &potential_sink); +bool source_and_sink_handles_map_properly( + const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, + const bool &touching_source, const bool &touching_sink, + const handle_t &potential_source, const handle_t &potential_sink); vector check_handle_as_start_of_path_seq(const string &handle_seq, const string &path_seq); diff --git a/src/algorithms/0_draft_snarl_normalization_evaluation.cpp b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp index b92550d1d23..758d4ea4d71 100644 --- a/src/algorithms/0_draft_snarl_normalization_evaluation.cpp +++ b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp @@ -10,6 +10,11 @@ namespace vg { * 43899 * 71109 * stops for several minuts at 2049699 b/c large size snarl now at 1:40 + * Stats of the new graph:number of top_level snarls in graph: 3418 + * number of total snarls in graph: 7443 + * nodes 4218503 + * edges 4227433 + * length 134506805 */ void evaluate_normalized_snarls(ifstream &snarl_stream) { diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 7b79a09694a..0ae69826d5a 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -1,23 +1,23 @@ -// mod.cpp: define the "normalize" subcommand, which realigns snarls to produce more efficient representations of snarls. +// mod.cpp: define the "normalize" subcommand, which realigns snarls to produce more +// efficient representations of snarls. +#include #include #include -#include #include "subcommand.hpp" #include "../../include/sglib/hash_graph.hpp" -#include "../algorithms/0_draft_haplotype_realignment.hpp" -#include "../gbwt_helper.hpp" #include "../../include/vg/io/vpkg.hpp" +#include "../algorithms/0_draft_haplotype_realignment.hpp" #include "../algorithms/0_draft_snarl_normalization_evaluation.cpp" - +#include "../gbwt_helper.hpp" using namespace std; using namespace vg; using namespace vg::subcommand; -void help_normalize(char** argv) { +void help_normalize(char **argv) { cerr << "usage: " << argv[0] << " normalize [options] >[mod.vg]" << endl << "Modifies snarls, outputs modified on stdout." << endl << endl @@ -26,8 +26,7 @@ void help_normalize(char** argv) { << " -s, --snarls snarls file corresponding to hashgraph." << endl; } -int main_normalize(int argc, char** argv) { - +int main_normalize(int argc, char **argv) { if (argc == 2) { help_normalize(argv); @@ -44,31 +43,26 @@ int main_normalize(int argc, char** argv) { while (true) { static struct option long_options[] = - { - {"help", no_argument, 0, 'h'}, - {"gbwt", required_argument, 0, 'g'}, - {"snarls", required_argument, 0, 's'}, - {"evaluate", no_argument, 0, 'e'}, - {0, 0, 0, 0} - }; + {{"help", no_argument, 0, 'h'}, + {"gbwt", required_argument, 0, 'g'}, + {"snarls", required_argument, 0, 's'}, + {"evaluate", no_argument, 0, 'e'}, + {0, 0, 0, 0}}; int option_index = 0; - c = getopt_long (argc, argv, "g:s:eh", - long_options, &option_index); - + c = getopt_long(argc, argv, "g:s:eh", long_options, &option_index); // Detect the end of the options. if (c == -1) break; - switch (c) - { + switch (c) { case 'g': gbwt = optarg; normalize = true; break; - + case 'e': evaluate = true; break; @@ -82,13 +76,11 @@ int main_normalize(int argc, char** argv) { } } - sglib::HashGraph* graph; - get_input_file(optind, argc, argv, [&](istream& in) { - graph = new sglib::HashGraph(in); - }); + sglib::HashGraph *graph; + get_input_file(optind, argc, argv, + [&](istream &in) { graph = new sglib::HashGraph(in); }); - if ( normalize ) - { + if (normalize) { /// Build the gbwt: ifstream gbwt_stream; gbwt_stream.open(gbwt); @@ -101,7 +93,7 @@ int main_normalize(int argc, char** argv) { std::ifstream snarl_stream; string snarl_file = snarls; snarl_stream.open(snarl_file); - + if (!snarl_stream) { cerr << "error:[vg mod] Cannot open Snarls file " << snarl_file << endl; exit(1); @@ -109,10 +101,9 @@ int main_normalize(int argc, char** argv) { // run test code on all snarls in graph. disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); - } - if ( evaluate ) { + if (evaluate) { std::ifstream snarl_stream; string snarl_file = snarls; snarl_stream.open(snarl_file); @@ -120,11 +111,17 @@ int main_normalize(int argc, char** argv) { vg::evaluate_normalized_snarls(snarl_stream); } - // graph->serialize(std::cout); + // TODO: NOTE: this may be cumbersome code if we decide to add more argument types. + // Consider changing. + if (normalize) { + graph->serialize(std::cout); + } delete graph; return 0; } // Register subcommand -static Subcommand vg_normalize("normalize", "edit snarls to reduce information duplication", TOOLKIT, main_normalize); \ No newline at end of file +static Subcommand vg_normalize("normalize", + "edit snarls to reduce information duplication", TOOLKIT, + main_normalize); \ No newline at end of file From 4186794c6d55016d62f22aacbad3eb7ae3c10aff Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 9 Aug 2019 10:21:26 -0700 Subject: [PATCH 32/63] removed 200 thread limit (to see how long it takes to normalize all snarls); also added timing to the main file. --- robin_bash/debug_normalize_snarl.sh | 49 ++++++++++ .../0_draft_haplotype_realignment.cpp | 93 ++++++++++--------- src/subcommand/0_normalize_main.cpp | 9 +- 3 files changed, 108 insertions(+), 43 deletions(-) create mode 100755 robin_bash/debug_normalize_snarl.sh diff --git a/robin_bash/debug_normalize_snarl.sh b/robin_bash/debug_normalize_snarl.sh new file mode 100755 index 00000000000..19dee860c9b --- /dev/null +++ b/robin_bash/debug_normalize_snarl.sh @@ -0,0 +1,49 @@ + +export VG_FULL_TRACEBACK=1 +set -e + +echo compiling! +. ./source_me.sh && make -j 8 +echo running! + +###Before and During Normalization +##running normalize_snarls on a full chromosome - local machine. +TEST_DIR=test/robin_tests/full_chr10 +FILE_NAME=hgsvc_chr10_construct +FILE_NAME_OUT=hgsvc_chr10_construct_normalized_no_max_size + +##running full chr10 +echo "running normalize (w/ evaluation)" +vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg + +## for printing out the normalized subsnarl: +# vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >graph_out.vg +# ./bin/vg view -dpn graph_out.vg| \ +# dot -Tsvg -o graph_out.svg +# chromium-browser graph_out.svg + +## for extracting a prenormalized subgraph for looking at chr10 +# vg find -x test/robin_tests/full_chr10/hgsvc_chr10_construct.xg -n 1236806 -c 25 >graph_in.vg +# ./bin/vg view -dpn graph_in.vg| \ +# dot -Tsvg -o graph_in.svg +# chromium-browser graph_in.svg + + +### After Normalization: +# TEST_DIR=test/robin_tests/full_chr10 +# FILE_NAME_OUT=hgsvc_chr10_construct_normalized_2 + +## for making a snarls file: +vg convert -a $TEST_DIR/$FILE_NAME_OUT.hg -V >$TEST_DIR/$FILE_NAME_OUT.vg +echo "hg converted to vg" +vg snarls $TEST_DIR/$FILE_NAME_OUT.vg >$TEST_DIR/$FILE_NAME_OUT.snarls +echo ".snarls made" + +## for evaluating normalized graph: +echo "evaluating. . ." +# vg normalize -e -s $TEST_DIR/$FILE_NAME_OUT.snarls $TEST_DIR/$FILE_NAME_OUT.hg +vg stats -z -l $TEST_DIR/$FILE_NAME_OUT.vg + +## creating a new gbwt graph from the outgraph: +vg index -G $TEST_DIR/$FILE_NAME_OUT.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME_OUT.vg +echo "gbwt made" \ No newline at end of file diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index e5fc0f7aae6..7deaaea1069 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -25,6 +25,7 @@ #include "../types.hpp" #include "extract_containing_graph.hpp" + namespace vg { // TODO: allow for snarls that have haplotypes that begin or end in the middle of the @@ -69,22 +70,22 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, for (auto roots : snarl_roots) { // if (roots->start().node_id() == 4181165) { - // cerr << "disambiguating snarl #" << (num_snarls_normalized + - // num_snarls_skipped) - // << " source: " << roots->start().node_id() - // << " sink: " << roots->end().node_id() << endl; - one_snarl_error_record = disambiguate_snarl( - graph, haploGraph, roots->start().node_id(), roots->end().node_id()); - get<0>(full_error_record) += get<0>(one_snarl_error_record); - get<1>(full_error_record) += get<1>(one_snarl_error_record); - get<2>(full_error_record) += get<2>(one_snarl_error_record); - get<3>(full_error_record) += get<3>(one_snarl_error_record); - if (!(get<0>(one_snarl_error_record) || get<1>(one_snarl_error_record) || - get<2>(one_snarl_error_record) || get<3>(one_snarl_error_record))) { - num_snarls_normalized += 1; - } else { - num_snarls_skipped += 1; - } + // cerr << "disambiguating snarl #" << (num_snarls_normalized + + // num_snarls_skipped) + // << " source: " << roots->start().node_id() + // << " sink: " << roots->end().node_id() << endl; + one_snarl_error_record = disambiguate_snarl( + graph, haploGraph, roots->start().node_id(), roots->end().node_id()); + get<0>(full_error_record) += get<0>(one_snarl_error_record); + get<1>(full_error_record) += get<1>(one_snarl_error_record); + get<2>(full_error_record) += get<2>(one_snarl_error_record); + get<3>(full_error_record) += get<3>(one_snarl_error_record); + if (!(get<0>(one_snarl_error_record) || get<1>(one_snarl_error_record) || + get<2>(one_snarl_error_record) || get<3>(one_snarl_error_record))) { + num_snarls_normalized += 1; + } else { + num_snarls_skipped += 1; + } // } } cerr << endl @@ -163,8 +164,9 @@ tuple disambiguate_snarl(MutablePathDeletableHandleGraph // TODO: of the snarl. Get rid of this once alignment issue is addressed! // TODO: also, limits the number of haplotypes to be aligned, since snarl starting at // TODO: 2049699 with 258 haplotypes is taking many minutes. - if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < 200 && - get<2>(haplotypes).size() == handles_in_snarl) { + // if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < 200 && + // get<2>(haplotypes).size() == handles_in_snarl) { + if (get<1>(haplotypes).empty() && get<2>(haplotypes).size() == handles_in_snarl) { // if (get<1>(haplotypes).empty()) { // Convert the haplotypes from vector format to string format. vector haplotypes_from_source_to_sink = @@ -222,18 +224,20 @@ tuple disambiguate_snarl(MutablePathDeletableHandleGraph return error_record; } else { if (!get<1>(haplotypes).empty()) { - cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id + cerr << "found a snarl starting at " << source_id << " and ending at " + << sink_id << " with haplotypes that start or end in the middle. Skipping." << endl; get<1>(error_record) = true; } - if (get<0>(haplotypes).size() > 200) { - cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id << " with too many haplotypes (" - << get<0>(haplotypes).size() << ") to efficiently align. Skipping." - << endl; - get<0>(error_record) = true; - } + // if (get<0>(haplotypes).size() > 200) { + // cerr << "found a snarl starting at " << source_id << " and ending at " + // << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() + // << ") to efficiently align. Skipping." << endl; + // get<0>(error_record) = true; + // } if (get<2>(haplotypes).size() != handles_in_snarl) { - cerr << "some handles in the snarl starting at " << source_id << " and ending at " << sink_id + cerr << "some handles in the snarl starting at " << source_id + << " and ending at " << sink_id << " aren't accounted for by the gbwt graph. " "Skipping." << endl; @@ -337,22 +341,27 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph if (incorrect_connections.find( snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == incorrect_connections.end()) { - cerr << "snarl starting at node " << source_id << " and ending at " << sink_id - << " has a thread that incorrectly connects two nodes that don't have any edge connecting them. These two nodes are " - << haploGraph.get_id(cur_haplotype.first.back()) - << " and " - << haploGraph.get_id(next_handle) - << ". This thread connection will be ignored." << endl; - incorrect_connections.emplace(snarl.edge_handle( - cur_haplotype.first.back(), next_handle)); - - //todo: debug_statement - cerr << "next handle(s) of handle " << snarl.get_id(cur_haplotype.first.back()) << " according to snarl:" << endl; - snarl.follow_edges(cur_haplotype.first.back(), false, [&] (const handle_t handle){ - cerr << "\t" < of cur_haplotype: diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 0ae69826d5a..78495058326 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -13,6 +13,8 @@ #include "../algorithms/0_draft_snarl_normalization_evaluation.cpp" #include "../gbwt_helper.hpp" +#include // for high_resolution_clock + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -98,9 +100,14 @@ int main_normalize(int argc, char **argv) { cerr << "error:[vg mod] Cannot open Snarls file " << snarl_file << endl; exit(1); } - + // Record start time + auto start = std::chrono::high_resolution_clock::now(); // run test code on all snarls in graph. disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); + // Record end time + auto finish = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = finish - start; + std::cout << "Elapsed time: " << elapsed.count() << " s\n"; } if (evaluate) { From 56445faf3b39be6ba3595bb31ed656f3bbadda5c Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 9 Aug 2019 10:32:20 -0700 Subject: [PATCH 33/63] debugged timing code --- robin_bash/debug_normalize_snarl.sh | 8 ++------ src/algorithms/0_draft_haplotype_realignment.cpp | 4 ++-- src/subcommand/0_normalize_main.cpp | 6 +++--- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/robin_bash/debug_normalize_snarl.sh b/robin_bash/debug_normalize_snarl.sh index 19dee860c9b..9aeaa2eb936 100755 --- a/robin_bash/debug_normalize_snarl.sh +++ b/robin_bash/debug_normalize_snarl.sh @@ -11,6 +11,7 @@ echo running! TEST_DIR=test/robin_tests/full_chr10 FILE_NAME=hgsvc_chr10_construct FILE_NAME_OUT=hgsvc_chr10_construct_normalized_no_max_size +# FILE_NAME_OUT=junk ##running full chr10 echo "running normalize (w/ evaluation)" @@ -28,11 +29,7 @@ vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TES # dot -Tsvg -o graph_in.svg # chromium-browser graph_in.svg - ### After Normalization: -# TEST_DIR=test/robin_tests/full_chr10 -# FILE_NAME_OUT=hgsvc_chr10_construct_normalized_2 - ## for making a snarls file: vg convert -a $TEST_DIR/$FILE_NAME_OUT.hg -V >$TEST_DIR/$FILE_NAME_OUT.vg echo "hg converted to vg" @@ -40,8 +37,7 @@ vg snarls $TEST_DIR/$FILE_NAME_OUT.vg >$TEST_DIR/$FILE_NAME_OUT.snarls echo ".snarls made" ## for evaluating normalized graph: -echo "evaluating. . ." -# vg normalize -e -s $TEST_DIR/$FILE_NAME_OUT.snarls $TEST_DIR/$FILE_NAME_OUT.hg +echo "getting vg stats:" vg stats -z -l $TEST_DIR/$FILE_NAME_OUT.vg ## creating a new gbwt graph from the outgraph: diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_draft_haplotype_realignment.cpp index 7deaaea1069..57f953d7e9a 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_draft_haplotype_realignment.cpp @@ -69,7 +69,7 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, for (auto roots : snarl_roots) { - // if (roots->start().node_id() == 4181165) { + if (roots->start().node_id() == 4181165) { // cerr << "disambiguating snarl #" << (num_snarls_normalized + // num_snarls_skipped) // << " source: " << roots->start().node_id() @@ -86,7 +86,7 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, } else { num_snarls_skipped += 1; } - // } + } } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 78495058326..bb0041638dc 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -101,13 +101,13 @@ int main_normalize(int argc, char **argv) { exit(1); } // Record start time - auto start = std::chrono::high_resolution_clock::now(); + auto start = chrono::high_resolution_clock::now(); // run test code on all snarls in graph. disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); // Record end time auto finish = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed = finish - start; - std::cout << "Elapsed time: " << elapsed.count() << " s\n"; + chrono::duration elapsed = finish - start; + cerr << "Elapsed time: " << elapsed.count() << " s\n"; } if (evaluate) { From badf5f79a40649bcb2288ca29be64b20df10e442 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 9 Aug 2019 12:28:04 -0700 Subject: [PATCH 34/63] added shebang --- robin_bash/debug_normalize_snarl.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/robin_bash/debug_normalize_snarl.sh b/robin_bash/debug_normalize_snarl.sh index 9aeaa2eb936..fad23c806bf 100755 --- a/robin_bash/debug_normalize_snarl.sh +++ b/robin_bash/debug_normalize_snarl.sh @@ -1,3 +1,4 @@ +#!/bin/bash export VG_FULL_TRACEBACK=1 set -e From aa7166356d444676b7d26e2d823cea3f83171f18 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 27 Sep 2019 11:24:00 -0700 Subject: [PATCH 35/63] object-orientified the snarl normalizer, also renamed cpp/hpp files --- .../0_draft_haplotype_realignment.hpp | 161 ---- ...0_draft_snarl_normalization_evaluation.cpp | 1 + ...lignment.cpp => 0_oo_normalize_snarls.cpp} | 815 ++++++++++-------- src/algorithms/0_oo_normalize_snarls.hpp | 83 ++ src/subcommand/0_normalize_main.cpp | 62 +- 5 files changed, 569 insertions(+), 553 deletions(-) delete mode 100644 src/algorithms/0_draft_haplotype_realignment.hpp rename src/algorithms/{0_draft_haplotype_realignment.cpp => 0_oo_normalize_snarls.cpp} (68%) create mode 100644 src/algorithms/0_oo_normalize_snarls.hpp diff --git a/src/algorithms/0_draft_haplotype_realignment.hpp b/src/algorithms/0_draft_haplotype_realignment.hpp deleted file mode 100644 index c3c81c86426..00000000000 --- a/src/algorithms/0_draft_haplotype_realignment.hpp +++ /dev/null @@ -1,161 +0,0 @@ -/* -Robin Rounthwaite -Find function call in ./subcommand/main.cpp -*/ -#include "../gbwt_helper.hpp" -#include "../handle.hpp" -#include "../subgraph.hpp" -#include "../vg.hpp" -#include "count_walks.hpp" -#include - -/* TODO for improving haplotype_realignment. -Tomorrow: -* scale code upwards so that you can run code on every snarl in given graph. -* also add requirement that haps entering snarl = haps exiting snarl. -TODO: align haplotypes_not_at_source once we have a solution for alignments that insert -TODO: the haplotype in a specified location -TODO: (use more unique marker signals to identify where in other strings the -TODO: middle-haplotype should align?) - -TODO: consider splitting handles where embedded paths begin/end in the middle of a handle. -TODO: (Note: would need to dynamically change other paths containing that handle. :-/) -TODO: Or simply split the handles of interest and then realign the paths - expensive. -TODO: Or insert *yet another* marker char to id where embedded paths begin/end, so its -TODO: easily find where to split the handles afterwards. AND! it makes moving the -TODO: paths less expensive. -TODO: (fewer spots to check alignment in the snarl). If we have unique markers for -TODO: each path, then -TODO: it becomes O(N) time, instead of ~O(N*M*n) (N: number of bases in snarl; M: -TODO: number of bases in path; -TODO: n: number of potential starting places in the snarl (note: slightly less -TODO: expensive since n is -TODO: divided up among the M paths).) -TODO: this would also addres the possibility of an embedded path being moved to an -TODO: alternative location -TODO: when it overlaps a repetitive sequence. (previous thought, tho' above one is -TODO: better): do I Need -TODO: to account for this with a sense of "bases distant from source"? - -TODO: make it so that gbwt file is customized by user rather than hardcoded. - -TODO: make it so that you pass the gbwt file directory to a one-liner function -TODO: (ran in normalize_main) that generates gbwt graph, extracts haps, -TODO: aligns haps, and reintegrates haps. (eventually will do it for every -TODO: snarl in the given graph). - -*/ -namespace vg { -void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, ifstream &snarl_stream); - -tuple disambiguate_snarl(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, const id_t &source_id, - const id_t &sink_id); - -tuple>, vector>, unordered_set> -extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &graph, - const id_t &source_id, const id_t &sink_id); - -vector> -find_haplotypes_not_at_source(const GBWTGraph &haploGraph, - unordered_set &touched_handles, - const id_t &sink_id); - -vector format_handle_haplotypes_to_strings( - const GBWTGraph &haploGraph, - const vector> &haplotype_handle_vectors); - -VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); - -void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); - -vector> -extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, - const id_t &sink_id); - -SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, - const id_t &end_id); - -void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph &new_snarl, - const vector> embedded_paths, - const id_t &source_id, const id_t &sink_id); - -void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, - const pair &old_embedded_path, - vector &new_snarl_handles, id_t &new_source_id, - id_t &new_sink_id, const id_t &old_source_id, - const id_t &old_sink_id); - -bool source_and_sink_handles_map_properly( - const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, - const bool &touching_source, const bool &touching_sink, - const handle_t &potential_source, const handle_t &potential_sink); - -vector check_handle_as_start_of_path_seq(const string &handle_seq, - const string &path_seq); - -// -------------------------------- DEBUG CODE BELOW: ------------------------------------ - -pair, vector> -debug_get_sources_and_sinks(const HandleGraph &graph); - -vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, - id_t start_id, id_t end_id); - -vector debug_get_embedded_paths_from_source_to_sink(const PathHandleGraph &graph, - const handle_t &source_handle, - const handle_t &sink_handle); -} // namespace vg - -/* -Deleted stuff: - -void jordan_bug(MutablePathDeletableHandleGraph& graph){ - - // example with one node: - handle_t example = graph.get_handle(23448); - handle_t replacement = graph.create_handle("GATTACA", 1); - - // move the source edges: - //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could -in Python!) graph.follow_edges(example, true, - [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, replacement); - }); - graph.follow_edges(example, false, - [&](const handle_t &next_handle) { - graph.create_edge(replacement, next_handle); - }); - - // move the paths: - graph.for_each_step_on_handle(example, [&](step_handle_t step) - { - graph.rewrite_segment(step, graph.get_next_step(step), -vector{replacement}); - }); - - // example with two nodes: - handle_t example_1 = graph.get_handle(23450); - handle_t replacement_1 = graph.create_handle("GATTACA", 2); - handle_t replacement_2 = graph.create_handle("GATTACA", 3); - graph.create_edge(replacement_1, replacement_2); - - // move the source edges: - //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could -in Python!) graph.follow_edges(example_1, true, - [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, replacement_1); - }); - graph.follow_edges(example_1, false, - [&](const handle_t &next_handle) { - graph.create_edge(replacement_2, next_handle); - }); - - // move the paths: - graph.for_each_step_on_handle(example_1, [&](step_handle_t step) - { - graph.rewrite_segment(step, step, vector{replacement_1, replacement_2}); - }); -} - */ diff --git a/src/algorithms/0_draft_snarl_normalization_evaluation.cpp b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp index 758d4ea4d71..6de2928065d 100644 --- a/src/algorithms/0_draft_snarl_normalization_evaluation.cpp +++ b/src/algorithms/0_draft_snarl_normalization_evaluation.cpp @@ -33,5 +33,6 @@ void evaluate_normalized_snarls(ifstream &snarl_stream) { }); cerr << "number of total snarls in graph: " << general_count << endl; + delete snarl_manager; } } \ No newline at end of file diff --git a/src/algorithms/0_draft_haplotype_realignment.cpp b/src/algorithms/0_oo_normalize_snarls.cpp similarity index 68% rename from src/algorithms/0_draft_haplotype_realignment.cpp rename to src/algorithms/0_oo_normalize_snarls.cpp index 57f953d7e9a..7e0eefe14be 100644 --- a/src/algorithms/0_draft_haplotype_realignment.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -1,13 +1,5 @@ -// TODO: I remove snarls where a haplotype begins/ends in the middle -// TODO: of the snarl. Get rid of this once alignment issue is addressed! -// TODO: also, limits the number of haplotypes to be aligned, since snarl starting at -// TODO: 2049699 with 258 haplotypes is taking many minutes. - -// TODO: another had 146 haplotypes and took maybe 5 minutes to align. (kept that one -// in tho' ) #pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? -#include "0_draft_haplotype_realignment.hpp" - +#include "0_oo_normalize_snarls.hpp" #include #include @@ -25,86 +17,95 @@ #include "../types.hpp" #include "extract_containing_graph.hpp" - namespace vg { +SnarlNormalizer::SnarlNormalizer(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, + const int &max_alignment_size, const string &path_finder) + : _haploGraph(haploGraph), _graph(graph), _max_alignment_size(max_alignment_size), + _path_finder(path_finder) {} // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl -// Runs disambiguate_snarl on every top-level snarl in the graph, so long as the +// Runs disambiguate_snarl on every top-level snarl in the _graph, so long as the // snarl only contains haplotype threads that extend fully from source to sink. // Arguments: -// graph: the full-sized handlegraph that will undergo edits in a snarl. -// haploGraph: the corresponding GBWTGraph of graph. -// snarl_stream: the file stream from .snarl file corresponding to graph. -void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, ifstream &snarl_stream) { - cerr << "disambiguate_top_level_snarls" << endl; +// _graph: the full-sized handlegraph that will undergo edits in a snarl. +// _haploGraph: the corresponding GBWTGraph of _graph. +// snarl_stream: the file stream from .snarl file corresponding to _graph. +void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { + // cerr << "disambiguate_top_level_snarls" << endl; SnarlManager *snarl_manager = new SnarlManager(snarl_stream); - /** Use this code to count number of snarls in graph. - * int top_count = 0; - * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ - * top_count++; - * } - * cerr << "number of top_level snarls in graph: " << top_count << endl; - * - * int general_count = 0; - * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ - * general_count++; - * }); - * cerr << "number of total snarls in graph: " << general_count << endl; - */ - int num_snarls_normalized = 0; int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); // error_record's bools are: - // - tuple one_snarl_error_record; - tuple full_error_record; - int num_of_errors = 4; + // 0) snarl exceeds max number of threads that can be efficiently aligned, + // 1) snarl has haplotypes starting/ending in the middle, + // 2) some handles in the snarl aren't connected by a thread, + // 3) snarl is cyclic. + // there are two additional ints for tracking the changing size of sequence in the + // snarl: + // 4) number of bases in the snarl before normalization + // 5) number of bases in the snarl after normalization. + int error_record_size = 5; + vector one_snarl_error_record(error_record_size, 0); + vector full_error_record(error_record_size, 0); + + pair snarl_sequence_change; for (auto roots : snarl_roots) { - if (roots->start().node_id() == 4181165) { - // cerr << "disambiguating snarl #" << (num_snarls_normalized + - // num_snarls_skipped) - // << " source: " << roots->start().node_id() - // << " sink: " << roots->end().node_id() << endl; - one_snarl_error_record = disambiguate_snarl( - graph, haploGraph, roots->start().node_id(), roots->end().node_id()); - get<0>(full_error_record) += get<0>(one_snarl_error_record); - get<1>(full_error_record) += get<1>(one_snarl_error_record); - get<2>(full_error_record) += get<2>(one_snarl_error_record); - get<3>(full_error_record) += get<3>(one_snarl_error_record); - if (!(get<0>(one_snarl_error_record) || get<1>(one_snarl_error_record) || - get<2>(one_snarl_error_record) || get<3>(one_snarl_error_record))) { - num_snarls_normalized += 1; - } else { - num_snarls_skipped += 1; - } - } + // if (roots->start().node_id() == 6165) { + // if (roots->start().node_id() < 50000) { + // if (roots->start().node_id() == 1883) { + // cerr << "disambiguating snarl #" + // << (num_snarls_normalized + num_snarls_skipped) + // << " source: " << roots->start().node_id() + // << " sink: " << roots->end().node_id() << endl; + one_snarl_error_record = + normalize_snarl(roots->start().node_id(), roots->end().node_id()); + + if (!((one_snarl_error_record[0]) || (one_snarl_error_record[1]) || + (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { + // if there are no errors, then we've successfully normalized a snarl. + num_snarls_normalized += 1; + // track the change in size of the snarl. + snarl_sequence_change.first += one_snarl_error_record[4]; + snarl_sequence_change.second += one_snarl_error_record[5]; + } else { + // else, there was an error. Track which errors caused the snarl to not + // normalize. + // note: the last two ints are ignored here b/c they're for + // recording the changing size of snarls that are successfully normalized. + for (int i = 0; i < error_record_size - 2; i++) { + full_error_record[i] += one_snarl_error_record[i]; + } + num_snarls_skipped += 1; + } + // } } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " << num_snarls_skipped << " snarls because. . .\nthey exceeded the size limit (" - << get<0>(full_error_record) + << full_error_record[0] << "snarls),\nhad haplotypes starting/ending in the middle of the snarl (" - << get<1>(full_error_record) << "),\nthe snarl was cyclic (" - << get<3>(full_error_record) + << full_error_record[1] << "),\nthe snarl was cyclic (" << full_error_record[3] << " snarls),\nor there " "were handles not connected by the gbwt info (" - << get<2>(full_error_record) << " snarls)." << endl; + << full_error_record[2] << " snarls)." << endl; + cerr << "amount of sequence in normalized snarls before normalization: " + << snarl_sequence_change.first << endl; + cerr << "amount of sequence in normalized snarls after normalization: " + << snarl_sequence_change.second << endl; /// Args: - /// source graph to extract subgraph from - /// into graph to extract into + /// source _graph to extract subgraph from + /// into _graph to extract into /// positions search outward from these positions /// max_dist include all nodes and edges that can be reached in at most - /// this distance reversing_walk_length also find graph material that can be reached + /// this distance reversing_walk_length also find _graph material that can be + /// reached // //todo: debug_statement // VG outGraph; @@ -117,135 +118,174 @@ void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, delete snarl_manager; } -// For a snarl in the given graph, with every edge covered by at least one haplotype +// For a snarl in the given _graph, with every edge covered by at least one haplotype // thread in the GBWTGraph, // extract all sequences in the snarl corresponding to the haplotype threads and // re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are // preserved; GBWT haplotypes in the snarl are not conserved. // Arguments: -// graph: the full-sized handlegraph that will undergo edits in a snarl. -// haploGraph: the corresponding GBWTGraph of graph. +// _graph: the full-sized handlegraph that will undergo edits in a snarl. +// _haploGraph: the corresponding GBWTGraph of _graph. // source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: none. // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl. -tuple disambiguate_snarl(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, - const id_t &source_id, - const id_t &sink_id) { +vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t &sink_id) { // cerr << "disambiguate_snarl" << endl; + _cur_source_id = source_id; + _cur_sink_id = sink_id; + // error_record's bools are: - // - tuple error_record{0, 0, 0, 0}; - SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + // 0) snarl exceeds max number of threads that can be efficiently aligned, + // 1) snarl has haplotypes starting/ending in the middle, + // 2) some handles in the snarl aren't connected by a thread, + // 3) snarl is cyclic. + // there are two additional ints for tracking the changing size of sequence in the + // snarl: + // 4) number of bases in the snarl before normalization + // 5) number of bases in the snarl after normalization. + vector error_record(5, 0); + SubHandleGraph snarl = extract_subgraph(_graph, _cur_source_id, sink_id); if (!algorithms::is_acyclic(&snarl)) { - cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; - get<3>(error_record) = true; + cerr << "snarl at " << _cur_source_id << " is cyclic. Skipping." << endl; + error_record[3] = true; } - // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. - // Return value is pair< haplotypes_that_stretch_from_source_to_sink, - // haplotypes_that_end/start_prematurely > - tuple>, vector>, unordered_set> - haplotypes = extract_gbwt_haplotypes(snarl, haploGraph, source_id, sink_id); + // extract threads + tuple, vector>, unordered_set> haplotypes; + if (_path_finder == "GBWT") { + // First, find all haplotypes encoded by the GBWT, in order to create the new + // snarl. Return value is tuple< haplotypes_that_stretch_from_source_to_sink, + // haplotypes_that_end/start_prematurely, set of all handles in the haplotypes > + tuple>, vector>, unordered_set> + gbwt_haplotypes = + extract_gbwt_haplotypes(snarl, _haploGraph, _cur_source_id, sink_id); + // Convert the haplotypes from vector format to string format. + get<0>(haplotypes) = format_handle_haplotypes_to_strings(get<0>(gbwt_haplotypes)); + get<1>(haplotypes) = get<1>(gbwt_haplotypes); + get<2>(haplotypes) = get<2>(gbwt_haplotypes); + } else if (_path_finder == "exhaustive") { + pair, unordered_set> exhaustive_haplotypes = + source_to_sink_exhaustive_path_finder(); + get<0>(haplotypes) = exhaustive_haplotypes.first; + get<2>(haplotypes) = exhaustive_haplotypes.second; + } else { + cerr << "path_finder type must be 'GBWT' or 'exhaustive', not " << _path_finder + << endl; + } - // check to make sure that the gbwt graph has threads connecting all handles: + // check to make sure that the gbwt _graph has threads connecting all handles: // ( needs the unordered_set from extract_gbwt haplotypes to be equal to the number of // handles in the snarl). - int handles_in_snarl = 0; - snarl.for_each_handle([&](const handle_t handle) { handles_in_snarl++; }); + unordered_set handles_in_snarl; + snarl.for_each_handle([&](const handle_t handle) { + handles_in_snarl.emplace(handle); + // count the number of bases in the snarl. + error_record[4] += snarl.get_sequence(handle).size(); + }); // TODO: this if statement removes snarls where a haplotype begins/ends in the middle // TODO: of the snarl. Get rid of this once alignment issue is addressed! // TODO: also, limits the number of haplotypes to be aligned, since snarl starting at // TODO: 2049699 with 258 haplotypes is taking many minutes. - // if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < 200 && - // get<2>(haplotypes).size() == handles_in_snarl) { - if (get<1>(haplotypes).empty() && get<2>(haplotypes).size() == handles_in_snarl) { - // if (get<1>(haplotypes).empty()) { - // Convert the haplotypes from vector format to string format. - vector haplotypes_from_source_to_sink = - format_handle_haplotypes_to_strings(haploGraph, get<0>(haplotypes)); + if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < _max_alignment_size && + get<2>(haplotypes).size() == handles_in_snarl.size()) { + // if (get<1>(haplotypes).empty() && get<2>(haplotypes).size() == + // handles_in_snarl) { if (get<1>(haplotypes).empty()) { Convert the haplotypes + // from vector format to string format. // vector< string > other_haplotypes = // format_handle_haplotypes_to_strings(haploGraph, get<1>(haplotypes)); - // Get the embedded paths in the snarl out of the graph, for the purposes of + // Get the embedded paths in the snarl out of the _graph, for the purposes of // moving them into the new snarl. In addition, any embedded paths that stretch // from source to sink are aligned in the new snarl. // TODO: once haplotypes that begin/end in the middle of the snarl have been // TODO: accounted for in the code, align all embedded paths? (and remove next // TODO: chunk of code that finds source-to-sink paths)? vector> embedded_paths = - extract_embedded_paths_in_snarl(graph, source_id, sink_id); + extract_embedded_paths_in_snarl(_graph, _cur_source_id, sink_id); // find the paths that stretch from source to sink: for (auto path : embedded_paths) { // cerr << "checking path of name " << - // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with - // start " << graph.get_id(graph.get_handle_of_step(path.first)) << " and sink - // " << - // graph.get_id(graph.get_handle_of_step(graph.get_previous_step(path.second))) + // _graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with + // start " << _graph.get_id(graph.get_handle_of_step(path.first)) << " and + // sink " << + // _graph.get_id(graph.get_handle_of_step(graph.get_previous_step(path.second))) // << endl; - if (graph.get_id(graph.get_handle_of_step(path.first)) == source_id && - graph.get_id(graph.get_handle_of_step( - graph.get_previous_step(path.second))) == sink_id) { + if (_graph.get_id(_graph.get_handle_of_step(path.first)) == _cur_source_id && + _graph.get_id(_graph.get_handle_of_step( + _graph.get_previous_step(path.second))) == sink_id) { // cerr << "adding path of name " << - // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << endl; - // get the sequence of the source to sink path, and add it to the paths to - // be aligned. + // _graph.get_path_name(graph.get_path_handle_of_step(path.first)) << + // endl; get the sequence of the source to sink path, and add it to the + // paths to be aligned. string path_seq; step_handle_t cur_step = path.first; while (cur_step != path.second) { - path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); - cur_step = graph.get_next_step(cur_step); + path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); + cur_step = _graph.get_next_step(cur_step); } - haplotypes_from_source_to_sink.push_back(path_seq); + get<0>(haplotypes).push_back(path_seq); } } // Align the new snarl: - VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + VG new_snarl = align_source_to_sink_haplotypes(get<0>(haplotypes)); - // todo: make 32 a part of the general object maximum handle_size info. - force_maximum_handle_size(new_snarl, 32); + // count the number of bases in the snarl. + new_snarl.for_each_handle([&](const handle_t handle) { + error_record[5] += new_snarl.get_sequence(handle).size(); + }); - // todo: debug_statement - // new_snarl.for_each_handle([&](const handle_t& handle) { + force_maximum_handle_size(new_snarl, _max_alignment_size); + + // //todo: debug_statement + // new_snarl.for_each_handle([&](const handle_t &handle) { // cerr << new_snarl.get_id(handle) << " " << new_snarl.get_sequence(handle) - // << "\t"; + // << "\t"; // }); - // integrate the new_snarl into the graph, removing the old snarl as you go. - integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); + // integrate the new_snarl into the _graph, removing the old snarl as you go. + integrate_snarl(new_snarl, embedded_paths); return error_record; } else { if (!get<1>(haplotypes).empty()) { - cerr << "found a snarl starting at " << source_id << " and ending at " + cerr << "found a snarl starting at " << _cur_source_id << " and ending at " << sink_id << " with haplotypes that start or end in the middle. Skipping." << endl; - get<1>(error_record) = true; + error_record[1] = true; } - // if (get<0>(haplotypes).size() > 200) { - // cerr << "found a snarl starting at " << source_id << " and ending at " - // << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() - // << ") to efficiently align. Skipping." << endl; - // get<0>(error_record) = true; - // } - if (get<2>(haplotypes).size() != handles_in_snarl) { - cerr << "some handles in the snarl starting at " << source_id + if (get<0>(haplotypes).size() > _max_alignment_size) { + cerr << "found a snarl starting at " << _cur_source_id << " and ending at " + << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() + << ") to efficiently align. Skipping." << endl; + error_record[0] = true; + } + if (get<2>(haplotypes).size() != handles_in_snarl.size()) { + cerr << "some handles in the snarl starting at " << _cur_source_id << " and ending at " << sink_id - << " aren't accounted for by the gbwt graph. " + << " aren't accounted for by the gbwt_graph. " "Skipping." << endl; - get<2>(error_record) = true; + cerr << "these handles are:" << endl << "\t"; + for (auto handle : handles_in_snarl) { + if (get<2>(haplotypes).find(handle) == get<2>(haplotypes).end()) { + cerr << _graph.get_id(handle) << " "; + } + } + cerr << endl; + error_record[2] = true; + } + if (error_record[5] > error_record[4]) { + cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " + "rather than decreased." + << endl; } return error_record; } -} +} // namespace vg // TODO: test that it successfully extracts any haplotypes that start/end in the middle of // TODO: the snarl. @@ -253,18 +293,19 @@ tuple disambiguate_snarl(MutablePathDeletableHandleGraph // are represented // by vectors of handles, representing the chain of handles in a thread. // Arguments: -// haploGraph: the GBWTGraph containing the snarl. -// source_id: the source of the snarl of interest. +// _haploGraph: the GBWTGraph containing the snarl. +// _cur_source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: // a pair containting two sets of paths (each represented by a vector). The // first in the pair represents all paths reaching from source to sink in the snarl, // and the second representing all other paths in the snarl (e.g. any that don't -// reach both source and sink in the graph.) +// reach both source and sink in the _graph.) // pair>, vector>> tuple>, vector>, unordered_set> -extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph, - const id_t &source_id, const id_t &sink_id) { +SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, + const GBWTGraph &haploGraph, + const id_t &_source_id, const id_t &sink_id) { // cerr << "extract_gbwt_haplotypes" << endl; // haplotype_queue contains all started exon_haplotypes not completed yet. @@ -273,13 +314,13 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // to the SearchState. vector, gbwt::SearchState>> haplotype_queue; - // source and sink handle for haploGraph: - handle_t source_handle = haploGraph.get_handle(source_id); - handle_t sink_handle = haploGraph.get_handle(sink_id); + // source and sink handle for _haploGraph: + handle_t source_handle = _haploGraph.get_handle(_source_id); + handle_t sink_handle = _haploGraph.get_handle(sink_id); // place source in haplotype_queue. vector source_handle_vec(1, source_handle); - gbwt::SearchState source_state = haploGraph.get_state(source_handle); + gbwt::SearchState source_state = _haploGraph.get_state(source_handle); haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); // touched_handles contains all handles that have been touched by the @@ -293,7 +334,7 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph vector> other_haplotypes; // sometimes a gbwt thread will indicate a connection between two handles that doesn't - // actually exist in the graph. These connections need to be ignored. + // actually exist in the _graph. These connections need to be ignored. unordered_set incorrect_connections; // int prev_size = 0; @@ -306,7 +347,7 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // for (auto hap : haplotype_queue) { // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; // for (handle_t handle : hap.first) { - // cerr << haploGraph.get_id(handle) << " "; + // cerr << _haploGraph.get_id(handle) << " "; // } // cerr << endl; // } @@ -319,11 +360,11 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // get all the subsequent search_states that immediately follow the searchstate // from cur_haplotype. vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, - [&](const gbwt::SearchState next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); + _haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); // if next_searches > 1, then we need to make multiple new haplotypes to be // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. @@ -333,7 +374,7 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // haplotypes_from_source_to_sink if haplotype extends to sink, or place in // the other_haplotypes if haplotype ends before reaching sink. for (gbwt::SearchState next_search : next_searches) { - handle_t next_handle = haploGraph.node_to_handle(next_search.node); + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); // if (!snarl.has_node(snarl.get_id(next_handle)) && // make_pair(haploGraph.get_id(cur_haplotype.first.back()),haploGraph.get_id(next_handle))) // { @@ -342,25 +383,25 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == incorrect_connections.end()) { cerr - << "snarl starting at node " << source_id << " and ending at " - << sink_id + << "snarl starting at node " << _cur_source_id + << " and ending at " << sink_id << " has a thread that incorrectly connects two nodes that " "don't have any edge connecting them. These two nodes are " - << haploGraph.get_id(cur_haplotype.first.back()) << " and " - << haploGraph.get_id(next_handle) + << _haploGraph.get_id(cur_haplotype.first.back()) << " and " + << _haploGraph.get_id(next_handle) << ". This thread connection will be ignored." << endl; incorrect_connections.emplace( snarl.edge_handle(cur_haplotype.first.back(), next_handle)); // todo: debug_statement - cerr << "next handle(s) of handle " - << snarl.get_id(cur_haplotype.first.back()) - << " according to snarl:" << endl; - snarl.follow_edges(cur_haplotype.first.back(), false, - [&](const handle_t handle) { - cerr << "\t" << snarl.get_id(handle); - }); - cerr << endl; + // cerr << "next handle(s) of handle " + // << snarl.get_id(cur_haplotype.first.back()) + // << " according to snarl:" << endl; + // snarl.follow_edges(cur_haplotype.first.back(), false, + // [&](const handle_t handle) { + // cerr << "\t" << snarl.get_id(handle); + // }); + // cerr << endl; } continue; } @@ -392,10 +433,10 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph } // if next_handle is the sink, put in haplotypes_from_source_to_sink else if (haploGraph.get_id( - haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { + _haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { // Then we need to add cur_haplotype + next_search to // haplotypes_from_source_to_sink. - handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); cur_haplotype.first.push_back(next_handle); haplotypes_from_source_to_sink.push_back(cur_haplotype.first); @@ -407,7 +448,7 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // haplotype_queue. else { // get the next_handle from the one next_search. - handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); // modify cur_haplotype with next_handle and next_search. cur_haplotype.first.push_back(next_handle); @@ -423,7 +464,7 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph // Find any haplotypes starting from handles not starting at the source, but which // still start somewhere inside the snarl. vector> haplotypes_not_starting_at_source = - find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + find_haplotypes_not_at_source(touched_handles, sink_id); // move haplotypes_not_starting_at_source into other_haplotypes: other_haplotypes.reserve(other_haplotypes.size() + @@ -437,20 +478,18 @@ extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &haploGraph } // Used to complete the traversal of a snarl along its haplotype threads, when there are -// handles connected to the snarl by -// threads that start after the source handle. (Threads that merely end before the -// sink handle are addressed in extract_gbwt_haplotypes). +// handles connected to the snarl by threads that start after the source handle. (Threads +// that merely end before the sink handle are addressed in extract_gbwt_haplotypes). // Arguments: -// haploGraph: the GBWTgraph containing the haplotype threads. +// _haploGraph: the GBWTgraph containing the haplotype threads. // touched_handles: any handles found in the snarl so far. // sink_id: the id of the final handle in the snarl. // Returns: // a vector of haplotypes in vector format that start in the middle of the // snarl. vector> -find_haplotypes_not_at_source(const GBWTGraph &haploGraph, - unordered_set &touched_handles, - const id_t &sink_id) { +SnarlNormalizer::find_haplotypes_not_at_source(unordered_set &touched_handles, + const id_t &sink_id) { // cerr << "find_haplotypes_not_at_source" << endl; /// Search every handle in touched handles for haplotypes starting at that point. @@ -467,7 +506,7 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, // We don't need to ever check the sink handle, since paths from the sink handle // extend beyond snarl. - handle_t sink_handle = haploGraph.get_handle(sink_id); + handle_t sink_handle = _haploGraph.get_handle(sink_id); // touched_handles.erase(sink_handle); // Nested function for making a new_search. Identifies threads starting at a given @@ -477,12 +516,12 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, auto make_new_search = [&](handle_t handle) { // Are there any new threads starting at this handle? gbwt::SearchState new_search = - haploGraph.index.prefix(haploGraph.handle_to_node(handle)); + _haploGraph.index->prefix(_haploGraph.handle_to_node(handle)); if (!new_search.empty()) { // Then add them to haplotype_queue. - haploGraph.follow_paths( + _haploGraph.follow_paths( new_search, [&](const gbwt::SearchState &next_search) -> bool { - handle_t next_handle = haploGraph.node_to_handle(next_search.node); + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); /// check to make sure that the thread isn't already finished: // if next_handle is the sink, or if this thread is only one handle @@ -530,14 +569,14 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, // get all the subsequent search_states that immediately follow the // searchstate from cur_haplotype. vector next_searches; - haploGraph.follow_paths(cur_haplotype.second, - [&](const gbwt::SearchState &next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); + _haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState &next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); for (gbwt::SearchState next_search : next_searches) { - handle_t next_handle = haploGraph.node_to_handle(next_search.node); + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); // if next_search is empty, then we've fallen off the thread, // and cur_haplotype can be placed in finished_haplotypes as is for this @@ -548,7 +587,7 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, // if next_search is on the sink_handle, // then cur_haplotype.first + next_search goes to finished_haplotypes. - else if (haploGraph.get_id(next_handle) == sink_id) { + else if (_haploGraph.get_id(next_handle) == sink_id) { // copy over the vector of cur_haplotype: vector next_handle_vec(cur_haplotype.first); @@ -597,19 +636,18 @@ find_haplotypes_not_at_source(const GBWTGraph &haploGraph, // haplotypes of // format string (which is the concatenated sequences in the handles). // Arguments: -// haploGraph: a GBWTGraph which contains the handles in vector< handle_t > +// _haploGraph: a GBWTGraph which contains the handles in vector< handle_t > // haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< handle_t // > format. // Returns: a vector of haplotypes of format string (which is the concatenated sequences // in the handles). -vector format_handle_haplotypes_to_strings( - const GBWTGraph &haploGraph, +vector SnarlNormalizer::format_handle_haplotypes_to_strings( const vector> &haplotype_handle_vectors) { vector haplotype_strings; for (vector haplotype_handles : haplotype_handle_vectors) { string hap; for (handle_t &handle : haplotype_handles) { - hap += haploGraph.get_sequence(handle); + hap += _haploGraph.get_sequence(handle); } haplotype_strings.push_back(hap); } @@ -617,15 +655,20 @@ vector format_handle_haplotypes_to_strings( } // TODO: eventually change to deal with haplotypes that start/end in middle of snarl. -// Aligns haplotypes to create a new graph using MSAConverter's seqan converter. +// Aligns haplotypes to create a new _graph using MSAConverter's seqan converter. // Assumes that each haplotype stretches from source to sink. // Arguments: // source_to_sink_haplotypes: a vector of haplotypes in string format (concat of // handle sequences). // Returns: // VG object representing the newly realigned snarl. -VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { +VG SnarlNormalizer::align_source_to_sink_haplotypes( + vector source_to_sink_haplotypes) { // cerr << "align_source_to_sink_haplotypes" << endl; + // cerr << " haplotypes in source_to_sink_haplotypes: " << endl; + // for (string hap : source_to_sink_haplotypes) { + // cerr << hap << endl; + // } // cerr << "number of strings to align: " << source_to_sink_haplotypes.size() << endl; // TODO: make the following comment true, so that I can normalize haplotypes that // TODO: aren't source_to_sink by adding a similar special character to strings in @@ -719,23 +762,25 @@ VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { cerr << "WARNING! Snarl realignment has generated " << source_and_sink.second.size() << " sink nodes." << endl; } + return snarl; } -/** For each handle in a given graph, divides any handles greater than max_size into parts - * that are equal to or less than the size of max_size. +/** For each handle in a given _graph, divides any handles greater than max_size into + * parts that are equal to or less than the size of max_size. * - * @param {MutableHandleGraph} graph : the graph in which we want to force a maximum + * @param {MutableHandleGraph} _graph : the _graph in which we want to force a maximum * handle size for all handles. * @param {size_t} max_size : the maximum size we want a handle to be. */ -void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size) { - // forcing each handle in the graph to have a maximum sequence length of max_size: - graph.for_each_handle([&](handle_t handle) { +void SnarlNormalizer::force_maximum_handle_size(MutableHandleGraph &graph, + const size_t &max_size) { + // forcing each handle in the _graph to have a maximum sequence length of max_size: + _graph.for_each_handle([&](handle_t handle) { // all the positions we want to make in the handle are in offsets. vector offsets; - size_t sequence_len = graph.get_sequence(handle).size(); + size_t sequence_len = _graph.get_sequence(handle).size(); int number_of_divisions = floor(sequence_len / max_size); // if the handle divides evenly into subhandles of size max_size, we don't need to @@ -751,19 +796,19 @@ void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size } // divide the handle into parts. - graph.divide_handle(handle, offsets); + _graph.divide_handle(handle, offsets); }); } // Finds all embedded paths that either start or end in a snarl (or both) defined by -// source_id, sink_id. +// _cur_source_id, sink_id. // returns a vector of the embedded paths, where each entry in the vector is defined // by the pair of step_handles closest to the beginning and end of the path. If the // path is fully contained within the snarl, these step_handles will the be the // leftmost and rightmost handles in the path. // Arguments: -// graph: a pathhandlegraph containing the snarl with embedded paths. -// source_id: the source of the snarl of interest. +// _graph: a pathhandlegraph containing the snarl with embedded paths. +// _cur_source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: // a vector containing all the embedded paths in the snarl, in pair< step_handle_t, @@ -771,22 +816,23 @@ void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size // interest, and pair.second is the step *after* the last step in the path's range of // interest (can be the null step at end of path). vector> -extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, - const id_t &sink_id) { +SnarlNormalizer::extract_embedded_paths_in_snarl(const PathHandleGraph &graph, + const id_t &source_id, + const id_t &sink_id) { // cerr << "extract_embedded_paths_in_snarl" << endl; // cerr << "source id: " << source_id << endl; // cerr << "source id contains what paths?: " << endl; - // for (auto step : graph.steps_of_handle(graph.get_handle(source_id))) { - // cerr << "\t" << graph.get_path_name(graph.get_path_handle_of_step(step)) << + // for (auto step : _graph.steps_of_handle(graph.get_handle(_source_id))) { + // cerr << "\t" << _graph.get_path_name(graph.get_path_handle_of_step(step)) << // endl; // } // cerr << "neighbors of 71104? (should include 71097):" << endl; - // handle_t test_handle = graph.get_handle(71104); - // graph.follow_edges(test_handle, true, [&](const handle_t &handle) { - // cerr << graph.get_id(handle) << endl; + // handle_t test_handle = _graph.get_handle(71104); + // _graph.follow_edges(test_handle, true, [&](const handle_t &handle) { + // cerr << _graph.get_id(handle) << endl; // }); // cerr << "can I still access source handle?" - // << graph.get_sequence(graph.get_handle(source_id)) << endl; + // << _graph.get_sequence(graph.get_handle(_source_id)) << endl; // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't // extend the path to a point beyond the source or sink. @@ -817,8 +863,8 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source // todo: debug_statement // cerr << "################looking for new paths################" << endl; // for (auto path : paths_found) { - // cerr << graph.get_path_name(path.first) << " " - // << graph.get_id(graph.get_handle_of_step(path.second)) << endl; + // cerr << _graph.get_path_name(path.first) << " " + // << _graph.get_id(graph.get_handle_of_step(path.second)) << endl; // } /// for each step_handle_t corresponding to a unique path, we want to get the steps @@ -837,22 +883,22 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source // snarl. step_handle_t begin_in_snarl_step = step; id_t begin_in_snarl_id = - graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); while ((begin_in_snarl_id != source_id) && - graph.has_previous_step(begin_in_snarl_step)) { - begin_in_snarl_step = graph.get_previous_step(begin_in_snarl_step); + _graph.has_previous_step(begin_in_snarl_step)) { + begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); begin_in_snarl_id = - graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); } path_in_snarl.first = begin_in_snarl_step; // Look for the step closest to the end of the path, as constrained by the snarl. step_handle_t end_in_snarl_step = step; - id_t end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); + id_t end_in_snarl_id = _graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and - // graph.has_next_step(end_in_snarl_step)) { + // _graph.has_next_step(end_in_snarl_step)) { while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { end_in_snarl_step = graph.get_next_step(end_in_snarl_step); end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); @@ -861,7 +907,7 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source // the null step at the end of the path (or the next arbitrary step, in the case // of a path that extends beyond our snarl.) // TODO: do we want the next arbitrary step in that latter case? - path_in_snarl.second = graph.get_next_step(end_in_snarl_step); + path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); paths_in_snarl.push_back(path_in_snarl); } @@ -872,14 +918,15 @@ extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source // TODO: change the arguments to handles, which contain orientation within themselves. // Given a start and end node id, construct an extract subgraph between the two nodes // (inclusive). Arguments: -// graph: a pathhandlegraph containing the snarl with embedded paths. -// source_id: the source of the snarl of interest. +// _graph: a pathhandlegraph containing the snarl with embedded paths. +// _cur_source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: -// a SubHandleGraph containing only the handles in graph that are between start_id +// a SubHandleGraph containing only the handles in _graph that are between start_id // and sink_id. -SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, - const id_t &sink_id) { +SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, + const id_t &start_id, + const id_t &sink_id) { // cerr << "extract_subgraph" << endl; /// make a subgraph containing only nodes of interest. (e.g. a snarl) // make empty subgraph @@ -890,12 +937,12 @@ SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, // TODO: how to ensure that "to the right" of start_handle is the correct direction? // initialize with start_handle (because we move only to the right of start_handle): - handle_t start_handle = graph.get_handle(start_id); + handle_t start_handle = _graph.get_handle(start_id); subgraph.add_handle(start_handle); visited.insert(graph.get_id(start_handle)); // look only to the right of start_handle - graph.follow_edges(start_handle, false, [&](const handle_t &handle) { + _graph.follow_edges(start_handle, false, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); @@ -906,7 +953,7 @@ SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, while (to_visit.size() != 0) { // remove cur_handle from to_visit unordered_set::iterator cur_index = to_visit.begin(); - handle_t cur_handle = graph.get_handle(*cur_index); + handle_t cur_handle = _graph.get_handle(*cur_index); to_visit.erase(cur_index); @@ -918,14 +965,14 @@ SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, if (graph.get_id(cur_handle) != sink_id) { // don't iterate past end node! // look for all nodes connected to cur_handle that need to be added // looking to the left, - graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { + _graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); } }); // looking to the right, - graph.follow_edges(cur_handle, false, [&](const handle_t &handle) { + _graph.follow_edges(cur_handle, false, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); @@ -936,20 +983,20 @@ SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, return subgraph; } -// Integrates the snarl into the graph, replacing the snarl occupying the space between -// source_id and sink_id. +// Integrates the snarl into the _graph, replacing the snarl occupying the space between +// _cur_source_id and sink_id. // In the process, transfers any embedded paths traversing the old snarl into the new // snarl. // Arguments: -// graph: the graph in which we want to insert the snarl. -// to_insert_snarl: a *separate* handle_graph from graph, often generated from +// _graph: the _graph in which we want to insert the snarl. +// to_insert_snarl: a *separate* handle_graph from _graph, often generated from // MSAconverter. embedded_paths: a vector of paths, where each is a pair. // pair.first is the first step_handle of interest in the // old_embedded_path, and pair.second is the step_handle *after* // the last step_handle of interest in the old_embedded_path (can // be the null step at the end of the path.) -// source_id: the source of the old (to be replaced) snarl in graph -// sink_id: the sink of the old (to be replaced) snarl in graph. +// _cur_source_id: the source of the old (to be replaced) snarl in _graph +// sink_id: the sink of the old (to be replaced) snarl in _graph. // Return: None. // TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same // TODO: orientation as we expect? i.e. that they don't move backward? I think @@ -959,10 +1006,9 @@ SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, // TODO: It may also be that we *don't want match_orientation to be true, // TODO: if we're tracking a path that loops backward in the snarl. Hmm... Will think // about this. -void integrate_snarl(MutablePathDeletableHandleGraph &graph, - const HandleGraph &to_insert_snarl, - const vector> embedded_paths, - const id_t &source_id, const id_t &sink_id) { +void SnarlNormalizer::integrate_snarl( + const HandleGraph &to_insert_snarl, + const vector> embedded_paths) { // cerr << "integrate_snarl" << endl; // //todo: debug_statement @@ -972,8 +1018,8 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // << to_insert_snarl.get_sequence(handle) << " \t"; // }); // cerr << endl; - // Get old graph snarl - SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); + // Get old _graph snarl + SubHandleGraph old_snarl = extract_subgraph(_graph, _cur_source_id, _cur_sink_id); // TODO: debug_statement: Check to make sure that newly made snarl has only one start // and end. @@ -984,46 +1030,51 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, if (to_insert_snarl_defining_handles.first.size() > 1 || to_insert_snarl_defining_handles.second.size() > 1) { - cerr << "ERROR: newly made snarl from a snarl starting at " << source_id + cerr << "ERROR: newly made snarl from a snarl starting at " << _cur_source_id << " has more than one start or end. # of starts: " << to_insert_snarl_defining_handles.first.size() << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; return; } - /// Replace start and end handles of old graph snarl with to_insert_snarl start and - /// end, and delete rest of old graph snarl: + /// Replace start and end handles of old _graph snarl with to_insert_snarl start and + /// end, and delete rest of old _graph snarl: - // add to_insert_snarl into graph without directly attaching the snarl to the graph + // add to_insert_snarl into _graph without directly attaching the snarl to the _graph // (yet). vector to_insert_snarl_topo_order = algorithms::lazier_topological_order(&to_insert_snarl); // Construct a parallel new_snarl_topo_order to identify - // paralogous nodes between to_insert_snarl and the new snarl inserted in graph. + // paralogous nodes between to_insert_snarl and the new snarl inserted in _graph. vector new_snarl_topo_order; - // integrate the handles from to_insert_snarl into the graph, and keep track of their + // integrate the handles from to_insert_snarl into the _graph, and keep track of their // identities by adding them to new_snarl_topo_order. for (handle_t to_insert_snarl_handle : to_insert_snarl_topo_order) { + // //todo: debug_statement: + // cerr << " pre-inserted snarl handle: " + // << to_insert_snarl.get_id(to_insert_snarl_handle) << " " + // << to_insert_snarl.get_sequence(to_insert_snarl_handle) << endl; + handle_t graph_handle = - graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); + _graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); new_snarl_topo_order.push_back(graph_handle); } - // Connect the newly made handles in the graph together the way they were connected in - // to_insert_snarl: + // Connect the newly made handles in the _graph together the way they were connected + // in to_insert_snarl: for (int i = 0; i < to_insert_snarl_topo_order.size(); i++) { to_insert_snarl.follow_edges( to_insert_snarl_topo_order[i], false, [&](const handle_t &snarl_handle) { - // get topo_index of nodes to be connected to graph start handle + // get topo_index of nodes to be connected to _graph start handle auto it = find(to_insert_snarl_topo_order.begin(), to_insert_snarl_topo_order.end(), snarl_handle); int topo_index = it - to_insert_snarl_topo_order.begin(); - // connect graph start handle - graph.create_edge(new_snarl_topo_order[i], - new_snarl_topo_order[topo_index]); + // connect _graph start handle + _graph.create_edge(new_snarl_topo_order[i], + new_snarl_topo_order[topo_index]); }); } @@ -1031,128 +1082,132 @@ void integrate_snarl(MutablePathDeletableHandleGraph &graph, // not necessarily preserved by move_path_to_snarl. Is temporary b/c we need to // replace the handles with ones with the right id_t label for source and sink later // on. - id_t temp_snarl_source_id = graph.get_id(new_snarl_topo_order.front()); - id_t temp_snarl_sink_id = graph.get_id(new_snarl_topo_order.back()); + id_t temp_snarl_source_id = _graph.get_id(new_snarl_topo_order.front()); + id_t temp_snarl_sink_id = _graph.get_id(new_snarl_topo_order.back()); // Add the neighbors of the source and sink of the original snarl to the new_snarl's // source and sink. // source integration: - graph.follow_edges( - graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, graph.get_handle(temp_snarl_source_id)); + _graph.follow_edges( + _graph.get_handle(_cur_source_id), true, [&](const handle_t &prev_handle) { + _graph.create_edge(prev_handle, _graph.get_handle(temp_snarl_source_id)); }); - graph.follow_edges( - graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { - graph.create_edge(graph.get_handle(temp_snarl_sink_id), next_handle); + _graph.follow_edges( + _graph.get_handle(_cur_sink_id), false, [&](const handle_t &next_handle) { + _graph.create_edge(_graph.get_handle(temp_snarl_sink_id), next_handle); }); // For each path of interest, move it onto the new_snarl. for (auto path : embedded_paths) { // //todo: debug_statement // cerr << "the new sink id: " << temp_snarl_sink_id << endl; - move_path_to_snarl(graph, path, new_snarl_topo_order, temp_snarl_source_id, - temp_snarl_sink_id, source_id, sink_id); + move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_source_id, + temp_snarl_sink_id, _cur_source_id, _cur_sink_id); } // Destroy the old snarl. old_snarl.for_each_handle( - [&](const handle_t &handle) { graph.destroy_handle(handle); }); + + [&](const handle_t &handle) { + // //todo: debug_statement these are the handles in old_snarl: + // cerr << old_snarl.get_id(handle) << old_snarl.get_sequence(handle) << endl; + _graph.destroy_handle(handle); + }); // Replace the source and sink handles with ones that have the original source/sink id // (for compatibility with future iterations on neighboring top-level snarls using the // same snarl manager. Couldn't replace it before b/c we needed the old handles to // move the paths. - handle_t new_source_handle = graph.create_handle( - graph.get_sequence(graph.get_handle(temp_snarl_source_id)), source_id); - handle_t new_sink_handle = - graph.create_handle(graph.get_sequence(new_snarl_topo_order.back()), sink_id); + handle_t new_source_handle = _graph.create_handle( + _graph.get_sequence(_graph.get_handle(temp_snarl_source_id)), _cur_source_id); + handle_t new_sink_handle = _graph.create_handle( + _graph.get_sequence(new_snarl_topo_order.back()), _cur_sink_id); // move the source edges: // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could // in Python!) - graph.follow_edges(graph.get_handle(temp_snarl_source_id), true, - [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, new_source_handle); - }); - graph.follow_edges(graph.get_handle(temp_snarl_source_id), false, - [&](const handle_t &next_handle) { - graph.create_edge(new_source_handle, next_handle); - }); + _graph.follow_edges(_graph.get_handle(temp_snarl_source_id), true, + [&](const handle_t &prev_handle) { + _graph.create_edge(prev_handle, new_source_handle); + }); + _graph.follow_edges(_graph.get_handle(temp_snarl_source_id), false, + [&](const handle_t &next_handle) { + _graph.create_edge(new_source_handle, next_handle); + }); // move the sink edges: - graph.follow_edges(graph.get_handle(temp_snarl_sink_id), true, - [&](const handle_t &prev_handle) { - graph.create_edge(prev_handle, new_sink_handle); - }); - graph.follow_edges(graph.get_handle(temp_snarl_sink_id), false, - [&](const handle_t &next_handle) { - graph.create_edge(new_sink_handle, next_handle); - }); + _graph.follow_edges(_graph.get_handle(temp_snarl_sink_id), true, + [&](const handle_t &prev_handle) { + _graph.create_edge(prev_handle, new_sink_handle); + }); + _graph.follow_edges(_graph.get_handle(temp_snarl_sink_id), false, + [&](const handle_t &next_handle) { + _graph.create_edge(new_sink_handle, next_handle); + }); // move the paths: - graph.for_each_step_on_handle( - graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { - graph.rewrite_segment(step, graph.get_next_step(step), - vector{new_source_handle}); + _graph.for_each_step_on_handle( + _graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { + _graph.rewrite_segment(step, _graph.get_next_step(step), + vector{new_source_handle}); }); - graph.for_each_step_on_handle( - graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { - graph.rewrite_segment(step, graph.get_next_step(step), - vector{new_sink_handle}); + _graph.for_each_step_on_handle( + _graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { + _graph.rewrite_segment(step, _graph.get_next_step(step), + vector{new_sink_handle}); }); // delete the previously created source and sink: - for (handle_t handle : - {graph.get_handle(temp_snarl_source_id), graph.get_handle(temp_snarl_sink_id)}) { + for (handle_t handle : {_graph.get_handle(temp_snarl_source_id), + _graph.get_handle(temp_snarl_sink_id)}) { - graph.destroy_handle(handle); + _graph.destroy_handle(handle); } } -// Moves a path from its original location in the graph to a new snarl, +// Moves a path from its original location in the _graph to a new snarl, // defined by a vector of interconnected handles. // NOTE: the handles in new_snarl_handles may not preserve topological order after // being passed to this method, if they were ordered before. -// Arguments: graph: the graph containing the old_embedded_path and the handles in +// Arguments: _graph: the _graph containing the old_embedded_path and the handles in // new_snarl_topo_order // old_embedded_path: a pair, where // pair.first is the first step_handle of interest in the // old_embedded_path, and pair.second is the step_handle *after* // the last step_handle of interest in the old_embedded_path (can // be the null step at the end of the path.) -// new_snarl_topo_order: all the handles in the new snarl, inside the graph. +// new_snarl_topo_order: all the handles in the new snarl, inside the _graph. // Return: None. -void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, - const pair &old_embedded_path, - vector &new_snarl_handles, id_t &new_source_id, - id_t &new_sink_id, const id_t &old_source_id, - const id_t &old_sink_id) { - // cerr << "move_path_to_snarl" << endl; +void SnarlNormalizer::move_path_to_snarl( + const pair &old_embedded_path, + vector &new_snarl_handles, id_t &new_source_id, id_t &new_sink_id, + const id_t &old_source_id, const id_t &old_sink_id) { + // cerr << "\nmove_path_to_snarl" << endl; // //TODO: debug_statement: // cerr << "path name: " - // << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) // << endl; // cerr << "source: " << new_source_id << " sink: " << new_sink_id << endl; - // if (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) == + // if (_graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) == // "chr10") { // cerr << "\t\tstart and end of old embedded path: " - // << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) + // << _graph.get_id(_graph.get_handle_of_step(old_embedded_path.first)) // << "end id" - // << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << - // endl; + // << _graph.get_id(_graph.get_handle_of_step(old_embedded_path.second)) + // << endl; // } // cerr << "#### handles in snarl (according to move_path_to_snarl): ####" << endl; // for (handle_t handle : new_snarl_handles) { - // cerr << "\t" << graph.get_id(handle) << " " << graph.get_sequence(handle); + // cerr << "\t" << _graph.get_id(handle) << " " << _graph.get_sequence(handle); // } // cerr << endl << endl; // cerr << "~~~~~ Handles following each handle:" << endl; // for (handle_t handle : new_snarl_handles) { - // cerr << "neighbors of handle " << graph.get_id(handle) << " (" - // <, int, int>> possible_paths; for (handle_t handle : new_snarl_handles) { - string handle_seq = graph.get_sequence(handle); + string handle_seq = _graph.get_sequence(handle); // starting index is where the path would begin in the handle, // since it could begin in the middle of the handle. @@ -1198,7 +1254,7 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, if (starting_indices.size() != 0) { for (int starting_index : starting_indices) { if ((handle_seq.size() - starting_index) >= path_seq.size() && - source_and_sink_handles_map_properly(graph, new_source_id, + source_and_sink_handles_map_properly(_graph, new_source_id, new_sink_id, touching_source, touching_sink, handle, handle)) { // if the entire path fits inside the current handle, and if any @@ -1207,11 +1263,11 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // found the full mapping location of the path! Move the path, end // the method. vector new_path{handle}; - graph.rewrite_segment(old_embedded_path.first, - old_embedded_path.second, new_path); + _graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, new_path); // //todo: debug_statement - // cerr << "found a full mapping at " << graph.get_id(handle) - // << " w/ seq " << graph.get_sequence(handle) << endl; + // cerr << "found a full mapping at " << _graph.get_id(handle) + // << " w/ seq " << _graph.get_sequence(handle) << endl; return; } else { // this is a potential starting handle for the path. Add as a @@ -1233,7 +1289,7 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // for (tuple, int, int> path : possible_paths) { // cerr << " possible start: "; // for (handle_t handle : get<0>(path)) { - // cerr << graph.get_id(handle) << " "; + // cerr << _graph.get_id(handle) << " "; // } // cerr << endl; // } @@ -1242,33 +1298,37 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // looking for: while (!possible_paths.empty()) { // take a path off of possible_paths, which will be copied for every iteration - // through graph.follow_edges, below: + // through _graph.follow_edges, below: tuple, int, int> possible_path_query = possible_paths.back(); possible_paths.pop_back(); // //TODO: debug_statement: // for (tuple, int, int> path : possible_paths) { - // cerr << "*\tpossible path query: "; - // for (handle_t handle : get<0>(possible_path_query)) { - // cerr << graph.get_id(handle) << " " << graph.get_sequence(handle) << " "; - // } - // cerr << endl; + // cerr << "*\tpossible path query: "; + // for (handle_t handle : get<0>(possible_path_query)) { + // cerr << _graph.get_id(handle) << " " << _graph.get_sequence(handle) + // << " "; + // } + // cerr << endl; // } // extend the path through all right-extending edges to see if any subsequent // paths still satisfy the requirements for being a possible_path: - bool no_path = graph.follow_edges( + bool no_path = _graph.follow_edges( get<0>(possible_path_query).back(), false, [&](const handle_t &next) { // //todo: debug_statement - // cerr << "next handle id and seq: " << graph.get_id(next) << " " - // << graph.get_sequence(next) << endl; + // cerr << "cur handle id: " + // << _graph.get_id(get<0>(possible_path_query).back()) << endl; + + // cerr << "next handle id and seq: " << _graph.get_id(next) << " " + // << _graph.get_sequence(next) << endl; // make a copy to be extended for through each possible next handle in // follow edges. tuple, int, int> possible_path = possible_path_query; // extract relevant information to make code more readable. - string next_seq = graph.get_sequence(next); - id_t next_id = graph.get_id(next); + string next_seq = _graph.get_sequence(next); + id_t next_id = _graph.get_id(next); int &cur_index_in_path = get<2>(possible_path); if (cur_index_in_path <= path_seq.size() && (find(new_snarl_handles.cbegin(), new_snarl_handles.cend(), next) != @@ -1276,24 +1336,28 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // if the next handle would be the ending handle for the path, if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { // cerr << "next handle would be the ending handle for the path" - // << endl; check to see if the sequence in the handle is suitable + // << endl; + // check to see if the sequence in the handle is suitable // for ending the path: int compare_length = path_seq.size() - cur_index_in_path; // //todo: debug_statement // cerr << "about to compare. compare val: " // << (next_seq.compare(0, compare_length, path_seq, - // cur_index_in_path, compare_length) == - // 0) + // cur_index_in_path, compare_length) == 0) // << " source_and_sink_handles_map " // << source_and_sink_handles_map_properly( - // graph, new_source_id, new_sink_id, touching_source, + // _graph, new_source_id, new_sink_id, touching_source, // touching_sink, get<0>(possible_path).front(), next) // << endl; + // cerr << "arguments of compare: " + // << " " << 0 << " " << compare_length << " " << path_seq + // << " " << cur_index_in_path << " " << compare_length << " " + // << endl; if ((next_seq.compare(0, compare_length, path_seq, cur_index_in_path, compare_length) == 0) && source_and_sink_handles_map_properly( - graph, new_source_id, new_sink_id, touching_source, + _graph, new_source_id, new_sink_id, touching_source, touching_sink, get<0>(possible_path).front(), next)) { // todo: debug_statement // cerr << "compared." << endl; @@ -1308,14 +1372,14 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // divide the handle where the path ends; pair divided_next = - graph.divide_handle(next, compare_length); + _graph.divide_handle(next, compare_length); get<0>(possible_path).push_back(divided_next.first); // Special case if next is the sink or the source, to // preserve the reassignment of source and sink ids in // integrate_snarl. if (next_id == new_sink_id) { - new_sink_id = graph.get_id(divided_next.second); + new_sink_id = _graph.get_id(divided_next.second); } // TODO: NOTE: finding the old "next" handle is expensive. @@ -1336,13 +1400,13 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // path. get<0>(possible_path).push_back(next); } - graph.rewrite_segment(old_embedded_path.first, - old_embedded_path.second, - get<0>(possible_path)); + _graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, + get<0>(possible_path)); // //todo: debug_statement: // cerr << "got a full path: "; // for (handle_t handle : get<0>(possible_path)) { - // cerr << graph.get_id(handle) << " "; + // cerr << _graph.get_id(handle) << " "; // } // cerr << endl; @@ -1399,7 +1463,7 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // == // "_alt_19f9bc9ad2826f58f113965edf36bb93740df46d_0") { // cerr << "mystery node 4214930: " - // << graph.get_sequence(graph.get_handle(4214930)) << endl; + // << _graph.get_sequence(graph.get_handle(4214930)) << endl; // } // if we've found a complete path in the above follow_edges, then we've @@ -1412,16 +1476,16 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, // if we failed to find a path, show an error message. cerr << "##########################\nWarning! Didn't find a corresponding path of " "name " - << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) << " from the old snarl at " << old_source_id << " in the newly aligned snarl. This snarl WILL be " "normalized, resulting in a probably incorrectly-constructed snarl." "\n##########################" << endl << endl; - // throw graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); + // throw _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); // assert(true && "Warning! Didn't find a corresponding path of name " + - // graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) // + " from the old snarl in the newly aligned snarl."); } @@ -1430,7 +1494,7 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, * source and/or sink of the old snarl still do so in the new snarl (which is * important to ensure that we don't break any paths partway through the snarl.) * - * @param {HandleGraph} graph : the graph that contains the old and new snarl + * @param {HandleGraph} _graph : the _graph that contains the old and new snarl * nodes. * @param {id_t} new_source_id : the node id of the newly created source. * @param {id_t} new_sink_id : the node id of the newly created sink. @@ -1438,28 +1502,28 @@ void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, * source. * @param {bool} touching_sink : true if the path is connected to the old * sink. - * @param {handle_t} potential_source : proposed source for the path in the new snarl. - * @param {handle_t} potential_sink : proposed sink for the path in the new snarl. + * @param {handle_t} path_start : proposed source for the path in the new snarl. + * @param {handle_t} path_end : proposed sink for the path in the new snarl. * @return {bool} : true if the path satisfies the requirement * that, if the original path covered the old source or sink, the new path also covers * the same respective nodes in the new snarl. */ -bool source_and_sink_handles_map_properly( +bool SnarlNormalizer::source_and_sink_handles_map_properly( const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, - const bool &touching_source, const bool &touching_sink, - const handle_t &potential_source, const handle_t &potential_sink) { + const bool &touching_source, const bool &touching_sink, const handle_t &path_start, + const handle_t &path_end) { bool path_map = false; // cerr << "touching source? " << touching_source << "touching_sink" << touching_sink - // << "source is source?" << (graph.get_id(potential_source) == new_source_id) - // << " sink is sink: " << (graph.get_id(potential_sink) == new_sink_id) << endl; + // << "source is source?" << (graph.get_id(path_start) == new_source_id) + // << " sink is sink: " << (graph.get_id(path_end) == new_sink_id) << endl; if (touching_source && touching_sink) { - path_map = ((graph.get_id(potential_source) == new_source_id) && - (graph.get_id(potential_sink) == new_sink_id)); + path_map = ((graph.get_id(path_start) == new_source_id) && + (graph.get_id(path_end) == new_sink_id)); } else if (touching_source) { - path_map = (graph.get_id(potential_source) == new_source_id); + path_map = (graph.get_id(path_start) == new_source_id); } else if (touching_sink) { - path_map = (graph.get_id(potential_sink) == new_sink_id); + path_map = (graph.get_id(path_end) == new_sink_id); } else { path_map = true; } @@ -1480,8 +1544,8 @@ bool source_and_sink_handles_map_properly( // starting points for in handle_seq // Return: a vector of all potential starting index of the subsequence in the // handle_seq. -vector check_handle_as_start_of_path_seq(const string &handle_seq, - const string &path_seq) { +vector SnarlNormalizer::check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq) { vector possible_start_indices; // If the handle_seq.size <= path_seq.size, look for subsequences reaching from // beginning/middle of handle_seq to the end - where path_seq may run off the end @@ -1546,23 +1610,23 @@ vector check_handle_as_start_of_path_seq(const string &handle_seq, // ------------------------------ DEBUG CODE BELOW: // ------------------------------------------ -// Returns pair where pair.first is a vector of all sources of the given graph and -// path.second is all the sinks of the given graph. If graph is a subhandlegraph of a +// Returns pair where pair.first is a vector of all sources of the given _graph and +// path.second is all the sinks of the given _graph. If _graph is a subhandlegraph of a // snarl, there should only be one source and sink each. pair, vector> -debug_get_sources_and_sinks(const HandleGraph &graph) { +SnarlNormalizer::debug_get_sources_and_sinks(const HandleGraph &graph) { // cerr << "debug_get_source_and_sinks" << endl; vector sink; vector source; // identify sources and sinks - graph.for_each_handle([&](const handle_t &handle) { + _graph.for_each_handle([&](const handle_t &handle) { bool is_source = true, is_sink = true; - graph.follow_edges(handle, true, [&](const handle_t &prev) { + _graph.follow_edges(handle, true, [&](const handle_t &prev) { is_source = false; return false; }); - graph.follow_edges(handle, false, [&](const handle_t &next) { + _graph.follow_edges(handle, false, [&](const handle_t &next) { is_sink = false; return false; }); @@ -1581,10 +1645,12 @@ debug_get_sources_and_sinks(const HandleGraph &graph) { // Runs through the whole snarl and generates all possible strings representing walks // from source to sink. Generates a combinatorial number of possible paths with splits // in the snarl. -vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, - id_t start_id, id_t sink_id) { +pair, unordered_set> +SnarlNormalizer::source_to_sink_exhaustive_path_finder() { // cerr << "debug_graph_to_strings" << endl; - SubHandleGraph snarl = extract_subgraph(graph, start_id, sink_id); + SubHandleGraph snarl = extract_subgraph(_graph, _cur_source_id, _cur_sink_id); + + unordered_set touched_handles; unordered_map> sequences; vector sinks; @@ -1622,6 +1688,7 @@ vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, // count walks by dynamic programming bool overflowed = false; for (const handle_t &handle : algorithms::lazier_topological_order(&snarl)) { + touched_handles.emplace(handle); size_t count_here = count[handle]; vector seqs_here = sequences[handle]; @@ -1660,7 +1727,7 @@ vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, } } - return walks; + return make_pair(walks, touched_handles); } -} // namespace vg +} // namespace vg \ No newline at end of file diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp new file mode 100644 index 00000000000..d410221fc1f --- /dev/null +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -0,0 +1,83 @@ +#include "../gbwt_helper.hpp" +#include "../handle.hpp" +#include "../subgraph.hpp" +#include "../vg.hpp" +#include "count_walks.hpp" +#include + +namespace vg { + +class SnarlNormalizer { + public: + virtual ~SnarlNormalizer() = default; + + SnarlNormalizer(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, + const int &max_alignment_size = 200, + const string &path_finder = "GBWT" /*alternative is "exhaustive"*/); + + virtual void normalize_top_level_snarls(ifstream &snarl_stream); + + virtual vector normalize_snarl(const id_t &source_id, const id_t &sink_id); + + protected: + // member variables: + // the handle graph with snarls to normalize + MutablePathDeletableHandleGraph &_graph; + // GBWT graph with snarls to normalize, includes the embedded threads needed for the + // GBWTPathFinder approach. + const GBWTGraph &_haploGraph; + // the maximum number of threads allowed to align in a given snarl. If the number of + // threads exceeds this threshold, the snarl is skipped. + int _max_alignment_size; + id_t _cur_source_id; + id_t _cur_sink_id; + const string &_path_finder; + + tuple>, vector>, unordered_set> + extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &graph, + const id_t &source_id, const id_t &sink_id); + + pair, unordered_set> source_to_sink_exhaustive_path_finder(); + + vector> + find_haplotypes_not_at_source(unordered_set &touched_handles, + const id_t &sink_id); + + vector format_handle_haplotypes_to_strings( + const vector> &haplotype_handle_vectors); + + VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); + + void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); + + vector> + extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id); + + SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &end_id); + + void integrate_snarl(const HandleGraph &new_snarl, + const vector> embedded_paths); + + void move_path_to_snarl(const pair &old_embedded_path, + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id); + + bool source_and_sink_handles_map_properly( + const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, + const bool &touching_source, const bool &touching_sink, + const handle_t &potential_source, const handle_t &potential_sink); + + vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq); + + // -------------------------------- DEBUG CODE BELOW: + // ------------------------------------ + + pair, vector> + debug_get_sources_and_sinks(const HandleGraph &graph); +}; + +} // namespace vg diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index bb0041638dc..dc1764f5338 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -7,25 +7,32 @@ #include "subcommand.hpp" -#include "../../include/sglib/hash_graph.hpp" +#include "../../include/bdsg/hash_graph.hpp" #include "../../include/vg/io/vpkg.hpp" -#include "../algorithms/0_draft_haplotype_realignment.hpp" +// #include "../algorithms/0_draft_haplotype_realignment.hpp" +#include "../algorithms/0_oo_normalize_snarls.hpp" #include "../algorithms/0_draft_snarl_normalization_evaluation.cpp" #include "../gbwt_helper.hpp" -#include // for high_resolution_clock +#include // for high_resolution_clock using namespace std; using namespace vg; using namespace vg::subcommand; void help_normalize(char **argv) { - cerr << "usage: " << argv[0] << " normalize [options] >[mod.vg]" << endl - << "Modifies snarls, outputs modified on stdout." << endl - << endl - << "options:" << endl - << " -g, --gbwt gbwt corresponding to hashgraph." << endl - << " -s, --snarls snarls file corresponding to hashgraph." << endl; + cerr + << "usage: " << argv[0] << " normalize [options] >[mod.vg]" << endl + << "Modifies snarls, outputs modified on stdout." << endl + << endl + << "options:" << endl + << " -g, --gbwt gbwt corresponding to hashgraph." << endl + << " -s, --snarls snarls file corresponding to hashgraph." << endl + << " -m, --max_alignment_size limits the number of threads that will " + "be aligned in any snarl. If exceeded, program skips snarl. Default is 200 " + "threads. If you don't want to skip any snarls based on thread count, enter 0." + << endl + << " -s, --snarls snarls file corresponding to hashgraph." << endl; } int main_normalize(int argc, char **argv) { @@ -37,6 +44,7 @@ int main_normalize(int argc, char **argv) { bool evaluate = false; bool normalize = false; + int max_alignment_size = 200; // default cutoff is 200 threads in a snarl. string gbwt; string snarls; @@ -48,11 +56,12 @@ int main_normalize(int argc, char **argv) { {{"help", no_argument, 0, 'h'}, {"gbwt", required_argument, 0, 'g'}, {"snarls", required_argument, 0, 's'}, + {"max_alignment_size", optional_argument, 0, 'm'}, {"evaluate", no_argument, 0, 'e'}, {0, 0, 0, 0}}; int option_index = 0; - c = getopt_long(argc, argv, "g:s:eh", long_options, &option_index); + c = getopt_long(argc, argv, "hg:s:m:e", long_options, &option_index); // Detect the end of the options. if (c == -1) @@ -69,6 +78,14 @@ int main_normalize(int argc, char **argv) { evaluate = true; break; + case 'm': + max_alignment_size = parse(optarg); + // if max_alignment_size is 0, then that signifies that it should actually be + // infinite, i.e. that we should not exclude any snarls. + if (max_alignment_size == 0) { + max_alignment_size = INT_MAX; + } + case 's': snarls = optarg; break; @@ -78,11 +95,13 @@ int main_normalize(int argc, char **argv) { } } - sglib::HashGraph *graph; + bdsg::HashGraph *graph; get_input_file(optind, argc, argv, - [&](istream &in) { graph = new sglib::HashGraph(in); }); + [&](istream &in) { graph = new bdsg::HashGraph(in); }); if (normalize) { + cerr << "running normalize!" << endl; + /// Build the gbwt: ifstream gbwt_stream; gbwt_stream.open(gbwt); @@ -102,8 +121,15 @@ int main_normalize(int argc, char **argv) { } // Record start time auto start = chrono::high_resolution_clock::now(); + + SnarlNormalizer normalizer = SnarlNormalizer(*graph, haploGraph, max_alignment_size); + // run test code on all snarls in graph. - disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream); + normalizer.normalize_top_level_snarls(snarl_stream); + + // // run test code on all snarls in graph. (non obj-oriented code) + // disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream, max_alignment_size); + // Record end time auto finish = std::chrono::high_resolution_clock::now(); chrono::duration elapsed = finish - start; @@ -111,11 +137,11 @@ int main_normalize(int argc, char **argv) { } if (evaluate) { - std::ifstream snarl_stream; - string snarl_file = snarls; - snarl_stream.open(snarl_file); - cerr << "about to evaluate normalized snarls" << endl; - vg::evaluate_normalized_snarls(snarl_stream); + // std::ifstream snarl_stream; + // string snarl_file = snarls; + // snarl_stream.open(snarl_file); + // cerr << "about to evaluate normalized snarls" << endl; + // vg::evaluate_normalized_snarls(snarl_stream); } // TODO: NOTE: this may be cumbersome code if we decide to add more argument types. From 579028249fd2576df97e35262ab884d2385adb99 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 4 Nov 2019 11:18:15 -0800 Subject: [PATCH 36/63] misc. updates while testing data for normalize. Fixed a bug with vpkg dealing with hashgraphs --- .../0_analyze_mapping_quality.py | 118 ++ .../0_demo_of_seqan_defined_matrix.cpp | 185 ++ ..._realignment-debug_code_for_move_paths.cpp | 1326 +++++++++++++ .../0_draft_haplotype_realignment.cpp | 1680 +++++++++++++++++ .../0_draft_haplotype_realignment.hpp | 163 ++ src/algorithms/0_oo_normalize_snarls.cpp | 89 +- src/algorithms/0_oo_normalize_snarls.hpp | 8 +- src/io/register_loader_saver_hash_graph.cpp | 2 +- src/subcommand/0_normalize_main.cpp | 42 +- 9 files changed, 3559 insertions(+), 54 deletions(-) create mode 100644 src/algorithms/0_old_copies_haplotype_realignment/0_analyze_mapping_quality.py create mode 100644 src/algorithms/0_old_copies_haplotype_realignment/0_demo_of_seqan_defined_matrix.cpp create mode 100644 src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment-debug_code_for_move_paths.cpp create mode 100644 src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.cpp create mode 100644 src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.hpp diff --git a/src/algorithms/0_old_copies_haplotype_realignment/0_analyze_mapping_quality.py b/src/algorithms/0_old_copies_haplotype_realignment/0_analyze_mapping_quality.py new file mode 100644 index 00000000000..e7dde0a7548 --- /dev/null +++ b/src/algorithms/0_old_copies_haplotype_realignment/0_analyze_mapping_quality.py @@ -0,0 +1,118 @@ +#%% + +import json +import numpy as np +import matplotlib.pyplot as plt +import collections as col + + +#%% + +def measure_map_qualities(json_file): + mapping_qualities = list() + mapping_qualities_not_60 = list() + no_quality = int() + more_than_1_qual = int() + with open(mapped_reads_file) as file: + line_ct = 0 + for line in file: + line_ct +=1 + # print(line_ct) + line_split = (line.split('"mapping_quality":')) + if len(line_split) == 2: + # print(line_split[1]) + # print(([line_split[1].split(',"')])) + # print(([line_split[1].split(',"')])[0]) + # print() + map_qual = int(line_split[1].split(',"')[0]) + if map_qual != 60: + mapping_qualities_not_60.append(map_qual) + mapping_qualities.append(int(line_split[1].split(',"')[0])) + elif len(line_split) == 1: + no_quality += 1 + else: + more_than_1_qual += 1 + + print("mean of mapping_qualities", np.mean(mapping_qualities)) + print("mean of mapping_qualities that aren't perfect (!= 60)", np.mean(mapping_qualities_not_60)) + print("no_quality", no_quality) + print("more_than_1_qual", more_than_1_qual) + + qual_count = col.Counter(mapping_qualities) + print(qual_count) + # plt.plot(qual_count) + plt.plot(mapping_qualities_not_60) + # plt.bar(mapping_qualities_not_60, 60) + +#%% + + +# mapped_reads_file = "~/test/robin_tests/full_chr10/test_map/hgsvc_chr10_construct_test_map.json" +mapped_reads_file = "hgsvc_chr10_construct_test_map.json" +# mapped_reads_file = "0_test.txt" +measure_map_qualities(mapped_reads_file) + + +#%% +""" +Current output with no normalization: +map_qual avg: 59.059475806451616 +22.68 +no_quality 8 +more_than_1_qual 0 +""" + + + + + + + + + + + + + + +""" +import json +import numpy as np +mapped_reads_file = "../../../test/robin_tests/full_chr10/test_map/hgsvc_chr10_construct_test_map.gam" + +mapping_qualities = list() +mapping_qualities_not_60 = list() +no_quality = int() +more_than_1_qual = int() +with open(mapped_reads_file) as file: + line_ct = 0 + for line in file: + line_ct +=1 + # print(line_ct) + line_split = (line.split('"mapping_quality":')) + if len(line_split) == 2: + # print(line_split[1]) + # print(([line_split[1].split(',"')])) + # print(([line_split[1].split(',"')])[0]) + # print() + map_qual = int(line_split[1].split(',"')[0]) + if map_qual != 60: + mapping_qualities_not_60.append(map_qual) + mapping_qualities.append(int(line_split[1].split(',"')[0])) + elif len(line_split) == 1: + no_quality += 1 + else: + more_than_1_qual += 1 +# print(mapping_qualities) +print(np.mean(mapping_qualities)) +print(np.mean(mapping_qualities_not_60)) +print("no_quality", no_quality) +print("more_than_1_qual", more_than_1_qual) + + + # mapped_reads = json.load(file) + # print(mapped_reads) + + + +""" \ No newline at end of file diff --git a/src/algorithms/0_old_copies_haplotype_realignment/0_demo_of_seqan_defined_matrix.cpp b/src/algorithms/0_old_copies_haplotype_realignment/0_demo_of_seqan_defined_matrix.cpp new file mode 100644 index 00000000000..eade05499e3 --- /dev/null +++ b/src/algorithms/0_old_copies_haplotype_realignment/0_demo_of_seqan_defined_matrix.cpp @@ -0,0 +1,185 @@ +// #include + +// #include +// #include // For printing strings. +// #include // The module score. + +// using namespace seqan; +// // Extend SeqAn by a user-define scoring matrix. +// namespace seqan { + +// // We have to create a new specialization of the ScoringMatrix_ class +// // for the DNA alphabet. For this, we first create a new tag. +// struct UserDefinedMatrix {}; + +// // Then, we specialize the class ScoringMatrix_ for the Dna5 alphabet. +// template <> +// struct ScoringMatrixData_ +// { +// enum +// { +// VALUE_SIZE = ValueSize::VALUE, +// TAB_SIZE = VALUE_SIZE * VALUE_SIZE +// }; + +// static inline int const * getData() +// { +// // The user defined data table. In this case, we use the data from BLOSUM-30. +// static int const _data[TAB_SIZE] = +// { +// 1, 0, 0, 0, 0, +// 0, 1, 0, 0, 0, +// 0, 0, 1, 0, 0, +// 0, 0, 0, 1, 0, +// 0, 0, 0, 0, 0 +// }; +// return _data; +// } + +// }; +// } // namespace seqan +// We define a function showScoringMatrix for displaying a matrix. + +// // Print a scoring scheme matrix to stdout. +// template +// void showScoringMatrix(Score > const & scoringScheme) +// { +// // Print top row. +// for (unsigned i = 0; i < ValueSize::VALUE; ++i) +// std::cout << "\t" << TSequenceValue(i); +// std::cout << std::endl; +// // Print each row. +// for (unsigned i = 0; i < ValueSize::VALUE; ++i) +// { +// std::cout << TSequenceValue(i); +// for (unsigned j = 0; j < ValueSize::VALUE; ++j) +// { +// std::cout << "\t" << score(scoringScheme, TSequenceValue(i), TSequenceValue(j)); +// } +// std::cout << std::endl; +// } +// } +// int main() +// { +// // 1. Define type and constants. +// // +// // Define types for the score value and the scoring scheme. +// typedef int TValue; +// typedef Score > TScoringScheme; +// // Define our gap scores in some constants. +// int const gapOpenScore = -1; +// int const gapExtendScore = -1; + +// // 2. Construct scoring scheme with default/empty matrix. +// // +// // Construct new scoring scheme, alternatively only give one score +// // that is used for both opening and extension. +// TScoringScheme scoringScheme(gapExtendScore, gapOpenScore); + +// // 3. Fill the now-existing ScoreMatrix +// // +// // The scoring scheme now already has a matrix of the size +// // ValueSize::VALUE x ValueSize::VALUE which +// // we can now fill. + +// // 3.1 We fill the scoring scheme with the product of the coordinates. +// std::cout << std::endl << "Coordinate Products" << std::endl; +// for (unsigned i = 0; i < ValueSize::VALUE; ++i) +// { +// for (unsigned j = 0; j < ValueSize::VALUE; ++j) +// { +// setScore(scoringScheme, Dna5(i), Dna5(j), i * j); +// } +// } +// showScoringMatrix(scoringScheme); + +// // 3.2 Now, we fill it with the user defined matrix above. +// std::cout << "User defined matrix (also Dna5 scoring matrix)..." << std::endl; +// setDefaultScoreMatrix(scoringScheme, UserDefinedMatrix()); +// showScoringMatrix(scoringScheme); + +// // 4. Show our user-defined Dna5 scoring matrix. +// std::cout << "User DNA scoring scheme..." << std::endl; +// Score > userScoringSchemeDna; +// showScoringMatrix(userScoringSchemeDna); + +// return 0; +// } +// Here is the output of the program: + +// Coordinate Products +// A C G T N +// A 0 0 0 0 0 +// C 0 1 2 3 4 +// G 0 2 4 6 8 +// T 0 3 6 9 12 +// N 0 4 8 12 16 +// User defined matrix (also Dna5 scoring matrix)... +// A C G T N +// A 1 0 0 0 0 +// C 0 1 0 0 0 +// G 0 0 1 0 0 +// T 0 0 0 1 0 +// N 0 0 0 0 0 +// User DNA scoring scheme... +// A C G T N +// A 1 0 0 0 0 +// C 0 1 0 0 0 +// G 0 0 1 0 0 +// T 0 0 0 1 0 +// N 0 0 0 0 0 +// Loading Score Matrices From File +// This small demo program shows how to load a score matrix from a file. Examples for score file are demos/howto/scores/dna_example.txt for DNA alphabets and tests/sPAM250 for amino acids. + +// Include the necessary headers. + +// #include + +// #include +// #include // For printing strings. +// #include // The module score. + +// using namespace seqan; +// We define a function that can show a scoring matrix. + +// // Print a scoring scheme matrix to stdout. +// template +// void showScoringMatrix(Score > const & scoringScheme) +// { +// // Print top row. +// for (unsigned i = 0; i < ValueSize::VALUE; ++i) +// std::cout << "\t" << TSequenceValue(i); +// std::cout << std::endl; +// // Print each row. +// for (unsigned i = 0; i < ValueSize::VALUE; ++i) +// { +// std::cout << TSequenceValue(i); +// for (unsigned j = 0; j < ValueSize::VALUE; ++j) +// { +// std::cout << "\t" << score(scoringScheme, TSequenceValue(i), TSequenceValue(j)); +// } +// std::cout << std::endl; +// } +// } +// Finally, the main program loads the scoring matrix and then shows it. + +// int main(int argc, char ** argv) +// { +// typedef int TScoreValue; + +// Score > scoreMatrix; +// loadScoreMatrix(scoreMatrix, toCString(getAbsolutePath("demos/howto/scores/dna_example.txt"))); +// showScoringMatrix(scoreMatrix); + +// return 0; +// } +// Here’s the program output. + +// A C G T +// A 1 -1 -1 -1 +// C -1 1 -1 -1 +// G -1 -1 1 -1 +// T -1 -1 -1 1 +// © Copyright 2015, The SeqAn Team. Revision 88e3a0bb. + +// Built with Sphinx using a theme provided by Read the Docs. \ No newline at end of file diff --git a/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment-debug_code_for_move_paths.cpp b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment-debug_code_for_move_paths.cpp new file mode 100644 index 00000000000..c9162c5c934 --- /dev/null +++ b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment-debug_code_for_move_paths.cpp @@ -0,0 +1,1326 @@ +#pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? +#include "0_draft_haplotype_realignment.hpp" + +#include +#include + +#include +#include +#include + +#include "../gbwt_helper.hpp" +#include "../handle.hpp" +#include "../msa_converter.hpp" +#include "../snarls.hpp" +#include "../vg.hpp" +#include +#include +#include +// #include "../../deps/libhandlegraph/src/include/handlegraph/path_handle_graph.hpp" + +namespace vg { + +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl +// Runs disambiguate_snarl on every top-level snarl in the graph, so long as the +// snarl only contains haplotype threads that extend fully from source to sink. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding GBWTGraph of graph. +// snarl_stream: the file stream from .snarl file corresponding to graph. +void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, ifstream &snarl_stream) { + cerr << "disambiguate_top_level_snarls" << endl; + SnarlManager *snarl_manager = new SnarlManager(snarl_stream); + + /** Use this code to count number of snarls in graph. + * int top_count = 0; + * for (const Snarl* snarl : snarl_manager->top_level_snarls()){ + * top_count++; + * } + * cerr << "number of top_level snarls in graph: " << top_count << endl; + * + * int general_count = 0; + * snarl_manager->for_each_snarl_preorder([&](const vg::Snarl * ignored){ + * general_count++; + * }); + * cerr << "number of total snarls in graph: " << general_count << endl; + */ + + int i = 0; + vector snarl_roots = snarl_manager->top_level_snarls(); + for (auto roots : snarl_roots) { + if (i == 2) { + // TODO: debug_code: + cerr << "return to root node ids, disambiguate snarl with.. " << endl; + cerr << "root node ids: " << roots->start().node_id() << " " + << roots->end().node_id() << endl; + disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + roots->end().node_id()); + } + + + // // TODO: debug_code: + // cerr << "return to root node ids, disambiguate snarl with.. " << endl; + // cerr << "root node ids: " << roots->start().node_id() << " " + // << roots->end().node_id() << endl; + // disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + // roots->end().node_id()); + i += 1; + cerr << endl << endl << "normalized " << i << " snarl(s)." << endl; + if (i == 3) { + break; + } + } + + delete snarl_manager; +} + +// For a snarl in the given graph, with every edge covered by at least one haplotype +// thread in the GBWTGraph, +// extract all sequences in the snarl corresponding to the haplotype threads and +// re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are +// preserved; GBWT haplotypes in the snarl are not conserved. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding GBWTGraph of graph. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: none. +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl. +void disambiguate_snarl(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id) { + cerr << "disambiguate_snarl" << endl; + + // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. + // Return value is pair< haplotypes_that_stretch_from_source_to_sink, + // haplotypes_that_end/start_prematurely > + pair>, vector>> haplotypes = + extract_gbwt_haplotypes(haploGraph, source_id, sink_id); + + // TODO: this if statement removes snarls where a haplotype begins/ends in the middle + // TODO: of the snarl. Get rid of this once alignment issue is addressed! + if (haplotypes.second.empty()) { + // Convert the haplotypes from vector format to string format. + vector haplotypes_from_source_to_sink = + format_handle_haplotypes_to_strings(haploGraph, haplotypes.first); + // vector< string > other_haplotypes = + // format_handle_haplotypes_to_strings(haploGraph, haplotypes.second); + + // Align the new snarl: + // TODO: find better way to improve disamiguation of beginning/ending regions of + // nodes + // TODO: than by adding leading/trailing AAA seq (essentially a special + // character). + cerr << "strings to be aligned: " << endl; + for (string &hap : haplotypes_from_source_to_sink) { + cerr << hap << endl; + hap = "AAAAAAAA" + hap + "AAAAAAAA"; + } + VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + + // Get the embedded paths in the snarl out of the graph, for the purposes of + // moving them into the new snarl. + vector> embedded_paths = + extract_embedded_paths_in_snarl(graph, source_id, sink_id); + + cerr << "paths: " << endl; + for (auto path : embedded_paths) { + cerr << " path " + << graph.get_path_name(graph.get_path_handle_of_step(path.first)) + << endl; + for (auto step : {path.first, graph.get_previous_step(path.second)}) { + cerr << "\t" << graph.get_id(graph.get_handle_of_step(step)) << " "; + } + cerr << endl; + } + + // integrate the new_snarl into the graph, removing the old snarl as you go. + integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); + cerr << endl; + + } else { + cerr << "found a snarl with haplotypes in the middle. Start: " << source_id + << " sink is " << sink_id << endl; + } +} + +// TODO: test that it successfully extracts any haplotypes that start/end in the middle of +// TODO: the snarl. +// For a snarl in a given GBWTGraph, extract all the haplotypes in the snarl. Haplotypes +// are represented +// by vectors of handles, representing the chain of handles in a thread. +// Arguments: +// haploGraph: the GBWTGraph containing the snarl. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a pair containting two sets of paths (each represented by a vector). The +// first in the pair represents all paths reaching from source to sink in the snarl, +// and the second representing all other paths in the snarl (e.g. any that don't +// reach both source and sink in the graph.) +pair>, vector>> +extract_gbwt_haplotypes(const GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id) { + cerr << "extract_gbwt_haplotypes" << endl; + + // touched_handles contains all handles that have been touched by the + // depth_first_search, for later use in other_haplotypes_to_strings, which identifies + // paths that didn't stretch from source to sink in the snarl. + unordered_set touched_handles; + + // haplotype_queue contains all started exon_haplotypes not completed yet. + // Every time we encounter a branch in the paths, the next node down the path + // Is stored here, along with the vector of handles that represents the path up + // to the SearchState. + vector, gbwt::SearchState>> haplotype_queue; + + // source and sink handle for haploGraph: + handle_t source_handle = haploGraph.get_handle(source_id); + handle_t sink_handle = haploGraph.get_handle(sink_id); + + // place source in haplotype_queue. + vector source_handle_vec(1, source_handle); + gbwt::SearchState source_state = haploGraph.get_state(source_handle); + haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); + touched_handles.emplace(source_handle); + + // haplotypes contains all "finished" haplotypes - those that were either walked + // to their conclusion, or until they reached the sink. + vector> haplotypes_from_source_to_sink; + vector> other_haplotypes; + + // for every partly-extracted thread, extend the thread until it either reaches + // the sink of the snarl or the end of the thread. + while (!haplotype_queue.empty()) { + + // get a haplotype out of haplotype_queue to extend - + // a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate + // from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be + // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. + if (next_searches.size() > 1) { + // for every next_search in next_searches, either create a new, extended + // cur_haplotype to push into haplotype queue, or place in the + // haplotypes_from_source_to_sink if haplotype extends to sink, or place in + // the other_haplotypes if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + + // add the new handle to the vec: + next_handle_vec.push_back(next_handle); + + // if new_handle is the sink, put in haplotypes_from_source_to_sink + if (haploGraph.get_id(next_handle) == sink_id) { + haplotypes_from_source_to_sink.push_back(next_handle_vec); + } else // keep extending the haplotype! + { + pair, gbwt::SearchState> next_haplotype = + make_pair(next_handle_vec, next_search); + haplotype_queue.push_back(next_haplotype); + } + // next_handle will be touched. + touched_handles.emplace(next_handle); + } + } + // if next_searches is empty, the path has ended but not reached sink. + else if (next_searches.empty()) { + // We have reached the end of the path, but it doesn't reach the sink. + // we need to add cur_haplotype to other_haplotypes. + other_haplotypes.push_back(cur_haplotype.first); + } + // if new_handle is the sink, put in haplotypes_from_source_to_sink + else if (haploGraph.get_id( + haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { + // Then we need to add cur_haplotype + next_search to + // haplotypes_from_source_to_sink. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + cur_haplotype.first.push_back(next_handle); + haplotypes_from_source_to_sink.push_back(cur_haplotype.first); + + // touched next_search's handle + touched_handles.emplace(next_handle); + } + // else, there is just one next_search, and it's not the end of the path. + // just extend the search by adding (cur_haplotype + next_search to + // haplotype_queue. + else { + // get the next_handle from the one next_search. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + + // modify cur_haplotype with next_handle and next_search. + cur_haplotype.first.push_back(next_handle); + cur_haplotype.second = + next_searches.back(); // there's only one next_search in next_searches. + + // put cur_haplotype back in haplotype_queue. + haplotype_queue.push_back(cur_haplotype); + touched_handles.emplace(next_handle); + } + } + + // Find any haplotypes starting from handles not starting at the source, but which + // still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = + find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + + // move haplotypes_not_starting_at_source into other_haplotypes: + other_haplotypes.reserve(other_haplotypes.size() + + haplotypes_not_starting_at_source.size()); + move(haplotypes_not_starting_at_source.begin(), + haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + + return make_pair(haplotypes_from_source_to_sink, other_haplotypes); +} + +// Used to complete the traversal of a snarl along its haplotype threads, when there are +// handles connected to the snarl by +// threads that start after the source handle. (Threads that merely end before the +// sink handle are addressed in extract_gbwt_haplotypes). +// Arguments: +// haploGraph: the GBWTgraph containing the haplotype threads. +// touched_handles: any handles found in the snarl so far. +// sink_id: the id of the final handle in the snarl. +// Returns: +// a vector of haplotypes in vector format that start in the middle of the +// snarl. +vector> +find_haplotypes_not_at_source(const GBWTGraph &haploGraph, + unordered_set &touched_handles, + const id_t &sink_id) { + cerr << "find_haplotypes_not_at_source" << endl; + + /// Search every handle in touched handles for haplotypes starting at that point. + // Any new haplotypes will be added to haplotype_queue. + vector, gbwt::SearchState>> haplotype_queue; + + // Fully extended haplotypes (or haplotypes extended to the snarl's sink) + // will be added to finished_haplotypes. + vector> finished_haplotypes; + + // In addition, we need to put the new handle into to_search, because a path may have + // started on the new handle (which means we need to start a searchstate there.) + unordered_set to_search; + + // We don't need to ever check the sink handle, since paths from the sink handle + // extend beyond snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + touched_handles.erase(sink_handle); + + // Nested function for making a new_search. Identifies threads starting at a given + // handle and + // either adds them as a full haplotype (if the haplotype is one handle long) or + // makes a new entry to haplotype_queue. + auto make_new_search = [&](handle_t handle) { + // Are there any new threads starting at this handle? + gbwt::SearchState new_search = + haploGraph.index.prefix(haploGraph.handle_to_node(handle)); + if (!new_search.empty()) { + // TODO: test_code code: are searchstates empty? + cerr << "apparently new thread starts at node: " << haploGraph.get_id(handle) + << endl; + cerr << "is the searchstate empty? " << new_search.empty() + << " size: " << new_search.size() << endl; + // Then add them to haplotype_queue. + haploGraph.follow_paths( + new_search, [&](const gbwt::SearchState &next_search) -> bool { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle + // long, then there isn't any useful string to extract from this. + if (next_handle != sink_handle || + next_search == gbwt::SearchState()) { + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState> mypair = + make_pair(new_path, next_search); + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to + // to_search. + if (touched_handles.find(next_handle) == touched_handles.end()) { + to_search.emplace(next_handle); + } + } + return true; + }); + } + }; + + /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. + /// Then, check to see if there are any new threads on handles in to_search. + /// Extend those threads, and add any newly found handles to to_search, + /// then search for threads again in to_search again... repeat until to_search remains + /// emptied of new handles. + + // for tracking whether the haplotype thread is still extending: + bool still_extending; + while (!to_search.empty() || !haplotype_queue.empty()) { + while (!haplotype_queue.empty()) { + // get a haplotype to extend out of haplotype_queue - a tuple of + // (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = + haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the + // searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState &next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // if next_search is empty, then we've fallen off the thread, + // and cur_haplotype can be placed in finished_haplotypes as is for this + // thread. + if (next_search == gbwt::SearchState()) { + finished_haplotypes.push_back(cur_haplotype.first); + } + + // if next_search is on the sink_handle, + // then cur_haplotype.first + next_search goes to finished_haplotypes. + else if (haploGraph.get_id(next_handle) == sink_id) { + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + // add next_handle + next_handle_vec.push_back(next_handle); + // place in finished_haplotypes + finished_haplotypes.push_back(next_handle_vec); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { + // copy over cur_haplotype: + pair, gbwt::SearchState> cur_haplotype_copy = + cur_haplotype; + // modify with next_handle/search + cur_haplotype_copy.first.push_back(next_handle); + cur_haplotype_copy.second = next_search; + // place back in haplotype_queue for further extension. + haplotype_queue.push_back(cur_haplotype_copy); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + } + } + } + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search) { + make_new_search(handle); // will add to haplotype_queue if there's any + // new_searches to be had. + } + to_search.clear(); + } + return finished_haplotypes; +} + +// Given a vector of haplotypes of format vector< handle_t >, returns a vector of +// haplotypes of +// format string (which is the concatenated sequences in the handles). +// Arguments: +// haploGraph: a GBWTGraph which contains the handles in vector< handle_t > +// haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< handle_t +// > format. +// Returns: a vector of haplotypes of format string (which is the concatenated sequences +// in the handles). +vector format_handle_haplotypes_to_strings( + const GBWTGraph &haploGraph, + const vector> &haplotype_handle_vectors) { + cerr << "format_handle_haplotypes_to_strings" << endl; + vector haplotype_strings; + for (vector haplotype_handles : haplotype_handle_vectors) { + string hap; + for (handle_t &handle : haplotype_handles) { + hap += haploGraph.get_sequence(handle); + } + haplotype_strings.push_back(hap); + } + return haplotype_strings; +} + +// TODO: eventually change to deal with haplotypes that start/end in middle of snarl. +// Aligns haplotypes to create a new graph using MSAConverter's seqan converter. +// Assumes that each haplotype stretches from source to sink. +// Arguments: +// source_to_sink_haplotypes: a vector of haplotypes in string format (concat of +// handle sequences). +// Returns: +// VG object representing the newly realigned snarl. +VG align_source_to_sink_haplotypes(const vector &source_to_sink_haplotypes) { + cerr << "align_source_to_sink_haplotypes" << endl; + seqan::Align align; // create multiple_sequence_alignment object + + seqan::resize(rows(align), source_to_sink_haplotypes.size()); + for (int i = 0; i < source_to_sink_haplotypes.size(); ++i) { + assignSource(row(align, i), source_to_sink_haplotypes[i].c_str()); + } + + globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + + stringstream ss; + ss << align; + MSAConverter myMSAConverter = MSAConverter(); + myMSAConverter.load_alignments(ss, "seqan"); + VG snarl = myMSAConverter.make_graph(); + snarl.clear_paths(); + + // TODO: find better way to improve disamiguation of beginning/ending regions of nodes + // TODO: than by adding leading/trailing AAA seq (essentially a special + // character). + pair, vector> source_and_sink = + debug_get_sources_and_sinks(snarl); + + // Replace source with a handle that has the leading AAA seq removed. + handle_t source = source_and_sink.first.back(); + string source_seq = snarl.get_sequence(source); + id_t source_id = snarl.get_id(source); + handle_t new_source = snarl.create_handle(source_seq.substr(8, source_seq.size())); + snarl.follow_edges(source, false, [&](const handle_t &handle) { + snarl.create_edge(new_source, handle); + }); + snarl.destroy_handle(source); + + handle_t sink = source_and_sink.second.back(); + string sink_seq = snarl.get_sequence(sink); + id_t sink_id = snarl.get_id(sink); + handle_t new_sink = snarl.create_handle(sink_seq.substr(0, (sink_seq.size() - 8))); + snarl.follow_edges( + sink, true, [&](const handle_t &handle) { snarl.create_edge(handle, new_sink); }); + snarl.destroy_handle(sink); + + return snarl; +} + +// Finds all embedded paths that either start or end in a snarl (or both) defined by +// source_id, sink_id. +// returns a vector of the embedded paths, where each entry in the vector is defined +// by the pair of step_handles closest to the beginning and end of the path. If the +// path is fully contained within the snarl, these step_handles will the be the +// leftmost and rightmost handles in the path. +// Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a vector containing all the embedded paths in the snarl, in pair< step_handle_t, +// step_handle_t > > format. Pair.first is the first step in the path's range of +// interest, and pair.second is the step *after* the last step in the path's range of +// interest (can be the null step at end of path). +vector> +extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id) { + cerr << "extract_embedded_paths_in_snarl" << endl; + + // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't + // extend the path to a point beyond the source or sink. + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + // key is path_handle, value is a step in that path from which to extend. + unordered_map paths_found; + + // look for handles with paths we haven't touched yet. + snarl.for_each_handle([&](const handle_t &handle) { + vector steps = graph.steps_of_handle(handle); + // do any of these steps belong to a path not in paths_found? + for (step_handle_t &step : steps) { + path_handle_t path = graph.get_path_handle_of_step(step); + // If it's a step along a new path, save the first step to that path we find. + // In addtion, if there are multiple steps found in the path, (The avoidance + // of source and sink here is to ensure that we can properly check to see if + // we've reached the end of an embedded path walking in any arbitrary + // direction (i.e. source towards sink or sink towards source). + if (paths_found.find(path) == paths_found.end() || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { + // then we need to mark it as found and save the step. + paths_found[path] = step; + } + } + }); + + /// for each step_handle_t corresponding to a unique path, we want to get the steps + /// closest to both the end and beginning step that still remains in the snarl. + // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", + // TODO so that I could iterate over the fxn. That sounds template-messy in C++ + // tho'. Should I? + vector> paths_in_snarl; + for (auto &it : paths_found) { + step_handle_t step = it.second; + // path_in_snarl describes the start and end steps in the path, + // as constrained by the snarl. + pair path_in_snarl; + + // Look for the step closest to the beginning of the path, as constrained by the + // snarl. + step_handle_t begin_in_snarl_step = step; + id_t begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + + while ((begin_in_snarl_id != source_id) && (begin_in_snarl_id != sink_id) && + graph.has_previous_step(begin_in_snarl_step)) { + begin_in_snarl_step = graph.get_previous_step(begin_in_snarl_step); + begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + } + path_in_snarl.first = begin_in_snarl_step; + + // Look for the step closest to the end of the path, as constrained by the snarl. + step_handle_t end_in_snarl_step = step; + id_t end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); + + while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and + graph.has_next_step(end_in_snarl_step)) { + end_in_snarl_step = graph.get_next_step(end_in_snarl_step); + end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); + } + // Note: when adding the end step, path notation convention requires that we add + // the null step at the end of the path (or the next arbitrary step, in the case + // of a path that extends beyond our snarl.) + // TODO: do we want the next arbitrary step in that latter case? + path_in_snarl.second = graph.get_next_step(end_in_snarl_step); + + paths_in_snarl.push_back(path_in_snarl); + } + + return paths_in_snarl; +} + +// TODO: change the arguments to handles, which contain orientation within themselves. +// Given a start and end node id, construct an extract subgraph between the two nodes +// (inclusive). Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a SubHandleGraph containing only the handles in graph that are between start_id +// and sink_id. +SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &sink_id) { + cerr << "extract_subgraph" << endl; + /// make a subgraph containing only nodes of interest. (e.g. a snarl) + // make empty subgraph + SubHandleGraph subgraph = SubHandleGraph(&graph); + + unordered_set visited; // to avoid counting the same node twice. + unordered_set to_visit; // nodes found that belong in the subgraph. + + // TODO: how to ensure that "to the right" of start_handle is the correct direction? + // initialize with start_handle (because we move only to the right of start_handle): + handle_t start_handle = graph.get_handle(start_id); + subgraph.add_handle(start_handle); + visited.insert(graph.get_id(start_handle)); + + // look only to the right of start_handle + graph.follow_edges(start_handle, false, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + + /// explore the rest of the snarl: + while (to_visit.size() != 0) { + // remove cur_handle from to_visit + unordered_set::iterator cur_index = to_visit.begin(); + handle_t cur_handle = graph.get_handle(*cur_index); + + to_visit.erase(cur_index); + + /// visit cur_handle + visited.insert(graph.get_id(cur_handle)); + + subgraph.add_handle(cur_handle); + + if (graph.get_id(cur_handle) != sink_id) { // don't iterate past end node! + // look for all nodes connected to cur_handle that need to be added + // looking to the left, + graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + // looking to the right, + graph.follow_edges(cur_handle, false, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + } + } + return subgraph; +} + +// Integrates the snarl into the graph, replacing the snarl occupying the space between +// source_id and sink_id. +// In the process, transfers any embedded paths traversing the old snarl into the new +// snarl. +// Arguments: +// graph: the graph in which we want to insert the snarl. +// to_insert_snarl: a *separate* handle_graph from graph, often generated from +// MSAconverter. embedded_paths: a vector of paths, where each is a pair. +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// source_id: the source of the old (to be replaced) snarl in graph +// sink_id: the sink of the old (to be replaced) snarl in graph. +// Return: None. +// TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same +// TODO: orientation as we expect? i.e. that they don't move backward? I think +// TODO: we want match_orientation to be = true, but this may cause problems +// TODO: in some cases given the way we currently construct handles (fixed when we +// TODO: create snarl-scanning interface). +// TODO: It may also be that we *don't want match_orientation to be true, +// TODO: if we're tracking a path that loops backward in the snarl. Hmm... Will think +// about this. +void integrate_snarl(MutablePathDeletableHandleGraph &graph, + const HandleGraph &to_insert_snarl, + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id) { + cerr << "integrate_snarl" << endl; + // Get old graph snarl + SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); + + // TODO: test_code: Check to make sure that newly made snarl has only one start and + // end. + // TODO: (shouldn't be necessary once we've implemented alignment with + // leading/trailing special chars.) Identify old and new snarl start and sink + pair, vector> to_insert_snarl_defining_handles = + debug_get_sources_and_sinks(to_insert_snarl); + + if (to_insert_snarl_defining_handles.first.size() > 1 || + to_insert_snarl_defining_handles.second.size() > 1) { + cerr << "ERROR: newly made snarl with more than one start or end. # of starts: " + << to_insert_snarl_defining_handles.first.size() + << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; + return; + } + + /// Replace start and end handles of old graph snarl with to_insert_snarl start and + /// end, and delete rest of old graph snarl: + + // add to_insert_snarl into graph without directly attaching the snarl to the graph + // (yet). + vector to_insert_snarl_topo_order = + algorithms::lazier_topological_order(&to_insert_snarl); + + // Construct a parallel new_snarl_topo_order to identify + // paralogous nodes between to_insert_snarl and the new snarl inserted in graph. + vector new_snarl_topo_order; + + // integrate the handles from to_insert_snarl into the graph, and keep track of their + // identities by adding them to new_snarl_topo_order. + for (handle_t to_insert_snarl_handle : to_insert_snarl_topo_order) { + handle_t graph_handle = + graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); + new_snarl_topo_order.push_back(graph_handle); + } + + // Connect the newly made handles in the graph together the way they were connected in + // to_insert_snarl: + for (int i = 0; i < to_insert_snarl_topo_order.size(); i++) { + to_insert_snarl.follow_edges( + to_insert_snarl_topo_order[i], false, [&](const handle_t &snarl_handle) { + // get topo_index of nodes to be connected to graph start handle + auto it = find(to_insert_snarl_topo_order.begin(), + to_insert_snarl_topo_order.end(), snarl_handle); + int topo_index = it - to_insert_snarl_topo_order.begin(); + + // connect graph start handle + graph.create_edge(new_snarl_topo_order[i], + new_snarl_topo_order[topo_index]); + }); + } + + // save the source and sink values of new_snarl_topo_order, since topological order is + // not necessarily preserved by move_path_to_snarl. Is temporary b/c we need to + // replace the handles with ones with the right id_t label for source and sink later + // on. + id_t temp_snarl_source_id = graph.get_id(new_snarl_topo_order.front()); + id_t temp_snarl_sink_id = graph.get_id(new_snarl_topo_order.back()); + + // Add the neighbors of the source and sink of the original snarl to the new_snarl's + // source and sink. + // source integration: + graph.follow_edges( + graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, graph.get_handle(temp_snarl_source_id)); + }); + graph.follow_edges( + graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { + graph.create_edge(graph.get_handle(temp_snarl_sink_id), next_handle); + }); + + // For each path of interest, move it onto the new_snarl. + for (auto path : embedded_paths) { + move_path_to_snarl(graph, path, new_snarl_topo_order, temp_snarl_source_id, + temp_snarl_sink_id); + } + + // Destroy the old snarl. + old_snarl.for_each_handle( + [&](const handle_t &handle) { graph.destroy_handle(handle); }); + + // Replace the source and sink handles with ones that have the original source/sink id + // (for compatibility with future iterations on neighboring top-level snarls using the + // same snarl manager. Couldn't replace it before b/c we needed the old handles to + // move the paths. + handle_t new_source_handle = graph.create_handle( + graph.get_sequence(graph.get_handle(temp_snarl_source_id)), source_id); + handle_t new_sink_handle = + graph.create_handle(graph.get_sequence(new_snarl_topo_order.back()), sink_id); + + // move the source edges: + // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could + // in Python!) + graph.follow_edges(graph.get_handle(temp_snarl_source_id), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_source_handle); + }); + graph.follow_edges(graph.get_handle(temp_snarl_source_id), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_source_handle, next_handle); + }); + + // move the sink edges: + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_sink_handle); + }); + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_sink_handle, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_source_handle}); + }); + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_sink_handle}); + }); + + // delete the previously created source and sink: + for (handle_t handle : + {graph.get_handle(temp_snarl_source_id), graph.get_handle(temp_snarl_sink_id)}) { + graph.destroy_handle(handle); + } +} + +// Moves a path from its original location in the graph to a new snarl, +// defined by a vector of interconnected handles. +// NOTE: the handles in new_snarl_handles may not preserve topological order after +// being passed to this method, if they were ordered before. +// Arguments: graph: the graph containing the old_embedded_path and the handles in +// new_snarl_topo_order +// old_embedded_path: a pair, where +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// new_snarl_topo_order: all the handles in the new snarl, inside the graph. +// Return: None. +void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, + const pair &old_embedded_path, + vector &new_snarl_handles, id_t &source_id, + id_t &sink_id) { + cerr << endl << "move_path_to_snarl" << endl; + + cerr << "for path " + << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << endl; + // get the sequence associated with the path + string path_seq; + step_handle_t cur_step = old_embedded_path.first; + cerr << " old_embedded path looks like: " + << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " + << graph.get_id(graph.get_handle_of_step( + graph.get_previous_step(old_embedded_path.second))) + << endl; + + cerr << "new snarl source and sink: " << source_id << " " << sink_id << endl; + + while (cur_step != old_embedded_path.second) { + path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); + cur_step = graph.get_next_step(cur_step); + } + + cerr << "pathseq: " << path_seq << endl; + + // for the given path, find every good possible starting handle in the new_snarl + // format of pair is < possible_path_handle_vec, + // starting_index_in_the_first_handle, current_index_in_path_seq> + vector, int, int>> possible_paths; + for (handle_t handle : new_snarl_handles) { + string handle_seq = graph.get_sequence(handle); + // starting index is where the path would begin in the handle, + // since it could begin in the middle of the handle. + vector starting_indices = + check_handle_as_start_of_path_seq(handle_seq, path_seq); + // TODO: debug_code: indices of start? + // cerr << "indices of start in handle " << graph.get_id(handle) << " with sequence " + // << graph.get_sequence(handle) << "?" << endl; + // for (auto start : starting_indices) { + // cerr << start << " " << graph.get_sequence(handle).substr(start) << endl; + // } + // cerr << endl; + // cerr << "this what the seq looks like: " << endl; + // for (auto start : starting_indices){ + // cerr << start << " " << + // } + // if there is a starting index, + if (starting_indices.size() != 0) { + // if the starting_indices implies that the starting handle entirely contains + // the path_seq of interest: + // cerr << " does the starting handle contain the pathseq? " + // << ((handle_seq.size() - starting_indices.back()) >= path_seq.size()) + // << endl; + // cerr << "handle_seq.size()" << handle_seq.size() << "starting_indices.back()" + // << starting_indices.back() << "path_seq.size()" << path_seq.size() + // << endl; + + if ((handle_seq.size() - starting_indices.back()) >= path_seq.size()) { + cerr << " found a full path at node " << graph.get_id(handle) << " at " << starting_indices.back() << endl; + // then we've already found the full mapping location of the path! Move + // the path, end the method. + vector new_path{handle}; + graph.rewrite_segment(old_embedded_path.first, old_embedded_path.second, + new_path); + return; + } else { + cerr << "adding possible path at node " << graph.get_id(handle) << " at " << endl; + for (auto index : starting_indices){ + cerr << index << " "; + } + cerr << endl; + // add it as a possible_path. + vector possible_path_handle_vec{handle}; + for (auto starting_index : starting_indices) { + possible_paths.push_back( + make_tuple(possible_path_handle_vec, starting_index, + handle_seq.size() - starting_index)); + } + } + } + } + + // for every possible path, extend it to determine if it really is the path we're + // looking for: + while (!possible_paths.empty()) { + // take a path off of possible_paths, which will be copied for every iteration through graph.follow_edges, below: + tuple, int, int> possible_path_query = possible_paths.back(); + + cerr << "possible paths looks like: " << endl; + for (auto path: possible_paths){ + for (auto handle: get<0>(path)){ + cerr << graph.get_id(handle) << " "; + } + cerr << endl << "- - -" << endl; + } + cerr << endl; + possible_paths.pop_back(); + + // extend the path through all right-extending edges to see if any subsequent + // paths still satisfy the requirements for being a possible_path: + bool no_path = graph.follow_edges( + get<0>(possible_path_query).back(), false, [&](const handle_t &next) { + // make a copy to be extended for through each possible next handle in follow edges. + tuple, int, int> possible_path = possible_path_query; + + + string next_seq = graph.get_sequence(next); + id_t next_id = graph.get_id(next); + cerr << "iterating through possible paths loop. id of next is: " << graph.get_id(next) << endl; + cerr << "ALSO: possible paths looks like: "; + for (auto handle: get<0>(possible_path)){ + cerr << graph.get_id(handle) << " "; + } + cerr << endl; + int &cur_index_in_path = get<2>(possible_path); + + + // // if the next handle would be the ending handle for the path, + // if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { + // // check to see if the sequence in the handle is suitable for ending + // // the path: + // int compare_length = path_seq.size() - cur_index_in_path; + // if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, + // compare_length) == 0) { + // // we've found the new path! Move path to the new sequence, and + // // end the function. + // // TODO: move the path to the new vector of handles, splitting + // // start and end handles if need be. NOTE: if sink handle, we need + // // to ensure that the sink is properly placed at the end of the + // // new_snarl_topo_order (for future re-naming of sink id to be the + // // same as the original snarl). + // // if the path ends before the end of next_seq, then split the + // // handle so that the path ends flush with the end of the + // // first of the two split handles. + // if (compare_length < next_seq.size()) { + // pair divided_next = + // graph.divide_handle(next, compare_length); + // get<0>(possible_path).push_back(divided_next.first); + // find(new_snarl_handles.begin(), new_snarl_handles.end(), next); + // new_snarl_handles.push_back(divided_next.first); + // } else { + // get<0>(possible_path).push_back(next); + // } + // graph.rewrite_segment(old_embedded_path.first, + // old_embedded_path.second, + // get<0>(possible_path)); + + // // TODO: test_code: show when we find a path: + // cerr << "found a full path named " + // << graph.get_path_name(graph.get_path_handle_of_step( + // old_embedded_path.first)) + // << "! Here is the sequence of handles:" << endl; + // for (handle_t handle : get<0>(possible_path)) { + // cerr << graph.get_id(handle) << ": " + // << graph.get_sequence(handle) << " " << endl; + // } + // return false; + // } + // } + + // if the next handle would be the ending handle for the path, + if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { + // check to see if the sequence in the handle is suitable for ending + // the path: + int compare_length = path_seq.size() - cur_index_in_path; + if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, + compare_length) == 0) { + // we've found the new path! Move path to the new sequence, and + // end the function. + + if (compare_length < next_seq.size()) { + // If the path ends before the end of next_seq, then split the + // handle so that the path ends flush with the end of the + // first of the two split handles. + + // divide the handle where the path ends; + pair divided_next = + graph.divide_handle(next, compare_length); + get<0>(possible_path).push_back(divided_next.first); + // Special case if next is the sink or the source, to preserve + // the reassignment of source and sink ids in integrate_snarl. + if (next_id = sink_id) { + sink_id = graph.get_id(divided_next.second); + } + + // TODO: NOTE: finding the old "next" handle is expensive. + // TODO: Use different container? + auto it = find(new_snarl_handles.begin(), + new_snarl_handles.end(), next); + + // replace the old invalidated handle with one of the new ones + *it = divided_next.first; + // stick the other new handle on the end of new_snarl_handles. + new_snarl_handles.push_back(divided_next.second); + + } else { + // otherwise, the end of the path already coincides with the + // end of the handle. In that case, just add it to the path. + get<0>(possible_path).push_back(next); + } + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, + get<0>(possible_path)); + + // TODO: test_code: show when we find a path: + cerr << "found a full path named " + << graph.get_path_name(graph.get_path_handle_of_step( + old_embedded_path.first)) + << "! Here is the sequence of handles:" << endl; + for (handle_t handle : get<0>(possible_path)) { + cerr << graph.get_id(handle) << ": " + << graph.get_sequence(handle) << " " << endl; + } + return false; + } + } + // see if the next handle would be the continuation of the path, but not + // the end, + else { + // check to see if the sequence in the handle is suitable for + // extending the path: + int compare_length = next_seq.size(); + if (next_seq.compare(0, compare_length, path_seq, cur_index_in_path, + compare_length) == 0) { + // extend the path + get<0>(possible_path).push_back(next); + + cerr << "we've found an extension for a path starting at " << get<1>(possible_path) << ": "; + for (auto handle : get<0>(possible_path)){ + cerr << graph.get_id(handle) << " "; + } + cerr << endl; + + // update the current index in path_seq. + get<2>(possible_path) += next_seq.size(); + + // place back into possible_paths + possible_paths.push_back(possible_path); + } + } + // continue to iterate through follow_edges. + return true; + }); + + // if we've found a complete path in the above follow_edges, then we've already + // moved the path, and we're done. + if (!no_path) { + return; + } + } + // if we failed to find a path, show an error message. + // TODO: make this better! Throw an exception? + cerr << "Warning! Didn't find a corresponding path of name " + << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << " from the old snarl in the newly aligned snarl." << endl + << endl; + // cerr << "Here's the sequence of the path: " << path_seq << endl + // << "Here's the start and end node ids of the path: " + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) << " " + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << endl + // << endl; +} + +// Determines whether some subsequence in a handle satisfies the condition of being the +// beginning of a path. +// If the path_seq is longer than the handle_seq, only checks subsequences that reach +// from the beginning/middle of the handle_seq to the end. If path_seq is shorter +// than handle_seq, checks for any substring of length path_seq within the +// handle_seq, as well as substrings smaller than length path_seq that extend beyond +// the current handle. +// Arguments: +// handle_seq: the sequence in the handle we're trying to identify as a +// start_of_path_seq. path_seq: the sequence in the path we're trying to find +// starting points for in handle_seq +// Return: a vector of all potential starting index of the subsequence in the handle_seq. +vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq) { + vector possible_start_indices; + cerr << "check_handle_as_start_of_path_seq" << endl; + // If the handle_seq.size <= path_seq.size, look for subsequences reaching from + // beginning/middle of handle_seq to the end - where path_seq may run off the end + // of this handle to the next in the snarl. + if (handle_seq.size() <= path_seq.size()) { + // iterate through all possible starting positions in the handle_seq. + for (int handle_start_i = 0; handle_start_i < handle_seq.size(); + handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // if handle_seq.size > path_seq.size, look for any subsequence within handle_seq of + // path_seq.size, as well as any subsequence smaller than path_seq reaching from + // middle of handle_seq to the end of handle_seq. + else { + // first, search through all handle_seq for any comparable subsequence of + // path_seq.size. Note: only differences between this for loop and above for loop + // is that handle_start_i stops at (<= path_seq.size() - handle_seq.size()), and + // subseq.size() = path_seq.size() + for (int handle_start_i = 0; + handle_start_i < (handle_seq.size() - path_seq.size()); handle_start_i++) { + int subseq_size = path_seq.size(); + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + // second, search through the last few bases of handle_seq for the beginning of + // path_seq. Note: nearly identical for loop to the one in "if (handle_seq.size() + // <= path_seq.size())" + for (int handle_start_i = (handle_seq.size() - path_seq.size() + 1); + handle_start_i < handle_seq.size(); handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // Note: if we passed through the above check without returning anything, then there + // isn't any satisfactory subsequence. + return possible_start_indices; +} + +// ------------------------------ DEBUG CODE BELOW: +// ------------------------------------------ + +// Returns pair where pair.first is a vector of all sources of the given graph and +// path.second is all the sinks of the given graph. If graph is a subhandlegraph of a +// snarl, there should only be one source and sink each. +pair, vector> +debug_get_sources_and_sinks(const HandleGraph &graph) { + cerr << "debug_get_source_and_sinks" << endl; + vector sink; + vector source; + + // identify sources and sinks + graph.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + graph.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + graph.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + source.push_back(handle); + } + if (is_sink) { + sink.emplace_back(handle); + } + }); + + return pair, vector>(source, sink); +} + +// Runs through the whole snarl and generates all possible strings representing walks from +// source to sink. Generates a combinatorial number of possible paths with splits in the +// snarl. +vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, + id_t start_id, id_t sink_id) { + cerr << "debug_graph_to_strings" << endl; + SubHandleGraph snarl = extract_subgraph(graph, start_id, sink_id); + + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(snarl.get_node_count()); // resize count to contain enough buckets for + // size of snarl + sequences.reserve(snarl.get_node_count()); // resize sequences to contain enough + // buckets for size of snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, we + // can just use start_id and sink_id. + snarl.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + snarl.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + snarl.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back( + snarl.get_sequence(handle)); // TODO: presented in the handle's local + // forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); + + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t &handle : algorithms::lazier_topological_order(&snarl)) { + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; + + snarl.follow_edges(handle, false, [&](const handle_t &next) { + size_t &count_next = count[next]; + string seq_next = snarl.get_sequence(next); + + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } + + else { + count_next += count_here; + for (string seq : seqs_here) { + sequences[next].push_back(seq + seq_next); + } + } + }); + /// TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t &sink : sinks) { + total_count += count[sink]; + } + + // all the sequences at the sinks will be all the sequences in the snarl. + vector walks; + for (handle_t &sink : sinks) { + for (string seq : sequences[sink]) { + walks.push_back(seq); + } + } + + return walks; +} + +} // namespace vg diff --git a/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.cpp b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.cpp new file mode 100644 index 00000000000..4757ebc2eec --- /dev/null +++ b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.cpp @@ -0,0 +1,1680 @@ +// TODO: I remove snarls where a haplotype begins/ends in the middle +// TODO: of the snarl. Get rid of this once alignment issue is addressed! +// TODO: also, limits the number of haplotypes to be aligned, since snarl starting at +// TODO: 2049699 with 258 haplotypes is taking many minutes. + +// TODO: another had 146 haplotypes and took maybe 5 minutes to align. (kept that one +// in tho' ) +#pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? +#include "0_draft_haplotype_realignment.hpp" + +#include +#include + +#include +#include +#include + +#include "../gbwt_helper.hpp" +#include "../handle.hpp" +#include "../msa_converter.hpp" +#include "../snarls.hpp" +#include "../vg.hpp" +#include "is_acyclic.hpp" +#include + +#include "../types.hpp" +#include "extract_containing_graph.hpp" + +namespace vg { + +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl +// Runs disambiguate_snarl on every top-level snarl in the graph, so long as the +// snarl only contains haplotype threads that extend fully from source to sink. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding gbwtgraph::GBWTGraph of graph. +// snarl_stream: the file stream from .snarl file corresponding to graph. +void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, + const gbwtgraph::GBWTGraph &haploGraph, ifstream &snarl_stream, + const int &max_alignment_size) { + // cerr << "disambiguate_top_level_snarls" << endl; + SnarlManager *snarl_manager = new SnarlManager(snarl_stream); + + int num_snarls_normalized = 0; + int num_snarls_skipped = 0; + vector snarl_roots = snarl_manager->top_level_snarls(); + // error_record's bools are: + // + tuple one_snarl_error_record; + tuple full_error_record; + int num_of_errors = 4; + + pair snarl_sequence_change; + + for (auto roots : snarl_roots) { + + if (roots->start().node_id() == 42777) { + cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) + << " source: " << roots->start().node_id() + << " sink: " << roots->end().node_id() << endl; + one_snarl_error_record = + disambiguate_snarl(graph, haploGraph, roots->start().node_id(), + roots->end().node_id(), max_alignment_size); + get<0>(full_error_record) += get<0>(one_snarl_error_record); + get<1>(full_error_record) += get<1>(one_snarl_error_record); + get<2>(full_error_record) += get<2>(one_snarl_error_record); + get<3>(full_error_record) += get<3>(one_snarl_error_record); + if (!(get<0>(one_snarl_error_record) || get<1>(one_snarl_error_record) || + get<2>(one_snarl_error_record) || get<3>(one_snarl_error_record))) { + num_snarls_normalized += 1; + snarl_sequence_change.first += get<4>(one_snarl_error_record); + snarl_sequence_change.second += get<5>(one_snarl_error_record); + } else { + num_snarls_skipped += 1; + } + } + } + cerr << endl + << "normalized " << num_snarls_normalized << " snarl(s), skipped " + << num_snarls_skipped << " snarls because. . .\nthey exceeded the size limit (" + << get<0>(full_error_record) + << "snarls),\nhad haplotypes starting/ending in the middle of the snarl (" + << get<1>(full_error_record) << "),\nthe snarl was cyclic (" + << get<3>(full_error_record) + << " snarls),\nor there " + "were handles not connected by the gbwt info (" + << get<2>(full_error_record) << " snarls)." << endl; + cerr << "amount of sequence in normalized snarls before normalization: " << snarl_sequence_change.first << endl; + cerr << "amount of sequence in normalized snarls after normalization: " << snarl_sequence_change.second << endl; + + /// Args: + /// source graph to extract subgraph from + /// into graph to extract into + /// positions search outward from these positions + /// max_dist include all nodes and edges that can be reached in at most + /// this distance reversing_walk_length also find graph material that can be reached + + // //todo: debug_statement + // VG outGraph; + // pos_t source_pos = make_pos_t(4211565, false, 0); + // vector pos_vec; + // pos_vec.push_back(source_pos); + // algorithms::extract_containing_graph(&graph, &outGraph, pos_vec, 150); + // outGraph.serialize_to_ostream(cout); + + delete snarl_manager; +} + +// For a snarl in the given graph, with every edge covered by at least one haplotype +// thread in the gbwtgraph::GBWTGraph, +// extract all sequences in the snarl corresponding to the haplotype threads and +// re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are +// preserved; GBWT haplotypes in the snarl are not conserved. +// Arguments: +// graph: the full-sized handlegraph that will undergo edits in a snarl. +// haploGraph: the corresponding gbwtgraph::GBWTGraph of graph. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: none. +// TODO: allow for snarls that have haplotypes that begin or end in the middle of the +// snarl. +tuple +disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const gbwtgraph::GBWTGraph &haploGraph, + const id_t &source_id, const id_t &sink_id, + const int &max_alignment_size) { + // cerr << "disambiguate_snarl" << endl; + // error_record's bools are: + // + tuple error_record{0, 0, 0, 0, 0, 0}; + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + + if (!algorithms::is_acyclic(&snarl)) { + cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; + get<3>(error_record) = true; + } + + // First, find all haplotypes encoded by the GBWT, in order to create the new snarl. + // Return value is tuple< haplotypes_that_stretch_from_source_to_sink, + // haplotypes_that_end/start_prematurely, set of all handles in the haplotypes > + tuple>, vector>, unordered_set> + haplotypes = extract_gbwt_haplotypes(snarl, haploGraph, source_id, sink_id); + + // check to make sure that the gbwt graph has threads connecting all handles: + // ( needs the unordered_set from extract_gbwt haplotypes to be equal to the number of + // handles in the snarl). + unordered_set handles_in_snarl; + snarl.for_each_handle([&](const handle_t handle) { + handles_in_snarl.emplace(handle); + // count the number of bases in the snarl. + get<4>(error_record) += snarl.get_sequence(handle).size(); + }); + + // TODO: this if statement removes snarls where a haplotype begins/ends in the middle + // TODO: of the snarl. Get rid of this once alignment issue is addressed! + // TODO: also, limits the number of haplotypes to be aligned, since snarl starting at + // TODO: 2049699 with 258 haplotypes is taking many minutes. + if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < max_alignment_size && + get<2>(haplotypes).size() == handles_in_snarl.size()) { + // if (get<1>(haplotypes).empty() && get<2>(haplotypes).size() == + // handles_in_snarl) { if (get<1>(haplotypes).empty()) { Convert the haplotypes + // from vector format to string format. + vector haplotypes_from_source_to_sink = + format_handle_haplotypes_to_strings(haploGraph, get<0>(haplotypes)); + // vector< string > other_haplotypes = + // format_handle_haplotypes_to_strings(haploGraph, get<1>(haplotypes)); + + // Get the embedded paths in the snarl out of the graph, for the purposes of + // moving them into the new snarl. In addition, any embedded paths that stretch + // from source to sink are aligned in the new snarl. + // TODO: once haplotypes that begin/end in the middle of the snarl have been + // TODO: accounted for in the code, align all embedded paths? (and remove next + // TODO: chunk of code that finds source-to-sink paths)? + vector> embedded_paths = + extract_embedded_paths_in_snarl(graph, source_id, sink_id); + + // find the paths that stretch from source to sink: + for (auto path : embedded_paths) { + // cerr << "checking path of name " << + // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with + // start " << graph.get_id(graph.get_handle_of_step(path.first)) << " and sink + // " << + // graph.get_id(graph.get_handle_of_step(graph.get_previous_step(path.second))) + // << endl; + if (graph.get_id(graph.get_handle_of_step(path.first)) == source_id && + graph.get_id(graph.get_handle_of_step( + graph.get_previous_step(path.second))) == sink_id) { + // cerr << "adding path of name " << + // graph.get_path_name(graph.get_path_handle_of_step(path.first)) << endl; + // get the sequence of the source to sink path, and add it to the paths to + // be aligned. + string path_seq; + step_handle_t cur_step = path.first; + while (cur_step != path.second) { + path_seq += graph.get_sequence(graph.get_handle_of_step(cur_step)); + cur_step = graph.get_next_step(cur_step); + } + haplotypes_from_source_to_sink.push_back(path_seq); + } + } + // Align the new snarl: + VG new_snarl = align_source_to_sink_haplotypes(haplotypes_from_source_to_sink); + + // count the number of bases in the snarl. + new_snarl.for_each_handle([&](const handle_t handle) { + get<5>(error_record) += new_snarl.get_sequence(handle).size(); + }); + + // todo: make 32 a part of the general object maximum handle_size info. + force_maximum_handle_size(new_snarl, 32); + + // todo: debug_statement + // new_snarl.for_each_handle([&](const handle_t& handle) { + // cerr << new_snarl.get_id(handle) << " " << new_snarl.get_sequence(handle) + // << "\t"; + // }); + + // integrate the new_snarl into the graph, removing the old snarl as you go. + integrate_snarl(graph, new_snarl, embedded_paths, source_id, sink_id); + return error_record; + } else { + if (!get<1>(haplotypes).empty()) { + cerr << "found a snarl starting at " << source_id << " and ending at " + << sink_id + << " with haplotypes that start or end in the middle. Skipping." << endl; + get<1>(error_record) = true; + } + if (get<0>(haplotypes).size() > max_alignment_size) { + cerr << "found a snarl starting at " << source_id << " and ending at " + << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() + << ") to efficiently align. Skipping." << endl; + get<0>(error_record) = true; + } + if (get<2>(haplotypes).size() != handles_in_snarl.size()) { + cerr << "some handles in the snarl starting at " << source_id + << " and ending at " << sink_id + << " aren't accounted for by the gbwt graph. " + "Skipping." + << endl; + cerr << "these handles are:" << endl << "\t"; + for (auto handle : handles_in_snarl) { + if (get<2>(haplotypes).find(handle) == get<2>(haplotypes).end()) { + cerr << graph.get_id(handle) << " "; + } + } + cerr << endl; + get<2>(error_record) = true; + } + if (get<5>(error_record) > get<4>(error_record)){ + cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, rather than decreased." << endl; + } + return error_record; + } +} // namespace vg + +// TODO: test that it successfully extracts any haplotypes that start/end in the middle of +// TODO: the snarl. +// For a snarl in a given gbwtgraph::GBWTGraph, extract all the haplotypes in the snarl. Haplotypes +// are represented +// by vectors of handles, representing the chain of handles in a thread. +// Arguments: +// haploGraph: the gbwtgraph::GBWTGraph containing the snarl. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a pair containting two sets of paths (each represented by a vector). The +// first in the pair represents all paths reaching from source to sink in the snarl, +// and the second representing all other paths in the snarl (e.g. any that don't +// reach both source and sink in the graph.) +// pair>, vector>> +tuple>, vector>, unordered_set> +extract_gbwt_haplotypes(const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, + const id_t &source_id, const id_t &sink_id) { + // cerr << "extract_gbwt_haplotypes" << endl; + + // haplotype_queue contains all started exon_haplotypes not completed yet. + // Every time we encounter a branch in the paths, the next node down the path + // Is stored here, along with the vector of handles that represents the path up + // to the SearchState. + vector, gbwt::SearchState>> haplotype_queue; + + // source and sink handle for haploGraph: + handle_t source_handle = haploGraph.get_handle(source_id); + handle_t sink_handle = haploGraph.get_handle(sink_id); + + // place source in haplotype_queue. + vector source_handle_vec(1, source_handle); + gbwt::SearchState source_state = haploGraph.get_state(source_handle); + haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); + + // touched_handles contains all handles that have been touched by the + // depth first search below, for later use in other_haplotypes_to_strings, which + // identifies paths that didn't stretch from source to sink in the snarl. + unordered_set touched_handles{source_handle, sink_handle}; + + // haplotypes contains all "finished" haplotypes - those that were either walked + // to their conclusion, or until they reached the sink. + vector> haplotypes_from_source_to_sink; + vector> other_haplotypes; + + // sometimes a gbwt thread will indicate a connection between two handles that doesn't + // actually exist in the graph. These connections need to be ignored. + unordered_set incorrect_connections; + + // int prev_size = 0; + // for every partly-extracted thread, extend the thread until it either reaches + // the sink of the snarl or the end of the thread. + while (!haplotype_queue.empty()) { + // todo: debug_statement + // cerr << "haplotype queue: "; + // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; + // for (auto hap : haplotype_queue) { + // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; + // for (handle_t handle : hap.first) { + // cerr << haploGraph.get_id(handle) << " "; + // } + // cerr << endl; + // } + + // get a haplotype out of haplotype_queue to extend - + // a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate + // from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be + // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. + if (next_searches.size() > 1) { + // for every next_search in next_searches, either create a new, extended + // cur_haplotype to push into haplotype queue, or place in the + // haplotypes_from_source_to_sink if haplotype extends to sink, or place in + // the other_haplotypes if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + // if (!snarl.has_node(snarl.get_id(next_handle)) && + // make_pair(haploGraph.get_id(cur_haplotype.first.back()),haploGraph.get_id(next_handle))) + // { + if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { + if (incorrect_connections.find( + snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == + incorrect_connections.end()) { + cerr + << "snarl starting at node " << source_id << " and ending at " + << sink_id + << " has a thread that incorrectly connects two nodes that " + "don't have any edge connecting them. These two nodes are " + << haploGraph.get_id(cur_haplotype.first.back()) << " and " + << haploGraph.get_id(next_handle) + << ". This thread connection will be ignored." << endl; + incorrect_connections.emplace( + snarl.edge_handle(cur_haplotype.first.back(), next_handle)); + + // todo: debug_statement + cerr << "next handle(s) of handle " + << snarl.get_id(cur_haplotype.first.back()) + << " according to snarl:" << endl; + snarl.follow_edges(cur_haplotype.first.back(), false, + [&](const handle_t handle) { + cerr << "\t" << snarl.get_id(handle); + }); + cerr << endl; + } + continue; + } + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + + // add the new handle to the vec: + next_handle_vec.push_back(next_handle); + + // if new_handle is the sink, put in haplotypes_from_source_to_sink + if (haploGraph.get_id(next_handle) == sink_id) { + haplotypes_from_source_to_sink.push_back(next_handle_vec); + } else // keep extending the haplotype! + { + pair, gbwt::SearchState> next_haplotype = + make_pair(next_handle_vec, next_search); + haplotype_queue.push_back(next_haplotype); + } + // next_handle will be touched. + touched_handles.emplace(next_handle); + } + } + // if next_searches is empty, the path has ended but not reached sink. + else if (next_searches.empty()) { + // We have reached the end of the path, but it doesn't reach the sink. + // we need to add cur_haplotype to other_haplotypes. + other_haplotypes.push_back(cur_haplotype.first); + + } + // if next_handle is the sink, put in haplotypes_from_source_to_sink + else if (haploGraph.get_id( + haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { + // Then we need to add cur_haplotype + next_search to + // haplotypes_from_source_to_sink. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + cur_haplotype.first.push_back(next_handle); + haplotypes_from_source_to_sink.push_back(cur_haplotype.first); + + // touched next_search's handle + touched_handles.emplace(next_handle); + } + // else, there is just one next_search, and it's not the end of the path. + // just extend the search by adding (cur_haplotype + next_search to + // haplotype_queue. + else { + // get the next_handle from the one next_search. + handle_t next_handle = haploGraph.node_to_handle(next_searches.back().node); + + // modify cur_haplotype with next_handle and next_search. + cur_haplotype.first.push_back(next_handle); + cur_haplotype.second = + next_searches.back(); // there's only one next_search in next_searches. + + // put cur_haplotype back in haplotype_queue. + haplotype_queue.push_back(cur_haplotype); + touched_handles.emplace(next_handle); + } + } + + // Find any haplotypes starting from handles not starting at the source, but which + // still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = + find_haplotypes_not_at_source(haploGraph, touched_handles, sink_id); + + // move haplotypes_not_starting_at_source into other_haplotypes: + other_haplotypes.reserve(other_haplotypes.size() + + haplotypes_not_starting_at_source.size()); + move(haplotypes_not_starting_at_source.begin(), + haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + + return tuple>, vector>, + unordered_set>{haplotypes_from_source_to_sink, + other_haplotypes, touched_handles}; +} + +// Used to complete the traversal of a snarl along its haplotype threads, when there are +// handles connected to the snarl by +// threads that start after the source handle. (Threads that merely end before the +// sink handle are addressed in extract_gbwt_haplotypes). +// Arguments: +// haploGraph: the GBWTgraph containing the haplotype threads. +// touched_handles: any handles found in the snarl so far. +// sink_id: the id of the final handle in the snarl. +// Returns: +// a vector of haplotypes in vector format that start in the middle of the +// snarl. +vector> +find_haplotypes_not_at_source(const gbwtgraph::GBWTGraph &haploGraph, + unordered_set &touched_handles, + const id_t &sink_id) { + // cerr << "find_haplotypes_not_at_source" << endl; + + /// Search every handle in touched handles for haplotypes starting at that point. + // Any new haplotypes will be added to haplotype_queue. + vector, gbwt::SearchState>> haplotype_queue; + + // Fully extended haplotypes (or haplotypes extended to the snarl's sink) + // will be added to finished_haplotypes. + vector> finished_haplotypes; + + // In addition, we need to put the new handle into to_search, because a path may have + // started on the new handle (which means we need to start a searchstate there.) + unordered_set to_search; + + // We don't need to ever check the sink handle, since paths from the sink handle + // extend beyond snarl. + handle_t sink_handle = haploGraph.get_handle(sink_id); + // touched_handles.erase(sink_handle); + + // Nested function for making a new_search. Identifies threads starting at a given + // handle and + // either adds them as a full haplotype (if the haplotype is one handle long) or + // makes a new entry to haplotype_queue. + auto make_new_search = [&](handle_t handle) { + // Are there any new threads starting at this handle? + gbwt::SearchState new_search = + haploGraph.index->prefix(haploGraph.handle_to_node(handle)); + if (!new_search.empty()) { + // Then add them to haplotype_queue. + haploGraph.follow_paths( + new_search, [&](const gbwt::SearchState &next_search) -> bool { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle + // long, then there isn't any useful string to extract from this. + if (next_handle != sink_handle || + next_search == gbwt::SearchState()) { + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState> mypair = + make_pair(new_path, next_search); + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to + // to_search. + if (touched_handles.find(next_handle) == touched_handles.end()) { + to_search.emplace(next_handle); + } + } + return true; + }); + } + }; + + /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. + /// Then, check to see if there are any new threads on handles in to_search. + /// Extend those threads, and add any newly found handles to to_search, + /// then search for threads again in to_search again... repeat until to_search remains + /// emptied of new handles. + + // for tracking whether the haplotype thread is still extending: + bool still_extending; + while (!to_search.empty() || !haplotype_queue.empty()) { + while (!haplotype_queue.empty()) { + // get a haplotype to extend out of haplotype_queue - a tuple of + // (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = + haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the + // searchstate from cur_haplotype. + vector next_searches; + haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState &next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = haploGraph.node_to_handle(next_search.node); + + // if next_search is empty, then we've fallen off the thread, + // and cur_haplotype can be placed in finished_haplotypes as is for this + // thread. + if (next_search == gbwt::SearchState()) { + finished_haplotypes.push_back(cur_haplotype.first); + } + + // if next_search is on the sink_handle, + // then cur_haplotype.first + next_search goes to finished_haplotypes. + else if (haploGraph.get_id(next_handle) == sink_id) { + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + // add next_handle + next_handle_vec.push_back(next_handle); + // place in finished_haplotypes + finished_haplotypes.push_back(next_handle_vec); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { + // copy over cur_haplotype: + pair, gbwt::SearchState> cur_haplotype_copy = + cur_haplotype; + // modify with next_handle/search + cur_haplotype_copy.first.push_back(next_handle); + cur_haplotype_copy.second = next_search; + // place back in haplotype_queue for further extension. + haplotype_queue.push_back(cur_haplotype_copy); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + } + } + } + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search) { + make_new_search(handle); // will add to haplotype_queue if there's any + // new_searches to be had. + } + to_search.clear(); + } + return finished_haplotypes; +} + +// Given a vector of haplotypes of format vector< handle_t >, returns a vector of +// haplotypes of +// format string (which is the concatenated sequences in the handles). +// Arguments: +// haploGraph: a gbwtgraph::GBWTGraph which contains the handles in vector< handle_t > +// haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< handle_t +// > format. +// Returns: a vector of haplotypes of format string (which is the concatenated sequences +// in the handles). +vector format_handle_haplotypes_to_strings( + const gbwtgraph::GBWTGraph &haploGraph, + const vector> &haplotype_handle_vectors) { + vector haplotype_strings; + for (vector haplotype_handles : haplotype_handle_vectors) { + string hap; + for (handle_t &handle : haplotype_handles) { + hap += haploGraph.get_sequence(handle); + } + haplotype_strings.push_back(hap); + } + return haplotype_strings; +} + +// TODO: eventually change to deal with haplotypes that start/end in middle of snarl. +// Aligns haplotypes to create a new graph using MSAConverter's seqan converter. +// Assumes that each haplotype stretches from source to sink. +// Arguments: +// source_to_sink_haplotypes: a vector of haplotypes in string format (concat of +// handle sequences). +// Returns: +// VG object representing the newly realigned snarl. +VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes) { + // cerr << "align_source_to_sink_haplotypes" << endl; + // cerr << "number of strings to align: " << source_to_sink_haplotypes.size() << endl; + // TODO: make the following comment true, so that I can normalize haplotypes that + // TODO: aren't source_to_sink by adding a similar special character to strings in + // TODO: the middle of the snarl. + // modify source_to_sink_haplotypes to replace the leading and + // trailing character with a special character. This ensures that the leading char of + // the haplotype becomes the first character in the newly aligned snarl's source - it + // maintains the context of the snarl. + + // store the source/sink chars for later reattachment to source and sink. + string source_char(1, source_to_sink_haplotypes.back().front()); + string sink_char(1, source_to_sink_haplotypes.back().back()); + + // for (string &hap : source_to_sink_haplotypes) { + // hap.replace(0, 1, "X"); + // hap.replace(hap.size() - 1, 1, "X"); + // } + + // /// make a new scoring matrix with _match=5, _mismatch = -3, _gap_extend = -1, and + // _gap_open = -3, EXCEPT that Q has to be matched with Q (so match score between Q + // and Q =len(seq)+1) + // // 1. Define type and constants. + // // + // // Define types for the score value and the scoring scheme. + // typedef int TValue; + // typedef seqan::Score > + // TScoringScheme; + // // Define our gap scores in some constants. + // int const gapOpenScore = -1; + // int const gapExtendScore = -1; + + // static int const _data[TAB_SIZE] = + // { + // 1, 0, 0, 0, 0, + // 0, 1, 0, 0, 0, + // 0, 0, 1, 0, 0, + // 0, 0, 0, 1, 0, + // 0, 0, 0, 0, 0 + // }; + + // create seqan multiple_sequence_alignment object + //// seqan::Align align; + seqan::Align align; + + seqan::resize(rows(align), source_to_sink_haplotypes.size()); + for (int i = 0; i < source_to_sink_haplotypes.size(); ++i) { + assignSource(row(align, i), source_to_sink_haplotypes[i].c_str()); + } + + globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); + + vector row_strings; + for (auto &row : rows(align)) { + string row_string; + auto it = begin(row); + auto itEnd = end(row); + for (; it != itEnd; it++) { + row_string += *it; + } + // todo: debug_statement + cerr << "ROW_STRING: " << row_string << endl; + // edit the row so that the proper source and sink chars are added to the + // haplotype instead of the special characters added to ensure correct alignment + // of source and sink. + row_string.replace(0, 1, source_char); + row_string.replace(row_string.size() - 1, 1, sink_char); + row_strings.push_back(row_string); + } + + stringstream ss; + for (string seq : row_strings) { + ss << endl << seq; + } + // ss << align; + MSAConverter myMSAConverter = MSAConverter(); + myMSAConverter.load_alignments(ss, "seqan"); + VG snarl = myMSAConverter.make_graph(); + snarl.clear_paths(); + + pair, vector> source_and_sink = + debug_get_sources_and_sinks(snarl); + + // TODO: throw exception(?) instead of cerr, or remove these messages if I'm confident + // TODO: code works. + if (source_and_sink.first.size() != 1) { + cerr << "WARNING! Snarl realignment has generated " + << source_and_sink.first.size() << " source nodes." << endl; + } + + if (source_and_sink.second.size() != 1) { + cerr << "WARNING! Snarl realignment has generated " + << source_and_sink.second.size() << " sink nodes." << endl; + } + return snarl; +} + +/** For each handle in a given graph, divides any handles greater than max_size into parts + * that are equal to or less than the size of max_size. + * + * @param {MutableHandleGraph} graph : the graph in which we want to force a maximum + * handle size for all handles. + * @param {size_t} max_size : the maximum size we want a handle to be. + */ +void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size) { + // forcing each handle in the graph to have a maximum sequence length of max_size: + graph.for_each_handle([&](handle_t handle) { + // all the positions we want to make in the handle are in offsets. + vector offsets; + + size_t sequence_len = graph.get_sequence(handle).size(); + int number_of_divisions = floor(sequence_len / max_size); + + // if the handle divides evenly into subhandles of size max_size, we don't need to + // make the last cut (which would be at the very end of the handle - cutting off + // no sequence). + if (sequence_len % max_size == 0) { + number_of_divisions--; + } + + // calculate the position of all the divisions we want to make. + for (int i = 1; i <= number_of_divisions; i++) { + offsets.push_back(i * max_size); + } + + // divide the handle into parts. + graph.divide_handle(handle, offsets); + }); +} + +// Finds all embedded paths that either start or end in a snarl (or both) defined by +// source_id, sink_id. +// returns a vector of the embedded paths, where each entry in the vector is defined +// by the pair of step_handles closest to the beginning and end of the path. If the +// path is fully contained within the snarl, these step_handles will the be the +// leftmost and rightmost handles in the path. +// Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a vector containing all the embedded paths in the snarl, in pair< step_handle_t, +// step_handle_t > > format. Pair.first is the first step in the path's range of +// interest, and pair.second is the step *after* the last step in the path's range of +// interest (can be the null step at end of path). +vector> +extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id) { + // cerr << "extract_embedded_paths_in_snarl" << endl; + // cerr << "source id: " << source_id << endl; + // cerr << "source id contains what paths?: " << endl; + // for (auto step : graph.steps_of_handle(graph.get_handle(source_id))) { + // cerr << "\t" << graph.get_path_name(graph.get_path_handle_of_step(step)) << + // endl; + // } + // cerr << "neighbors of 71104? (should include 71097):" << endl; + // handle_t test_handle = graph.get_handle(71104); + // graph.follow_edges(test_handle, true, [&](const handle_t &handle) { + // cerr << graph.get_id(handle) << endl; + // }); + // cerr << "can I still access source handle?" + // << graph.get_sequence(graph.get_handle(source_id)) << endl; + + // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't + // extend the path to a point beyond the source or sink. + SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); + // key is path_handle, value is a step in that path from which to extend. + unordered_map paths_found; + + // look for handles with paths we haven't touched yet. + snarl.for_each_handle([&](const handle_t &handle) { + vector steps = graph.steps_of_handle(handle); + // do any of these steps belong to a path not in paths_found? + for (step_handle_t &step : steps) { + path_handle_t path = graph.get_path_handle_of_step(step); + // If it's a step along a new path, save the first step to that path we find. + // In addtion, if there are multiple steps found in the path, (The avoidance + // of source and sink here is to ensure that we can properly check to see if + // we've reached the end of an embedded path walking in any arbitrary + // direction (i.e. source towards sink or sink towards source). + if (paths_found.find(path) == paths_found.end() || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || + graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { + // then we need to mark it as found and save the step. + paths_found[path] = step; + } + } + }); + + // todo: debug_statement + // cerr << "################looking for new paths################" << endl; + // for (auto path : paths_found) { + // cerr << graph.get_path_name(path.first) << " " + // << graph.get_id(graph.get_handle_of_step(path.second)) << endl; + // } + + /// for each step_handle_t corresponding to a unique path, we want to get the steps + /// closest to both the end and beginning step that still remains in the snarl. + // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", + // TODO so that I could iterate over the fxn. That sounds template-messy in C++ + // tho'. Should I? + vector> paths_in_snarl; + for (auto &it : paths_found) { + step_handle_t step = it.second; + // path_in_snarl describes the start and end steps in the path, + // as constrained by the snarl. + pair path_in_snarl; + + // Look for the step closest to the beginning of the path, as constrained by the + // snarl. + step_handle_t begin_in_snarl_step = step; + id_t begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + + while ((begin_in_snarl_id != source_id) && + graph.has_previous_step(begin_in_snarl_step)) { + begin_in_snarl_step = graph.get_previous_step(begin_in_snarl_step); + begin_in_snarl_id = + graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + } + path_in_snarl.first = begin_in_snarl_step; + + // Look for the step closest to the end of the path, as constrained by the snarl. + step_handle_t end_in_snarl_step = step; + id_t end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); + + // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and + // graph.has_next_step(end_in_snarl_step)) { + while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { + end_in_snarl_step = graph.get_next_step(end_in_snarl_step); + end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); + } + // Note: when adding the end step, path notation convention requires that we add + // the null step at the end of the path (or the next arbitrary step, in the case + // of a path that extends beyond our snarl.) + // TODO: do we want the next arbitrary step in that latter case? + path_in_snarl.second = graph.get_next_step(end_in_snarl_step); + + paths_in_snarl.push_back(path_in_snarl); + } + + return paths_in_snarl; +} + +// TODO: change the arguments to handles, which contain orientation within themselves. +// Given a start and end node id, construct an extract subgraph between the two nodes +// (inclusive). Arguments: +// graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a SubHandleGraph containing only the handles in graph that are between start_id +// and sink_id. +SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &sink_id) { + // cerr << "extract_subgraph" << endl; + /// make a subgraph containing only nodes of interest. (e.g. a snarl) + // make empty subgraph + SubHandleGraph subgraph = SubHandleGraph(&graph); + + unordered_set visited; // to avoid counting the same node twice. + unordered_set to_visit; // nodes found that belong in the subgraph. + + // TODO: how to ensure that "to the right" of start_handle is the correct direction? + // initialize with start_handle (because we move only to the right of start_handle): + handle_t start_handle = graph.get_handle(start_id); + subgraph.add_handle(start_handle); + visited.insert(graph.get_id(start_handle)); + + // look only to the right of start_handle + graph.follow_edges(start_handle, false, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + + /// explore the rest of the snarl: + while (to_visit.size() != 0) { + // remove cur_handle from to_visit + unordered_set::iterator cur_index = to_visit.begin(); + handle_t cur_handle = graph.get_handle(*cur_index); + + to_visit.erase(cur_index); + + /// visit cur_handle + visited.insert(graph.get_id(cur_handle)); + + subgraph.add_handle(cur_handle); + + if (graph.get_id(cur_handle) != sink_id) { // don't iterate past end node! + // look for all nodes connected to cur_handle that need to be added + // looking to the left, + graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + // looking to the right, + graph.follow_edges(cur_handle, false, [&](const handle_t &handle) { + // mark the nodes to come as to_visit + if (visited.find(graph.get_id(handle)) == visited.end()) { + to_visit.insert(graph.get_id(handle)); + } + }); + } + } + return subgraph; +} + +// Integrates the snarl into the graph, replacing the snarl occupying the space between +// source_id and sink_id. +// In the process, transfers any embedded paths traversing the old snarl into the new +// snarl. +// Arguments: +// graph: the graph in which we want to insert the snarl. +// to_insert_snarl: a *separate* handle_graph from graph, often generated from +// MSAconverter. embedded_paths: a vector of paths, where each is a pair. +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// source_id: the source of the old (to be replaced) snarl in graph +// sink_id: the sink of the old (to be replaced) snarl in graph. +// Return: None. +// TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same +// TODO: orientation as we expect? i.e. that they don't move backward? I think +// TODO: we want match_orientation to be = true, but this may cause problems +// TODO: in some cases given the way we currently construct handles (fixed when we +// TODO: create snarl-scanning interface). +// TODO: It may also be that we *don't want match_orientation to be true, +// TODO: if we're tracking a path that loops backward in the snarl. Hmm... Will think +// about this. +void integrate_snarl(MutablePathDeletableHandleGraph &graph, + const HandleGraph &to_insert_snarl, + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id) { + // cerr << "integrate_snarl" << endl; + + // //todo: debug_statement + // cerr << "handles in to_insert_snarl:" << endl; + // to_insert_snarl.for_each_handle([&](const handle_t &handle) { + // cerr << to_insert_snarl.get_id(handle) << " " + // << to_insert_snarl.get_sequence(handle) << " \t"; + // }); + // cerr << endl; + // Get old graph snarl + SubHandleGraph old_snarl = extract_subgraph(graph, source_id, sink_id); + + // TODO: debug_statement: Check to make sure that newly made snarl has only one start + // and end. + // TODO: (shouldn't be necessary once we've implemented alignment with + // leading/trailing special chars.) Identify old and new snarl start and sink + pair, vector> to_insert_snarl_defining_handles = + debug_get_sources_and_sinks(to_insert_snarl); + + if (to_insert_snarl_defining_handles.first.size() > 1 || + to_insert_snarl_defining_handles.second.size() > 1) { + cerr << "ERROR: newly made snarl from a snarl starting at " << source_id + << " has more than one start or end. # of starts: " + << to_insert_snarl_defining_handles.first.size() + << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; + return; + } + + /// Replace start and end handles of old graph snarl with to_insert_snarl start and + /// end, and delete rest of old graph snarl: + + // add to_insert_snarl into graph without directly attaching the snarl to the graph + // (yet). + vector to_insert_snarl_topo_order = + algorithms::lazier_topological_order(&to_insert_snarl); + + // Construct a parallel new_snarl_topo_order to identify + // paralogous nodes between to_insert_snarl and the new snarl inserted in graph. + vector new_snarl_topo_order; + + // integrate the handles from to_insert_snarl into the graph, and keep track of their + // identities by adding them to new_snarl_topo_order. + for (handle_t to_insert_snarl_handle : to_insert_snarl_topo_order) { + handle_t graph_handle = + graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); + new_snarl_topo_order.push_back(graph_handle); + } + + // Connect the newly made handles in the graph together the way they were connected in + // to_insert_snarl: + for (int i = 0; i < to_insert_snarl_topo_order.size(); i++) { + to_insert_snarl.follow_edges( + to_insert_snarl_topo_order[i], false, [&](const handle_t &snarl_handle) { + // get topo_index of nodes to be connected to graph start handle + auto it = find(to_insert_snarl_topo_order.begin(), + to_insert_snarl_topo_order.end(), snarl_handle); + int topo_index = it - to_insert_snarl_topo_order.begin(); + + // connect graph start handle + graph.create_edge(new_snarl_topo_order[i], + new_snarl_topo_order[topo_index]); + }); + } + + // save the source and sink values of new_snarl_topo_order, since topological order is + // not necessarily preserved by move_path_to_snarl. Is temporary b/c we need to + // replace the handles with ones with the right id_t label for source and sink later + // on. + id_t temp_snarl_source_id = graph.get_id(new_snarl_topo_order.front()); + id_t temp_snarl_sink_id = graph.get_id(new_snarl_topo_order.back()); + + // Add the neighbors of the source and sink of the original snarl to the new_snarl's + // source and sink. + // source integration: + graph.follow_edges( + graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, graph.get_handle(temp_snarl_source_id)); + }); + graph.follow_edges( + graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { + graph.create_edge(graph.get_handle(temp_snarl_sink_id), next_handle); + }); + + // For each path of interest, move it onto the new_snarl. + for (auto path : embedded_paths) { + // //todo: debug_statement + // cerr << "the new sink id: " << temp_snarl_sink_id << endl; + move_path_to_snarl(graph, path, new_snarl_topo_order, temp_snarl_source_id, + temp_snarl_sink_id, source_id, sink_id); + } + + // Destroy the old snarl. + old_snarl.for_each_handle( + [&](const handle_t &handle) { graph.destroy_handle(handle); }); + + // Replace the source and sink handles with ones that have the original source/sink id + // (for compatibility with future iterations on neighboring top-level snarls using the + // same snarl manager. Couldn't replace it before b/c we needed the old handles to + // move the paths. + handle_t new_source_handle = graph.create_handle( + graph.get_sequence(graph.get_handle(temp_snarl_source_id)), source_id); + handle_t new_sink_handle = + graph.create_handle(graph.get_sequence(new_snarl_topo_order.back()), sink_id); + + // move the source edges: + // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could + // in Python!) + graph.follow_edges(graph.get_handle(temp_snarl_source_id), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_source_handle); + }); + graph.follow_edges(graph.get_handle(temp_snarl_source_id), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_source_handle, next_handle); + }); + + // move the sink edges: + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, new_sink_handle); + }); + graph.follow_edges(graph.get_handle(temp_snarl_sink_id), false, + [&](const handle_t &next_handle) { + graph.create_edge(new_sink_handle, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_source_handle}); + }); + graph.for_each_step_on_handle( + graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { + graph.rewrite_segment(step, graph.get_next_step(step), + vector{new_sink_handle}); + }); + + // delete the previously created source and sink: + for (handle_t handle : + {graph.get_handle(temp_snarl_source_id), graph.get_handle(temp_snarl_sink_id)}) { + + graph.destroy_handle(handle); + } +} + +// Moves a path from its original location in the graph to a new snarl, +// defined by a vector of interconnected handles. +// NOTE: the handles in new_snarl_handles may not preserve topological order after +// being passed to this method, if they were ordered before. +// Arguments: graph: the graph containing the old_embedded_path and the handles in +// new_snarl_topo_order +// old_embedded_path: a pair, where +// pair.first is the first step_handle of interest in the +// old_embedded_path, and pair.second is the step_handle *after* +// the last step_handle of interest in the old_embedded_path (can +// be the null step at the end of the path.) +// new_snarl_topo_order: all the handles in the new snarl, inside the graph. +// Return: None. +void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, + const pair &old_embedded_path, + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id) { + // cerr << "move_path_to_snarl" << endl; + // //TODO: debug_statement: + // cerr << "path name: " + // << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // << endl; + // cerr << "source: " << new_source_id << " sink: " << new_sink_id << endl; + // if (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) == + // "chr10") { + // cerr << "\t\tstart and end of old embedded path: " + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.first)) + // << "end id" + // << graph.get_id(graph.get_handle_of_step(old_embedded_path.second)) << + // endl; + // } + // cerr << "#### handles in snarl (according to move_path_to_snarl): ####" << endl; + // for (handle_t handle : new_snarl_handles) { + // cerr << "\t" << graph.get_id(handle) << " " << graph.get_sequence(handle); + // } + // cerr << endl << endl; + // cerr << "~~~~~ Handles following each handle:" << endl; + // for (handle_t handle : new_snarl_handles) { + // cerr << "neighbors of handle " << graph.get_id(handle) << " (" + // < + // //todo: debug_statement + // cerr << "checking handles as start of path-seq" << endl; + vector, int, int>> possible_paths; + for (handle_t handle : new_snarl_handles) { + string handle_seq = graph.get_sequence(handle); + + // starting index is where the path would begin in the handle, + // since it could begin in the middle of the handle. + vector starting_indices = + check_handle_as_start_of_path_seq(handle_seq, path_seq); + + // if there is a starting index, + if (starting_indices.size() != 0) { + for (int starting_index : starting_indices) { + if ((handle_seq.size() - starting_index) >= path_seq.size() && + source_and_sink_handles_map_properly(graph, new_source_id, + new_sink_id, touching_source, + touching_sink, handle, handle)) { + // if the entire path fits inside the current handle, and if any + // paths that touched source and sink in the old snarl would be + // touching source and sink in the new snarl, then we've already + // found the full mapping location of the path! Move the path, end + // the method. + vector new_path{handle}; + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, new_path); + // //todo: debug_statement + // cerr << "found a full mapping at " << graph.get_id(handle) + // << " w/ seq " << graph.get_sequence(handle) << endl; + return; + } else { + // this is a potential starting handle for the path. Add as a + // possible_path. + vector possible_path_handle_vec{handle}; + possible_paths.push_back( + make_tuple(possible_path_handle_vec, starting_index, + handle_seq.size() - starting_index)); + } + } + } + } + + // //todo: debug_statement: + // cerr << "done checking handles as start of path seq" << endl; + + // //TODO: debug_statement: + // cerr << "possible paths so far: " << endl; + // for (tuple, int, int> path : possible_paths) { + // cerr << " possible start: "; + // for (handle_t handle : get<0>(path)) { + // cerr << graph.get_id(handle) << " "; + // } + // cerr << endl; + // } + + // for every possible path, extend it to determine if it really is the path we're + // looking for: + while (!possible_paths.empty()) { + // take a path off of possible_paths, which will be copied for every iteration + // through graph.follow_edges, below: + tuple, int, int> possible_path_query = possible_paths.back(); + possible_paths.pop_back(); + + // //TODO: debug_statement: + // for (tuple, int, int> path : possible_paths) { + // cerr << "*\tpossible path query: "; + // for (handle_t handle : get<0>(possible_path_query)) { + // cerr << graph.get_id(handle) << " " << graph.get_sequence(handle) << " "; + // } + // cerr << endl; + // } + + // extend the path through all right-extending edges to see if any subsequent + // paths still satisfy the requirements for being a possible_path: + bool no_path = graph.follow_edges( + get<0>(possible_path_query).back(), false, [&](const handle_t &next) { + // //todo: debug_statement + // cerr << "next handle id and seq: " << graph.get_id(next) << " " + // << graph.get_sequence(next) << endl; + // make a copy to be extended for through each possible next handle in + // follow edges. + tuple, int, int> possible_path = possible_path_query; + + // extract relevant information to make code more readable. + string next_seq = graph.get_sequence(next); + id_t next_id = graph.get_id(next); + int &cur_index_in_path = get<2>(possible_path); + if (cur_index_in_path <= path_seq.size() && + (find(new_snarl_handles.cbegin(), new_snarl_handles.cend(), next) != + new_snarl_handles.cend())) { + // if the next handle would be the ending handle for the path, + if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { + // cerr << "next handle would be the ending handle for the path" + // << endl; check to see if the sequence in the handle is suitable + // for ending the path: + int compare_length = path_seq.size() - cur_index_in_path; + + // //todo: debug_statement + // cerr << "about to compare. compare val: " + // << (next_seq.compare(0, compare_length, path_seq, + // cur_index_in_path, compare_length) == + // 0) + // << " source_and_sink_handles_map " + // << source_and_sink_handles_map_properly( + // graph, new_source_id, new_sink_id, touching_source, + // touching_sink, get<0>(possible_path).front(), next) + // << endl; + if ((next_seq.compare(0, compare_length, path_seq, + cur_index_in_path, compare_length) == 0) && + source_and_sink_handles_map_properly( + graph, new_source_id, new_sink_id, touching_source, + touching_sink, get<0>(possible_path).front(), next)) { + // todo: debug_statement + // cerr << "compared." << endl; + + // we've found the new path! Move path to the new sequence, + // and end the function. + + if (compare_length < next_seq.size()) { + // If the path ends before the end of next_seq, then split + // the handle so that the path ends flush with the end of + // the first of the two split handles. + + // divide the handle where the path ends; + pair divided_next = + graph.divide_handle(next, compare_length); + get<0>(possible_path).push_back(divided_next.first); + + // Special case if next is the sink or the source, to + // preserve the reassignment of source and sink ids in + // integrate_snarl. + if (next_id == new_sink_id) { + new_sink_id = graph.get_id(divided_next.second); + } + + // TODO: NOTE: finding the old "next" handle is expensive. + // TODO: Use different container? + auto it = find(new_snarl_handles.begin(), + new_snarl_handles.end(), next); + + // replace the old invalidated handle with one of the new + // ones + *it = divided_next.first; + // stick the other new handle on the end of + // new_snarl_handles. + new_snarl_handles.push_back(divided_next.second); + + } else { + // otherwise, the end of the path already coincides with + // the end of the handle. In that case, just add it to the + // path. + get<0>(possible_path).push_back(next); + } + graph.rewrite_segment(old_embedded_path.first, + old_embedded_path.second, + get<0>(possible_path)); + // //todo: debug_statement: + // cerr << "got a full path: "; + // for (handle_t handle : get<0>(possible_path)) { + // cerr << graph.get_id(handle) << " "; + // } + // cerr << endl; + + // we've already found the path. No need to keep looking for + // more paths. + return false; + } + } + // see if the next handle would be the continuation of the path, but + // not the end, + else { + + // check to see if the sequence in the handle is suitable for + // extending the path: + int compare_length = next_seq.size(); + // //todo: debug_statement + // cerr << "compare returned false" << endl; + // cerr << "compare in returned false: " + // << " next_seq len " << next_seq.size() << " compare_length + // " + // << compare_length << " path_seq len " << path_seq.size() + // << " cur_index_in_path " << cur_index_in_path << endl; + // cerr << "if statement eval: cur_index_in_path <= + // next_seq.size() " + // << (cur_index_in_path <= next_seq.size()) + // << " next_seq.compare(0, compare_length, path_seq, " + // "cur_index_in_path, compare_length) == 0) " + // << (next_seq.compare(0, compare_length, path_seq, + // cur_index_in_path, compare_length) == + // 0) + // << endl; + if (next_seq.compare(0, compare_length, path_seq, + cur_index_in_path, compare_length) == 0) { + // cerr << "compared in return false" << endl; + // extend the path + get<0>(possible_path).push_back(next); + + // update the current index in path_seq. + get<2>(possible_path) += next_seq.size(); + + // place back into possible_paths + possible_paths.push_back(possible_path); + // cerr << "extending the path!" << endl; + } + } + } + // continue to iterate through follow_edges. + return true; + }); + + // //todo: debug_statement: + // if + // (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // == + // "_alt_19f9bc9ad2826f58f113965edf36bb93740df46d_0") { + // cerr << "mystery node 4214930: " + // << graph.get_sequence(graph.get_handle(4214930)) << endl; + // } + + // if we've found a complete path in the above follow_edges, then we've + // already moved the path, and we're done. + if (!no_path) { + return; + } + } + // //todo: figure out how to do some better error message instead of cerr. + // if we failed to find a path, show an error message. + cerr << "##########################\nWarning! Didn't find a corresponding path of " + "name " + << graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + << " from the old snarl at " << old_source_id + << " in the newly aligned snarl. This snarl WILL be " + "normalized, resulting in a probably incorrectly-constructed snarl." + "\n##########################" + << endl + << endl; + // throw graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); + // assert(true && "Warning! Didn't find a corresponding path of name " + + // graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) + // + " from the old snarl in the newly aligned snarl."); +} + +/** Used to help move_path_to_snarl map paths from an old snarl to its newly + * normalized counterpart. In particular, ensures that any paths which touch the + * source and/or sink of the old snarl still do so in the new snarl (which is + * important to ensure that we don't break any paths partway through the snarl.) + * + * @param {HandleGraph} graph : the graph that contains the old and new snarl + * nodes. + * @param {id_t} new_source_id : the node id of the newly created source. + * @param {id_t} new_sink_id : the node id of the newly created sink. + * @param {bool} touching_source : true if the path is connected to the old + * source. + * @param {bool} touching_sink : true if the path is connected to the old + * sink. + * @param {handle_t} potential_source : proposed source for the path in the new snarl. + * @param {handle_t} potential_sink : proposed sink for the path in the new snarl. + * @return {bool} : true if the path satisfies the requirement + * that, if the original path covered the old source or sink, the new path also covers + * the same respective nodes in the new snarl. + */ +bool source_and_sink_handles_map_properly( + const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, + const bool &touching_source, const bool &touching_sink, + const handle_t &potential_source, const handle_t &potential_sink) { + + bool path_map = false; + // cerr << "touching source? " << touching_source << "touching_sink" << touching_sink + // << "source is source?" << (graph.get_id(potential_source) == new_source_id) + // << " sink is sink: " << (graph.get_id(potential_sink) == new_sink_id) << endl; + if (touching_source && touching_sink) { + path_map = ((graph.get_id(potential_source) == new_source_id) && + (graph.get_id(potential_sink) == new_sink_id)); + } else if (touching_source) { + path_map = (graph.get_id(potential_source) == new_source_id); + } else if (touching_sink) { + path_map = (graph.get_id(potential_sink) == new_sink_id); + } else { + path_map = true; + } + // cerr << "path_map " << path_map << endl; + return path_map; +} + +// Determines whether some subsequence in a handle satisfies the condition of being +// the beginning of a path. +// If the path_seq is longer than the handle_seq, only checks subsequences that +// reach from the beginning/middle of the handle_seq to the end. If path_seq is +// shorter than handle_seq, checks for any substring of length path_seq within +// the handle_seq, as well as substrings smaller than length path_seq that extend +// beyond the current handle. +// Arguments: +// handle_seq: the sequence in the handle we're trying to identify as a +// start_of_path_seq. path_seq: the sequence in the path we're trying to find +// starting points for in handle_seq +// Return: a vector of all potential starting index of the subsequence in the +// handle_seq. +vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq) { + vector possible_start_indices; + // If the handle_seq.size <= path_seq.size, look for subsequences reaching from + // beginning/middle of handle_seq to the end - where path_seq may run off the end + // of this handle to the next in the snarl. + if (handle_seq.size() <= path_seq.size()) { + // iterate through all possible starting positions in the handle_seq. + for (int handle_start_i = 0; handle_start_i < handle_seq.size(); + handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // if handle_seq.size > path_seq.size, look for any subsequence within handle_seq + // of path_seq.size, as well as any subsequence smaller than path_seq reaching + // from middle of handle_seq to the end of handle_seq. + else { + // first, search through all handle_seq for any comparable subsequence of + // path_seq.size. Note: only differences between this for loop and above for + // loop is that handle_start_i stops at (<= path_seq.size() - + // handle_seq.size()), and subseq.size() = path_seq.size() + for (int handle_start_i = 0; + handle_start_i <= (handle_seq.size() - path_seq.size()); handle_start_i++) { + int subseq_size = path_seq.size(); + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + // second, search through the last few bases of handle_seq for the beginning + // of path_seq. Note: nearly identical for loop to the one in "if + // (handle_seq.size() + // <= path_seq.size())" + for (int handle_start_i = (handle_seq.size() - path_seq.size() + 1); + handle_start_i < handle_seq.size(); handle_start_i++) { + int subseq_size = handle_seq.size() - handle_start_i; + // The path_seq subsequence of interest is from 0 to subseq_size; + // The handle_seq subsequence of interest starts at handle_start_i + // and ends at the end of the handle_seq (len subseq_size). + // if compare returns 0, the substring matches. + if (path_seq.compare(0, subseq_size, handle_seq, handle_start_i, + subseq_size) == 0) { + possible_start_indices.push_back(handle_start_i); + } + } + } + // Note: if we passed through the above check without returning anything, then + // there isn't any satisfactory subsequence and we'll return an empty vector. + return possible_start_indices; +} + +// ------------------------------ DEBUG CODE BELOW: +// ------------------------------------------ + +// Returns pair where pair.first is a vector of all sources of the given graph and +// path.second is all the sinks of the given graph. If graph is a subhandlegraph of a +// snarl, there should only be one source and sink each. +pair, vector> +debug_get_sources_and_sinks(const HandleGraph &graph) { + // cerr << "debug_get_source_and_sinks" << endl; + vector sink; + vector source; + + // identify sources and sinks + graph.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + graph.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + graph.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + source.push_back(handle); + } + if (is_sink) { + sink.emplace_back(handle); + } + }); + return pair, vector>(source, sink); +} + +// Runs through the whole snarl and generates all possible strings representing walks +// from source to sink. Generates a combinatorial number of possible paths with splits +// in the snarl. +vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, + id_t start_id, id_t sink_id) { + // cerr << "debug_graph_to_strings" << endl; + SubHandleGraph snarl = extract_subgraph(graph, start_id, sink_id); + + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(snarl.get_node_count()); // resize count to contain enough buckets + // for size of snarl + sequences.reserve(snarl.get_node_count()); // resize sequences to contain enough + // buckets for size of snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, + // we can just use start_id and sink_id. + snarl.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + snarl.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + snarl.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back( + snarl.get_sequence(handle)); // TODO: presented in the handle's local + // forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); + + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t &handle : algorithms::lazier_topological_order(&snarl)) { + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; + + snarl.follow_edges(handle, false, [&](const handle_t &next) { + size_t &count_next = count[next]; + string seq_next = snarl.get_sequence(next); + + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } + + else { + count_next += count_here; + for (string seq : seqs_here) { + sequences[next].push_back(seq + seq_next); + } + } + }); + /// TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t &sink : sinks) { + total_count += count[sink]; + } + + // all the sequences at the sinks will be all the sequences in the snarl. + vector walks; + for (handle_t &sink : sinks) { + for (string seq : sequences[sink]) { + walks.push_back(seq); + } + } + + return walks; +} + +} diff --git a/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.hpp b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.hpp new file mode 100644 index 00000000000..bb102dd4470 --- /dev/null +++ b/src/algorithms/0_old_copies_haplotype_realignment/0_draft_haplotype_realignment.hpp @@ -0,0 +1,163 @@ +/* +Robin Rounthwaite +Find function call in ./subcommand/main.cpp +*/ +#include "../gbwt_helper.hpp" +#include "../handle.hpp" +#include "../subgraph.hpp" +#include "../vg.hpp" +#include "count_walks.hpp" +#include + +/* TODO for improving haplotype_realignment. +Tomorrow: +* scale code upwards so that you can run code on every snarl in given graph. +* also add requirement that haps entering snarl = haps exiting snarl. +TODO: align haplotypes_not_at_source once we have a solution for alignments that insert +TODO: the haplotype in a specified location +TODO: (use more unique marker signals to identify where in other strings the +TODO: middle-haplotype should align?) + +TODO: consider splitting handles where embedded paths begin/end in the middle of a handle. +TODO: (Note: would need to dynamically change other paths containing that handle. :-/) +TODO: Or simply split the handles of interest and then realign the paths - expensive. +TODO: Or insert *yet another* marker char to id where embedded paths begin/end, so its +TODO: easily find where to split the handles afterwards. AND! it makes moving the +TODO: paths less expensive. +TODO: (fewer spots to check alignment in the snarl). If we have unique markers for +TODO: each path, then +TODO: it becomes O(N) time, instead of ~O(N*M*n) (N: number of bases in snarl; M: +TODO: number of bases in path; +TODO: n: number of potential starting places in the snarl (note: slightly less +TODO: expensive since n is +TODO: divided up among the M paths).) +TODO: this would also addres the possibility of an embedded path being moved to an +TODO: alternative location +TODO: when it overlaps a repetitive sequence. (previous thought, tho' above one is +TODO: better): do I Need +TODO: to account for this with a sense of "bases distant from source"? + +TODO: make it so that gbwt file is customized by user rather than hardcoded. + +TODO: make it so that you pass the gbwt file directory to a one-liner function +TODO: (ran in normalize_main) that generates gbwt graph, extracts haps, +TODO: aligns haps, and reintegrates haps. (eventually will do it for every +TODO: snarl in the given graph). + +*/ +namespace vg { +void disambiguate_top_level_snarls(MutablePathDeletableHandleGraph &graph, + const GBWTGraph &haploGraph, ifstream &snarl_stream, + const int &max_alignment_size); + +tuple +disambiguate_snarl(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, + const id_t &source_id, const id_t &sink_id, + const int &max_alignment_size); + +tuple>, vector>, unordered_set> +extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &graph, + const id_t &source_id, const id_t &sink_id); + +vector> +find_haplotypes_not_at_source(const GBWTGraph &haploGraph, + unordered_set &touched_handles, + const id_t &sink_id); + +vector format_handle_haplotypes_to_strings( + const GBWTGraph &haploGraph, + const vector> &haplotype_handle_vectors); + +VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); + +void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); + +vector> +extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, + const id_t &sink_id); + +SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &end_id); + +void integrate_snarl(MutablePathDeletableHandleGraph &graph, const HandleGraph &new_snarl, + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id); + +void move_path_to_snarl(MutablePathDeletableHandleGraph &graph, + const pair &old_embedded_path, + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id); + +bool source_and_sink_handles_map_properly( + const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, + const bool &touching_source, const bool &touching_sink, + const handle_t &potential_source, const handle_t &potential_sink); + +vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq); + +// -------------------------------- DEBUG CODE BELOW: ------------------------------------ + +pair, vector> +debug_get_sources_and_sinks(const HandleGraph &graph); + +vector debug_graph_to_strings(MutablePathDeletableHandleGraph &graph, + id_t start_id, id_t end_id); + +vector debug_get_embedded_paths_from_source_to_sink(const PathHandleGraph &graph, + const handle_t &source_handle, + const handle_t &sink_handle); +} // namespace vg + +/* +Deleted stuff: + +void jordan_bug(MutablePathDeletableHandleGraph& graph){ + + // example with one node: + handle_t example = graph.get_handle(23448); + handle_t replacement = graph.create_handle("GATTACA", 1); + + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could +in Python!) graph.follow_edges(example, true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, replacement); + }); + graph.follow_edges(example, false, + [&](const handle_t &next_handle) { + graph.create_edge(replacement, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle(example, [&](step_handle_t step) + { + graph.rewrite_segment(step, graph.get_next_step(step), +vector{replacement}); + }); + + // example with two nodes: + handle_t example_1 = graph.get_handle(23450); + handle_t replacement_1 = graph.create_handle("GATTACA", 2); + handle_t replacement_2 = graph.create_handle("GATTACA", 3); + graph.create_edge(replacement_1, replacement_2); + + // move the source edges: + //TODO: note the copy/paste. Ask if there's a better way to do this (I totally could +in Python!) graph.follow_edges(example_1, true, + [&](const handle_t &prev_handle) { + graph.create_edge(prev_handle, replacement_1); + }); + graph.follow_edges(example_1, false, + [&](const handle_t &next_handle) { + graph.create_edge(replacement_2, next_handle); + }); + + // move the paths: + graph.for_each_step_on_handle(example_1, [&](step_handle_t step) + { + graph.rewrite_segment(step, step, vector{replacement_1, replacement_2}); + }); +} + */ diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 7e0eefe14be..e7ddcb398bc 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include "../gbwt_helper.hpp" #include "../handle.hpp" #include "../msa_converter.hpp" @@ -19,7 +21,7 @@ namespace vg { SnarlNormalizer::SnarlNormalizer(MutablePathDeletableHandleGraph &graph, - const GBWTGraph &haploGraph, + const gbwtgraph::GBWTGraph &haploGraph, const int &max_alignment_size, const string &path_finder) : _haploGraph(haploGraph), _graph(graph), _max_alignment_size(max_alignment_size), _path_finder(path_finder) {} @@ -30,10 +32,10 @@ SnarlNormalizer::SnarlNormalizer(MutablePathDeletableHandleGraph &graph, // snarl only contains haplotype threads that extend fully from source to sink. // Arguments: // _graph: the full-sized handlegraph that will undergo edits in a snarl. -// _haploGraph: the corresponding GBWTGraph of _graph. +// _haploGraph: the corresponding gbwtgraph::GBWTGraph of _graph. // snarl_stream: the file stream from .snarl file corresponding to _graph. void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { - // cerr << "disambiguate_top_level_snarls" << endl; + cerr << "disambiguate_top_level_snarls" << endl; SnarlManager *snarl_manager = new SnarlManager(snarl_stream); int num_snarls_normalized = 0; @@ -55,14 +57,13 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; for (auto roots : snarl_roots) { - - // if (roots->start().node_id() == 6165) { - // if (roots->start().node_id() < 50000) { - // if (roots->start().node_id() == 1883) { - // cerr << "disambiguating snarl #" - // << (num_snarls_normalized + num_snarls_skipped) - // << " source: " << roots->start().node_id() - // << " sink: " << roots->end().node_id() << endl; + // if (roots->start().node_id() > 269600 && roots->start().node_id() < 269700) { + // if (roots->start().node_id() == 0) { // as in, don't normalize any snarls + // if (roots->start().node_id() < 50000) { + cerr << "disambiguating snarl #" + << (num_snarls_normalized + num_snarls_skipped) + << " source: " << roots->start().node_id() + << " sink: " << roots->end().node_id() << endl; one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); @@ -109,23 +110,23 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // //todo: debug_statement // VG outGraph; - // pos_t source_pos = make_pos_t(4211565, false, 0); + // pos_t source_pos = make_pos_t(269695, false, 0); // vector pos_vec; // pos_vec.push_back(source_pos); - // algorithms::extract_containing_graph(&graph, &outGraph, pos_vec, 150); + // algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, 1000); // outGraph.serialize_to_ostream(cout); delete snarl_manager; } // For a snarl in the given _graph, with every edge covered by at least one haplotype -// thread in the GBWTGraph, +// thread in the gbwtgraph::GBWTGraph, // extract all sequences in the snarl corresponding to the haplotype threads and // re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are // preserved; GBWT haplotypes in the snarl are not conserved. // Arguments: // _graph: the full-sized handlegraph that will undergo edits in a snarl. -// _haploGraph: the corresponding GBWTGraph of _graph. +// _haploGraph: the corresponding gbwtgraph::GBWTGraph of _graph. // source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: none. @@ -145,7 +146,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // snarl: // 4) number of bases in the snarl before normalization // 5) number of bases in the snarl after normalization. - vector error_record(5, 0); + vector error_record(6, 0); SubHandleGraph snarl = extract_subgraph(_graph, _cur_source_id, sink_id); if (!algorithms::is_acyclic(&snarl)) { @@ -235,6 +236,11 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & VG new_snarl = align_source_to_sink_haplotypes(get<0>(haplotypes)); // count the number of bases in the snarl. + //todo: debug_statement + // cerr << "size of snarl before any counting: " << error_record[5] << endl; + // for (auto rec : error_record){ + // cerr << " " << rec << endl; + // } new_snarl.for_each_handle([&](const handle_t handle) { error_record[5] += new_snarl.get_sequence(handle).size(); }); @@ -249,7 +255,6 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // integrate the new_snarl into the _graph, removing the old snarl as you go. integrate_snarl(new_snarl, embedded_paths); - return error_record; } else { if (!get<1>(haplotypes).empty()) { cerr << "found a snarl starting at " << _cur_source_id << " and ending at " @@ -278,22 +283,29 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & cerr << endl; error_record[2] = true; } - if (error_record[5] > error_record[4]) { - cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " - "rather than decreased." - << endl; - } - return error_record; } + // todo: decide if this should be a requirement for the integration of normalized + // snarl + if (error_record[5] > error_record[4]) { + cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " + "starting at " + << _cur_source_id << endl + << "\tsize before: " << error_record[4] << " size after: " << error_record[5] + << endl; + } else if (error_record[5] <= 0) { + cerr << "normalized snarl size is <= zero: " << error_record[5] << endl; + } + return error_record; + } // namespace vg // TODO: test that it successfully extracts any haplotypes that start/end in the middle of // TODO: the snarl. -// For a snarl in a given GBWTGraph, extract all the haplotypes in the snarl. Haplotypes -// are represented +// For a snarl in a given gbwtgraph::GBWTGraph, extract all the haplotypes in the snarl. +// Haplotypes are represented // by vectors of handles, representing the chain of handles in a thread. // Arguments: -// _haploGraph: the GBWTGraph containing the snarl. +// _haploGraph: the gbwtgraph::GBWTGraph containing the snarl. // _cur_source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: @@ -304,7 +316,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // pair>, vector>> tuple>, vector>, unordered_set> SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, - const GBWTGraph &haploGraph, + const gbwtgraph::GBWTGraph &haploGraph, const id_t &_source_id, const id_t &sink_id) { // cerr << "extract_gbwt_haplotypes" << endl; @@ -636,9 +648,9 @@ SnarlNormalizer::find_haplotypes_not_at_source(unordered_set &touched_ // haplotypes of // format string (which is the concatenated sequences in the handles). // Arguments: -// _haploGraph: a GBWTGraph which contains the handles in vector< handle_t > -// haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< handle_t -// > format. +// _haploGraph: a gbwtgraph::GBWTGraph which contains the handles in vector< handle_t +// > haplotypes. haplotypte_handle_vectors: a vector of haplotypes in vector< +// handle_t > format. // Returns: a vector of haplotypes of format string (which is the concatenated sequences // in the handles). vector SnarlNormalizer::format_handle_haplotypes_to_strings( @@ -1185,10 +1197,12 @@ void SnarlNormalizer::move_path_to_snarl( // cerr << "\nmove_path_to_snarl" << endl; // //TODO: debug_statement: // cerr << "path name: " - // << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) + // << + // _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) // << endl; // cerr << "source: " << new_source_id << " sink: " << new_sink_id << endl; - // if (_graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) == + // if (_graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) + // == // "chr10") { // cerr << "\t\tstart and end of old embedded path: " // << _graph.get_id(_graph.get_handle_of_step(old_embedded_path.first)) @@ -1344,15 +1358,18 @@ void SnarlNormalizer::move_path_to_snarl( // //todo: debug_statement // cerr << "about to compare. compare val: " // << (next_seq.compare(0, compare_length, path_seq, - // cur_index_in_path, compare_length) == 0) + // cur_index_in_path, compare_length) == + // 0) // << " source_and_sink_handles_map " // << source_and_sink_handles_map_properly( - // _graph, new_source_id, new_sink_id, touching_source, - // touching_sink, get<0>(possible_path).front(), next) + // _graph, new_source_id, new_sink_id, + // touching_source, touching_sink, + // get<0>(possible_path).front(), next) // << endl; // cerr << "arguments of compare: " // << " " << 0 << " " << compare_length << " " << path_seq - // << " " << cur_index_in_path << " " << compare_length << " " + // << " " << cur_index_in_path << " " << compare_length << " + // " // << endl; if ((next_seq.compare(0, compare_length, path_seq, cur_index_in_path, compare_length) == 0) && diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index d410221fc1f..509662aee66 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -4,6 +4,8 @@ #include "../vg.hpp" #include "count_walks.hpp" #include +#include + namespace vg { @@ -11,7 +13,7 @@ class SnarlNormalizer { public: virtual ~SnarlNormalizer() = default; - SnarlNormalizer(MutablePathDeletableHandleGraph &graph, const GBWTGraph &haploGraph, + SnarlNormalizer(MutablePathDeletableHandleGraph &graph, const gbwtgraph::GBWTGraph &haploGraph, const int &max_alignment_size = 200, const string &path_finder = "GBWT" /*alternative is "exhaustive"*/); @@ -25,7 +27,7 @@ class SnarlNormalizer { MutablePathDeletableHandleGraph &_graph; // GBWT graph with snarls to normalize, includes the embedded threads needed for the // GBWTPathFinder approach. - const GBWTGraph &_haploGraph; + const gbwtgraph::GBWTGraph &_haploGraph; // the maximum number of threads allowed to align in a given snarl. If the number of // threads exceeds this threshold, the snarl is skipped. int _max_alignment_size; @@ -34,7 +36,7 @@ class SnarlNormalizer { const string &_path_finder; tuple>, vector>, unordered_set> - extract_gbwt_haplotypes(const SubHandleGraph &snarl, const GBWTGraph &graph, + extract_gbwt_haplotypes(const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &graph, const id_t &source_id, const id_t &sink_id); pair, unordered_set> source_to_sink_exhaustive_path_finder(); diff --git a/src/io/register_loader_saver_hash_graph.cpp b/src/io/register_loader_saver_hash_graph.cpp index f2d4b1fdc10..fb8db9ad538 100644 --- a/src/io/register_loader_saver_hash_graph.cpp +++ b/src/io/register_loader_saver_hash_graph.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_hash_graph() { - Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { // Allocate a HashGraph bdsg::HashGraph* hash_graph = new bdsg::HashGraph(); diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index dc1764f5338..a7199702640 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -5,13 +5,15 @@ #include #include +#include + #include "subcommand.hpp" #include "../../include/bdsg/hash_graph.hpp" #include "../../include/vg/io/vpkg.hpp" // #include "../algorithms/0_draft_haplotype_realignment.hpp" -#include "../algorithms/0_oo_normalize_snarls.hpp" #include "../algorithms/0_draft_snarl_normalization_evaluation.cpp" +#include "../algorithms/0_oo_normalize_snarls.hpp" #include "../gbwt_helper.hpp" #include // for high_resolution_clock @@ -95,9 +97,16 @@ int main_normalize(int argc, char **argv) { } } - bdsg::HashGraph *graph; - get_input_file(optind, argc, argv, - [&](istream &in) { graph = new bdsg::HashGraph(in); }); + cerr << "getting hashgraph" << endl; + unique_ptr graph; + get_input_file(optind, argc, argv, [&](istream &in) { + graph = vg::io::VPKG::load_one(in); + }); + + // bdsg::HashGraph *graph; + // get_input_file(optind, argc, argv, + // [&](istream &in) { graph = new bdsg::HashGraph(in); }); + cerr << "got hashgraph" << endl; if (normalize) { cerr << "running normalize!" << endl; @@ -109,7 +118,7 @@ int main_normalize(int argc, char **argv) { // Load the GBWT from its container unique_ptr gbwt; gbwt = vg::io::VPKG::load_one(gbwt_stream); - GBWTGraph haploGraph = vg::GBWTGraph(*gbwt, *graph); + gbwtgraph::GBWTGraph haploGraph = gbwtgraph::GBWTGraph(*gbwt, *graph); std::ifstream snarl_stream; string snarl_file = snarls; @@ -122,13 +131,15 @@ int main_normalize(int argc, char **argv) { // Record start time auto start = chrono::high_resolution_clock::now(); - SnarlNormalizer normalizer = SnarlNormalizer(*graph, haploGraph, max_alignment_size); + SnarlNormalizer normalizer = + SnarlNormalizer(*graph, haploGraph, max_alignment_size); // run test code on all snarls in graph. normalizer.normalize_top_level_snarls(snarl_stream); // // run test code on all snarls in graph. (non obj-oriented code) - // disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream, max_alignment_size); + // disambiguate_top_level_snarls(*graph, haploGraph, snarl_stream, + // max_alignment_size); // Record end time auto finish = std::chrono::high_resolution_clock::now(); @@ -137,19 +148,22 @@ int main_normalize(int argc, char **argv) { } if (evaluate) { - // std::ifstream snarl_stream; - // string snarl_file = snarls; - // snarl_stream.open(snarl_file); - // cerr << "about to evaluate normalized snarls" << endl; - // vg::evaluate_normalized_snarls(snarl_stream); + std::ifstream snarl_stream; + string snarl_file = snarls; + snarl_stream.open(snarl_file); + cerr << "about to evaluate normalized snarls" << endl; + vg::evaluate_normalized_snarls(snarl_stream); } // TODO: NOTE: this may be cumbersome code if we decide to add more argument types. // Consider changing. + if (normalize) { - graph->serialize(std::cout); + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + + // graph->serialize(std::cout); } - delete graph; + // delete graph; return 0; } From 36ae967c5cc8df7da97116244f4ff99afa1047ae Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 4 Nov 2019 13:19:45 -0800 Subject: [PATCH 37/63] normalize update --- src/subcommand/0_normalize_main.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index a7199702640..226737368fe 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -97,16 +97,26 @@ int main_normalize(int argc, char **argv) { } } - cerr << "getting hashgraph" << endl; + // //get a VG graph: + // cerr << "getting a vg graph" << endl; + // unique_ptr graph; + // get_input_file(optind, argc, argv, [&](istream &in) { + // graph = vg::io::VPKG::load_one(in); + // }); + // cerr << "got a vg graph" << endl; + + + //getting graph of any type, requires vpkg wrapping: unique_ptr graph; get_input_file(optind, argc, argv, [&](istream &in) { graph = vg::io::VPKG::load_one(in); }); + // cerr << "getting hashgraph" << endl; // bdsg::HashGraph *graph; // get_input_file(optind, argc, argv, // [&](istream &in) { graph = new bdsg::HashGraph(in); }); - cerr << "got hashgraph" << endl; + // cerr << "got hashgraph" << endl; if (normalize) { cerr << "running normalize!" << endl; From f13be40cc91f4315ccb6c5b24d0ac8defe4cc4ef Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 4 Nov 2019 16:58:56 -0800 Subject: [PATCH 38/63] made a simple vpkg wrap function for old vg files. --- src/subcommand/0_normalize_main.cpp | 5 ++ src/subcommand/0_vpkg_wrap_main.cpp | 105 ++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 src/subcommand/0_vpkg_wrap_main.cpp diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 226737368fe..2c47c0358ec 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -105,6 +105,11 @@ int main_normalize(int argc, char **argv) { // }); // cerr << "got a vg graph" << endl; + // //using vpkg for just HashGraph: + // unique_ptr graph; + // get_input_file(optind, argc, argv, [&](istream &in) { + // graph = vg::io::VPKG::load_one(in); + // }); //getting graph of any type, requires vpkg wrapping: unique_ptr graph; diff --git a/src/subcommand/0_vpkg_wrap_main.cpp b/src/subcommand/0_vpkg_wrap_main.cpp new file mode 100644 index 00000000000..0efd5ff07f4 --- /dev/null +++ b/src/subcommand/0_vpkg_wrap_main.cpp @@ -0,0 +1,105 @@ +#pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? +#include +#include +#include + +#include + +#include "subcommand.hpp" + +#include "../../include/bdsg/hash_graph.hpp" +#include "../../include/vg/io/vpkg.hpp" +// #include "../algorithms/0_draft_haplotype_realignment.hpp" +#include "../algorithms/0_oo_normalize_snarls.hpp" +#include "../gbwt_helper.hpp" + +#include "../../include/vg/io/vpkg.hpp" + +using namespace std; +using namespace vg; +using namespace vg::subcommand; + +void test(const HandleGraph &graph){ + handle_t handle = graph.get_handle(2); + cerr << "seq of handle 2 " << graph.get_sequence(handle); +} + +void help_vpkg_wrap(char **argv) { + cerr + << "usage: " << argv[0] << " vpkg_wrap [options] >[vpkg_wrapped.hg]" << endl + << "Wraps given vg graph into vpkg format, saves as hg." << endl + << endl + << "options:" << endl; + // << " -v, --input_vg unwrapped vg." << endl; +} + + +int main_vpkg_wrap(int argc, char **argv) { + + if (argc == 2) { + help_vpkg_wrap(argv); + return 1; + } + + string input_vg; + + int c; + optind = 2; // force optind past command positional argument + while (true) { + static struct option long_options[] = + + {{"help", no_argument, 0, 'h'}, + {"input_vg", required_argument, 0, 'v'}, + {0, 0, 0, 0}}; + + int option_index = 0; + c = getopt_long(argc, argv, "hv:", long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) { + + case 'v': + input_vg = optarg; + break; + + default: + abort(); + } + } + + // if (input_vg.size() > 0) { + // //get a HashGraph graph: + // cerr << "getting a HashGraph graph" << endl; + // unique_ptr graph; + // get_input_file(optind, argc, argv, [&](istream &in) { + // graph = vg::io::VPKG::load_one(in); + // }); + + // cerr << "got a HashGraph graph" << endl; + // PathHandleGraph *hand_graph = dynamic_cast(graph.get()); + // cerr << "pointer location " << hand_graph << endl; + // // test(*hand_graph); + //get a VG graph: + cerr << "getting a vg graph" << endl; + unique_ptr graph; + get_input_file(optind, argc, argv, [&](istream &in) { + graph = vg::io::VPKG::load_one(in); + }); + cerr << "got a vg graph" << endl; + // PathHandleGraph *hand_graph = dynamic_cast(graph.get()); + + //write graph + // vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + cerr << "saved the vg graph, now wrapped." << endl; + // } + return 0; +} + +// Register subcommand +static Subcommand vg_vpkg_wrap("vpkg_wrap", + "Wraps given vg graph into vpkg format, saves as hg.", TOOLKIT, + main_vpkg_wrap); \ No newline at end of file From 92bf363fe9105df4fa56653cbf80f72abf375faa Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 2 Nov 2020 10:17:18 -0800 Subject: [PATCH 39/63] old snarl normalizer updates --- robin_bash/debug_normalize_snarl.sh | 130 ++++++++++++++++++++++++---- src/subcommand/0_vpkg_wrap_main.cpp | 2 +- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/robin_bash/debug_normalize_snarl.sh b/robin_bash/debug_normalize_snarl.sh index fad23c806bf..465d6693591 100755 --- a/robin_bash/debug_normalize_snarl.sh +++ b/robin_bash/debug_normalize_snarl.sh @@ -7,16 +7,97 @@ echo compiling! . ./source_me.sh && make -j 8 echo running! +###After Normalization +dir=/home/robin/paten_lab/vg/test/robin_tests/chr21 +# base=hgsvc_construct.chr21.robin_made +base=hgsvc_construct.chr21.robin_made.normalized +haps=HGSVC.haps.vcf.gz +threads=16 +# meta=200bp.60000num +reads=hgsvc_construct.chr21.robin_made.normalized.read_sim.200bp.60000num.txt +# reads=hgsvc_construct.chr21.robin_made.normalized.read_sim.400bp.5000num.txt + +vg map -t $threads -d $dir/$base -x $dir/$base.xg -T $dir/$reads >$dir/$base.alignment.gam +# vg view -a $dir/$base.alignment.gam -j >$dir/$base.alignment.$meta.json + +echo finished mapping. + +#jq arbitrary queries for gam files (vg view is easier, more weighty), vg gamcompare with an empty file lets you look at full gam. + +###Before Normalization (orignally used in sh file on courtyard - see reconstruct_jmonlong_chr21): +# in_dir=/public/groups/cgl/graph-genomes/jmonlong/hgsvc/haps +# ref=hg38.fa +# in_dir=/home/robin/paten_lab/vg/test/robin_tests/chr21 +# vars=HGSVC.haps.vcf.gz +# base=hgsvc_construct.chr21.robin_made +# base_out=hgsvc_construct.chr21.robin_made.test_out +# chrom=chr21 +# threads=8 + +#make graph +#vg construct -r $in_dir/$ref -v $in_dir/$vars -R $chrom -C -m 32 -a -f > $base.vg + +#index graph. Note: currently doesn't make .gcsa for some reason. +# vg index -t $threads -x $in_dir/$base_out.xg -G $in_dir/$base_out.gbwt -v $in_dir/$vars -g $in_dir/$base_out.gcsa $in_dir/$base.vg +# vg index -G hgsvc_construct.chr21.robin_made.normalized.gbwt -g hgsvc_construct.chr21.robin_made.normalized.gcsa -v HGSVC.haps.vcf.gz hgsvc_construct.chr21.robin_made.normalized.vg + +#make snarls. +# vg snarls -v $in_dir/$vars $base.vg > $base.snarls.pb + +# chunk graph? +# vg chunk -x hgsvc_construct.chr21.robin_made.normalized.xg -G hgsvc_construct.chr21.robin_made.normalized.gbwt -r 0:16608 >chunk_normalized_0_to_16608.vg + + + ###Before and During Normalization ##running normalize_snarls on a full chromosome - local machine. -TEST_DIR=test/robin_tests/full_chr10 -FILE_NAME=hgsvc_chr10_construct -FILE_NAME_OUT=hgsvc_chr10_construct_normalized_no_max_size +# TEST_DIR=test/robin_tests/full_chr10 +# FILE_NAME=hgsvc_chr10_construct # FILE_NAME_OUT=junk +# FILE_NAME_OUT=chr10_no_gbwt_handles_at_25128 +# FILE_NAME_OUT=hgsvc_chr10_construct_normalized_no_max_size + +# TEST_DIR=test/robin_tests/chr21 +# FILE_NAME=hgsvc_construct.chr21.robin_made.normalized +# FILE_NAME_OUT=hgsvc_construct.chr21.robin_made.normalized +# FILE_NAME_OUT=hgsvc_construct.chr21.robin_made.normalized.subgraph.301929.exp_context + +## running full chr21: +# vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls.pb $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg + +## running subset of chr21 +# vg find -x $TEST_DIR/$FILE_NAME.xg -n 301929 -c 100 >$FILE_NAME_OUT.vg +# ./bin/vg view -dpn $FILE_NAME_OUT.vg| \ +# dot -Tsvg -o $FILE_NAME_OUT.svg +# chromium-browser $FILE_NAME_OUT.svg + + +# vg view -dpn $FILE_NAME_OUT.vg| \ +# dot -Tsvg -o $FILE_NAME_OUT.svg +# chromium-browser $FILE_NAME_OUT.svg + +## for extracting a prenormalized subgraph for looking at chr10 + +# TEST_DIR=test/robin_tests/chr21 +# FILE_NAME=hgsvc_construct.chr21.robin_made +# FILE_NAME_OUT=hgsvc_construct.chr21.robin_made.subgraph.301929.exp_context + +# vg find -x $TEST_DIR/$FILE_NAME.xg -n 301929 -c 100 >$FILE_NAME_OUT.vg +# ./bin/vg view -dpn $FILE_NAME_OUT.vg| \ +# dot -Tsvg -o $FILE_NAME_OUT.svg +# chromium-browser $FILE_NAME_OUT.svg + ##running full chr10 -echo "running normalize (w/ evaluation)" -vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg +# echo "running normalize (w/ evaluation)" +# valgrind --leak-check=full vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg +# vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls.pb $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg + + +# ##running full chr10 with no max size. +# echo "running normalize (w/ evaluation)" +# vg normalize -e -m 0 -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg + ## for printing out the normalized subsnarl: # vg normalize -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >graph_out.vg @@ -25,22 +106,37 @@ vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TES # chromium-browser graph_out.svg ## for extracting a prenormalized subgraph for looking at chr10 -# vg find -x test/robin_tests/full_chr10/hgsvc_chr10_construct.xg -n 1236806 -c 25 >graph_in.vg -# ./bin/vg view -dpn graph_in.vg| \ -# dot -Tsvg -o graph_in.svg -# chromium-browser graph_in.svg +# vg find -x $TEST_DIR/$FILE_NAME.xg -n 25128 -c 25 >$FILE_NAME_OUT.vg +# ./bin/vg view -dpn $FILE_NAME_OUT.vg| \disambiguating snarl #85 source: 23053 sink: 23075 +# dot -Tsvg -o $FILE_NAME_OUT.svg +# chromium-browser $FILE_NAME_OUT.svg + +## looking at an old example +# TEST_DIR=test/robin_tests/robin_haplotypes/complex +# FILE_NAME=chr10_subgraph_0_new +# FILE_NAME_OUT=chr10_subgraph_0_new_normalized_200_max_thread_size +# # vg index -G $TEST_DIR/$FILE_NAME.gbwt -v $TEST_DIR/../../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME.vg +# # vg convert -v $TEST_DIR/$FILE_NAME.vg -A >$TEST_DIR/$FILE_NAME.hg +# vg normalize -e -g $TEST_DIR/$FILE_NAME.gbwt -s $TEST_DIR/$FILE_NAME.snarls $TEST_DIR/$FILE_NAME.hg >$TEST_DIR/$FILE_NAME_OUT.hg +# vg convert -a $TEST_DIR/$FILE_NAME_OUT.hg -V >$TEST_DIR/$FILE_NAME_OUT.vg +# vg mod -g 609548 -x 65 $TEST_DIR/$FILE_NAME_OUT.vg | vg view -dpn - | dot -Tsvg -o $TEST_DIR_OUT/$FILE_NAME_OUT.svg +# chromium-browser $TEST_DIR_OUT/$FILE_NAME_OUT.svg + + + ### After Normalization: ## for making a snarls file: -vg convert -a $TEST_DIR/$FILE_NAME_OUT.hg -V >$TEST_DIR/$FILE_NAME_OUT.vg -echo "hg converted to vg" -vg snarls $TEST_DIR/$FILE_NAME_OUT.vg >$TEST_DIR/$FILE_NAME_OUT.snarls -echo ".snarls made" +# vg convert -a $TEST_DIR/$FILE_NAME_OUT.hg -V >$TEST_DIR/$FILE_NAME_OUT.vg +# echo "hg converted to vg" +# vg snarls $TEST_DIR/$FILE_NAME_OUT.vg >$TEST_DIR/$FILE_NAME_OUT.snarls +# echo ".snarls made" ## for evaluating normalized graph: -echo "getting vg stats:" -vg stats -z -l $TEST_DIR/$FILE_NAME_OUT.vg +# echo "getting vg stats:" +# vg stats -z -l $TEST_DIR/$FILE_NAME_OUT.vg ## creating a new gbwt graph from the outgraph: -vg index -G $TEST_DIR/$FILE_NAME_OUT.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME_OUT.vg -echo "gbwt made" \ No newline at end of file +# vg index -G $TEST_DIR/$FILE_NAME_OUT.gbwt -v $TEST_DIR/../HGSVC.haps.chr10.vcf.gz $TEST_DIR/$FILE_NAME_OUT.vg +# echo "gbwt made" + diff --git a/src/subcommand/0_vpkg_wrap_main.cpp b/src/subcommand/0_vpkg_wrap_main.cpp index 0efd5ff07f4..026c27984ff 100644 --- a/src/subcommand/0_vpkg_wrap_main.cpp +++ b/src/subcommand/0_vpkg_wrap_main.cpp @@ -27,7 +27,7 @@ void test(const HandleGraph &graph){ void help_vpkg_wrap(char **argv) { cerr << "usage: " << argv[0] << " vpkg_wrap [options] >[vpkg_wrapped.hg]" << endl - << "Wraps given vg graph into vpkg format, saves as hg." << endl + << "Wraps given vg graph into vpkg format, saves as vg." << endl << endl << "options:" << endl; // << " -v, --input_vg unwrapped vg." << endl; From b3650bb4668fc6543c031b2fda1026155891ea9b Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 11:46:25 -0800 Subject: [PATCH 40/63] cleaned some comments --- src/algorithms/0_oo_normalize_snarls.cpp | 108 +++++++++++++++-------- src/algorithms/0_oo_normalize_snarls.hpp | 2 +- src/subcommand/0_normalize_main.cpp | 3 +- 3 files changed, 71 insertions(+), 42 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index e7ddcb398bc..7fc59bc37a9 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -19,21 +19,29 @@ #include "../types.hpp" #include "extract_containing_graph.hpp" +/* +TODO: allow for snarls that have haplotypes that begin or end in the middle of the snarl +TODO: allow normalization of multiple adjacent snarls in one combined realignment. +*/ namespace vg { + +/** + * To "normalize" a snarl, SnarlNormalizer extracts all the sequences in the snarl as + * represented in the gbwt, and then realigns them to create a replacement snarl. + * This process hopefully results in a snarl with less redundant sequence, and with + * duplicate variation combined into a single variant. +*/ SnarlNormalizer::SnarlNormalizer(MutablePathDeletableHandleGraph &graph, const gbwtgraph::GBWTGraph &haploGraph, const int &max_alignment_size, const string &path_finder) : _haploGraph(haploGraph), _graph(graph), _max_alignment_size(max_alignment_size), _path_finder(path_finder) {} -// TODO: allow for snarls that have haplotypes that begin or end in the middle of the -// snarl -// Runs disambiguate_snarl on every top-level snarl in the _graph, so long as the -// snarl only contains haplotype threads that extend fully from source to sink. -// Arguments: -// _graph: the full-sized handlegraph that will undergo edits in a snarl. -// _haploGraph: the corresponding gbwtgraph::GBWTGraph of _graph. -// snarl_stream: the file stream from .snarl file corresponding to _graph. + +/** + * Iterates over all top-level snarls in _graph, and normalizes them. + * @param snarl_stream file stream from .snarl.pb output of vg snarls +*/ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { cerr << "disambiguate_top_level_snarls" << endl; SnarlManager *snarl_manager = new SnarlManager(snarl_stream); @@ -41,15 +49,19 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { int num_snarls_normalized = 0; int num_snarls_skipped = 0; vector snarl_roots = snarl_manager->top_level_snarls(); - // error_record's bools are: - // 0) snarl exceeds max number of threads that can be efficiently aligned, - // 1) snarl has haplotypes starting/ending in the middle, - // 2) some handles in the snarl aren't connected by a thread, - // 3) snarl is cyclic. - // there are two additional ints for tracking the changing size of sequence in the - // snarl: - // 4) number of bases in the snarl before normalization - // 5) number of bases in the snarl after normalization. + + /** + * We keep an error record to observe when snarls are skipped because they aren't + * normalizable under current restraints. Bools: + * 0) snarl exceeds max number of threads that can be efficiently aligned, + * 1) snarl has haplotypes starting/ending in the middle, + * 2) some handles in the snarl aren't connected by a thread, + * 3) snarl is cyclic. + * There are two additional ints for tracking the changing size of sequence in the + * snarl. Ints: + * 4) number of bases in the snarl before normalization + * 5) number of bases in the snarl after normalization. + */ int error_record_size = 5; vector one_snarl_error_record(error_record_size, 0); vector full_error_record(error_record_size, 0); @@ -58,12 +70,11 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { for (auto roots : snarl_roots) { // if (roots->start().node_id() > 269600 && roots->start().node_id() < 269700) { - // if (roots->start().node_id() == 0) { // as in, don't normalize any snarls - // if (roots->start().node_id() < 50000) { cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) << " source: " << roots->start().node_id() << " sink: " << roots->end().node_id() << endl; + one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); @@ -162,7 +173,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // haplotypes_that_end/start_prematurely, set of all handles in the haplotypes > tuple>, vector>, unordered_set> gbwt_haplotypes = - extract_gbwt_haplotypes(snarl, _haploGraph, _cur_source_id, sink_id); + extract_gbwt_haplotypes(snarl, _cur_source_id, sink_id); // Convert the haplotypes from vector format to string format. get<0>(haplotypes) = format_handle_haplotypes_to_strings(get<0>(gbwt_haplotypes)); get<1>(haplotypes) = get<1>(gbwt_haplotypes); @@ -208,14 +219,15 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & vector> embedded_paths = extract_embedded_paths_in_snarl(_graph, _cur_source_id, sink_id); + //todo: debug_statement + cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; + for (string seq : get<0>(haplotypes)) { + cerr << seq << endl; + } + // find the paths that stretch from source to sink: for (auto path : embedded_paths) { - // cerr << "checking path of name " << - // _graph.get_path_name(graph.get_path_handle_of_step(path.first)) << " with - // start " << _graph.get_id(graph.get_handle_of_step(path.first)) << " and - // sink " << - // _graph.get_id(graph.get_handle_of_step(graph.get_previous_step(path.second))) - // << endl; + cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; if (_graph.get_id(_graph.get_handle_of_step(path.first)) == _cur_source_id && _graph.get_id(_graph.get_handle_of_step( _graph.get_previous_step(path.second))) == sink_id) { @@ -316,7 +328,6 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // pair>, vector>> tuple>, vector>, unordered_set> SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, - const gbwtgraph::GBWTGraph &haploGraph, const id_t &_source_id, const id_t &sink_id) { // cerr << "extract_gbwt_haplotypes" << endl; @@ -388,7 +399,7 @@ SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, for (gbwt::SearchState next_search : next_searches) { handle_t next_handle = _haploGraph.node_to_handle(next_search.node); // if (!snarl.has_node(snarl.get_id(next_handle)) && - // make_pair(haploGraph.get_id(cur_haplotype.first.back()),haploGraph.get_id(next_handle))) + // make_pair(_haploGraph.get_id(cur_haplotype.first.back()),_haploGraph.get_id(next_handle))) // { if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { if (incorrect_connections.find( @@ -424,7 +435,7 @@ SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, next_handle_vec.push_back(next_handle); // if new_handle is the sink, put in haplotypes_from_source_to_sink - if (haploGraph.get_id(next_handle) == sink_id) { + if (_haploGraph.get_id(next_handle) == sink_id) { haplotypes_from_source_to_sink.push_back(next_handle_vec); } else // keep extending the haplotype! { @@ -444,7 +455,7 @@ SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, } // if next_handle is the sink, put in haplotypes_from_source_to_sink - else if (haploGraph.get_id( + else if (_haploGraph.get_id( _haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { // Then we need to add cur_haplotype + next_search to // haplotypes_from_source_to_sink. @@ -484,6 +495,15 @@ SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, move(haplotypes_not_starting_at_source.begin(), haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + //todo: debug_statement + cerr << "lets look through all the haplotypes after extraction:" << endl; + for (vector hap_vec : haplotypes_from_source_to_sink) { + cerr << "new hap:" << endl; + for (handle_t handle : hap_vec){ + cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; + } + } + return tuple>, vector>, unordered_set>{haplotypes_from_source_to_sink, other_haplotypes, touched_handles}; @@ -699,6 +719,9 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( hap.replace(hap.size() - 1, 1, "X"); } + // //todo: debug_statement + // source_to_sink_haplotypes.emplace_back("XX"); + // /// make a new scoring matrix with _match=5, _mismatch = -3, _gap_extend = -1, and // _gap_open = -3, EXCEPT that Q has to be matched with Q (so match score between Q // and Q =len(seq)+1) @@ -741,7 +764,7 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( row_string += *it; } // todo: debug_statement - // cerr << "ROW_STRING: " << row_string << endl; + cerr << "ROW_STRING: " << row_string << endl; // edit the row so that the proper source and sink chars are added to the // haplotype instead of the special characters added to ensure correct alignment // of source and sink. @@ -752,6 +775,8 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( stringstream ss; for (string seq : row_strings) { + // todo: debug_statement + cerr << "seq in alignment:" << seq << endl; ss << endl << seq; } // ss << align; @@ -1023,13 +1048,13 @@ void SnarlNormalizer::integrate_snarl( const vector> embedded_paths) { // cerr << "integrate_snarl" << endl; - // //todo: debug_statement - // cerr << "handles in to_insert_snarl:" << endl; - // to_insert_snarl.for_each_handle([&](const handle_t &handle) { - // cerr << to_insert_snarl.get_id(handle) << " " - // << to_insert_snarl.get_sequence(handle) << " \t"; - // }); - // cerr << endl; + //todo: debug_statement + cerr << "handles in to_insert_snarl:" << endl; + to_insert_snarl.for_each_handle([&](const handle_t &handle) { + cerr << to_insert_snarl.get_id(handle) << " " + << to_insert_snarl.get_sequence(handle) << " \t"; + }); + cerr << endl; // Get old _graph snarl SubHandleGraph old_snarl = extract_subgraph(_graph, _cur_source_id, _cur_sink_id); @@ -1072,6 +1097,7 @@ void SnarlNormalizer::integrate_snarl( handle_t graph_handle = _graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); new_snarl_topo_order.push_back(graph_handle); + cerr << "graph handle being inserted into new_snarl_topo_order:" << _graph.get_id(graph_handle) << endl; } // Connect the newly made handles in the _graph together the way they were connected @@ -1096,6 +1122,8 @@ void SnarlNormalizer::integrate_snarl( // on. id_t temp_snarl_source_id = _graph.get_id(new_snarl_topo_order.front()); id_t temp_snarl_sink_id = _graph.get_id(new_snarl_topo_order.back()); + cerr << "the temp source id: " << temp_snarl_source_id << endl; + cerr << "the temp sink id: " << temp_snarl_sink_id << endl; // Add the neighbors of the source and sink of the original snarl to the new_snarl's // source and sink. @@ -1168,11 +1196,13 @@ void SnarlNormalizer::integrate_snarl( _graph.rewrite_segment(step, _graph.get_next_step(step), vector{new_sink_handle}); }); + cerr << "the temp source id: " << temp_snarl_source_id << endl; + cerr << "the temp sink id: " << temp_snarl_sink_id << endl; // delete the previously created source and sink: for (handle_t handle : {_graph.get_handle(temp_snarl_source_id), _graph.get_handle(temp_snarl_sink_id)}) { - + cerr << "id of handle to delete from tem source/sink: " << _graph.get_id(handle) << endl; _graph.destroy_handle(handle); } } diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 509662aee66..af487ef7060 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -36,7 +36,7 @@ class SnarlNormalizer { const string &_path_finder; tuple>, vector>, unordered_set> - extract_gbwt_haplotypes(const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &graph, + extract_gbwt_haplotypes(const SubHandleGraph &snarl, const id_t &source_id, const id_t &sink_id); pair, unordered_set> source_to_sink_exhaustive_path_finder(); diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 2c47c0358ec..d0d278dab12 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -33,8 +33,7 @@ void help_normalize(char **argv) { << " -m, --max_alignment_size limits the number of threads that will " "be aligned in any snarl. If exceeded, program skips snarl. Default is 200 " "threads. If you don't want to skip any snarls based on thread count, enter 0." - << endl - << " -s, --snarls snarls file corresponding to hashgraph." << endl; + << endl; } int main_normalize(int argc, char **argv) { From 01d2892acc8d111608b1dbc3664a72f6cbf52a00 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 15:23:54 -0800 Subject: [PATCH 41/63] made SnarlSequences class for gbwt sequences. --- src/algorithms/0_oo_normalize_snarls.cpp | 856 +++++++++++------------ src/algorithms/0_oo_normalize_snarls.hpp | 8 +- src/algorithms/0_snarl_sequences.cpp | 361 ++++++++++ src/algorithms/0_snarl_sequences.hpp | 31 + 4 files changed, 808 insertions(+), 448 deletions(-) create mode 100644 src/algorithms/0_snarl_sequences.cpp create mode 100644 src/algorithms/0_snarl_sequences.hpp diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 7fc59bc37a9..b1149e5f3b3 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -1,5 +1,7 @@ #pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? + #include "0_oo_normalize_snarls.hpp" +#include "0_snarl_sequences.hpp" #include #include @@ -21,7 +23,11 @@ /* TODO: allow for snarls that have haplotypes that begin or end in the middle of the snarl + TODO: allow normalization of multiple adjacent snarls in one combined realignment. + +TODO: test that extract_gbwt haplotypes successfully extracts any haplotypes that start/end in the middle of +TODO: the snarl. */ namespace vg { @@ -57,8 +63,7 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { * 1) snarl has haplotypes starting/ending in the middle, * 2) some handles in the snarl aren't connected by a thread, * 3) snarl is cyclic. - * There are two additional ints for tracking the changing size of sequence in the - * snarl. Ints: + * There are two additional ints for tracking the snarl size. Ints: * 4) number of bases in the snarl before normalization * 5) number of bases in the snarl after normalization. */ @@ -74,9 +79,8 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { << (num_snarls_normalized + num_snarls_skipped) << " source: " << roots->start().node_id() << " sink: " << roots->end().node_id() << endl; - - one_snarl_error_record = - normalize_snarl(roots->start().node_id(), roots->end().node_id()); + + normalize_snarl(roots->start().node_id(), roots->end().node_id()); if (!((one_snarl_error_record[0]) || (one_snarl_error_record[1]) || (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { @@ -111,15 +115,7 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { cerr << "amount of sequence in normalized snarls after normalization: " << snarl_sequence_change.second << endl; - /// Args: - /// source _graph to extract subgraph from - /// into _graph to extract into - /// positions search outward from these positions - /// max_dist include all nodes and edges that can be reached in at most - /// this distance reversing_walk_length also find _graph material that can be - /// reached - - // //todo: debug_statement + // //todo: debug_statement for extracting snarl of interest. // VG outGraph; // pos_t source_pos = make_pos_t(269695, false, 0); // vector pos_vec; @@ -130,57 +126,51 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { delete snarl_manager; } -// For a snarl in the given _graph, with every edge covered by at least one haplotype -// thread in the gbwtgraph::GBWTGraph, -// extract all sequences in the snarl corresponding to the haplotype threads and -// re-align them with MSAConverter/seqan to form a new snarl. Embedded paths are -// preserved; GBWT haplotypes in the snarl are not conserved. -// Arguments: -// _graph: the full-sized handlegraph that will undergo edits in a snarl. -// _haploGraph: the corresponding gbwtgraph::GBWTGraph of _graph. -// source_id: the source of the snarl of interest. -// sink_id: the sink of the snarl of interest. +/** + * Normalize a single snarl defined by a source and sink. Only extracts and realigns + * sequences found in the gbwt. + * @param source_id the source of the snarl of interest. + * @param sink_id the sink of the snarl of interest. + * @param error_record an empty vector of 6 integers. +*/ // Returns: none. // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl. vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t &sink_id) { - // cerr << "disambiguate_snarl" << endl; - _cur_source_id = source_id; - _cur_sink_id = sink_id; - - // error_record's bools are: - // 0) snarl exceeds max number of threads that can be efficiently aligned, - // 1) snarl has haplotypes starting/ending in the middle, - // 2) some handles in the snarl aren't connected by a thread, - // 3) snarl is cyclic. - // there are two additional ints for tracking the changing size of sequence in the - // snarl: - // 4) number of bases in the snarl before normalization - // 5) number of bases in the snarl after normalization. + /** + * We keep an error record to observe when snarls are skipped because they aren't + * normalizable under current restraints. Bools: + * 0) snarl exceeds max number of threads that can be efficiently aligned, + * 1) snarl has haplotypes starting/ending in the middle, + * 2) some handles in the snarl aren't connected by a thread, + * 3) snarl is cyclic. + * There are two additional ints for tracking the snarl size. Ints: + * 4) number of bases in the snarl before normalization + * 5) number of bases in the snarl after normalization. + */ vector error_record(6, 0); - SubHandleGraph snarl = extract_subgraph(_graph, _cur_source_id, sink_id); + SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); if (!algorithms::is_acyclic(&snarl)) { - cerr << "snarl at " << _cur_source_id << " is cyclic. Skipping." << endl; + cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; error_record[3] = true; + return error_record; } // extract threads tuple, vector>, unordered_set> haplotypes; + SnarlSequences sequence_finder = SnarlSequences(snarl, _haploGraph, source_id, sink_id); + if (_path_finder == "GBWT") { - // First, find all haplotypes encoded by the GBWT, in order to create the new - // snarl. Return value is tuple< haplotypes_that_stretch_from_source_to_sink, - // haplotypes_that_end/start_prematurely, set of all handles in the haplotypes > tuple>, vector>, unordered_set> - gbwt_haplotypes = - extract_gbwt_haplotypes(snarl, _cur_source_id, sink_id); + gbwt_haplotypes = sequence_finder.find_gbwt_haps(); // Convert the haplotypes from vector format to string format. get<0>(haplotypes) = format_handle_haplotypes_to_strings(get<0>(gbwt_haplotypes)); get<1>(haplotypes) = get<1>(gbwt_haplotypes); get<2>(haplotypes) = get<2>(gbwt_haplotypes); } else if (_path_finder == "exhaustive") { pair, unordered_set> exhaustive_haplotypes = - source_to_sink_exhaustive_path_finder(); + source_to_sink_exhaustive_path_finder(source_id, sink_id); get<0>(haplotypes) = exhaustive_haplotypes.first; get<2>(haplotypes) = exhaustive_haplotypes.second; } else { @@ -204,20 +194,10 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // TODO: 2049699 with 258 haplotypes is taking many minutes. if (get<1>(haplotypes).empty() && get<0>(haplotypes).size() < _max_alignment_size && get<2>(haplotypes).size() == handles_in_snarl.size()) { - // if (get<1>(haplotypes).empty() && get<2>(haplotypes).size() == - // handles_in_snarl) { if (get<1>(haplotypes).empty()) { Convert the haplotypes - // from vector format to string format. - // vector< string > other_haplotypes = - // format_handle_haplotypes_to_strings(haploGraph, get<1>(haplotypes)); - - // Get the embedded paths in the snarl out of the _graph, for the purposes of - // moving them into the new snarl. In addition, any embedded paths that stretch - // from source to sink are aligned in the new snarl. - // TODO: once haplotypes that begin/end in the middle of the snarl have been - // TODO: accounted for in the code, align all embedded paths? (and remove next - // TODO: chunk of code that finds source-to-sink paths)? + // Get the embedded paths in the snarl from _graph, to move them to new_snarl. + // Any embedded paths not in gbwt are aligned in the new snarl. vector> embedded_paths = - extract_embedded_paths_in_snarl(_graph, _cur_source_id, sink_id); + extract_embedded_paths_in_snarl(_graph, source_id, sink_id); //todo: debug_statement cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; @@ -225,10 +205,13 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & cerr << seq << endl; } + // TODO: once haplotypes that begin/end in the middle of the snarl have been + // TODO: accounted for in the code, remove next chunk of code that finds + // TODO: source-to-sink paths. // find the paths that stretch from source to sink: for (auto path : embedded_paths) { - cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; - if (_graph.get_id(_graph.get_handle_of_step(path.first)) == _cur_source_id && + // cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + if (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id && _graph.get_id(_graph.get_handle_of_step( _graph.get_previous_step(path.second))) == sink_id) { // cerr << "adding path of name " << @@ -248,40 +231,29 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & VG new_snarl = align_source_to_sink_haplotypes(get<0>(haplotypes)); // count the number of bases in the snarl. - //todo: debug_statement - // cerr << "size of snarl before any counting: " << error_record[5] << endl; - // for (auto rec : error_record){ - // cerr << " " << rec << endl; - // } new_snarl.for_each_handle([&](const handle_t handle) { error_record[5] += new_snarl.get_sequence(handle).size(); }); force_maximum_handle_size(new_snarl, _max_alignment_size); - // //todo: debug_statement - // new_snarl.for_each_handle([&](const handle_t &handle) { - // cerr << new_snarl.get_id(handle) << " " << new_snarl.get_sequence(handle) - // << "\t"; - // }); - // integrate the new_snarl into the _graph, removing the old snarl as you go. - integrate_snarl(new_snarl, embedded_paths); + integrate_snarl(new_snarl, embedded_paths, source_id, sink_id); } else { if (!get<1>(haplotypes).empty()) { - cerr << "found a snarl starting at " << _cur_source_id << " and ending at " + cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id << " with haplotypes that start or end in the middle. Skipping." << endl; error_record[1] = true; } if (get<0>(haplotypes).size() > _max_alignment_size) { - cerr << "found a snarl starting at " << _cur_source_id << " and ending at " + cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() << ") to efficiently align. Skipping." << endl; error_record[0] = true; } if (get<2>(haplotypes).size() != handles_in_snarl.size()) { - cerr << "some handles in the snarl starting at " << _cur_source_id + cerr << "some handles in the snarl starting at " << source_id << " and ending at " << sink_id << " aren't accounted for by the gbwt_graph. " "Skipping." @@ -296,12 +268,11 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & error_record[2] = true; } } - // todo: decide if this should be a requirement for the integration of normalized - // snarl + // todo: decide if we should only normalize snarls that decrease in size. if (error_record[5] > error_record[4]) { cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " "starting at " - << _cur_source_id << endl + << source_id << endl << "\tsize before: " << error_record[4] << " size after: " << error_record[5] << endl; } else if (error_record[5] <= 0) { @@ -309,360 +280,356 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & } return error_record; -} // namespace vg - -// TODO: test that it successfully extracts any haplotypes that start/end in the middle of -// TODO: the snarl. -// For a snarl in a given gbwtgraph::GBWTGraph, extract all the haplotypes in the snarl. -// Haplotypes are represented -// by vectors of handles, representing the chain of handles in a thread. -// Arguments: -// _haploGraph: the gbwtgraph::GBWTGraph containing the snarl. -// _cur_source_id: the source of the snarl of interest. -// sink_id: the sink of the snarl of interest. -// Returns: -// a pair containting two sets of paths (each represented by a vector). The -// first in the pair represents all paths reaching from source to sink in the snarl, -// and the second representing all other paths in the snarl (e.g. any that don't -// reach both source and sink in the _graph.) -// pair>, vector>> -tuple>, vector>, unordered_set> -SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, - const id_t &_source_id, const id_t &sink_id) { - // cerr << "extract_gbwt_haplotypes" << endl; - - // haplotype_queue contains all started exon_haplotypes not completed yet. - // Every time we encounter a branch in the paths, the next node down the path - // Is stored here, along with the vector of handles that represents the path up - // to the SearchState. - vector, gbwt::SearchState>> haplotype_queue; - - // source and sink handle for _haploGraph: - handle_t source_handle = _haploGraph.get_handle(_source_id); - handle_t sink_handle = _haploGraph.get_handle(sink_id); - - // place source in haplotype_queue. - vector source_handle_vec(1, source_handle); - gbwt::SearchState source_state = _haploGraph.get_state(source_handle); - haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); - - // touched_handles contains all handles that have been touched by the - // depth first search below, for later use in other_haplotypes_to_strings, which - // identifies paths that didn't stretch from source to sink in the snarl. - unordered_set touched_handles{source_handle, sink_handle}; - - // haplotypes contains all "finished" haplotypes - those that were either walked - // to their conclusion, or until they reached the sink. - vector> haplotypes_from_source_to_sink; - vector> other_haplotypes; - - // sometimes a gbwt thread will indicate a connection between two handles that doesn't - // actually exist in the _graph. These connections need to be ignored. - unordered_set incorrect_connections; - - // int prev_size = 0; - // for every partly-extracted thread, extend the thread until it either reaches - // the sink of the snarl or the end of the thread. - while (!haplotype_queue.empty()) { - // todo: debug_statement - // cerr << "haplotype queue: "; - // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; - // for (auto hap : haplotype_queue) { - // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; - // for (handle_t handle : hap.first) { - // cerr << _haploGraph.get_id(handle) << " "; - // } - // cerr << endl; - // } - - // get a haplotype out of haplotype_queue to extend - - // a tuple of (handles_traversed_so_far, last_touched_SearchState) - pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); - haplotype_queue.pop_back(); - - // get all the subsequent search_states that immediately follow the searchstate - // from cur_haplotype. - vector next_searches; - _haploGraph.follow_paths(cur_haplotype.second, - [&](const gbwt::SearchState next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); - - // if next_searches > 1, then we need to make multiple new haplotypes to be - // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. - if (next_searches.size() > 1) { - // for every next_search in next_searches, either create a new, extended - // cur_haplotype to push into haplotype queue, or place in the - // haplotypes_from_source_to_sink if haplotype extends to sink, or place in - // the other_haplotypes if haplotype ends before reaching sink. - for (gbwt::SearchState next_search : next_searches) { - handle_t next_handle = _haploGraph.node_to_handle(next_search.node); - // if (!snarl.has_node(snarl.get_id(next_handle)) && - // make_pair(_haploGraph.get_id(cur_haplotype.first.back()),_haploGraph.get_id(next_handle))) - // { - if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { - if (incorrect_connections.find( - snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == - incorrect_connections.end()) { - cerr - << "snarl starting at node " << _cur_source_id - << " and ending at " << sink_id - << " has a thread that incorrectly connects two nodes that " - "don't have any edge connecting them. These two nodes are " - << _haploGraph.get_id(cur_haplotype.first.back()) << " and " - << _haploGraph.get_id(next_handle) - << ". This thread connection will be ignored." << endl; - incorrect_connections.emplace( - snarl.edge_handle(cur_haplotype.first.back(), next_handle)); - - // todo: debug_statement - // cerr << "next handle(s) of handle " - // << snarl.get_id(cur_haplotype.first.back()) - // << " according to snarl:" << endl; - // snarl.follow_edges(cur_haplotype.first.back(), false, - // [&](const handle_t handle) { - // cerr << "\t" << snarl.get_id(handle); - // }); - // cerr << endl; - } - continue; - } - // copy over the vector of cur_haplotype: - vector next_handle_vec(cur_haplotype.first); - - // add the new handle to the vec: - next_handle_vec.push_back(next_handle); - - // if new_handle is the sink, put in haplotypes_from_source_to_sink - if (_haploGraph.get_id(next_handle) == sink_id) { - haplotypes_from_source_to_sink.push_back(next_handle_vec); - } else // keep extending the haplotype! - { - pair, gbwt::SearchState> next_haplotype = - make_pair(next_handle_vec, next_search); - haplotype_queue.push_back(next_haplotype); - } - // next_handle will be touched. - touched_handles.emplace(next_handle); - } - } - // if next_searches is empty, the path has ended but not reached sink. - else if (next_searches.empty()) { - // We have reached the end of the path, but it doesn't reach the sink. - // we need to add cur_haplotype to other_haplotypes. - other_haplotypes.push_back(cur_haplotype.first); - - } - // if next_handle is the sink, put in haplotypes_from_source_to_sink - else if (_haploGraph.get_id( - _haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { - // Then we need to add cur_haplotype + next_search to - // haplotypes_from_source_to_sink. - handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); - cur_haplotype.first.push_back(next_handle); - haplotypes_from_source_to_sink.push_back(cur_haplotype.first); - - // touched next_search's handle - touched_handles.emplace(next_handle); - } - // else, there is just one next_search, and it's not the end of the path. - // just extend the search by adding (cur_haplotype + next_search to - // haplotype_queue. - else { - // get the next_handle from the one next_search. - handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); - - // modify cur_haplotype with next_handle and next_search. - cur_haplotype.first.push_back(next_handle); - cur_haplotype.second = - next_searches.back(); // there's only one next_search in next_searches. - - // put cur_haplotype back in haplotype_queue. - haplotype_queue.push_back(cur_haplotype); - touched_handles.emplace(next_handle); - } - } - - // Find any haplotypes starting from handles not starting at the source, but which - // still start somewhere inside the snarl. - vector> haplotypes_not_starting_at_source = - find_haplotypes_not_at_source(touched_handles, sink_id); - - // move haplotypes_not_starting_at_source into other_haplotypes: - other_haplotypes.reserve(other_haplotypes.size() + - haplotypes_not_starting_at_source.size()); - move(haplotypes_not_starting_at_source.begin(), - haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); - - //todo: debug_statement - cerr << "lets look through all the haplotypes after extraction:" << endl; - for (vector hap_vec : haplotypes_from_source_to_sink) { - cerr << "new hap:" << endl; - for (handle_t handle : hap_vec){ - cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; - } - } - - return tuple>, vector>, - unordered_set>{haplotypes_from_source_to_sink, - other_haplotypes, touched_handles}; } -// Used to complete the traversal of a snarl along its haplotype threads, when there are -// handles connected to the snarl by threads that start after the source handle. (Threads -// that merely end before the sink handle are addressed in extract_gbwt_haplotypes). -// Arguments: -// _haploGraph: the GBWTgraph containing the haplotype threads. -// touched_handles: any handles found in the snarl so far. -// sink_id: the id of the final handle in the snarl. -// Returns: -// a vector of haplotypes in vector format that start in the middle of the -// snarl. -vector> -SnarlNormalizer::find_haplotypes_not_at_source(unordered_set &touched_handles, - const id_t &sink_id) { - // cerr << "find_haplotypes_not_at_source" << endl; - - /// Search every handle in touched handles for haplotypes starting at that point. - // Any new haplotypes will be added to haplotype_queue. - vector, gbwt::SearchState>> haplotype_queue; - - // Fully extended haplotypes (or haplotypes extended to the snarl's sink) - // will be added to finished_haplotypes. - vector> finished_haplotypes; - - // In addition, we need to put the new handle into to_search, because a path may have - // started on the new handle (which means we need to start a searchstate there.) - unordered_set to_search; - - // We don't need to ever check the sink handle, since paths from the sink handle - // extend beyond snarl. - handle_t sink_handle = _haploGraph.get_handle(sink_id); - // touched_handles.erase(sink_handle); - - // Nested function for making a new_search. Identifies threads starting at a given - // handle and - // either adds them as a full haplotype (if the haplotype is one handle long) or - // makes a new entry to haplotype_queue. - auto make_new_search = [&](handle_t handle) { - // Are there any new threads starting at this handle? - gbwt::SearchState new_search = - _haploGraph.index->prefix(_haploGraph.handle_to_node(handle)); - if (!new_search.empty()) { - // Then add them to haplotype_queue. - _haploGraph.follow_paths( - new_search, [&](const gbwt::SearchState &next_search) -> bool { - handle_t next_handle = _haploGraph.node_to_handle(next_search.node); - - /// check to make sure that the thread isn't already finished: - // if next_handle is the sink, or if this thread is only one handle - // long, then there isn't any useful string to extract from this. - if (next_handle != sink_handle || - next_search == gbwt::SearchState()) { - // establish a new thread to walk along. - vector new_path; - new_path.push_back(handle); - new_path.push_back(next_handle); - - pair, gbwt::SearchState> mypair = - make_pair(new_path, next_search); - - // add the new path to haplotype_queue to be extended. - haplotype_queue.push_back(make_pair(new_path, next_search)); - - // if next_handle hasn't been checked for starting threads, add to - // to_search. - if (touched_handles.find(next_handle) == touched_handles.end()) { - to_search.emplace(next_handle); - } - } - return true; - }); - } - }; - - /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. - /// Then, check to see if there are any new threads on handles in to_search. - /// Extend those threads, and add any newly found handles to to_search, - /// then search for threads again in to_search again... repeat until to_search remains - /// emptied of new handles. - - // for tracking whether the haplotype thread is still extending: - bool still_extending; - while (!to_search.empty() || !haplotype_queue.empty()) { - while (!haplotype_queue.empty()) { - // get a haplotype to extend out of haplotype_queue - a tuple of - // (handles_traversed_so_far, last_touched_SearchState) - pair, gbwt::SearchState> cur_haplotype = - haplotype_queue.back(); - haplotype_queue.pop_back(); - - // get all the subsequent search_states that immediately follow the - // searchstate from cur_haplotype. - vector next_searches; - _haploGraph.follow_paths(cur_haplotype.second, - [&](const gbwt::SearchState &next_search) -> bool { - next_searches.push_back(next_search); - return true; - }); - - for (gbwt::SearchState next_search : next_searches) { - handle_t next_handle = _haploGraph.node_to_handle(next_search.node); - - // if next_search is empty, then we've fallen off the thread, - // and cur_haplotype can be placed in finished_haplotypes as is for this - // thread. - if (next_search == gbwt::SearchState()) { - finished_haplotypes.push_back(cur_haplotype.first); - } - - // if next_search is on the sink_handle, - // then cur_haplotype.first + next_search goes to finished_haplotypes. - else if (_haploGraph.get_id(next_handle) == sink_id) { - - // copy over the vector of cur_haplotype: - vector next_handle_vec(cur_haplotype.first); - // add next_handle - next_handle_vec.push_back(next_handle); - // place in finished_haplotypes - finished_haplotypes.push_back(next_handle_vec); - - // also, if next_handle hasn't been checked for new threads, add to - // to_search. - if (touched_handles.find(next_handle) != touched_handles.end()) { - to_search.emplace(next_handle); - } - } - // otherwise, just place an extended cur_haplotype in haplotype_queue. - else { - // copy over cur_haplotype: - pair, gbwt::SearchState> cur_haplotype_copy = - cur_haplotype; - // modify with next_handle/search - cur_haplotype_copy.first.push_back(next_handle); - cur_haplotype_copy.second = next_search; - // place back in haplotype_queue for further extension. - haplotype_queue.push_back(cur_haplotype_copy); - - // also, if next_handle hasn't been checked for new threads, add to - // to_search. - if (touched_handles.find(next_handle) != touched_handles.end()) { - to_search.emplace(next_handle); - } - } - } - } - // Then, make more new_searches from the handles in to_search. - for (handle_t handle : to_search) { - make_new_search(handle); // will add to haplotype_queue if there's any - // new_searches to be had. - } - to_search.clear(); - } - return finished_haplotypes; -} +// // TODO: test that it successfully extracts any haplotypes that start/end in the middle of +// // TODO: the snarl. +// /** +// * Finds all haps in gbwt associated with snarl. +// * @param snarl The subhandlegraph of the snarl to be normalized. +// * @param source_id The source of the snarl. +// * @param sink_id The sink of the snarl. +// * @return A 3-tuple containing 1) a vector of haps stretching from source to sink, in +// * vector format; 2) a second vector containing all other haps in snarl; +// * 3) a vector of all handles oberved by the method. +// */ +// tuple>, vector>, unordered_set> +// SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, +// const id_t &source_id, const id_t &sink_id) { +// /** +// * haplotype_queue contains all started exon_haplotypes not completed yet. +// * Every time we encounter a branch in the paths, the next node down the path +// * Is stored here, along with the vector of handles that represents the path up +// * to the SearchState. +// */ +// vector, gbwt::SearchState>> haplotype_queue; + +// // source and sink handle for _haploGraph: +// handle_t source_handle = _haploGraph.get_handle(source_id); +// handle_t sink_handle = _haploGraph.get_handle(sink_id); + +// // place source in haplotype_queue. +// vector source_handle_vec(1, source_handle); +// gbwt::SearchState source_state = _haploGraph.get_state(source_handle); +// haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); + +// // touched_handles contains all handles that have been touched by the +// // depth first search below, for later use in other_haplotypes_to_strings, which +// // identifies paths that didn't stretch from source to sink in the snarl. +// unordered_set touched_handles{source_handle, sink_handle}; + +// // these haplotype vecs contains all "finished" haplotypes - those that were either +// // walked to their conclusion, or until they reached the sink. +// vector> haplotypes_from_source_to_sink; +// vector> other_haplotypes; + +// // sometimes a gbwt thread will indicate a connection between two handles that doesn't +// // actually exist in the _graph. These connections need to be ignored. +// unordered_set incorrect_connections; + +// // for every partly-extracted thread, extend the thread until it either reaches +// // the sink of the snarl or the end of the thread. +// while (!haplotype_queue.empty()) { +// // todo: debug_statement +// // cerr << "haplotype queue: "; +// // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; +// // for (auto hap : haplotype_queue) { +// // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; +// // for (handle_t handle : hap.first) { +// // cerr << _haploGraph.get_id(handle) << " "; +// // } +// // cerr << endl; +// // } + +// // get a haplotype out of haplotype_queue to extend - +// // a tuple of (handles_traversed_so_far, last_touched_SearchState) +// pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); +// haplotype_queue.pop_back(); + +// // get all the subsequent search_states that immediately follow the searchstate +// // from cur_haplotype. +// vector next_searches; +// _haploGraph.follow_paths(cur_haplotype.second, +// [&](const gbwt::SearchState next_search) -> bool { +// next_searches.push_back(next_search); +// return true; +// }); + +// // if next_searches > 1, then we need to make multiple new haplotypes to be +// // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. +// if (next_searches.size() > 1) { +// // for every next_search in next_searches, either create a new, extended +// // cur_haplotype to push into haplotype queue, or place in the +// // haplotypes_from_source_to_sink if haplotype extends to sink, or place in +// // the other_haplotypes if haplotype ends before reaching sink. +// for (gbwt::SearchState next_search : next_searches) { +// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); +// // if (!snarl.has_node(snarl.get_id(next_handle)) && +// // make_pair(_haploGraph.get_id(cur_haplotype.first.back()),_haploGraph.get_id(next_handle))) +// // { +// if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { +// if (incorrect_connections.find( +// snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == +// incorrect_connections.end()) { +// cerr << "snarl starting at node " << source_id +// << " and ending at " << sink_id +// << " has a thread that incorrectly connects two nodes that " +// "don't have any edge connecting them. These two nodes are " +// << _haploGraph.get_id(cur_haplotype.first.back()) << " and " +// << _haploGraph.get_id(next_handle) +// << ". This thread connection will be ignored." << endl; +// incorrect_connections.emplace( +// snarl.edge_handle(cur_haplotype.first.back(), next_handle)); + +// // todo: debug_statement +// // cerr << "next handle(s) of handle " +// // << snarl.get_id(cur_haplotype.first.back()) +// // << " according to snarl:" << endl; +// // snarl.follow_edges(cur_haplotype.first.back(), false, +// // [&](const handle_t handle) { +// // cerr << "\t" << snarl.get_id(handle); +// // }); +// // cerr << endl; +// } +// continue; +// } +// // copy over the vector of cur_haplotype: +// vector next_handle_vec(cur_haplotype.first); + +// // add the new handle to the vec: +// next_handle_vec.push_back(next_handle); + +// // if new_handle is the sink, put in haplotypes_from_source_to_sink +// if (_haploGraph.get_id(next_handle) == sink_id) { +// haplotypes_from_source_to_sink.push_back(next_handle_vec); +// } else // keep extending the haplotype! +// { +// pair, gbwt::SearchState> next_haplotype = +// make_pair(next_handle_vec, next_search); +// haplotype_queue.push_back(next_haplotype); +// } +// // next_handle will be touched. +// touched_handles.emplace(next_handle); +// } +// } +// // if next_searches is empty, the path has ended but not reached sink. +// else if (next_searches.empty()) { +// // We have reached the end of the path, but it doesn't reach the sink. +// // we need to add cur_haplotype to other_haplotypes. +// other_haplotypes.push_back(cur_haplotype.first); + +// } +// // if next_handle is the sink, put in haplotypes_from_source_to_sink +// else if (_haploGraph.get_id( +// _haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { +// // Then we need to add cur_haplotype + next_search to +// // haplotypes_from_source_to_sink. +// handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); +// cur_haplotype.first.push_back(next_handle); +// haplotypes_from_source_to_sink.push_back(cur_haplotype.first); + +// // touched next_search's handle +// touched_handles.emplace(next_handle); +// } +// // else, there is just one next_search, and it's not the end of the path. +// // just extend the search by adding (cur_haplotype + next_search to +// // haplotype_queue. +// else { +// // get the next_handle from the one next_search. +// handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); + +// // modify cur_haplotype with next_handle and next_search. +// cur_haplotype.first.push_back(next_handle); +// cur_haplotype.second = +// next_searches.back(); // there's only one next_search in next_searches. + +// // put cur_haplotype back in haplotype_queue. +// haplotype_queue.push_back(cur_haplotype); +// touched_handles.emplace(next_handle); +// } +// } + +// // Find any haplotypes starting from handles not starting at the source, but which +// // still start somewhere inside the snarl. +// vector> haplotypes_not_starting_at_source = +// find_haplotypes_not_at_source(touched_handles, sink_id); + +// // move haplotypes_not_starting_at_source into other_haplotypes: +// other_haplotypes.reserve(other_haplotypes.size() + +// haplotypes_not_starting_at_source.size()); +// move(haplotypes_not_starting_at_source.begin(), +// haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + +// //todo: debug_statement +// cerr << "lets look through all the haplotypes after extraction:" << endl; +// for (vector hap_vec : haplotypes_from_source_to_sink) { +// cerr << "new hap:" << endl; +// for (handle_t handle : hap_vec){ +// cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; +// } +// } + +// return tuple>, vector>, +// unordered_set>{haplotypes_from_source_to_sink, +// other_haplotypes, touched_handles}; +// } + + +// // Used to complete the traversal of a snarl along its haplotype threads, when there are +// // handles connected to the snarl by threads that start after the source handle. (Threads +// // that merely end before the sink handle are addressed in extract_gbwt_haplotypes). +// // Arguments: +// // _haploGraph: the GBWTgraph containing the haplotype threads. +// // touched_handles: any handles found in the snarl so far. +// // sink_id: the id of the final handle in the snarl. +// // Returns: +// // a vector of haplotypes in vector format that start in the middle of the +// // snarl. +// vector> +// SnarlNormalizer::find_haplotypes_not_at_source(unordered_set &touched_handles, +// const id_t &sink_id) { +// // cerr << "find_haplotypes_not_at_source" << endl; + +// /// Search every handle in touched handles for haplotypes starting at that point. +// // Any new haplotypes will be added to haplotype_queue. +// vector, gbwt::SearchState>> haplotype_queue; + +// // Fully extended haplotypes (or haplotypes extended to the snarl's sink) +// // will be added to finished_haplotypes. +// vector> finished_haplotypes; + +// // In addition, we need to put the new handle into to_search, because a path may have +// // started on the new handle (which means we need to start a searchstate there.) +// unordered_set to_search; + +// // We don't need to ever check the sink handle, since paths from the sink handle +// // extend beyond snarl. +// handle_t sink_handle = _haploGraph.get_handle(sink_id); +// // touched_handles.erase(sink_handle); + +// // Nested function for making a new_search. Identifies threads starting at a given +// // handle and +// // either adds them as a full haplotype (if the haplotype is one handle long) or +// // makes a new entry to haplotype_queue. +// auto make_new_search = [&](handle_t handle) { +// // Are there any new threads starting at this handle? +// gbwt::SearchState new_search = +// _haploGraph.index->prefix(_haploGraph.handle_to_node(handle)); +// if (!new_search.empty()) { +// // Then add them to haplotype_queue. +// _haploGraph.follow_paths( +// new_search, [&](const gbwt::SearchState &next_search) -> bool { +// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); + +// /// check to make sure that the thread isn't already finished: +// // if next_handle is the sink, or if this thread is only one handle +// // long, then there isn't any useful string to extract from this. +// if (next_handle != sink_handle || +// next_search == gbwt::SearchState()) { +// // establish a new thread to walk along. +// vector new_path; +// new_path.push_back(handle); +// new_path.push_back(next_handle); + +// pair, gbwt::SearchState> mypair = +// make_pair(new_path, next_search); + +// // add the new path to haplotype_queue to be extended. +// haplotype_queue.push_back(make_pair(new_path, next_search)); + +// // if next_handle hasn't been checked for starting threads, add to +// // to_search. +// if (touched_handles.find(next_handle) == touched_handles.end()) { +// to_search.emplace(next_handle); +// } +// } +// return true; +// }); +// } +// }; + +// /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. +// /// Then, check to see if there are any new threads on handles in to_search. +// /// Extend those threads, and add any newly found handles to to_search, +// /// then search for threads again in to_search again... repeat until to_search remains +// /// emptied of new handles. + +// // for tracking whether the haplotype thread is still extending: +// bool still_extending; +// while (!to_search.empty() || !haplotype_queue.empty()) { +// while (!haplotype_queue.empty()) { +// // get a haplotype to extend out of haplotype_queue - a tuple of +// // (handles_traversed_so_far, last_touched_SearchState) +// pair, gbwt::SearchState> cur_haplotype = +// haplotype_queue.back(); +// haplotype_queue.pop_back(); + +// // get all the subsequent search_states that immediately follow the +// // searchstate from cur_haplotype. +// vector next_searches; +// _haploGraph.follow_paths(cur_haplotype.second, +// [&](const gbwt::SearchState &next_search) -> bool { +// next_searches.push_back(next_search); +// return true; +// }); + +// for (gbwt::SearchState next_search : next_searches) { +// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); + +// // if next_search is empty, then we've fallen off the thread, +// // and cur_haplotype can be placed in finished_haplotypes as is for this +// // thread. +// if (next_search == gbwt::SearchState()) { +// finished_haplotypes.push_back(cur_haplotype.first); +// } + +// // if next_search is on the sink_handle, +// // then cur_haplotype.first + next_search goes to finished_haplotypes. +// else if (_haploGraph.get_id(next_handle) == sink_id) { + +// // copy over the vector of cur_haplotype: +// vector next_handle_vec(cur_haplotype.first); +// // add next_handle +// next_handle_vec.push_back(next_handle); +// // place in finished_haplotypes +// finished_haplotypes.push_back(next_handle_vec); + +// // also, if next_handle hasn't been checked for new threads, add to +// // to_search. +// if (touched_handles.find(next_handle) != touched_handles.end()) { +// to_search.emplace(next_handle); +// } + +// } +// // otherwise, just place an extended cur_haplotype in haplotype_queue. +// else { +// // copy over cur_haplotype: +// pair, gbwt::SearchState> cur_haplotype_copy = +// cur_haplotype; +// // modify with next_handle/search +// cur_haplotype_copy.first.push_back(next_handle); +// cur_haplotype_copy.second = next_search; +// // place back in haplotype_queue for further extension. +// haplotype_queue.push_back(cur_haplotype_copy); + +// // also, if next_handle hasn't been checked for new threads, add to +// // to_search. +// if (touched_handles.find(next_handle) != touched_handles.end()) { +// to_search.emplace(next_handle); +// } +// } +// } +// } +// // Then, make more new_searches from the handles in to_search. +// for (handle_t handle : to_search) { +// make_new_search(handle); // will add to haplotype_queue if there's any +// // new_searches to be had. +// } +// to_search.clear(); +// } +// return finished_haplotypes; +// } // Given a vector of haplotypes of format vector< handle_t >, returns a vector of // haplotypes of @@ -838,14 +805,14 @@ void SnarlNormalizer::force_maximum_handle_size(MutableHandleGraph &graph, } // Finds all embedded paths that either start or end in a snarl (or both) defined by -// _cur_source_id, sink_id. +// source_id, sink_id. // returns a vector of the embedded paths, where each entry in the vector is defined // by the pair of step_handles closest to the beginning and end of the path. If the // path is fully contained within the snarl, these step_handles will the be the // leftmost and rightmost handles in the path. // Arguments: // _graph: a pathhandlegraph containing the snarl with embedded paths. -// _cur_source_id: the source of the snarl of interest. +// source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: // a vector containing all the embedded paths in the snarl, in pair< step_handle_t, @@ -956,7 +923,7 @@ SnarlNormalizer::extract_embedded_paths_in_snarl(const PathHandleGraph &graph, // Given a start and end node id, construct an extract subgraph between the two nodes // (inclusive). Arguments: // _graph: a pathhandlegraph containing the snarl with embedded paths. -// _cur_source_id: the source of the snarl of interest. +// source_id: the source of the snarl of interest. // sink_id: the sink of the snarl of interest. // Returns: // a SubHandleGraph containing only the handles in _graph that are between start_id @@ -1021,7 +988,7 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, } // Integrates the snarl into the _graph, replacing the snarl occupying the space between -// _cur_source_id and sink_id. +// source_id and sink_id. // In the process, transfers any embedded paths traversing the old snarl into the new // snarl. // Arguments: @@ -1032,7 +999,7 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, // old_embedded_path, and pair.second is the step_handle *after* // the last step_handle of interest in the old_embedded_path (can // be the null step at the end of the path.) -// _cur_source_id: the source of the old (to be replaced) snarl in _graph +// source_id: the source of the old (to be replaced) snarl in _graph // sink_id: the sink of the old (to be replaced) snarl in _graph. // Return: None. // TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same @@ -1045,7 +1012,8 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, // about this. void SnarlNormalizer::integrate_snarl( const HandleGraph &to_insert_snarl, - const vector> embedded_paths) { + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id) { // cerr << "integrate_snarl" << endl; //todo: debug_statement @@ -1056,7 +1024,7 @@ void SnarlNormalizer::integrate_snarl( }); cerr << endl; // Get old _graph snarl - SubHandleGraph old_snarl = extract_subgraph(_graph, _cur_source_id, _cur_sink_id); + SubHandleGraph old_snarl = extract_subgraph(_graph, source_id, sink_id); // TODO: debug_statement: Check to make sure that newly made snarl has only one start // and end. @@ -1067,7 +1035,7 @@ void SnarlNormalizer::integrate_snarl( if (to_insert_snarl_defining_handles.first.size() > 1 || to_insert_snarl_defining_handles.second.size() > 1) { - cerr << "ERROR: newly made snarl from a snarl starting at " << _cur_source_id + cerr << "ERROR: newly made snarl from a snarl starting at " << source_id << " has more than one start or end. # of starts: " << to_insert_snarl_defining_handles.first.size() << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl; @@ -1129,11 +1097,11 @@ void SnarlNormalizer::integrate_snarl( // source and sink. // source integration: _graph.follow_edges( - _graph.get_handle(_cur_source_id), true, [&](const handle_t &prev_handle) { + _graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { _graph.create_edge(prev_handle, _graph.get_handle(temp_snarl_source_id)); }); _graph.follow_edges( - _graph.get_handle(_cur_sink_id), false, [&](const handle_t &next_handle) { + _graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { _graph.create_edge(_graph.get_handle(temp_snarl_sink_id), next_handle); }); @@ -1142,7 +1110,7 @@ void SnarlNormalizer::integrate_snarl( // //todo: debug_statement // cerr << "the new sink id: " << temp_snarl_sink_id << endl; move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_source_id, - temp_snarl_sink_id, _cur_source_id, _cur_sink_id); + temp_snarl_sink_id, source_id, sink_id); } // Destroy the old snarl. @@ -1159,9 +1127,9 @@ void SnarlNormalizer::integrate_snarl( // same snarl manager. Couldn't replace it before b/c we needed the old handles to // move the paths. handle_t new_source_handle = _graph.create_handle( - _graph.get_sequence(_graph.get_handle(temp_snarl_source_id)), _cur_source_id); + _graph.get_sequence(_graph.get_handle(temp_snarl_source_id)), source_id); handle_t new_sink_handle = _graph.create_handle( - _graph.get_sequence(new_snarl_topo_order.back()), _cur_sink_id); + _graph.get_sequence(new_snarl_topo_order.back()), sink_id); // move the source edges: // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could @@ -1693,9 +1661,9 @@ SnarlNormalizer::debug_get_sources_and_sinks(const HandleGraph &graph) { // from source to sink. Generates a combinatorial number of possible paths with splits // in the snarl. pair, unordered_set> -SnarlNormalizer::source_to_sink_exhaustive_path_finder() { +SnarlNormalizer::source_to_sink_exhaustive_path_finder(const id_t &source_id, const id_t &sink_id) { // cerr << "debug_graph_to_strings" << endl; - SubHandleGraph snarl = extract_subgraph(_graph, _cur_source_id, _cur_sink_id); + SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); unordered_set touched_handles; diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index af487ef7060..6ea796a5907 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -31,15 +31,14 @@ class SnarlNormalizer { // the maximum number of threads allowed to align in a given snarl. If the number of // threads exceeds this threshold, the snarl is skipped. int _max_alignment_size; - id_t _cur_source_id; - id_t _cur_sink_id; const string &_path_finder; tuple>, vector>, unordered_set> extract_gbwt_haplotypes(const SubHandleGraph &snarl, const id_t &source_id, const id_t &sink_id); - pair, unordered_set> source_to_sink_exhaustive_path_finder(); + pair, unordered_set> + source_to_sink_exhaustive_path_finder(const id_t &source_id, const id_t &sink_id); vector> find_haplotypes_not_at_source(unordered_set &touched_handles, @@ -60,7 +59,8 @@ class SnarlNormalizer { const id_t &end_id); void integrate_snarl(const HandleGraph &new_snarl, - const vector> embedded_paths); + const vector> embedded_paths, + const id_t &source_id, const id_t &sink_id); void move_path_to_snarl(const pair &old_embedded_path, vector &new_snarl_handles, id_t &new_source_id, diff --git a/src/algorithms/0_snarl_sequences.cpp b/src/algorithms/0_snarl_sequences.cpp new file mode 100644 index 00000000000..1b41d8e5554 --- /dev/null +++ b/src/algorithms/0_snarl_sequences.cpp @@ -0,0 +1,361 @@ +#include "0_snarl_sequences.hpp" +// #include +#include +#include "../handle.hpp" + +#include "../subgraph.hpp" + + +namespace vg { +SnarlSequences::SnarlSequences(const SubHandleGraph &snarl, + const gbwtgraph::GBWTGraph &haploGraph, + const id_t &source_id, const id_t &sink_id) + : _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id) {} + +// TODO: test that it successfully extracts any haplotypes that start/end in the middle of +// TODO: the snarl. +/** + * Finds all haplotypes in gbwt associated with snarl. + * @param snarl The subhandlegraph of the snarl to be normalized. + * @param source_id The source of the snarl. + * @param sink_id The sink of the snarl. + * @return A 3-tuple containing 1) a vector of haps stretching from source to sink, in + * vector format; 2) a second vector containing all other haps in snarl; + * 3) a vector of all handles oberved by the method. +*/ +tuple>, vector>, unordered_set> +SnarlSequences::find_gbwt_haps() { + /** + * haplotype_queue contains all started exon_haplotypes not completed yet. + * Every time we encounter a branch in the paths, the next node down the path + * Is stored here, along with the vector of handles that represents the path up + * to the SearchState. + */ + vector, gbwt::SearchState>> haplotype_queue; + + // source and sink handle for _haploGraph: + handle_t source_handle = _haploGraph.get_handle(_source_id); + handle_t sink_handle = _haploGraph.get_handle(_sink_id); + + // place source in haplotype_queue. + vector source_handle_vec(1, source_handle); + gbwt::SearchState source_state = _haploGraph.get_state(source_handle); + haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); + + // touched_handles contains all handles that have been touched by the + // depth first search below, for later use in other_haplotypes_to_strings, which + // identifies paths that didn't stretch from source to sink in the snarl. + unordered_set touched_handles{source_handle, sink_handle}; + + // these haplotype vecs contains all "finished" haplotypes - those that were either + // walked to their conclusion, or until they reached the sink. + vector> haplotypes_from_source_to_sink; + vector> other_haplotypes; + + // sometimes a gbwt thread will indicate a connection between two handles that doesn't + // actually exist in the _graph. These connections need to be ignored. + unordered_set incorrect_connections; + + // for every partly-extracted thread, extend the thread until it either reaches + // the sink of the snarl or the end of the thread. + while (!haplotype_queue.empty()) { + // todo: debug_statement + // cerr << "haplotype queue: "; + // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; + // for (auto hap : haplotype_queue) { + // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; + // for (handle_t handle : hap.first) { + // cerr << _haploGraph.get_id(handle) << " "; + // } + // cerr << endl; + // } + + // get a haplotype out of haplotype_queue to extend - + // a tuple of (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the searchstate + // from cur_haplotype. + vector next_searches; + _haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + // if next_searches > 1, then we need to make multiple new haplotypes to be + // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. + if (next_searches.size() > 1) { + // for every next_search in next_searches, either create a new, extended + // cur_haplotype to push into haplotype queue, or place in the + // haplotypes_from_source_to_sink if haplotype extends to sink, or place in + // the other_haplotypes if haplotype ends before reaching sink. + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); + // if (!snarl.has_node(snarl.get_id(next_handle)) && + // make_pair(_haploGraph.get_id(cur_haplotype.first.back()),_haploGraph.get_id(next_handle))) + // { + if (!_snarl.has_edge(cur_haplotype.first.back(), next_handle)) { + if (incorrect_connections.find( + _snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == + incorrect_connections.end()) { + cerr << "_snarl starting at node " << _source_id + << " and ending at " << _sink_id + << " has a thread that incorrectly connects two nodes that " + "don't have any edge connecting them. These two nodes are " + << _haploGraph.get_id(cur_haplotype.first.back()) << " and " + << _haploGraph.get_id(next_handle) + << ". This thread connection will be ignored." << endl; + incorrect_connections.emplace( + _snarl.edge_handle(cur_haplotype.first.back(), next_handle)); + + // todo: debug_statement + // cerr << "next handle(s) of handle " + // << _snarl.get_id(cur_haplotype.first.back()) + // << " according to _snarl:" << endl; + // _snarl.follow_edges(cur_haplotype.first.back(), false, + // [&](const handle_t handle) { + // cerr << "\t" << _snarl.get_id(handle); + // }); + // cerr << endl; + } + continue; + } + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + + // add the new handle to the vec: + next_handle_vec.push_back(next_handle); + + // if new_handle is the sink, put in haplotypes_from_source_to_sink + if (_haploGraph.get_id(next_handle) == _sink_id) { + haplotypes_from_source_to_sink.push_back(next_handle_vec); + } else // keep extending the haplotype! + { + pair, gbwt::SearchState> next_haplotype = + make_pair(next_handle_vec, next_search); + haplotype_queue.push_back(next_haplotype); + } + // next_handle will be touched. + touched_handles.emplace(next_handle); + } + } + // if next_searches is empty, the path has ended but not reached sink. + else if (next_searches.empty()) { + // We have reached the end of the path, but it doesn't reach the sink. + // we need to add cur_haplotype to other_haplotypes. + other_haplotypes.push_back(cur_haplotype.first); + + } + // if next_handle is the sink, put in haplotypes_from_source_to_sink + else if (_haploGraph.get_id( + _haploGraph.node_to_handle(next_searches.back().node)) == _sink_id) { + // Then we need to add cur_haplotype + next_search to + // haplotypes_from_source_to_sink. + handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); + cur_haplotype.first.push_back(next_handle); + haplotypes_from_source_to_sink.push_back(cur_haplotype.first); + + // touched next_search's handle + touched_handles.emplace(next_handle); + } + // else, there is just one next_search, and it's not the end of the path. + // just extend the search by adding (cur_haplotype + next_search to + // haplotype_queue. + else { + // get the next_handle from the one next_search. + handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); + + // modify cur_haplotype with next_handle and next_search. + cur_haplotype.first.push_back(next_handle); + cur_haplotype.second = + next_searches.back(); // there's only one next_search in next_searches. + + // put cur_haplotype back in haplotype_queue. + haplotype_queue.push_back(cur_haplotype); + touched_handles.emplace(next_handle); + } + } + + // Find any haplotypes starting from handles not starting at the source, but which + // still start somewhere inside the snarl. + vector> haplotypes_not_starting_at_source = + find_haplotypes_not_at_source(touched_handles); + + // move haplotypes_not_starting_at_source into other_haplotypes: + other_haplotypes.reserve(other_haplotypes.size() + + haplotypes_not_starting_at_source.size()); + move(haplotypes_not_starting_at_source.begin(), + haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); + + //todo: debug_statement + cerr << "lets look through all the haplotypes after extraction:" << endl; + for (vector hap_vec : haplotypes_from_source_to_sink) { + cerr << "new hap:" << endl; + for (handle_t handle : hap_vec){ + cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; + } + } + + return tuple>, vector>, + unordered_set>{haplotypes_from_source_to_sink, + other_haplotypes, touched_handles}; +} + + +// Used to complete the traversal of a snarl along its haplotype threads, when there are +// handles connected to the snarl by threads that start after the source handle. (Threads +// that merely end before the sink handle are addressed in extract_gbwt_haplotypes). +// Arguments: +// _haploGraph: the GBWTgraph containing the haplotype threads. +// touched_handles: any handles found in the snarl so far. +// sink_id: the id of the final handle in the snarl. +// Returns: +// a vector of haplotypes in vector format that start in the middle of the +// snarl. +vector> +SnarlSequences::find_haplotypes_not_at_source(unordered_set &touched_handles) { + // cerr << "find_haplotypes_not_at_source" << endl; + + /// Search every handle in touched handles for haplotypes starting at that point. + // Any new haplotypes will be added to haplotype_queue. + vector, gbwt::SearchState>> haplotype_queue; + + // Fully extended haplotypes (or haplotypes extended to the snarl's sink) + // will be added to finished_haplotypes. + vector> finished_haplotypes; + + // In addition, we need to put the new handle into to_search, because a path may have + // started on the new handle (which means we need to start a searchstate there.) + unordered_set to_search; + + // We don't need to ever check the sink handle, since paths from the sink handle + // extend beyond snarl. + handle_t sink_handle = _haploGraph.get_handle(_sink_id); + // touched_handles.erase(sink_handle); + + // Nested function for making a new_search. Identifies threads starting at a given + // handle and + // either adds them as a full haplotype (if the haplotype is one handle long) or + // makes a new entry to haplotype_queue. + auto make_new_search = [&](handle_t handle) { + // Are there any new threads starting at this handle? + gbwt::SearchState new_search = + _haploGraph.index->prefix(_haploGraph.handle_to_node(handle)); + if (!new_search.empty()) { + // Then add them to haplotype_queue. + _haploGraph.follow_paths( + new_search, [&](const gbwt::SearchState &next_search) -> bool { + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); + + /// check to make sure that the thread isn't already finished: + // if next_handle is the sink, or if this thread is only one handle + // long, then there isn't any useful string to extract from this. + if (next_handle != sink_handle || + next_search == gbwt::SearchState()) { + // establish a new thread to walk along. + vector new_path; + new_path.push_back(handle); + new_path.push_back(next_handle); + + pair, gbwt::SearchState> mypair = + make_pair(new_path, next_search); + + // add the new path to haplotype_queue to be extended. + haplotype_queue.push_back(make_pair(new_path, next_search)); + + // if next_handle hasn't been checked for starting threads, add to + // to_search. + if (touched_handles.find(next_handle) == touched_handles.end()) { + to_search.emplace(next_handle); + } + } + return true; + }); + } + }; + + /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. + /// Then, check to see if there are any new threads on handles in to_search. + /// Extend those threads, and add any newly found handles to to_search, + /// then search for threads again in to_search again... repeat until to_search remains + /// emptied of new handles. + + // for tracking whether the haplotype thread is still extending: + bool still_extending; + while (!to_search.empty() || !haplotype_queue.empty()) { + while (!haplotype_queue.empty()) { + // get a haplotype to extend out of haplotype_queue - a tuple of + // (handles_traversed_so_far, last_touched_SearchState) + pair, gbwt::SearchState> cur_haplotype = + haplotype_queue.back(); + haplotype_queue.pop_back(); + + // get all the subsequent search_states that immediately follow the + // searchstate from cur_haplotype. + vector next_searches; + _haploGraph.follow_paths(cur_haplotype.second, + [&](const gbwt::SearchState &next_search) -> bool { + next_searches.push_back(next_search); + return true; + }); + + for (gbwt::SearchState next_search : next_searches) { + handle_t next_handle = _haploGraph.node_to_handle(next_search.node); + + // if next_search is empty, then we've fallen off the thread, + // and cur_haplotype can be placed in finished_haplotypes as is for this + // thread. + if (next_search == gbwt::SearchState()) { + finished_haplotypes.push_back(cur_haplotype.first); + } + + // if next_search is on the sink_handle, + // then cur_haplotype.first + next_search goes to finished_haplotypes. + else if (_haploGraph.get_id(next_handle) == _sink_id) { + + // copy over the vector of cur_haplotype: + vector next_handle_vec(cur_haplotype.first); + // add next_handle + next_handle_vec.push_back(next_handle); + // place in finished_haplotypes + finished_haplotypes.push_back(next_handle_vec); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + + } + // otherwise, just place an extended cur_haplotype in haplotype_queue. + else { + // copy over cur_haplotype: + pair, gbwt::SearchState> cur_haplotype_copy = + cur_haplotype; + // modify with next_handle/search + cur_haplotype_copy.first.push_back(next_handle); + cur_haplotype_copy.second = next_search; + // place back in haplotype_queue for further extension. + haplotype_queue.push_back(cur_haplotype_copy); + + // also, if next_handle hasn't been checked for new threads, add to + // to_search. + if (touched_handles.find(next_handle) != touched_handles.end()) { + to_search.emplace(next_handle); + } + } + } + } + // Then, make more new_searches from the handles in to_search. + for (handle_t handle : to_search) { + make_new_search(handle); // will add to haplotype_queue if there's any + // new_searches to be had. + } + to_search.clear(); + } + return finished_haplotypes; +} + +} \ No newline at end of file diff --git a/src/algorithms/0_snarl_sequences.hpp b/src/algorithms/0_snarl_sequences.hpp new file mode 100644 index 00000000000..49340559bc5 --- /dev/null +++ b/src/algorithms/0_snarl_sequences.hpp @@ -0,0 +1,31 @@ +#include +#include "../handle.hpp" +#include "../subgraph.hpp" + + +namespace vg { + +class SnarlSequences { + public: + virtual ~SnarlSequences() = default; + + SnarlSequences(const SubHandleGraph &snarl, + const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, + const id_t &sink_id); + + tuple>, vector>, unordered_set> + find_gbwt_haps(); + + protected: + // member variables: + // the handle graph with snarls to normalize + const SubHandleGraph &_snarl; + // GBWT graph with snarls to normalize, includes the embedded threads needed for the + // GBWTPathFinder approach. + const gbwtgraph::GBWTGraph &_haploGraph; + const id_t _source_id; + const id_t _sink_id; + + vector> find_haplotypes_not_at_source(unordered_set &touched_handles); +}; +} \ No newline at end of file From 825bea678a37bd6c8d0577d5e4e209a737d6a432 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 16:35:32 -0800 Subject: [PATCH 42/63] exhaustive sequence finding moved to SnarlSequenceFinder --- src/algorithms/0_oo_normalize_snarls.cpp | 15 +- src/algorithms/0_oo_normalize_snarls.hpp | 3 +- ...uences.cpp => 0_snarl_sequence_finder.cpp} | 132 +++++++++++++++++- ...uences.hpp => 0_snarl_sequence_finder.hpp} | 10 +- src/subcommand/0_normalize_main.cpp | 4 +- 5 files changed, 146 insertions(+), 18 deletions(-) rename src/algorithms/{0_snarl_sequences.cpp => 0_snarl_sequence_finder.cpp} (80%) rename src/algorithms/{0_snarl_sequences.hpp => 0_snarl_sequence_finder.hpp} (77%) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index b1149e5f3b3..ecc84bea1c9 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -1,8 +1,8 @@ #pragma once // TODO: remove this, to avoid warnings + maybe bad coding practice? #include "0_oo_normalize_snarls.hpp" -#include "0_snarl_sequences.hpp" -#include +#include "0_snarl_sequence_finder.hpp" +// #include #include #include @@ -30,7 +30,7 @@ TODO: test that extract_gbwt haplotypes successfully extracts any haplotypes tha TODO: the snarl. */ namespace vg { - +namespace algorithms{ /** * To "normalize" a snarl, SnarlNormalizer extracts all the sequences in the snarl as * represented in the gbwt, and then realigns them to create a replacement snarl. @@ -159,7 +159,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // extract threads tuple, vector>, unordered_set> haplotypes; - SnarlSequences sequence_finder = SnarlSequences(snarl, _haploGraph, source_id, sink_id); + SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(snarl, _haploGraph, source_id, sink_id); if (_path_finder == "GBWT") { tuple>, vector>, unordered_set> @@ -170,7 +170,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & get<2>(haplotypes) = get<2>(gbwt_haplotypes); } else if (_path_finder == "exhaustive") { pair, unordered_set> exhaustive_haplotypes = - source_to_sink_exhaustive_path_finder(source_id, sink_id); + sequence_finder.find_exhaustive_paths(); get<0>(haplotypes) = exhaustive_haplotypes.first; get<2>(haplotypes) = exhaustive_haplotypes.second; } else { @@ -1702,7 +1702,7 @@ SnarlNormalizer::source_to_sink_exhaustive_path_finder(const id_t &source_id, co // count walks by dynamic programming bool overflowed = false; - for (const handle_t &handle : algorithms::lazier_topological_order(&snarl)) { + for (const handle_t &handle : lazier_topological_order(&snarl)) { touched_handles.emplace(handle); size_t count_here = count[handle]; vector seqs_here = sequences[handle]; @@ -1745,4 +1745,5 @@ SnarlNormalizer::source_to_sink_exhaustive_path_finder(const id_t &source_id, co return make_pair(walks, touched_handles); } -} // namespace vg \ No newline at end of file +} +} \ No newline at end of file diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 6ea796a5907..44344f75f1d 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -8,6 +8,7 @@ namespace vg { +namespace algorithms { class SnarlNormalizer { public: @@ -81,5 +82,5 @@ class SnarlNormalizer { pair, vector> debug_get_sources_and_sinks(const HandleGraph &graph); }; - +} } // namespace vg diff --git a/src/algorithms/0_snarl_sequences.cpp b/src/algorithms/0_snarl_sequence_finder.cpp similarity index 80% rename from src/algorithms/0_snarl_sequences.cpp rename to src/algorithms/0_snarl_sequence_finder.cpp index 1b41d8e5554..32c2c3a99dc 100644 --- a/src/algorithms/0_snarl_sequences.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -1,13 +1,42 @@ -#include "0_snarl_sequences.hpp" +#include "0_snarl_sequence_finder.hpp" + +// #include "0_oo_normalize_snarls.hpp" +// #include "0_snarl_sequence_finder.hpp" // #include +// #include + +// #include +// #include +// #include + +// #include + +// #include "../gbwt_helper.hpp" +// #include "../handle.hpp" +// #include "../msa_converter.hpp" +// #include "../snarls.hpp" +// #include "../vg.hpp" +// #include "is_acyclic.hpp" + +// #include "../types.hpp" +// #include "extract_containing_graph.hpp" + +// #include + +// #include "../msa_converter.hpp" +// #include "vg.hpp" + +#include "topological_sort.hpp" + + #include #include "../handle.hpp" - #include "../subgraph.hpp" namespace vg { -SnarlSequences::SnarlSequences(const SubHandleGraph &snarl, +namespace algorithms{ +SnarlSequenceFinder::SnarlSequenceFinder(const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id) : _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id) {} @@ -24,7 +53,7 @@ SnarlSequences::SnarlSequences(const SubHandleGraph &snarl, * 3) a vector of all handles oberved by the method. */ tuple>, vector>, unordered_set> -SnarlSequences::find_gbwt_haps() { +SnarlSequenceFinder::find_gbwt_haps() { /** * haplotype_queue contains all started exon_haplotypes not completed yet. * Every time we encounter a branch in the paths, the next node down the path @@ -215,7 +244,7 @@ SnarlSequences::find_gbwt_haps() { // a vector of haplotypes in vector format that start in the middle of the // snarl. vector> -SnarlSequences::find_haplotypes_not_at_source(unordered_set &touched_handles) { +SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touched_handles) { // cerr << "find_haplotypes_not_at_source" << endl; /// Search every handle in touched handles for haplotypes starting at that point. @@ -358,4 +387,97 @@ SnarlSequences::find_haplotypes_not_at_source(unordered_set &touched_h return finished_haplotypes; } +/////////////////////////////////// +// exhaustive sequence extraction: +/////////////////////////////////// +// Runs through the whole snarl and generates all possible strings representing walks +// from source to sink. Generates a combinatorial number of possible paths with splits +// in the snarl. +//todo: for consistency, have source_to_sink_exhaustive_path_finder return paths in format +//todo: vector> instead of vector +pair, unordered_set> +SnarlSequenceFinder::find_exhaustive_paths() { + // cerr << "debug_graph_to_strings" << endl; + unordered_set touched_handles; + + unordered_map> sequences; + vector sinks; + unordered_map count; + count.reserve(_snarl.get_node_count()); // resize count to contain enough buckets + // for size of _snarl + sequences.reserve(_snarl.get_node_count()); // resize sequences to contain enough + // buckets for size of _snarl + + // identify sources and sinks //TODO: once we've established that this fxn works, + // we can just use start_id and sink_id. + _snarl.for_each_handle([&](const handle_t &handle) { + bool is_source = true, is_sink = true; + _snarl.follow_edges(handle, true, [&](const handle_t &prev) { + is_source = false; + return false; + }); + _snarl.follow_edges(handle, false, [&](const handle_t &next) { + is_sink = false; + return false; + }); + + // base case for dynamic programming + if (is_source) { + count[handle] = 1; + sequences[handle].push_back( + _snarl.get_sequence(handle)); // TODO: presented in the handle's local + // forward orientation. An issue? + } + if (is_sink) { + sinks.emplace_back(handle); + } + }); + + // count walks by dynamic programming + bool overflowed = false; + for (const handle_t &handle : lazier_topological_order(&_snarl)) { + touched_handles.emplace(handle); + size_t count_here = count[handle]; + vector seqs_here = sequences[handle]; + + _snarl.follow_edges(handle, false, [&](const handle_t &next) { + size_t &count_next = count[next]; + string seq_next = _snarl.get_sequence(next); + + if (numeric_limits::max() - count_here < count_next) { + overflowed = true; + } + + else { + count_next += count_here; + for (string seq : seqs_here) { + sequences[next].push_back(seq + seq_next); + } + } + }); + /// TODO: figure out how to deal with overflow. + // if (overflowed) { + // return numeric_limits::max(); + // } + } + + // total up the walks at the sinks + size_t total_count = 0; + for (handle_t &sink : sinks) { + total_count += count[sink]; + } + + // all the sequences at the sinks will be all the sequences in the _snarl. + vector walks; + for (handle_t &sink : sinks) { + for (string seq : sequences[sink]) { + walks.push_back(seq); + } + } + + return make_pair(walks, touched_handles); +} + + +} } \ No newline at end of file diff --git a/src/algorithms/0_snarl_sequences.hpp b/src/algorithms/0_snarl_sequence_finder.hpp similarity index 77% rename from src/algorithms/0_snarl_sequences.hpp rename to src/algorithms/0_snarl_sequence_finder.hpp index 49340559bc5..75d1e3176f5 100644 --- a/src/algorithms/0_snarl_sequences.hpp +++ b/src/algorithms/0_snarl_sequence_finder.hpp @@ -4,18 +4,21 @@ namespace vg { +namespace algorithms { -class SnarlSequences { +class SnarlSequenceFinder { public: - virtual ~SnarlSequences() = default; + virtual ~SnarlSequenceFinder() = default; - SnarlSequences(const SubHandleGraph &snarl, + SnarlSequenceFinder(const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id); tuple>, vector>, unordered_set> find_gbwt_haps(); + pair, unordered_set> find_exhaustive_paths(); + protected: // member variables: // the handle graph with snarls to normalize @@ -28,4 +31,5 @@ class SnarlSequences { vector> find_haplotypes_not_at_source(unordered_set &touched_handles); }; +} } \ No newline at end of file diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index d0d278dab12..add0df34343 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -145,8 +145,8 @@ int main_normalize(int argc, char **argv) { // Record start time auto start = chrono::high_resolution_clock::now(); - SnarlNormalizer normalizer = - SnarlNormalizer(*graph, haploGraph, max_alignment_size); + algorithms::SnarlNormalizer normalizer = + algorithms::SnarlNormalizer(*graph, haploGraph, max_alignment_size); // run test code on all snarls in graph. normalizer.normalize_top_level_snarls(snarl_stream); From 3d27382cd329c13c0266f5489715d704df111e72 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 16:55:00 -0800 Subject: [PATCH 43/63] moved find_embedded_paths to snarl_sequence_finder --- src/algorithms/0_oo_normalize_snarls.cpp | 224 ++++++++++----------- src/algorithms/0_snarl_sequence_finder.cpp | 124 +++++++++++- src/algorithms/0_snarl_sequence_finder.hpp | 15 +- 3 files changed, 242 insertions(+), 121 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index ecc84bea1c9..05c7d3121e9 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -159,7 +159,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // extract threads tuple, vector>, unordered_set> haplotypes; - SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(snarl, _haploGraph, source_id, sink_id); + SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id); if (_path_finder == "GBWT") { tuple>, vector>, unordered_set> @@ -197,7 +197,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // Get the embedded paths in the snarl from _graph, to move them to new_snarl. // Any embedded paths not in gbwt are aligned in the new snarl. vector> embedded_paths = - extract_embedded_paths_in_snarl(_graph, source_id, sink_id); + sequence_finder.find_embedded_paths(); //todo: debug_statement cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; @@ -804,120 +804,120 @@ void SnarlNormalizer::force_maximum_handle_size(MutableHandleGraph &graph, }); } -// Finds all embedded paths that either start or end in a snarl (or both) defined by -// source_id, sink_id. -// returns a vector of the embedded paths, where each entry in the vector is defined -// by the pair of step_handles closest to the beginning and end of the path. If the -// path is fully contained within the snarl, these step_handles will the be the -// leftmost and rightmost handles in the path. -// Arguments: -// _graph: a pathhandlegraph containing the snarl with embedded paths. -// source_id: the source of the snarl of interest. -// sink_id: the sink of the snarl of interest. -// Returns: -// a vector containing all the embedded paths in the snarl, in pair< step_handle_t, -// step_handle_t > > format. Pair.first is the first step in the path's range of -// interest, and pair.second is the step *after* the last step in the path's range of -// interest (can be the null step at end of path). -vector> -SnarlNormalizer::extract_embedded_paths_in_snarl(const PathHandleGraph &graph, - const id_t &source_id, - const id_t &sink_id) { - // cerr << "extract_embedded_paths_in_snarl" << endl; - // cerr << "source id: " << source_id << endl; - // cerr << "source id contains what paths?: " << endl; - // for (auto step : _graph.steps_of_handle(graph.get_handle(_source_id))) { - // cerr << "\t" << _graph.get_path_name(graph.get_path_handle_of_step(step)) << - // endl; - // } - // cerr << "neighbors of 71104? (should include 71097):" << endl; - // handle_t test_handle = _graph.get_handle(71104); - // _graph.follow_edges(test_handle, true, [&](const handle_t &handle) { - // cerr << _graph.get_id(handle) << endl; - // }); - // cerr << "can I still access source handle?" - // << _graph.get_sequence(graph.get_handle(_source_id)) << endl; - - // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't - // extend the path to a point beyond the source or sink. - SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); - // key is path_handle, value is a step in that path from which to extend. - unordered_map paths_found; - - // look for handles with paths we haven't touched yet. - snarl.for_each_handle([&](const handle_t &handle) { - vector steps = graph.steps_of_handle(handle); - // do any of these steps belong to a path not in paths_found? - for (step_handle_t &step : steps) { - path_handle_t path = graph.get_path_handle_of_step(step); - // If it's a step along a new path, save the first step to that path we find. - // In addtion, if there are multiple steps found in the path, (The avoidance - // of source and sink here is to ensure that we can properly check to see if - // we've reached the end of an embedded path walking in any arbitrary - // direction (i.e. source towards sink or sink towards source). - if (paths_found.find(path) == paths_found.end() || - graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || - graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { - // then we need to mark it as found and save the step. - paths_found[path] = step; - } - } - }); - - // todo: debug_statement - // cerr << "################looking for new paths################" << endl; - // for (auto path : paths_found) { - // cerr << _graph.get_path_name(path.first) << " " - // << _graph.get_id(graph.get_handle_of_step(path.second)) << endl; - // } - - /// for each step_handle_t corresponding to a unique path, we want to get the steps - /// closest to both the end and beginning step that still remains in the snarl. - // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", - // TODO so that I could iterate over the fxn. That sounds template-messy in C++ - // tho'. Should I? - vector> paths_in_snarl; - for (auto &it : paths_found) { - step_handle_t step = it.second; - // path_in_snarl describes the start and end steps in the path, - // as constrained by the snarl. - pair path_in_snarl; - - // Look for the step closest to the beginning of the path, as constrained by the - // snarl. - step_handle_t begin_in_snarl_step = step; - id_t begin_in_snarl_id = - _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); - - while ((begin_in_snarl_id != source_id) && - _graph.has_previous_step(begin_in_snarl_step)) { - begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); - begin_in_snarl_id = - _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); - } - path_in_snarl.first = begin_in_snarl_step; +// // Finds all embedded paths that either start or end in a snarl (or both) defined by +// // source_id, sink_id. +// // returns a vector of the embedded paths, where each entry in the vector is defined +// // by the pair of step_handles closest to the beginning and end of the path. If the +// // path is fully contained within the snarl, these step_handles will the be the +// // leftmost and rightmost handles in the path. +// // Arguments: +// // _graph: a pathhandlegraph containing the snarl with embedded paths. +// // source_id: the source of the snarl of interest. +// // sink_id: the sink of the snarl of interest. +// // Returns: +// // a vector containing all the embedded paths in the snarl, in pair< step_handle_t, +// // step_handle_t > > format. Pair.first is the first step in the path's range of +// // interest, and pair.second is the step *after* the last step in the path's range of +// // interest (can be the null step at end of path). +// vector> +// SnarlNormalizer::extract_embedded_paths_in_snarl(const PathHandleGraph &graph, +// const id_t &source_id, +// const id_t &sink_id) { +// // cerr << "extract_embedded_paths_in_snarl" << endl; +// // cerr << "source id: " << source_id << endl; +// // cerr << "source id contains what paths?: " << endl; +// // for (auto step : _graph.steps_of_handle(graph.get_handle(_source_id))) { +// // cerr << "\t" << _graph.get_path_name(graph.get_path_handle_of_step(step)) << +// // endl; +// // } +// // cerr << "neighbors of 71104? (should include 71097):" << endl; +// // handle_t test_handle = _graph.get_handle(71104); +// // _graph.follow_edges(test_handle, true, [&](const handle_t &handle) { +// // cerr << _graph.get_id(handle) << endl; +// // }); +// // cerr << "can I still access source handle?" +// // << _graph.get_sequence(graph.get_handle(_source_id)) << endl; + +// // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't +// // extend the path to a point beyond the source or sink. +// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); +// // key is path_handle, value is a step in that path from which to extend. +// unordered_map paths_found; + +// // look for handles with paths we haven't touched yet. +// snarl.for_each_handle([&](const handle_t &handle) { +// vector steps = graph.steps_of_handle(handle); +// // do any of these steps belong to a path not in paths_found? +// for (step_handle_t &step : steps) { +// path_handle_t path = graph.get_path_handle_of_step(step); +// // If it's a step along a new path, save the first step to that path we find. +// // In addtion, if there are multiple steps found in the path, (The avoidance +// // of source and sink here is to ensure that we can properly check to see if +// // we've reached the end of an embedded path walking in any arbitrary +// // direction (i.e. source towards sink or sink towards source). +// if (paths_found.find(path) == paths_found.end() || +// graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || +// graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { +// // then we need to mark it as found and save the step. +// paths_found[path] = step; +// } +// } +// }); + +// // todo: debug_statement +// // cerr << "################looking for new paths################" << endl; +// // for (auto path : paths_found) { +// // cerr << _graph.get_path_name(path.first) << " " +// // << _graph.get_id(graph.get_handle_of_step(path.second)) << endl; +// // } + +// /// for each step_handle_t corresponding to a unique path, we want to get the steps +// /// closest to both the end and beginning step that still remains in the snarl. +// // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", +// // TODO so that I could iterate over the fxn. That sounds template-messy in C++ +// // tho'. Should I? +// vector> paths_in_snarl; +// for (auto &it : paths_found) { +// step_handle_t step = it.second; +// // path_in_snarl describes the start and end steps in the path, +// // as constrained by the snarl. +// pair path_in_snarl; + +// // Look for the step closest to the beginning of the path, as constrained by the +// // snarl. +// step_handle_t begin_in_snarl_step = step; +// id_t begin_in_snarl_id = +// _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); + +// while ((begin_in_snarl_id != source_id) && +// _graph.has_previous_step(begin_in_snarl_step)) { +// begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); +// begin_in_snarl_id = +// _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); +// } +// path_in_snarl.first = begin_in_snarl_step; - // Look for the step closest to the end of the path, as constrained by the snarl. - step_handle_t end_in_snarl_step = step; - id_t end_in_snarl_id = _graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); +// // Look for the step closest to the end of the path, as constrained by the snarl. +// step_handle_t end_in_snarl_step = step; +// id_t end_in_snarl_id = _graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); - // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and - // _graph.has_next_step(end_in_snarl_step)) { - while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { - end_in_snarl_step = graph.get_next_step(end_in_snarl_step); - end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); - } - // Note: when adding the end step, path notation convention requires that we add - // the null step at the end of the path (or the next arbitrary step, in the case - // of a path that extends beyond our snarl.) - // TODO: do we want the next arbitrary step in that latter case? - path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); +// // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and +// // _graph.has_next_step(end_in_snarl_step)) { +// while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { +// end_in_snarl_step = graph.get_next_step(end_in_snarl_step); +// end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); +// } +// // Note: when adding the end step, path notation convention requires that we add +// // the null step at the end of the path (or the next arbitrary step, in the case +// // of a path that extends beyond our snarl.) +// // TODO: do we want the next arbitrary step in that latter case? +// path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); - paths_in_snarl.push_back(path_in_snarl); - } +// paths_in_snarl.push_back(path_in_snarl); +// } - return paths_in_snarl; -} +// return paths_in_snarl; +// } // TODO: change the arguments to handles, which contain orientation within themselves. // Given a start and end node id, construct an extract subgraph between the two nodes diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index 32c2c3a99dc..b8e5a9b6ef6 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -36,10 +36,11 @@ namespace vg { namespace algorithms{ -SnarlSequenceFinder::SnarlSequenceFinder(const SubHandleGraph &snarl, +SnarlSequenceFinder::SnarlSequenceFinder(const PathHandleGraph & graph, + const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id) - : _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id) {} + : _graph(graph), _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id) {} // TODO: test that it successfully extracts any haplotypes that start/end in the middle of // TODO: the snarl. @@ -387,9 +388,124 @@ SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touc return finished_haplotypes; } -/////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////// +// embedded path finding: +////////////////////////////////////////////////////////////////////////////////////////// + +// Finds all embedded paths that either start or end in a snarl (or both) defined by +// source_id, sink_id. +// returns a vector of the embedded paths, where each entry in the vector is defined +// by the pair of step_handles closest to the beginning and end of the path. If the +// path is fully contained within the snarl, these step_handles will the be the +// leftmost and rightmost handles in the path. +// Arguments: +// _graph: a pathhandlegraph containing the snarl with embedded paths. +// source_id: the source of the snarl of interest. +// sink_id: the sink of the snarl of interest. +// Returns: +// a vector containing all the embedded paths in the snarl, in pair< step_handle_t, +// step_handle_t > > format. Pair.first is the first step in the path's range of +// interest, and pair.second is the step *after* the last step in the path's range of +// interest (can be the null step at end of path). +vector> +SnarlSequenceFinder::find_embedded_paths() { + // cerr << "extract_embedded_paths_in_snarl" << endl; + // cerr << "source id: " << source_id << endl; + // cerr << "source id contains what paths?: " << endl; + // for (auto step : _graph.steps_of_handle(graph.get_handle(_source_id))) { + // cerr << "\t" << _graph.get_path_name(graph.get_path_handle_of_step(step)) << + // endl; + // } + // cerr << "neighbors of 71104? (should include 71097):" << endl; + // handle_t test_handle = _graph.get_handle(71104); + // _graph.follow_edges(test_handle, true, [&](const handle_t &handle) { + // cerr << _graph.get_id(handle) << endl; + // }); + // cerr << "can I still access source handle?" + // << _graph.get_sequence(graph.get_handle(_source_id)) << endl; + + // key is path_handle, value is a step in that path from which to extend. + unordered_map paths_found; + + // look for handles with paths we haven't touched yet. + _snarl.for_each_handle([&](const handle_t &handle) { + vector steps = _graph.steps_of_handle(handle); + // do any of these steps belong to a path not in paths_found? + for (step_handle_t &step : steps) { + path_handle_t path = _graph.get_path_handle_of_step(step); + // If it's a step along a new path, save the first step to that path we find. + // In addtion, if there are multiple steps found in the path, (The avoidance + // of source and sink here is to ensure that we can properly check to see if + // we've reached the end of an embedded path walking in any arbitrary + // direction (i.e. source towards sink or sink towards source). + if (paths_found.find(path) == paths_found.end() || + _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _source_id || + _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _sink_id) { + // then we need to mark it as found and save the step. + paths_found[path] = step; + } + } + }); + + // todo: debug_statement + // cerr << "################looking for new paths################" << endl; + // for (auto path : paths_found) { + // cerr << __graph.get_path_name(path.first) << " " + // << __graph.get_id(_graph.get_handle_of_step(path.second)) << endl; + // } + + /// for each step_handle_t corresponding to a unique path, we want to get the steps + /// closest to both the end and beginning step that still remains in the snarl. + // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", + // TODO so that I could iterate over the fxn. That sounds template-messy in C++ + // tho'. Should I? + vector> paths_in_snarl; + for (auto &it : paths_found) { + step_handle_t step = it.second; + // path_in_snarl describes the start and end steps in the path, + // as constrained by the snarl. + pair path_in_snarl; + + // Look for the step closest to the beginning of the path, as constrained by the + // snarl. + step_handle_t begin_in_snarl_step = step; + id_t begin_in_snarl_id = + _graph.get_id(_graph.get_handle_of_step(begin_in_snarl_step)); + + while ((begin_in_snarl_id != _source_id) && + _graph.has_previous_step(begin_in_snarl_step)) { + begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); + begin_in_snarl_id = + _graph.get_id(_graph.get_handle_of_step(begin_in_snarl_step)); + } + path_in_snarl.first = begin_in_snarl_step; + + // Look for the step closest to the end of the path, as constrained by the snarl. + step_handle_t end_in_snarl_step = step; + id_t end_in_snarl_id = _graph.get_id(_graph.get_handle_of_step(end_in_snarl_step)); + + // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and + // _graph.has_next_step(end_in_snarl_step)) { + while (end_in_snarl_id != _sink_id and _graph.has_next_step(end_in_snarl_step)) { + end_in_snarl_step = _graph.get_next_step(end_in_snarl_step); + end_in_snarl_id = _graph.get_id(_graph.get_handle_of_step(end_in_snarl_step)); + } + // Note: when adding the end step, path notation convention requires that we add + // the null step at the end of the path (or the next arbitrary step, in the case + // of a path that extends beyond our snarl.) + // TODO: do we want the next arbitrary step in that latter case? + path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); + + paths_in_snarl.push_back(path_in_snarl); + } + + return paths_in_snarl; +} + +////////////////////////////////////////////////////////////////////////////////////////// // exhaustive sequence extraction: -/////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////// + // Runs through the whole snarl and generates all possible strings representing walks // from source to sink. Generates a combinatorial number of possible paths with splits // in the snarl. diff --git a/src/algorithms/0_snarl_sequence_finder.hpp b/src/algorithms/0_snarl_sequence_finder.hpp index 75d1e3176f5..ca7dc4ce953 100644 --- a/src/algorithms/0_snarl_sequence_finder.hpp +++ b/src/algorithms/0_snarl_sequence_finder.hpp @@ -10,7 +10,7 @@ class SnarlSequenceFinder { public: virtual ~SnarlSequenceFinder() = default; - SnarlSequenceFinder(const SubHandleGraph &snarl, + SnarlSequenceFinder(const PathHandleGraph & graph, const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, const id_t &sink_id); @@ -18,18 +18,23 @@ class SnarlSequenceFinder { find_gbwt_haps(); pair, unordered_set> find_exhaustive_paths(); + + vector> find_embedded_paths(); protected: // member variables: - // the handle graph with snarls to normalize + // the handle graph containing the snarl + const PathHandleGraph &_graph; + // a subhandlegraph with only the nodes in the snarl const SubHandleGraph &_snarl; // GBWT graph with snarls to normalize, includes the embedded threads needed for the // GBWTPathFinder approach. const gbwtgraph::GBWTGraph &_haploGraph; - const id_t _source_id; - const id_t _sink_id; + const id_t &_source_id; + const id_t &_sink_id; - vector> find_haplotypes_not_at_source(unordered_set &touched_handles); + vector> + find_haplotypes_not_at_source(unordered_set &touched_handles); }; } } \ No newline at end of file From ab4805642710e73772f0d7c938d0464cfa129dcd Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 16:57:55 -0800 Subject: [PATCH 44/63] deleted now-redundant fxns from SnarlNormalizer --- src/algorithms/0_oo_normalize_snarls.cpp | 552 ----------------------- 1 file changed, 552 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 05c7d3121e9..e3bbd0c46f4 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -282,355 +282,6 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & } - -// // TODO: test that it successfully extracts any haplotypes that start/end in the middle of -// // TODO: the snarl. -// /** -// * Finds all haps in gbwt associated with snarl. -// * @param snarl The subhandlegraph of the snarl to be normalized. -// * @param source_id The source of the snarl. -// * @param sink_id The sink of the snarl. -// * @return A 3-tuple containing 1) a vector of haps stretching from source to sink, in -// * vector format; 2) a second vector containing all other haps in snarl; -// * 3) a vector of all handles oberved by the method. -// */ -// tuple>, vector>, unordered_set> -// SnarlNormalizer::extract_gbwt_haplotypes(const SubHandleGraph &snarl, -// const id_t &source_id, const id_t &sink_id) { -// /** -// * haplotype_queue contains all started exon_haplotypes not completed yet. -// * Every time we encounter a branch in the paths, the next node down the path -// * Is stored here, along with the vector of handles that represents the path up -// * to the SearchState. -// */ -// vector, gbwt::SearchState>> haplotype_queue; - -// // source and sink handle for _haploGraph: -// handle_t source_handle = _haploGraph.get_handle(source_id); -// handle_t sink_handle = _haploGraph.get_handle(sink_id); - -// // place source in haplotype_queue. -// vector source_handle_vec(1, source_handle); -// gbwt::SearchState source_state = _haploGraph.get_state(source_handle); -// haplotype_queue.push_back(make_pair(source_handle_vec, source_state)); - -// // touched_handles contains all handles that have been touched by the -// // depth first search below, for later use in other_haplotypes_to_strings, which -// // identifies paths that didn't stretch from source to sink in the snarl. -// unordered_set touched_handles{source_handle, sink_handle}; - -// // these haplotype vecs contains all "finished" haplotypes - those that were either -// // walked to their conclusion, or until they reached the sink. -// vector> haplotypes_from_source_to_sink; -// vector> other_haplotypes; - -// // sometimes a gbwt thread will indicate a connection between two handles that doesn't -// // actually exist in the _graph. These connections need to be ignored. -// unordered_set incorrect_connections; - -// // for every partly-extracted thread, extend the thread until it either reaches -// // the sink of the snarl or the end of the thread. -// while (!haplotype_queue.empty()) { -// // todo: debug_statement -// // cerr << "haplotype queue: "; -// // cerr << "size of queue:" << haplotype_queue.size() << " " << endl; -// // for (auto hap : haplotype_queue) { -// // cerr << "size: " << hap.first.size() << endl << "handle_ids: "; -// // for (handle_t handle : hap.first) { -// // cerr << _haploGraph.get_id(handle) << " "; -// // } -// // cerr << endl; -// // } - -// // get a haplotype out of haplotype_queue to extend - -// // a tuple of (handles_traversed_so_far, last_touched_SearchState) -// pair, gbwt::SearchState> cur_haplotype = haplotype_queue.back(); -// haplotype_queue.pop_back(); - -// // get all the subsequent search_states that immediately follow the searchstate -// // from cur_haplotype. -// vector next_searches; -// _haploGraph.follow_paths(cur_haplotype.second, -// [&](const gbwt::SearchState next_search) -> bool { -// next_searches.push_back(next_search); -// return true; -// }); - -// // if next_searches > 1, then we need to make multiple new haplotypes to be -// // recorded in haplotype_queue or one of the finished haplotype_handle_vectors. -// if (next_searches.size() > 1) { -// // for every next_search in next_searches, either create a new, extended -// // cur_haplotype to push into haplotype queue, or place in the -// // haplotypes_from_source_to_sink if haplotype extends to sink, or place in -// // the other_haplotypes if haplotype ends before reaching sink. -// for (gbwt::SearchState next_search : next_searches) { -// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); -// // if (!snarl.has_node(snarl.get_id(next_handle)) && -// // make_pair(_haploGraph.get_id(cur_haplotype.first.back()),_haploGraph.get_id(next_handle))) -// // { -// if (!snarl.has_edge(cur_haplotype.first.back(), next_handle)) { -// if (incorrect_connections.find( -// snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == -// incorrect_connections.end()) { -// cerr << "snarl starting at node " << source_id -// << " and ending at " << sink_id -// << " has a thread that incorrectly connects two nodes that " -// "don't have any edge connecting them. These two nodes are " -// << _haploGraph.get_id(cur_haplotype.first.back()) << " and " -// << _haploGraph.get_id(next_handle) -// << ". This thread connection will be ignored." << endl; -// incorrect_connections.emplace( -// snarl.edge_handle(cur_haplotype.first.back(), next_handle)); - -// // todo: debug_statement -// // cerr << "next handle(s) of handle " -// // << snarl.get_id(cur_haplotype.first.back()) -// // << " according to snarl:" << endl; -// // snarl.follow_edges(cur_haplotype.first.back(), false, -// // [&](const handle_t handle) { -// // cerr << "\t" << snarl.get_id(handle); -// // }); -// // cerr << endl; -// } -// continue; -// } -// // copy over the vector of cur_haplotype: -// vector next_handle_vec(cur_haplotype.first); - -// // add the new handle to the vec: -// next_handle_vec.push_back(next_handle); - -// // if new_handle is the sink, put in haplotypes_from_source_to_sink -// if (_haploGraph.get_id(next_handle) == sink_id) { -// haplotypes_from_source_to_sink.push_back(next_handle_vec); -// } else // keep extending the haplotype! -// { -// pair, gbwt::SearchState> next_haplotype = -// make_pair(next_handle_vec, next_search); -// haplotype_queue.push_back(next_haplotype); -// } -// // next_handle will be touched. -// touched_handles.emplace(next_handle); -// } -// } -// // if next_searches is empty, the path has ended but not reached sink. -// else if (next_searches.empty()) { -// // We have reached the end of the path, but it doesn't reach the sink. -// // we need to add cur_haplotype to other_haplotypes. -// other_haplotypes.push_back(cur_haplotype.first); - -// } -// // if next_handle is the sink, put in haplotypes_from_source_to_sink -// else if (_haploGraph.get_id( -// _haploGraph.node_to_handle(next_searches.back().node)) == sink_id) { -// // Then we need to add cur_haplotype + next_search to -// // haplotypes_from_source_to_sink. -// handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); -// cur_haplotype.first.push_back(next_handle); -// haplotypes_from_source_to_sink.push_back(cur_haplotype.first); - -// // touched next_search's handle -// touched_handles.emplace(next_handle); -// } -// // else, there is just one next_search, and it's not the end of the path. -// // just extend the search by adding (cur_haplotype + next_search to -// // haplotype_queue. -// else { -// // get the next_handle from the one next_search. -// handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); - -// // modify cur_haplotype with next_handle and next_search. -// cur_haplotype.first.push_back(next_handle); -// cur_haplotype.second = -// next_searches.back(); // there's only one next_search in next_searches. - -// // put cur_haplotype back in haplotype_queue. -// haplotype_queue.push_back(cur_haplotype); -// touched_handles.emplace(next_handle); -// } -// } - -// // Find any haplotypes starting from handles not starting at the source, but which -// // still start somewhere inside the snarl. -// vector> haplotypes_not_starting_at_source = -// find_haplotypes_not_at_source(touched_handles, sink_id); - -// // move haplotypes_not_starting_at_source into other_haplotypes: -// other_haplotypes.reserve(other_haplotypes.size() + -// haplotypes_not_starting_at_source.size()); -// move(haplotypes_not_starting_at_source.begin(), -// haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); - -// //todo: debug_statement -// cerr << "lets look through all the haplotypes after extraction:" << endl; -// for (vector hap_vec : haplotypes_from_source_to_sink) { -// cerr << "new hap:" << endl; -// for (handle_t handle : hap_vec){ -// cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; -// } -// } - -// return tuple>, vector>, -// unordered_set>{haplotypes_from_source_to_sink, -// other_haplotypes, touched_handles}; -// } - - -// // Used to complete the traversal of a snarl along its haplotype threads, when there are -// // handles connected to the snarl by threads that start after the source handle. (Threads -// // that merely end before the sink handle are addressed in extract_gbwt_haplotypes). -// // Arguments: -// // _haploGraph: the GBWTgraph containing the haplotype threads. -// // touched_handles: any handles found in the snarl so far. -// // sink_id: the id of the final handle in the snarl. -// // Returns: -// // a vector of haplotypes in vector format that start in the middle of the -// // snarl. -// vector> -// SnarlNormalizer::find_haplotypes_not_at_source(unordered_set &touched_handles, -// const id_t &sink_id) { -// // cerr << "find_haplotypes_not_at_source" << endl; - -// /// Search every handle in touched handles for haplotypes starting at that point. -// // Any new haplotypes will be added to haplotype_queue. -// vector, gbwt::SearchState>> haplotype_queue; - -// // Fully extended haplotypes (or haplotypes extended to the snarl's sink) -// // will be added to finished_haplotypes. -// vector> finished_haplotypes; - -// // In addition, we need to put the new handle into to_search, because a path may have -// // started on the new handle (which means we need to start a searchstate there.) -// unordered_set to_search; - -// // We don't need to ever check the sink handle, since paths from the sink handle -// // extend beyond snarl. -// handle_t sink_handle = _haploGraph.get_handle(sink_id); -// // touched_handles.erase(sink_handle); - -// // Nested function for making a new_search. Identifies threads starting at a given -// // handle and -// // either adds them as a full haplotype (if the haplotype is one handle long) or -// // makes a new entry to haplotype_queue. -// auto make_new_search = [&](handle_t handle) { -// // Are there any new threads starting at this handle? -// gbwt::SearchState new_search = -// _haploGraph.index->prefix(_haploGraph.handle_to_node(handle)); -// if (!new_search.empty()) { -// // Then add them to haplotype_queue. -// _haploGraph.follow_paths( -// new_search, [&](const gbwt::SearchState &next_search) -> bool { -// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); - -// /// check to make sure that the thread isn't already finished: -// // if next_handle is the sink, or if this thread is only one handle -// // long, then there isn't any useful string to extract from this. -// if (next_handle != sink_handle || -// next_search == gbwt::SearchState()) { -// // establish a new thread to walk along. -// vector new_path; -// new_path.push_back(handle); -// new_path.push_back(next_handle); - -// pair, gbwt::SearchState> mypair = -// make_pair(new_path, next_search); - -// // add the new path to haplotype_queue to be extended. -// haplotype_queue.push_back(make_pair(new_path, next_search)); - -// // if next_handle hasn't been checked for starting threads, add to -// // to_search. -// if (touched_handles.find(next_handle) == touched_handles.end()) { -// to_search.emplace(next_handle); -// } -// } -// return true; -// }); -// } -// }; - -// /// Extend any paths in haplotype_queue, and add any newly found handles to to_search. -// /// Then, check to see if there are any new threads on handles in to_search. -// /// Extend those threads, and add any newly found handles to to_search, -// /// then search for threads again in to_search again... repeat until to_search remains -// /// emptied of new handles. - -// // for tracking whether the haplotype thread is still extending: -// bool still_extending; -// while (!to_search.empty() || !haplotype_queue.empty()) { -// while (!haplotype_queue.empty()) { -// // get a haplotype to extend out of haplotype_queue - a tuple of -// // (handles_traversed_so_far, last_touched_SearchState) -// pair, gbwt::SearchState> cur_haplotype = -// haplotype_queue.back(); -// haplotype_queue.pop_back(); - -// // get all the subsequent search_states that immediately follow the -// // searchstate from cur_haplotype. -// vector next_searches; -// _haploGraph.follow_paths(cur_haplotype.second, -// [&](const gbwt::SearchState &next_search) -> bool { -// next_searches.push_back(next_search); -// return true; -// }); - -// for (gbwt::SearchState next_search : next_searches) { -// handle_t next_handle = _haploGraph.node_to_handle(next_search.node); - -// // if next_search is empty, then we've fallen off the thread, -// // and cur_haplotype can be placed in finished_haplotypes as is for this -// // thread. -// if (next_search == gbwt::SearchState()) { -// finished_haplotypes.push_back(cur_haplotype.first); -// } - -// // if next_search is on the sink_handle, -// // then cur_haplotype.first + next_search goes to finished_haplotypes. -// else if (_haploGraph.get_id(next_handle) == sink_id) { - -// // copy over the vector of cur_haplotype: -// vector next_handle_vec(cur_haplotype.first); -// // add next_handle -// next_handle_vec.push_back(next_handle); -// // place in finished_haplotypes -// finished_haplotypes.push_back(next_handle_vec); - -// // also, if next_handle hasn't been checked for new threads, add to -// // to_search. -// if (touched_handles.find(next_handle) != touched_handles.end()) { -// to_search.emplace(next_handle); -// } - -// } -// // otherwise, just place an extended cur_haplotype in haplotype_queue. -// else { -// // copy over cur_haplotype: -// pair, gbwt::SearchState> cur_haplotype_copy = -// cur_haplotype; -// // modify with next_handle/search -// cur_haplotype_copy.first.push_back(next_handle); -// cur_haplotype_copy.second = next_search; -// // place back in haplotype_queue for further extension. -// haplotype_queue.push_back(cur_haplotype_copy); - -// // also, if next_handle hasn't been checked for new threads, add to -// // to_search. -// if (touched_handles.find(next_handle) != touched_handles.end()) { -// to_search.emplace(next_handle); -// } -// } -// } -// } -// // Then, make more new_searches from the handles in to_search. -// for (handle_t handle : to_search) { -// make_new_search(handle); // will add to haplotype_queue if there's any -// // new_searches to be had. -// } -// to_search.clear(); -// } -// return finished_haplotypes; -// } - // Given a vector of haplotypes of format vector< handle_t >, returns a vector of // haplotypes of // format string (which is the concatenated sequences in the handles). @@ -804,121 +455,6 @@ void SnarlNormalizer::force_maximum_handle_size(MutableHandleGraph &graph, }); } -// // Finds all embedded paths that either start or end in a snarl (or both) defined by -// // source_id, sink_id. -// // returns a vector of the embedded paths, where each entry in the vector is defined -// // by the pair of step_handles closest to the beginning and end of the path. If the -// // path is fully contained within the snarl, these step_handles will the be the -// // leftmost and rightmost handles in the path. -// // Arguments: -// // _graph: a pathhandlegraph containing the snarl with embedded paths. -// // source_id: the source of the snarl of interest. -// // sink_id: the sink of the snarl of interest. -// // Returns: -// // a vector containing all the embedded paths in the snarl, in pair< step_handle_t, -// // step_handle_t > > format. Pair.first is the first step in the path's range of -// // interest, and pair.second is the step *after* the last step in the path's range of -// // interest (can be the null step at end of path). -// vector> -// SnarlNormalizer::extract_embedded_paths_in_snarl(const PathHandleGraph &graph, -// const id_t &source_id, -// const id_t &sink_id) { -// // cerr << "extract_embedded_paths_in_snarl" << endl; -// // cerr << "source id: " << source_id << endl; -// // cerr << "source id contains what paths?: " << endl; -// // for (auto step : _graph.steps_of_handle(graph.get_handle(_source_id))) { -// // cerr << "\t" << _graph.get_path_name(graph.get_path_handle_of_step(step)) << -// // endl; -// // } -// // cerr << "neighbors of 71104? (should include 71097):" << endl; -// // handle_t test_handle = _graph.get_handle(71104); -// // _graph.follow_edges(test_handle, true, [&](const handle_t &handle) { -// // cerr << _graph.get_id(handle) << endl; -// // }); -// // cerr << "can I still access source handle?" -// // << _graph.get_sequence(graph.get_handle(_source_id)) << endl; - -// // get the snarl subgraph of the PathHandleGraph, in order to ensure that we don't -// // extend the path to a point beyond the source or sink. -// SubHandleGraph snarl = extract_subgraph(graph, source_id, sink_id); -// // key is path_handle, value is a step in that path from which to extend. -// unordered_map paths_found; - -// // look for handles with paths we haven't touched yet. -// snarl.for_each_handle([&](const handle_t &handle) { -// vector steps = graph.steps_of_handle(handle); -// // do any of these steps belong to a path not in paths_found? -// for (step_handle_t &step : steps) { -// path_handle_t path = graph.get_path_handle_of_step(step); -// // If it's a step along a new path, save the first step to that path we find. -// // In addtion, if there are multiple steps found in the path, (The avoidance -// // of source and sink here is to ensure that we can properly check to see if -// // we've reached the end of an embedded path walking in any arbitrary -// // direction (i.e. source towards sink or sink towards source). -// if (paths_found.find(path) == paths_found.end() || -// graph.get_id(graph.get_handle_of_step(paths_found[path])) == source_id || -// graph.get_id(graph.get_handle_of_step(paths_found[path])) == sink_id) { -// // then we need to mark it as found and save the step. -// paths_found[path] = step; -// } -// } -// }); - -// // todo: debug_statement -// // cerr << "################looking for new paths################" << endl; -// // for (auto path : paths_found) { -// // cerr << _graph.get_path_name(path.first) << " " -// // << _graph.get_id(graph.get_handle_of_step(path.second)) << endl; -// // } - -// /// for each step_handle_t corresponding to a unique path, we want to get the steps -// /// closest to both the end and beginning step that still remains in the snarl. -// // TODO: Note copy paste of code here. In python I'd do "for fxn in [fxn1, fxn2]:", -// // TODO so that I could iterate over the fxn. That sounds template-messy in C++ -// // tho'. Should I? -// vector> paths_in_snarl; -// for (auto &it : paths_found) { -// step_handle_t step = it.second; -// // path_in_snarl describes the start and end steps in the path, -// // as constrained by the snarl. -// pair path_in_snarl; - -// // Look for the step closest to the beginning of the path, as constrained by the -// // snarl. -// step_handle_t begin_in_snarl_step = step; -// id_t begin_in_snarl_id = -// _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); - -// while ((begin_in_snarl_id != source_id) && -// _graph.has_previous_step(begin_in_snarl_step)) { -// begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); -// begin_in_snarl_id = -// _graph.get_id(graph.get_handle_of_step(begin_in_snarl_step)); -// } -// path_in_snarl.first = begin_in_snarl_step; - -// // Look for the step closest to the end of the path, as constrained by the snarl. -// step_handle_t end_in_snarl_step = step; -// id_t end_in_snarl_id = _graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); - -// // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and -// // _graph.has_next_step(end_in_snarl_step)) { -// while (end_in_snarl_id != sink_id and graph.has_next_step(end_in_snarl_step)) { -// end_in_snarl_step = graph.get_next_step(end_in_snarl_step); -// end_in_snarl_id = graph.get_id(graph.get_handle_of_step(end_in_snarl_step)); -// } -// // Note: when adding the end step, path notation convention requires that we add -// // the null step at the end of the path (or the next arbitrary step, in the case -// // of a path that extends beyond our snarl.) -// // TODO: do we want the next arbitrary step in that latter case? -// path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); - -// paths_in_snarl.push_back(path_in_snarl); -// } - -// return paths_in_snarl; -// } - // TODO: change the arguments to handles, which contain orientation within themselves. // Given a start and end node id, construct an extract subgraph between the two nodes // (inclusive). Arguments: @@ -1657,93 +1193,5 @@ SnarlNormalizer::debug_get_sources_and_sinks(const HandleGraph &graph) { return pair, vector>(source, sink); } -// Runs through the whole snarl and generates all possible strings representing walks -// from source to sink. Generates a combinatorial number of possible paths with splits -// in the snarl. -pair, unordered_set> -SnarlNormalizer::source_to_sink_exhaustive_path_finder(const id_t &source_id, const id_t &sink_id) { - // cerr << "debug_graph_to_strings" << endl; - SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); - - unordered_set touched_handles; - - unordered_map> sequences; - vector sinks; - unordered_map count; - count.reserve(snarl.get_node_count()); // resize count to contain enough buckets - // for size of snarl - sequences.reserve(snarl.get_node_count()); // resize sequences to contain enough - // buckets for size of snarl - - // identify sources and sinks //TODO: once we've established that this fxn works, - // we can just use start_id and sink_id. - snarl.for_each_handle([&](const handle_t &handle) { - bool is_source = true, is_sink = true; - snarl.follow_edges(handle, true, [&](const handle_t &prev) { - is_source = false; - return false; - }); - snarl.follow_edges(handle, false, [&](const handle_t &next) { - is_sink = false; - return false; - }); - - // base case for dynamic programming - if (is_source) { - count[handle] = 1; - sequences[handle].push_back( - snarl.get_sequence(handle)); // TODO: presented in the handle's local - // forward orientation. An issue? - } - if (is_sink) { - sinks.emplace_back(handle); - } - }); - - // count walks by dynamic programming - bool overflowed = false; - for (const handle_t &handle : lazier_topological_order(&snarl)) { - touched_handles.emplace(handle); - size_t count_here = count[handle]; - vector seqs_here = sequences[handle]; - - snarl.follow_edges(handle, false, [&](const handle_t &next) { - size_t &count_next = count[next]; - string seq_next = snarl.get_sequence(next); - - if (numeric_limits::max() - count_here < count_next) { - overflowed = true; - } - - else { - count_next += count_here; - for (string seq : seqs_here) { - sequences[next].push_back(seq + seq_next); - } - } - }); - /// TODO: figure out how to deal with overflow. - // if (overflowed) { - // return numeric_limits::max(); - // } - } - - // total up the walks at the sinks - size_t total_count = 0; - for (handle_t &sink : sinks) { - total_count += count[sink]; - } - - // all the sequences at the sinks will be all the sequences in the snarl. - vector walks; - for (handle_t &sink : sinks) { - for (string seq : sequences[sink]) { - walks.push_back(seq); - } - } - - return make_pair(walks, touched_handles); -} - } } \ No newline at end of file From f4f197b589324c42c5ccfdcefec86ec5bdaeebd2 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 24 Nov 2020 17:09:56 -0800 Subject: [PATCH 45/63] cleaned normalize_snarls hpp --- src/algorithms/0_oo_normalize_snarls.hpp | 41 +++++++++++------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 44344f75f1d..28c7f907f44 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -34,31 +34,22 @@ class SnarlNormalizer { int _max_alignment_size; const string &_path_finder; - tuple>, vector>, unordered_set> - extract_gbwt_haplotypes(const SubHandleGraph &snarl, - const id_t &source_id, const id_t &sink_id); + ////////////////////////////////////////////////////////////////////////////////////// + // finding information on original graph: + ////////////////////////////////////////////////////////////////////////////////////// - pair, unordered_set> - source_to_sink_exhaustive_path_finder(const id_t &source_id, const id_t &sink_id); - - vector> - find_haplotypes_not_at_source(unordered_set &touched_handles, - const id_t &sink_id); + SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, + const id_t &end_id); + + vector check_handle_as_start_of_path_seq(const string &handle_seq, + const string &path_seq); - vector format_handle_haplotypes_to_strings( - const vector> &haplotype_handle_vectors); + ////////////////////////////////////////////////////////////////////////////////////// + // creation of new graph: + ////////////////////////////////////////////////////////////////////////////////////// VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); - void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); - - vector> - extract_embedded_paths_in_snarl(const PathHandleGraph &graph, const id_t &source_id, - const id_t &sink_id); - - SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, - const id_t &end_id); - void integrate_snarl(const HandleGraph &new_snarl, const vector> embedded_paths, const id_t &source_id, const id_t &sink_id); @@ -73,8 +64,14 @@ class SnarlNormalizer { const bool &touching_source, const bool &touching_sink, const handle_t &potential_source, const handle_t &potential_sink); - vector check_handle_as_start_of_path_seq(const string &handle_seq, - const string &path_seq); + void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); + + ////////////////////////////////////////////////////////////////////////////////////// + // format-type switching: + ////////////////////////////////////////////////////////////////////////////////////// + vector format_handle_haplotypes_to_strings( + const vector> &haplotype_handle_vectors); + // -------------------------------- DEBUG CODE BELOW: // ------------------------------------ From 19c070660af95b576572ff88cc33993bac97af78 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 1 Dec 2020 10:38:02 -0800 Subject: [PATCH 46/63] resolved merge conflict --- src/subcommand/mod_main.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/subcommand/mod_main.cpp b/src/subcommand/mod_main.cpp index 073cbe6c34c..b799c64924c 100644 --- a/src/subcommand/mod_main.cpp +++ b/src/subcommand/mod_main.cpp @@ -777,8 +777,6 @@ int main_mod(int argc, char** argv) { vg_graph->paths = Paths(); } - graph->serialize_to_ostream(std::cout); - // Save the modified graph vg::io::save_handle_graph(graph.get(), std::cout); From 6602922985492413bbde9a0f90f9cece70e89e3e Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 3 Dec 2020 16:45:06 -0800 Subject: [PATCH 47/63] duplicate haplotypes removed before alignment --- src/algorithms/0_oo_normalize_snarls.cpp | 32 ++++++++++++++-------- src/algorithms/0_oo_normalize_snarls.hpp | 4 +-- src/algorithms/0_snarl_sequence_finder.cpp | 6 ++-- src/algorithms/0_snarl_sequence_finder.hpp | 2 +- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index e3bbd0c46f4..bd67645d6e2 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -158,7 +158,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & } // extract threads - tuple, vector>, unordered_set> haplotypes; + tuple, vector>, unordered_set> haplotypes; SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id); if (_path_finder == "GBWT") { @@ -169,7 +169,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & get<1>(haplotypes) = get<1>(gbwt_haplotypes); get<2>(haplotypes) = get<2>(gbwt_haplotypes); } else if (_path_finder == "exhaustive") { - pair, unordered_set> exhaustive_haplotypes = + pair, unordered_set> exhaustive_haplotypes = sequence_finder.find_exhaustive_paths(); get<0>(haplotypes) = exhaustive_haplotypes.first; get<2>(haplotypes) = exhaustive_haplotypes.second; @@ -224,7 +224,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } - get<0>(haplotypes).push_back(path_seq); + get<0>(haplotypes).emplace(path_seq); } } // Align the new snarl: @@ -291,15 +291,15 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // handle_t > format. // Returns: a vector of haplotypes of format string (which is the concatenated sequences // in the handles). -vector SnarlNormalizer::format_handle_haplotypes_to_strings( +unordered_set SnarlNormalizer::format_handle_haplotypes_to_strings( const vector> &haplotype_handle_vectors) { - vector haplotype_strings; + unordered_set haplotype_strings; for (vector haplotype_handles : haplotype_handle_vectors) { string hap; for (handle_t &handle : haplotype_handles) { hap += _haploGraph.get_sequence(handle); } - haplotype_strings.push_back(hap); + haplotype_strings.emplace(hap); } return haplotype_strings; } @@ -313,7 +313,7 @@ vector SnarlNormalizer::format_handle_haplotypes_to_strings( // Returns: // VG object representing the newly realigned snarl. VG SnarlNormalizer::align_source_to_sink_haplotypes( - vector source_to_sink_haplotypes) { + unordered_set source_to_sink_haplotypes) { // cerr << "align_source_to_sink_haplotypes" << endl; // cerr << " haplotypes in source_to_sink_haplotypes: " << endl; // for (string hap : source_to_sink_haplotypes) { @@ -329,10 +329,16 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( // maintains the context of the snarl. // store the source/sink chars for later reattachment to source and sink. - string source_char(1, source_to_sink_haplotypes.back().front()); - string sink_char(1, source_to_sink_haplotypes.back().back()); + string random_element; + for (auto hap : source_to_sink_haplotypes){ + random_element = hap; + break; + } + string source_char(1, random_element.front()); + string sink_char(1, random_element.back()); - for (string &hap : source_to_sink_haplotypes) { + // replace the source and sink chars with X, to force match at source and sink. + for (auto hap : source_to_sink_haplotypes) { hap.replace(0, 1, "X"); hap.replace(hap.size() - 1, 1, "X"); } @@ -367,8 +373,10 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( seqan::Align align; seqan::resize(rows(align), source_to_sink_haplotypes.size()); - for (int i = 0; i < source_to_sink_haplotypes.size(); ++i) { - assignSource(row(align, i), source_to_sink_haplotypes[i].c_str()); + int i = 0; + for (auto hap : source_to_sink_haplotypes) { + assignSource(row(align, i), hap.c_str()); + i++; } globalMsaAlignment(align, seqan::SimpleScore(5, -3, -1, -3)); diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 28c7f907f44..e9d749490bb 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -48,7 +48,7 @@ class SnarlNormalizer { // creation of new graph: ////////////////////////////////////////////////////////////////////////////////////// - VG align_source_to_sink_haplotypes(vector source_to_sink_haplotypes); + VG align_source_to_sink_haplotypes(unordered_set source_to_sink_haplotypes); void integrate_snarl(const HandleGraph &new_snarl, const vector> embedded_paths, @@ -69,7 +69,7 @@ class SnarlNormalizer { ////////////////////////////////////////////////////////////////////////////////////// // format-type switching: ////////////////////////////////////////////////////////////////////////////////////// - vector format_handle_haplotypes_to_strings( + unordered_set format_handle_haplotypes_to_strings( const vector> &haplotype_handle_vectors); diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index b8e5a9b6ef6..2754f0b8cb9 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -511,7 +511,7 @@ SnarlSequenceFinder::find_embedded_paths() { // in the snarl. //todo: for consistency, have source_to_sink_exhaustive_path_finder return paths in format //todo: vector> instead of vector -pair, unordered_set> +pair, unordered_set> SnarlSequenceFinder::find_exhaustive_paths() { // cerr << "debug_graph_to_strings" << endl; unordered_set touched_handles; @@ -584,10 +584,10 @@ SnarlSequenceFinder::find_exhaustive_paths() { } // all the sequences at the sinks will be all the sequences in the _snarl. - vector walks; + unordered_set walks; for (handle_t &sink : sinks) { for (string seq : sequences[sink]) { - walks.push_back(seq); + walks.emplace(seq); } } diff --git a/src/algorithms/0_snarl_sequence_finder.hpp b/src/algorithms/0_snarl_sequence_finder.hpp index ca7dc4ce953..9bce63d53c9 100644 --- a/src/algorithms/0_snarl_sequence_finder.hpp +++ b/src/algorithms/0_snarl_sequence_finder.hpp @@ -17,7 +17,7 @@ class SnarlSequenceFinder { tuple>, vector>, unordered_set> find_gbwt_haps(); - pair, unordered_set> find_exhaustive_paths(); + pair, unordered_set> find_exhaustive_paths(); vector> find_embedded_paths(); From 5e39863d12dc329cb69d08bf49211f58345f7d5e Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 17 Dec 2020 11:59:52 -0800 Subject: [PATCH 48/63] extraction of chosen subgraph, single snarl normalization. --- src/algorithms/0_oo_normalize_snarls.cpp | 31 +++++++++++++++++++++--- src/subcommand/0_normalize_main.cpp | 6 ++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index bd67645d6e2..9de8a8ead6c 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -67,20 +67,35 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { * 4) number of bases in the snarl before normalization * 5) number of bases in the snarl after normalization. */ - int error_record_size = 5; + int error_record_size = 6; vector one_snarl_error_record(error_record_size, 0); vector full_error_record(error_record_size, 0); pair snarl_sequence_change; + // //todo: debug_code + int stop_size = 1; + int num_snarls_touched = 0; + + int skip_first_few = 1; + int skipped = 0; for (auto roots : snarl_roots) { + if (skipped < skip_first_few){ + skipped++; + continue; + } + if (num_snarls_touched == stop_size){ + break; + } else { + num_snarls_touched++; + } // if (roots->start().node_id() > 269600 && roots->start().node_id() < 269700) { cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) << " source: " << roots->start().node_id() << " sink: " << roots->end().node_id() << endl; - normalize_snarl(roots->start().node_id(), roots->end().node_id()); + one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); if (!((one_snarl_error_record[0]) || (one_snarl_error_record[1]) || (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { @@ -100,6 +115,14 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { num_snarls_skipped += 1; } // } + //todo: debug_statement for extracting snarl of interest. + VG outGraph; + pos_t source_pos = make_pos_t(roots->start().node_id(), false, 0); + vector pos_vec; + pos_vec.push_back(source_pos); + algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, 1000); + // algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, roots->end().node_id() - roots->start().node_id() + 2); + outGraph.serialize_to_ostream(cout); } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " @@ -115,12 +138,14 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { cerr << "amount of sequence in normalized snarls after normalization: " << snarl_sequence_change.second << endl; - // //todo: debug_statement for extracting snarl of interest. + //todo: debug_statement for extracting snarl of interest. // VG outGraph; // pos_t source_pos = make_pos_t(269695, false, 0); // vector pos_vec; // pos_vec.push_back(source_pos); // algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, 1000); + // _graph = outGraph; + // vg::io::VPKG::save(*dynamic_cast(outGraph.get()), cout); // outGraph.serialize_to_ostream(cout); delete snarl_manager; diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index add0df34343..24a67f715e9 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -169,11 +169,9 @@ int main_normalize(int argc, char **argv) { vg::evaluate_normalized_snarls(snarl_stream); } - // TODO: NOTE: this may be cumbersome code if we decide to add more argument types. - // Consider changing. - if (normalize) { - vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + //todo: maybe rewrite to mimic mod_main. + // vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); // graph->serialize(std::cout); } From 12ca17237953caac38b3f919f5af6b057c2558d9 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 7 Jan 2021 17:29:35 -0800 Subject: [PATCH 49/63] added compatibility for snarls that are recorded 'backwards' in the graph --- src/algorithms/0_oo_normalize_snarls.cpp | 107 ++++++++++++++------- src/algorithms/0_snarl_sequence_finder.cpp | 4 + src/subcommand/0_normalize_main.cpp | 2 +- 3 files changed, 77 insertions(+), 36 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 9de8a8ead6c..7f7770dacaf 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -73,37 +73,43 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; - // //todo: debug_code - int stop_size = 1; - int num_snarls_touched = 0; + // // //todo: debug_code + // int stop_size = 1; + // int num_snarls_touched = 0; - int skip_first_few = 1; - int skipped = 0; + // int skip_first_few = 2; //#1, node 3702578 is a cyclic snarl. Don't recall about #0. #2 also cyclic. Looks like cyclic snarls weren't buggy? + // int skipped = 0; for (auto roots : snarl_roots) { - if (skipped < skip_first_few){ - skipped++; - continue; - } - if (num_snarls_touched == stop_size){ - break; - } else { - num_snarls_touched++; - } - // if (roots->start().node_id() > 269600 && roots->start().node_id() < 269700) { + // if (skipped < skip_first_few){ + // skipped++; + // continue; + // } + // if (num_snarls_touched == stop_size){ + // break; + // } else { + // num_snarls_touched++; + // } + // if (roots->start().node_id() == 3704138) { + cerr << "root backwards?" << roots->start().backward() << endl; cerr << "disambiguating snarl #" - << (num_snarls_normalized + num_snarls_skipped) - << " source: " << roots->start().node_id() - << " sink: " << roots->end().node_id() << endl; - - one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); + << (num_snarls_normalized + num_snarls_skipped) + << " source: " << roots->start().node_id() + << " sink: " << roots->end().node_id() << endl; + if (!roots->start().backward()){ + one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); + } + else { + one_snarl_error_record = normalize_snarl(roots->end().node_id(), roots->start().node_id()); + } if (!((one_snarl_error_record[0]) || (one_snarl_error_record[1]) || - (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { + (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { // if there are no errors, then we've successfully normalized a snarl. num_snarls_normalized += 1; // track the change in size of the snarl. snarl_sequence_change.first += one_snarl_error_record[4]; snarl_sequence_change.second += one_snarl_error_record[5]; + cerr << "normalized snarl starting at: " << roots->start().node_id() << endl; } else { // else, there was an error. Track which errors caused the snarl to not // normalize. @@ -114,15 +120,16 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { } num_snarls_skipped += 1; } + // //todo: debug_statement for extracting snarl of interest. + // VG outGraph; + // pos_t source_pos = make_pos_t(roots->start().node_id(), false, 0); + // vector pos_vec; + // pos_vec.push_back(source_pos); + // algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, roots->end().node_id() - roots->start().node_id() + 2); + // outGraph.serialize_to_ostream(cout); + // break; // } - //todo: debug_statement for extracting snarl of interest. - VG outGraph; - pos_t source_pos = make_pos_t(roots->start().node_id(), false, 0); - vector pos_vec; - pos_vec.push_back(source_pos); - algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, 1000); - // algorithms::extract_containing_graph(&_graph, &outGraph, pos_vec, roots->end().node_id() - roots->start().node_id() + 2); - outGraph.serialize_to_ostream(cout); + } cerr << endl << "normalized " << num_snarls_normalized << " snarl(s), skipped " @@ -174,8 +181,18 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & * 5) number of bases in the snarl after normalization. */ vector error_record(6, 0); + // //todo: debug_statement: determining whether cyclic problem in yeast graph goes away when I swapo source and sink. + // SubHandleGraph snarl = extract_subgraph(_graph, sink_id, source_id); SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); + //todo: debug_statement: Evaluate connections of all nodes in subgraph. + snarl.for_each_handle([&](const handle_t handle){ + cerr << "examining left neighbors of handle " << snarl.get_id(handle) << ":" << endl; + snarl.follow_edges(handle, false, [&](const handle_t &next) { + cerr << " " << snarl.get_id(next) << endl; + }); + }); + if (!algorithms::is_acyclic(&snarl)) { cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; error_record[3] = true; @@ -184,6 +201,8 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // extract threads tuple, vector>, unordered_set> haplotypes; + // //todo: debug_statement: determining whether cyclic problem in yeast graph goes away when I swapo source and sink. + // SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, sink_id, source_id); SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id); if (_path_finder == "GBWT") { @@ -263,12 +282,17 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & force_maximum_handle_size(new_snarl, _max_alignment_size); // integrate the new_snarl into the _graph, removing the old snarl as you go. + // //todo: debug_statement + // integrate_snarl(new_snarl, embedded_paths, sink_id, source_id); integrate_snarl(new_snarl, embedded_paths, source_id, sink_id); } else { if (!get<1>(haplotypes).empty()) { cerr << "found a snarl starting at " << source_id << " and ending at " << sink_id << " with haplotypes that start or end in the middle. Skipping." << endl; + cerr << "There are " << sizeof(get<1>(haplotypes)) << " haplotypes of that description." << endl; + // vector string_haps = format_handle_haplotypes_to_strings(get<1>(haplotypes).front()); + // cerr << "First example: " << get<1>(haplotypes) << endl; error_record[1] = true; } if (get<0>(haplotypes).size() > _max_alignment_size) { @@ -283,6 +307,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & << " aren't accounted for by the gbwt_graph. " "Skipping." << endl; + cerr << "size of snarl:" << handles_in_snarl.size() << "number of handles touched by gbwt graph: " << get<2>(haplotypes).size() << endl; cerr << "these handles are:" << endl << "\t"; for (auto handle : handles_in_snarl) { if (get<2>(haplotypes).find(handle) == get<2>(haplotypes).end()) { @@ -586,10 +611,15 @@ void SnarlNormalizer::integrate_snarl( // cerr << "integrate_snarl" << endl; //todo: debug_statement - cerr << "handles in to_insert_snarl:" << endl; + cerr << "\nhandles in to_insert_snarl:" << endl; to_insert_snarl.for_each_handle([&](const handle_t &handle) { cerr << to_insert_snarl.get_id(handle) << " " - << to_insert_snarl.get_sequence(handle) << " \t"; + << to_insert_snarl.get_sequence(handle) << " "; + cerr << "neighbors: "; + to_insert_snarl.follow_edges(handle, false, [&](const handle_t &next) { + cerr << " " << to_insert_snarl.get_id(next) << endl; + }); + cerr << " \n"; }); cerr << endl; // Get old _graph snarl @@ -1204,22 +1234,29 @@ SnarlNormalizer::debug_get_sources_and_sinks(const HandleGraph &graph) { vector source; // identify sources and sinks - _graph.for_each_handle([&](const handle_t &handle) { + graph.for_each_handle([&](const handle_t &handle) { + //todo: debug_statements in code below: + cerr << "identifying if " << graph.get_id(handle) << "is a source/sink." <> SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touched_handles) { + //todo: debug_statement + for (handle_t handle : touched_handles){ + cerr << "touched handles find_gbwt_haps: " << _graph.get_id(handle) << endl; + } // cerr << "find_haplotypes_not_at_source" << endl; /// Search every handle in touched handles for haplotypes starting at that point. diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 24a67f715e9..7e73c651653 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -171,7 +171,7 @@ int main_normalize(int argc, char **argv) { if (normalize) { //todo: maybe rewrite to mimic mod_main. - // vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); // graph->serialize(std::cout); } From a7c67bee650d1ae1f5136f91e6256114cd037fcc Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 28 Jan 2021 15:28:53 -0800 Subject: [PATCH 50/63] added debug prints, prep for fixing path moving between snarls --- src/algorithms/0_oo_normalize_snarls.cpp | 57 +++++++++++++--------- src/algorithms/0_snarl_sequence_finder.cpp | 3 ++ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 7f7770dacaf..d9dd761646e 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -74,22 +74,25 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; // // //todo: debug_code - // int stop_size = 1; - // int num_snarls_touched = 0; + int stop_size = 1; + int num_snarls_touched = 0; // int skip_first_few = 2; //#1, node 3702578 is a cyclic snarl. Don't recall about #0. #2 also cyclic. Looks like cyclic snarls weren't buggy? // int skipped = 0; + // int snarl_num = 0; for (auto roots : snarl_roots) { + // cerr << "normalizing snarl number " << snarl_num << endl; + // snarl_num++; // if (skipped < skip_first_few){ // skipped++; // continue; // } - // if (num_snarls_touched == stop_size){ - // break; - // } else { - // num_snarls_touched++; - // } - // if (roots->start().node_id() == 3704138) { + if (num_snarls_touched == stop_size){ + break; + } else { + num_snarls_touched++; + } + // if (roots->start().node_id() == 3881494) { cerr << "root backwards?" << roots->start().backward() << endl; cerr << "disambiguating snarl #" << (num_snarls_normalized + num_snarls_skipped) @@ -185,13 +188,13 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // SubHandleGraph snarl = extract_subgraph(_graph, sink_id, source_id); SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); - //todo: debug_statement: Evaluate connections of all nodes in subgraph. - snarl.for_each_handle([&](const handle_t handle){ - cerr << "examining left neighbors of handle " << snarl.get_id(handle) << ":" << endl; - snarl.follow_edges(handle, false, [&](const handle_t &next) { - cerr << " " << snarl.get_id(next) << endl; - }); - }); + // //todo: debug_statement: Evaluate connections of all nodes in subgraph. + // snarl.for_each_handle([&](const handle_t handle){ + // cerr << "examining left neighbors of handle " << snarl.get_id(handle) << ":" << endl; + // snarl.follow_edges(handle, false, [&](const handle_t &next) { + // cerr << " " << snarl.get_id(next) << endl; + // }); + // }); if (!algorithms::is_acyclic(&snarl)) { cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; @@ -232,7 +235,8 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & error_record[4] += snarl.get_sequence(handle).size(); }); - // TODO: this if statement removes snarls where a haplotype begins/ends in the middle + // TODO: this if statement only permits snarls that satsify requirements, i.e. + // TODO: there are no haplotype begins/ends in the middle // TODO: of the snarl. Get rid of this once alignment issue is addressed! // TODO: also, limits the number of haplotypes to be aligned, since snarl starting at // TODO: 2049699 with 258 haplotypes is taking many minutes. @@ -253,18 +257,22 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // TODO: accounted for in the code, remove next chunk of code that finds // TODO: source-to-sink paths. // find the paths that stretch from source to sink: + cerr << "~~~~~~~~~~source: " << source_id << "sink: " << sink_id << endl; for (auto path : embedded_paths) { - // cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + + cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; if (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id && _graph.get_id(_graph.get_handle_of_step( _graph.get_previous_step(path.second))) == sink_id) { - // cerr << "adding path of name " << - // _graph.get_path_name(graph.get_path_handle_of_step(path.first)) << - // endl; get the sequence of the source to sink path, and add it to the + cerr << "*****************************************\nadding path of name " << + _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << + endl; + // get the sequence of the source to sink path, and add it to the // paths to be aligned. string path_seq; step_handle_t cur_step = path.first; while (cur_step != path.second) { + cerr << "while adding path, looking at node " << _graph.get_id(_graph.get_handle_of_step(cur_step)) << " with seq " << _graph.get_sequence(_graph.get_handle_of_step(cur_step)) << endl; path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } @@ -708,6 +716,9 @@ void SnarlNormalizer::integrate_snarl( for (auto path : embedded_paths) { // //todo: debug_statement // cerr << "the new sink id: " << temp_snarl_sink_id << endl; + // //todo: debug_statement + // move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_sink_id, + // temp_snarl_source_id, sink_id, source_id); move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_source_id, temp_snarl_sink_id, source_id, sink_id); } @@ -731,8 +742,7 @@ void SnarlNormalizer::integrate_snarl( _graph.get_sequence(new_snarl_topo_order.back()), sink_id); // move the source edges: - // TODO: note the copy/paste. Ask if there's a better way to do this (I totally could - // in Python!) + // TODO: note the copy/paste. Fix? _graph.follow_edges(_graph.get_handle(temp_snarl_source_id), true, [&](const handle_t &prev_handle) { _graph.create_edge(prev_handle, new_source_handle); @@ -769,7 +779,7 @@ void SnarlNormalizer::integrate_snarl( // delete the previously created source and sink: for (handle_t handle : {_graph.get_handle(temp_snarl_source_id), _graph.get_handle(temp_snarl_sink_id)}) { - cerr << "id of handle to delete from tem source/sink: " << _graph.get_id(handle) << endl; + cerr << "id of handle to delete from the old source/sink: " << _graph.get_id(handle) << endl; _graph.destroy_handle(handle); } } @@ -842,6 +852,7 @@ void SnarlNormalizer::move_path_to_snarl( path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } + cerr << "path_seq to move: " << path_seq << endl; // TODO: debug_statement: // cerr << "\t\tpath sequence length: " << path_seq.size() << endl; diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index b33750bcfbb..c2166123f6a 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -433,10 +433,12 @@ SnarlSequenceFinder::find_embedded_paths() { // look for handles with paths we haven't touched yet. _snarl.for_each_handle([&](const handle_t &handle) { + cerr << "looking for paths at handle " << _graph.get_id(handle) << endl; vector steps = _graph.steps_of_handle(handle); // do any of these steps belong to a path not in paths_found? for (step_handle_t &step : steps) { path_handle_t path = _graph.get_path_handle_of_step(step); + cerr << "found a path. Is it new?" << endl; // If it's a step along a new path, save the first step to that path we find. // In addtion, if there are multiple steps found in the path, (The avoidance // of source and sink here is to ensure that we can properly check to see if @@ -445,6 +447,7 @@ SnarlSequenceFinder::find_embedded_paths() { if (paths_found.find(path) == paths_found.end() || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _source_id || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _sink_id) { + cerr << "found a new path." << endl; // then we need to mark it as found and save the step. paths_found[path] = step; } From b89f75be64cfb3c495c8874279c60a292aebed2b Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 26 Feb 2021 09:17:20 -0800 Subject: [PATCH 51/63] right-to-left directed snarls are now supported. --- .../0_move_embedded_paths_to_new_snarl.cpp | 581 +++++++++++++++++ src/algorithms/0_oo_normalize_snarls.cpp | 604 +++++------------- src/algorithms/0_oo_normalize_snarls.hpp | 26 +- src/algorithms/0_snarl_sequence_finder.cpp | 76 ++- src/algorithms/0_snarl_sequence_finder.hpp | 3 +- 5 files changed, 835 insertions(+), 455 deletions(-) create mode 100644 src/algorithms/0_move_embedded_paths_to_new_snarl.cpp diff --git a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp new file mode 100644 index 00000000000..cce5d2769e9 --- /dev/null +++ b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp @@ -0,0 +1,581 @@ +#include "0_oo_normalize_snarls.hpp" + +namespace vg { +namespace algorithms { +/* +* Goal: break move_path_to_snarl into easier-to-read, composite parts. +* +* Make it easier to take into account "backwards" bool when wanting to iterate through +* snarl backwards. +* +* Temp notes to self: +* for compare syntax, see https://www.geeksforgeeks.org/stdstringcompare-in-c/ +* syntax 2 was what I wanted when I looked it up. +* Example Error message: +* if (get_path_handle_of_step(segment_begin) != get_path_handle_of_step(segment_end)) { +* cerr << "error:[VG] attempted to rewrite segment delimited by steps on two separate paths" << endl; +* exit(1); +* } +* +*/ + +vector, int>> SnarlNormalizer::find_possible_path_starts(const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right) +{ + vector, int>> possible_paths; + + // if path starts at leftmost handle, then path's leftmost extension is at the + // beginning of the leftmost handle. + if (path_spans_left_right.first) + { + vector path; + path.push_back(leftmost_handle); + pair, int> path_loc = make_pair(path, _graph.get_sequence(leftmost_handle).size()); + possible_paths.push_back(path_loc); + } + return possible_paths; +} +/** + * extend_possible_paths + * + * @param {vector} undefined : + * @param {int>>} possible_path_starts : + * @param {string} path_str : + * @param {handle_t} leftmost_handle : + * @param {handle_t} rightmost_handle : + * @param {pair} path_spans_left_right : + * @return {vector} : empty if no path; otherwise, the sequence of handles representing a path which matches the path_str. + */ +vector SnarlNormalizer::extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right) +{ + // cerr << "path string (note: should be left-to-right at this point, e.g. TTACT, not AGTAA: " << path_str << endl; + // cerr << "leftmost handle id and seq: " << _graph.get_id(leftmost_handle) << " " << _graph.get_sequence(leftmost_handle) << endl; + vector correct_path; + // Now that we have all the possible leftmost starting positions for the path in + // possible_paths, search to the right of those positions for possible extensions + // of the path. + // + // If there are two possible extensions from a single position, make two entries in + // possible_paths. + // + // Continue until extending each possible path location, until we find a path that + // reaches (and properly includes) the sink. If there is no such path, then throw an + // exception saying that we couldn't find the path. + int times=0; + while (!possible_path_starts.empty() && correct_path.empty()) + { + // cerr << "walked through while loop " << times << " times." << endl; + + times += 1; + // take a path off of possible_path_starts, which will be copied for every iteration + // through _graph.follow_edges, below: + pair, int> cur_possible_path = possible_path_starts.back(); + possible_path_starts.pop_back(); + + string possible_path_str; + for (handle_t handle : cur_possible_path.first) + { + // cerr << "cur_possible_step id: " << _graph.get_id(handle) << endl; + possible_path_str += " " + _graph.get_sequence(handle); + } + + // cerr << "cur_possible_path is size " << cur_possible_path.first.size() << " with seq length " << cur_possible_path.second << " and sequence " << possible_path_str << endl; + // extend the path through all right-extending edges to see if any subsequent + // paths still satisfy the requirements for being a possible_path: + _graph.follow_edges( + get<0>(cur_possible_path).back(), false, [&](const handle_t &next) { + // make a copy to be extended for through each possible next handle in + // follow edges. + pair, int> possible_path = cur_possible_path; + + // decide if "next" is a valid extension of possible_path. + string next_seq = _graph.get_sequence(next); + // cerr << "next_seq is from candidate node for extension. next_seq: " << next_seq << endl; + //todo: test that this compare functions properly. + if ((path_str.compare(possible_path.second, next_seq.size(), next_seq) == 0)) + { + // cerr << "candidate for extension passed!" << endl; + possible_path.first.push_back(next); + pair, int> new_possible_path = make_pair(possible_path.first, possible_path.second + next_seq.size()); + // If we've reached the end of the path, we've either reached the proper end point (anywhere if path_spans_left_right.second==false; else, the rightmost node.), or we've yet to find a valid new_possible_path. + if ((!path_spans_left_right.second && new_possible_path.second>= path_str.size()) + || (path_spans_left_right.second && _graph.get_id(next) == _graph.get_id(rightmost_handle) && new_possible_path.second == path_str.size())) + { + // we found the correct path. + correct_path = new_possible_path.first; + return; + } + else + { + possible_path_starts.push_back(new_possible_path); + } + } + else + { + // cerr << "candidate for extension failed." << endl; + } + // return false; + }); + } + cerr << "************UNIT_TEST for extend_possible_paths************" << endl; + if (correct_path.size() == 0) + { + cerr << "no correct path in snarl found. output path size is zero." << endl; + } + cerr << "************END-UNIT_TEST for extend_possible_paths.************"<< endl; + + return correct_path; +} +/** + * SnarlNormalizer::move_path_to_new_snarl + * + * @param {pair} old_path : + * @param {id_t} source : the source for the new snarl. + * @param {id_t} sink : the sink for the new snarl. + * @param {pair} path_spans_left_right : + * @param {bool} path_directed_left_to_right : + */ +void SnarlNormalizer::move_path_to_new_snarl(const pair & old_path, const id_t &leftmost_id, const id_t &rightmost_id, const pair &path_spans_left_right, const bool &path_directed_left_to_right) +{ + /* +* This should return the series of handles, from left to right if path_left_to_right==true (else vice-versa), that the path should move to. +* +* Or returns None if the proposed "valid_starting_index" didn't pan out to give a good +* path in following handles. +*/ + // if path doesn't span both source and sink, I need to address that. But for the + // first iteration of this algorithm, I'll dodge that question. + // todo: address paths that don't span both source and sink. + if (!(path_spans_left_right.first and path_spans_left_right.second)) + { + cerr << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; + cerr << "PATH DOESN'T SPAN SOURCE AND SINK! THIS IS CURRENTLY UNSUPPORTED. SNARL WILL BE NORMALIZED, BUT PATH WON'T BE INCLUDED." << endl; + cerr << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; + vector no_path; + return; + } + + // get the path_string from the handles in the old_path: + string path_str; + step_handle_t cur_step = old_path.first; + string path_name = _graph.get_path_name(_graph.get_path_handle_of_step(old_path.first)); // used for unit tests at bottom. + while (cur_step != old_path.second) + { + path_str += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); + cur_step = _graph.get_next_step(cur_step); + } + + // cerr << "path string as originally extracted: " << path_str << endl; + handle_t leftmost_handle = _graph.get_handle(leftmost_id); + handle_t rightmost_handle = _graph.get_handle(rightmost_id); + + // make path_str read left-to-right. + if (!path_directed_left_to_right) + { + path_str = reverse_complement(path_str); + } + + // leftmost_handle = _graph.get_handle(sink); + // rightmost_handle = _graph.get_handle(source); + // path_str = reverse_complement(path_str); + // path_spans_left_right = make_pair(path_spans_left_right.second, path_spans_left_right.first); + // } + // else + // { + // leftmost_handle = _graph.get_handle(source); + // rightmost_handle = _graph.get_handle(sink); + // path_spans_left_right = path_spans_left_right; + // } + // cerr << "in move_path_to_snarl: " << endl; + // cerr << "path_name: " << path_name << endl; + // cerr << "path_str: " << path_str << endl; + // cerr << "sink id and seq: " << sink << " " << _graph.get_sequence(_graph.get_handle(sink)) << endl; + // cerr << "leftmost handle id and seq: " << _graph.get_id(leftmost_handle) << " " << _graph.get_sequence(leftmost_handle) << endl; + + + // dealing with edge cases: + // if source == sink, the snarl has become one single node and the path should map directly to that + // one node. + //todo! + + // possible_paths tracks all the possible locations of the path as we extend the + // path_string-to-_graph alignment. + // + // Each "location" in the vector is a pair. pair.first is the vector of handles + // comprising the path found thus far. pair.second is the size of the extended path + // thus far, so that we know where we are in path_str. + // + // Each path will be filled left-to-right. If path_left_to_right==false, the final + // path will be reversed before it's returned. + vector, int>> possible_path_starts = find_possible_path_starts(leftmost_handle, rightmost_handle, path_spans_left_right); + // cerr << "size of possible_path_starts: " << possible_path_starts.size() << endl; + vector new_path_location = extend_possible_paths(possible_path_starts, path_str, leftmost_handle, rightmost_handle, path_spans_left_right); + // cerr << "size of new_path_location: " << new_path_location.size() << endl; + + //todo! debug comment_out below. + // flip the order of the handles if the path moves right-to-left. + if (!path_directed_left_to_right) + { + std::reverse(new_path_location.begin(), new_path_location.end()); + for (int i = 0; i != new_path_location.size(); i++) + { + new_path_location[i] = _graph.flip(new_path_location[i]); + } + } + + // cerr << "rewriting path " << _graph.get_path_name(_graph.get_path_handle_of_step(old_path.first)) << endl; + // step_handle_t cur_old_step = old_path.first; + // string old_path_series; + // string old_path_str; + // while (cur_old_step != old_path.second) + // { + // cerr << "cur_old_step id: " << _graph.get_id(_graph.get_handle_of_step(cur_old_step)) << endl; + // old_path_series += " " + _graph.get_id(_graph.get_handle_of_step(cur_old_step)); + // old_path_str += " " + _graph.get_sequence(_graph.get_handle_of_step(cur_old_step)); + // // I wonder if I'm messing up the path sequence because get_sequence is always left->right, but the real path seq should be right->left on some handles. + // cur_old_step = _graph.get_next_step(cur_old_step); + // } + // string new_path_series; + // string new_path_str; + // for (handle_t handle : new_path_location) + // { + // cerr << "cur_new_step id: " << _graph.get_id(handle) << endl; + // new_path_series += " " + _graph.get_id(handle); + // new_path_str += " " + _graph.get_sequence(handle); + // // I wonder if I'm messing up the path sequence because get_sequence is always left->right, but the real path seq should be right->left on some handles. + // } + + // cerr << "request to rewrite segment with old_path: " << old_path_series << " " << old_path_str << endl; + // cerr << "and new path: " << new_path_series << " " << new_path_str << endl; + + cerr << "sequence according to old_path.first: " << _graph.get_id(_graph.get_handle_of_step(old_path.first)) << " " << _graph.get_sequence(_graph.get_handle_of_step(old_path.first)) << endl; + _graph.rewrite_segment(old_path.first, old_path.second, new_path_location); + cerr << "sequence at new_path_location first entry: " << _graph.get_id(new_path_location.front()) << " " << _graph.get_sequence(new_path_location.front()) << endl; + + cerr << "************UNIT_TEST for extend_possible_paths************" << endl; + // Test that the new path exists. + if (new_path_location.size() == 0) + { + cerr << "no new path location found." << endl; + } + // Test that the new path seq = old path seq. + else + { + step_handle_t new_path_source; + vector steps = _graph.steps_of_handle(new_path_location.front()); + for (auto step : steps) + { + if (_graph.get_path_name(_graph.get_path_handle_of_step(step)) == path_name) + { + new_path_source = step; + } + } + string new_path_str = _graph.get_sequence(_graph.get_handle_of_step(new_path_source)); + step_handle_t cur_step = new_path_source; + while (_graph.get_id(_graph.get_handle_of_step(cur_step)) != _graph.get_id(new_path_location.back())) + { + //todo: make sure that this process correctly extracts path string. + cur_step = _graph.get_next_step(cur_step); + new_path_str += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); + } + string old_path_str; + if (!path_directed_left_to_right) + { + old_path_str = reverse_complement(path_str); + } + else + { + old_path_str = path_str; + } + if (old_path_str != new_path_str) + { + cerr << "Once the path was moved into the new snarl, it didn't have the same sequence." << endl; + cerr << "original seq: " << old_path_str << endl; + cerr << " new seq: " << new_path_str << endl; + } + } + + cerr << "************END-UNIT_TEST for extend_possible_paths.************"<< endl; + + +} + +// // Moves a path from its original location in the _graph to a new snarl, +// // defined by a vector of interconnected handles. +// // NOTE: the handles in new_snarl_handles may not preserve topological order after +// // being passed to this method, if they were ordered before. +// // Arguments: _graph: the _graph containing the old_embedded_path and the handles in +// // new_snarl_topo_order +// // old_embedded_path: a pair, where +// // pair.first is the first step_handle of interest in the +// // old_embedded_path, and pair.second is the step_handle *after* +// // the last step_handle of interest in the old_embedded_path (can +// // be the null step at the end of the path.) +// // new_snarl_topo_order: all the handles in the new snarl, inside the _graph. +// // Return: None. +// void SnarlNormalizer::move_embedded_path_to_snarl( +// const pair &old_embedded_path, +// vector &new_snarl_handles, id_t &new_source_id, id_t &new_sink_id, +// const id_t &old_source_id, const id_t &old_sink_id, const bool backwards) +// { + +// // get the sequence associated with the path +// string path_seq; +// step_handle_t cur_step = old_embedded_path.first; + +// // if the old path is touching either/both the source/sink, we want to make sure that +// // the newly moved path also touches those. Otherwise, any paths that extend beyond +// // the source or sink may be cut into pieces when the portion of the path overlapping +// // the snarl is moved to a region inside the snarl. +// bool touching_source = +// (_graph.get_id(_graph.get_handle_of_step(old_embedded_path.first)) == +// old_source_id); +// bool touching_sink = +// (_graph.get_id(_graph.get_handle_of_step( +// _graph.get_previous_step(old_embedded_path.second))) == old_sink_id); + +// cerr << "MOVE_PATH_TO_SNARL: touching_source and touching_sink: " << touching_source << " " << touching_sink << " new_source_id: " << new_source_id << " new_sink_id: " << new_sink_id << " old_source_id: " << old_source_id << " old_sink_id: " << old_sink_id << endl; +// cerr << "old_embedded_path.second: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(old_embedded_path.second))) << endl; +// // extract the path sequence of the embedded path: +// while (cur_step != old_embedded_path.second) +// { +// path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); +// cur_step = _graph.get_next_step(cur_step); +// } +// cerr << "path_seq to move: " << path_seq << endl; + +// vector, int, int>> possible_paths; +// for (handle_t handle : new_snarl_handles) +// { +// string handle_seq = _graph.get_sequence(handle); +// if (backwards) +// { +// handle_seq = reverse_complement(handle_seq); +// } + +// // starting index is where the path would begin in the handle, +// // since it could begin in the middle of the handle. +// vector starting_indices = +// check_handle_as_start_of_path_seq(handle_seq, path_seq); + +// cerr << "starting indices for path " << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) << "(path_seq: " << path_seq << ")" +// << " in handle " << _graph.get_id(handle) << " (handle_seq: " << handle_seq << ")" << endl; +// for (int i : starting_indices) +// { +// cerr << i << endl; +// } +// cerr << "finished indices" << endl; + +// // if there is a starting index, +// if (starting_indices.size() != 0) +// { +// for (int starting_index : starting_indices) +// { +// if (((backwards && (starting_index >= path_seq.size())) || (!backwards && ((handle_seq.size() - starting_index) >= path_seq.size()))) && +// source_and_sink_handles_map_properly(_graph, new_source_id, +// new_sink_id, touching_source, +// touching_sink, handle, handle)) +// { +// // if the entire path fits inside the current handle, and if any +// // paths that touched source and sink in the old snarl would be +// // touching source and sink in the new snarl, then we've already +// // found the full mapping location of the path! Move the path, end +// // the method. +// vector new_path{handle}; +// _graph.rewrite_segment(old_embedded_path.first, +// old_embedded_path.second, new_path); +// // //todo: debug_statement +// // cerr << "found a full mapping at " << _graph.get_id(handle) +// // << " w/ seq " << _graph.get_sequence(handle) << endl; +// return; +// } +// else +// { +// // this is a potential starting handle for the path. Add as a +// // possible_path. +// vector possible_path_handle_vec{handle}; +// possible_paths.push_back( +// make_tuple(possible_path_handle_vec, starting_index, +// handle_seq.size() - starting_index)); +// } +// } +// } +// } + +// // for every possible path, extend it to determine if it really is the path we're +// // looking for: +// while (!possible_paths.empty()) +// { +// // take a path off of possible_paths, which will be copied for every iteration +// // through _graph.follow_edges, below: +// tuple, int, int> possible_path_query = possible_paths.back(); +// possible_paths.pop_back(); + +// // extend the path through all right-extending edges to see if any subsequent +// // paths still satisfy the requirements for being a possible_path: +// bool no_path = _graph.follow_edges( +// get<0>(possible_path_query).back(), false, [&](const handle_t &next) { +// // make a copy to be extended for through each possible next handle in +// // follow edges. +// tuple, int, int> possible_path = possible_path_query; + +// // extract relevant information to make code more readable. +// string next_seq = _graph.get_sequence(next); +// id_t next_id = _graph.get_id(next); +// int &cur_index_in_path = get<2>(possible_path); +// if (cur_index_in_path <= path_seq.size() && +// (find(new_snarl_handles.cbegin(), new_snarl_handles.cend(), next) != +// new_snarl_handles.cend())) +// { +// // if the next handle would be the ending handle for the path, +// if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) +// { +// // cerr << "next handle would be the ending handle for the path" +// // << endl; +// // check to see if the sequence in the handle is suitable +// // for ending the path: +// int compare_length = path_seq.size() - cur_index_in_path; + +// if ((next_seq.compare(0, compare_length, path_seq, +// cur_index_in_path, compare_length) == 0) && +// source_and_sink_handles_map_properly( +// _graph, new_source_id, new_sink_id, touching_source, +// touching_sink, get<0>(possible_path).front(), next)) +// { + +// // we've found the new path! Move path to the new sequence, +// // and end the function. + +// if (compare_length < next_seq.size()) +// { +// // If the path ends before the end of next_seq, then split +// // the handle so that the path ends flush with the end of +// // the first of the two split handles. + +// // divide the handle where the path ends; +// pair divided_next = +// _graph.divide_handle(next, compare_length); +// get<0>(possible_path).push_back(divided_next.first); + +// // Special case if next is the sink or the source, to +// // preserve the reassignment of source and sink ids in +// // integrate_snarl. +// if (next_id == new_sink_id) +// { +// new_sink_id = _graph.get_id(divided_next.second); +// } + +// // TODO: NOTE: finding the old "next" handle is expensive. +// // TODO: Use different container? +// auto it = find(new_snarl_handles.begin(), +// new_snarl_handles.end(), next); + +// // replace the old invalidated handle with one of the new +// // ones +// *it = divided_next.first; +// // stick the other new handle on the end of +// // new_snarl_handles. +// new_snarl_handles.push_back(divided_next.second); +// } +// else +// { +// // otherwise, the end of the path already coincides with +// // the end of the handle. In that case, just add it to the +// // path. +// get<0>(possible_path).push_back(next); +// } +// _graph.rewrite_segment(old_embedded_path.first, +// old_embedded_path.second, +// get<0>(possible_path)); +// // //todo: debug_statement: +// // cerr << "got a full path: "; +// // for (handle_t handle : get<0>(possible_path)) { +// // cerr << _graph.get_id(handle) << " "; +// // } +// // cerr << endl; + +// // we've already found the path. No need to keep looking for +// // more paths. +// return false; +// } +// } +// // see if the next handle would be the continuation of the path, but +// // not the end, +// else +// { + +// // check to see if the sequence in the handle is suitable for +// // extending the path: +// int compare_length = next_seq.size(); +// // //todo: debug_statement +// // cerr << "compare returned false" << endl; +// // cerr << "compare in returned false: " +// // << " next_seq len " << next_seq.size() << " compare_length +// // " +// // << compare_length << " path_seq len " << path_seq.size() +// // << " cur_index_in_path " << cur_index_in_path << endl; +// // cerr << "if statement eval: cur_index_in_path <= +// // next_seq.size() " +// // << (cur_index_in_path <= next_seq.size()) +// // << " next_seq.compare(0, compare_length, path_seq, " +// // "cur_index_in_path, compare_length) == 0) " +// // << (next_seq.compare(0, compare_length, path_seq, +// // cur_index_in_path, compare_length) == +// // 0) +// // << endl; +// if (next_seq.compare(0, compare_length, path_seq, +// cur_index_in_path, compare_length) == 0) +// { +// // cerr << "compared in return false" << endl; +// // extend the path +// get<0>(possible_path).push_back(next); + +// // update the current index in path_seq. +// get<2>(possible_path) += next_seq.size(); + +// // place back into possible_paths +// possible_paths.push_back(possible_path); +// // cerr << "extending the path!" << endl; +// } +// } +// } +// // continue to iterate through follow_edges. +// return true; +// }); + +// // //todo: debug_statement: +// // if +// // (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) +// // == +// // "_alt_19f9bc9ad2826f58f113965edf36bb93740df46d_0") { +// // cerr << "mystery node 4214930: " +// // << _graph.get_sequence(graph.get_handle(4214930)) << endl; +// // } + +// // if we've found a complete path in the above follow_edges, then we've +// // already moved the path, and we're done. +// if (!no_path) +// { +// return; +// } +// } +// // //todo: figure out how to do some better error message instead of cerr. +// // if we failed to find a path, show an error message. +// cerr << "##########################\nWarning! Didn't find a corresponding path of " +// "name " +// << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) +// << " from the old snarl at " << old_source_id +// << " in the newly aligned snarl. This snarl WILL be " +// "normalized, resulting in a probably incorrectly-constructed snarl." +// "\n##########################" +// << endl +// << endl; +// // throw _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); +// // assert(true && "Warning! Didn't find a corresponding path of name " + +// // _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) +// // + " from the old snarl in the newly aligned snarl."); +// } + +} //VG +} //Algorithms \ No newline at end of file diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index d9dd761646e..3ad60f8078f 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -49,7 +49,7 @@ SnarlNormalizer::SnarlNormalizer(MutablePathDeletableHandleGraph &graph, * @param snarl_stream file stream from .snarl.pb output of vg snarls */ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { - cerr << "disambiguate_top_level_snarls" << endl; + // cerr << "disambiguate_top_level_snarls" << endl; SnarlManager *snarl_manager = new SnarlManager(snarl_stream); int num_snarls_normalized = 0; @@ -93,18 +93,13 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { num_snarls_touched++; } // if (roots->start().node_id() == 3881494) { - cerr << "root backwards?" << roots->start().backward() << endl; - cerr << "disambiguating snarl #" - << (num_snarls_normalized + num_snarls_skipped) - << " source: " << roots->start().node_id() - << " sink: " << roots->end().node_id() << endl; - - if (!roots->start().backward()){ - one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id()); - } - else { - one_snarl_error_record = normalize_snarl(roots->end().node_id(), roots->start().node_id()); - } + // cerr << "root backwards?" << roots->start().backward() << endl; + // cerr << "disambiguating snarl #" + // << (num_snarls_normalized + num_snarls_skipped) + // << " source: " << roots->start().node_id() + // << " sink: " << roots->end().node_id() << endl; + + one_snarl_error_record = normalize_snarl(roots->start().node_id(), roots->end().node_id(), roots->start().backward()); if (!((one_snarl_error_record[0]) || (one_snarl_error_record[1]) || (one_snarl_error_record[2]) || (one_snarl_error_record[3]))) { // if there are no errors, then we've successfully normalized a snarl. @@ -123,6 +118,8 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { } num_snarls_skipped += 1; } + //todo! Make unit test for shrinking snarls. + // //todo: debug_statement for extracting snarl of interest. // VG outGraph; // pos_t source_pos = make_pos_t(roots->start().node_id(), false, 0); @@ -171,7 +168,16 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // Returns: none. // TODO: allow for snarls that have haplotypes that begin or end in the middle of the // snarl. -vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t &sink_id) { +vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const bool backwards) { + // if (backwards){ + // // swap the source and sink ids. Essentially, this guarantees I treat the leftmost node in snarl as "source". + // // (although some adjustments for paths need be made) + // id_t swap_source = sink_id; //temp storage of sink_id value. + // sink_id = source_id; + // source_id = swap_source; + // } + + /** * We keep an error record to observe when snarls are skipped because they aren't * normalizable under current restraints. Bools: @@ -186,7 +192,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & vector error_record(6, 0); // //todo: debug_statement: determining whether cyclic problem in yeast graph goes away when I swapo source and sink. // SubHandleGraph snarl = extract_subgraph(_graph, sink_id, source_id); - SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id); + SubHandleGraph snarl = extract_subgraph(_graph, source_id, sink_id, backwards); // //todo: debug_statement: Evaluate connections of all nodes in subgraph. // snarl.for_each_handle([&](const handle_t handle){ @@ -206,7 +212,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & tuple, vector>, unordered_set> haplotypes; // //todo: debug_statement: determining whether cyclic problem in yeast graph goes away when I swapo source and sink. // SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, sink_id, source_id); - SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id); + SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id, backwards); if (_path_finder == "GBWT") { tuple>, vector>, unordered_set> @@ -221,8 +227,9 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & get<0>(haplotypes) = exhaustive_haplotypes.first; get<2>(haplotypes) = exhaustive_haplotypes.second; } else { - cerr << "path_finder type must be 'GBWT' or 'exhaustive', not " << _path_finder - << endl; + cerr << "path_finder type must be 'GBWT' or 'exhaustive', not '" << _path_finder + << "'." << endl; + exit(1); } // check to make sure that the gbwt _graph has threads connecting all handles: @@ -248,10 +255,10 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & sequence_finder.find_embedded_paths(); //todo: debug_statement - cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; - for (string seq : get<0>(haplotypes)) { - cerr << seq << endl; - } + // cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; + // for (string seq : get<0>(haplotypes)) { + // cerr << seq << endl; + // } // TODO: once haplotypes that begin/end in the middle of the snarl have been // TODO: accounted for in the code, remove next chunk of code that finds @@ -260,23 +267,32 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & cerr << "~~~~~~~~~~source: " << source_id << "sink: " << sink_id << endl; for (auto path : embedded_paths) { - cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with start " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + // cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with source " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + // cerr << "SOURCE info: prev step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << "prev prev step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(_graph.get_previous_step(path.second)))) << " source: " << _graph.get_id(_graph.get_handle_of_step(path.second)) << " next step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_next_step(path.second))) << endl; + // cerr << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << " " << source_id << " source bool: " << (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == source_id) << endl; if (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id && _graph.get_id(_graph.get_handle_of_step( - _graph.get_previous_step(path.second))) == sink_id) { - cerr << "*****************************************\nadding path of name " << - _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << - endl; + _graph.get_previous_step(path.second))) == sink_id) { + + // cerr << "******************************************\nadding path of name " << + // _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << + // endl; // get the sequence of the source to sink path, and add it to the // paths to be aligned. string path_seq; step_handle_t cur_step = path.first; while (cur_step != path.second) { - cerr << "while adding path, looking at node " << _graph.get_id(_graph.get_handle_of_step(cur_step)) << " with seq " << _graph.get_sequence(_graph.get_handle_of_step(cur_step)) << endl; + // cerr << "while adding path, looking at node " << _graph.get_id(_graph.get_handle_of_step(cur_step)) << " with seq " << _graph.get_sequence(_graph.get_handle_of_step(cur_step)) << endl; path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } - get<0>(haplotypes).emplace(path_seq); + if (backwards) { + get<0>(haplotypes).emplace(reverse_complement(path_seq)); + } + else { + get<0>(haplotypes).emplace(path_seq); + + } } } // Align the new snarl: @@ -292,7 +308,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & // integrate the new_snarl into the _graph, removing the old snarl as you go. // //todo: debug_statement // integrate_snarl(new_snarl, embedded_paths, sink_id, source_id); - integrate_snarl(new_snarl, embedded_paths, source_id, sink_id); + integrate_snarl(snarl, new_snarl, embedded_paths, source_id, sink_id, backwards); } else { if (!get<1>(haplotypes).empty()) { cerr << "found a snarl starting at " << source_id << " and ending at " @@ -340,6 +356,7 @@ vector SnarlNormalizer::normalize_snarl(const id_t &source_id, const id_t & } + // Given a vector of haplotypes of format vector< handle_t >, returns a vector of // haplotypes of // format string (which is the concatenated sequences in the handles). @@ -448,7 +465,7 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( row_string += *it; } // todo: debug_statement - cerr << "ROW_STRING: " << row_string << endl; + // cerr << "ROW_STRING: " << row_string << endl; // edit the row so that the proper source and sink chars are added to the // haplotype instead of the special characters added to ensure correct alignment // of source and sink. @@ -460,7 +477,7 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( stringstream ss; for (string seq : row_strings) { // todo: debug_statement - cerr << "seq in alignment:" << seq << endl; + // cerr << "seq in alignment:" << seq << endl; ss << endl << seq; } // ss << align; @@ -531,8 +548,24 @@ void SnarlNormalizer::force_maximum_handle_size(MutableHandleGraph &graph, // a SubHandleGraph containing only the handles in _graph that are between start_id // and sink_id. SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, - const id_t &start_id, - const id_t &sink_id) { + id_t source_id, + id_t sink_id, + const bool backwards) { + // cerr << "extract_subgraph has source and sink: " << source_id << " " << sink_id << endl; + // because algorithm moves left to right, determine leftmost and rightmost nodes. + id_t leftmost_id; + id_t rightmost_id; + // if snarl's "backwards," source is rightmost node, sink is leftmost. + if (backwards) + { + leftmost_id = sink_id; + rightmost_id = source_id; + } + else + { + leftmost_id = source_id; + rightmost_id = sink_id; + } // cerr << "extract_subgraph" << endl; /// make a subgraph containing only nodes of interest. (e.g. a snarl) // make empty subgraph @@ -541,14 +574,13 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, unordered_set visited; // to avoid counting the same node twice. unordered_set to_visit; // nodes found that belong in the subgraph. - // TODO: how to ensure that "to the right" of start_handle is the correct direction? - // initialize with start_handle (because we move only to the right of start_handle): - handle_t start_handle = _graph.get_handle(start_id); - subgraph.add_handle(start_handle); - visited.insert(graph.get_id(start_handle)); + // initialize with leftmost_handle (because we move only to the right of leftmost_handle): + handle_t leftmost_handle = _graph.get_handle(leftmost_id); + subgraph.add_handle(leftmost_handle); + visited.insert(graph.get_id(leftmost_handle)); - // look only to the right of start_handle - _graph.follow_edges(start_handle, false, [&](const handle_t &handle) { + // look only to the right of leftmost_handle + _graph.follow_edges(leftmost_handle, false, [&](const handle_t &handle) { // mark the nodes to come as to_visit if (visited.find(graph.get_id(handle)) == visited.end()) { to_visit.insert(graph.get_id(handle)); @@ -568,7 +600,7 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, subgraph.add_handle(cur_handle); - if (graph.get_id(cur_handle) != sink_id) { // don't iterate past end node! + if (graph.get_id(cur_handle) != rightmost_id) { // don't iterate past rightmost node! // look for all nodes connected to cur_handle that need to be added // looking to the left, _graph.follow_edges(cur_handle, true, [&](const handle_t &handle) { @@ -604,34 +636,26 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, // source_id: the source of the old (to be replaced) snarl in _graph // sink_id: the sink of the old (to be replaced) snarl in _graph. // Return: None. -// TODO: Note: How to ensure that step_handle_t's walk along the snarl in the same -// TODO: orientation as we expect? i.e. that they don't move backward? I think -// TODO: we want match_orientation to be = true, but this may cause problems -// TODO: in some cases given the way we currently construct handles (fixed when we -// TODO: create snarl-scanning interface). -// TODO: It may also be that we *don't want match_orientation to be true, -// TODO: if we're tracking a path that loops backward in the snarl. Hmm... Will think -// about this. -void SnarlNormalizer::integrate_snarl( +void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, const HandleGraph &to_insert_snarl, const vector> embedded_paths, - const id_t &source_id, const id_t &sink_id) { + const id_t &source_id, const id_t &sink_id, const bool backwards) { // cerr << "integrate_snarl" << endl; //todo: debug_statement - cerr << "\nhandles in to_insert_snarl:" << endl; - to_insert_snarl.for_each_handle([&](const handle_t &handle) { - cerr << to_insert_snarl.get_id(handle) << " " - << to_insert_snarl.get_sequence(handle) << " "; - cerr << "neighbors: "; - to_insert_snarl.follow_edges(handle, false, [&](const handle_t &next) { - cerr << " " << to_insert_snarl.get_id(next) << endl; - }); - cerr << " \n"; - }); - cerr << endl; + // cerr << "\nhandles in to_insert_snarl:" << endl; + // to_insert_snarl.for_each_handle([&](const handle_t &handle) { + // cerr << to_insert_snarl.get_id(handle) << " " + // << to_insert_snarl.get_sequence(handle) << " "; + // cerr << "neighbors: "; + // to_insert_snarl.follow_edges(handle, false, [&](const handle_t &next) { + // cerr << " " << to_insert_snarl.get_id(next) << endl; + // }); + // cerr << " \n"; + // }); + // cerr << endl; // Get old _graph snarl - SubHandleGraph old_snarl = extract_subgraph(_graph, source_id, sink_id); + // SubHandleGraph old_snarl = extract_subgraph(_graph, source_id, sink_id, backwards); // TODO: debug_statement: Check to make sure that newly made snarl has only one start // and end. @@ -672,7 +696,7 @@ void SnarlNormalizer::integrate_snarl( handle_t graph_handle = _graph.create_handle(to_insert_snarl.get_sequence(to_insert_snarl_handle)); new_snarl_topo_order.push_back(graph_handle); - cerr << "graph handle being inserted into new_snarl_topo_order:" << _graph.get_id(graph_handle) << endl; + // cerr << "graph handle being inserted into new_snarl_topo_order:" << _graph.get_id(graph_handle) << endl; } // Connect the newly made handles in the _graph together the way they were connected @@ -695,38 +719,62 @@ void SnarlNormalizer::integrate_snarl( // not necessarily preserved by move_path_to_snarl. Is temporary b/c we need to // replace the handles with ones with the right id_t label for source and sink later // on. - id_t temp_snarl_source_id = _graph.get_id(new_snarl_topo_order.front()); - id_t temp_snarl_sink_id = _graph.get_id(new_snarl_topo_order.back()); - cerr << "the temp source id: " << temp_snarl_source_id << endl; - cerr << "the temp sink id: " << temp_snarl_sink_id << endl; + id_t temp_snarl_leftmost_id = _graph.get_id(new_snarl_topo_order.front()); + id_t temp_snarl_rightmost_id = _graph.get_id(new_snarl_topo_order.back()); + // cerr << "the temp source id: " << temp_snarl_leftmost_id << endl; + // cerr << "the temp sink id: " << temp_snarl_rightmost_id << endl; // Add the neighbors of the source and sink of the original snarl to the new_snarl's // source and sink. // source integration: + if (!backwards) + { _graph.follow_edges( _graph.get_handle(source_id), true, [&](const handle_t &prev_handle) { - _graph.create_edge(prev_handle, _graph.get_handle(temp_snarl_source_id)); + _graph.create_edge(prev_handle, _graph.get_handle(temp_snarl_leftmost_id)); }); _graph.follow_edges( _graph.get_handle(sink_id), false, [&](const handle_t &next_handle) { - _graph.create_edge(_graph.get_handle(temp_snarl_sink_id), next_handle); + _graph.create_edge(_graph.get_handle(temp_snarl_rightmost_id), next_handle); }); - + } + else + { + _graph.follow_edges( + _graph.get_handle(source_id), false, [&](const handle_t &next_handle) { + _graph.create_edge(_graph.get_handle(temp_snarl_rightmost_id), next_handle); + }); + _graph.follow_edges( + _graph.get_handle(sink_id), true, [&](const handle_t &prev_handle) { + _graph.create_edge(prev_handle, _graph.get_handle(temp_snarl_leftmost_id)); + }); + } // For each path of interest, move it onto the new_snarl. for (auto path : embedded_paths) { // //todo: debug_statement - // cerr << "the new sink id: " << temp_snarl_sink_id << endl; + // cerr << "the new sink id: " << temp_snarl_rightmost_id << endl; // //todo: debug_statement - // move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_sink_id, - // temp_snarl_source_id, sink_id, source_id); - move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_source_id, - temp_snarl_sink_id, source_id, sink_id); + // move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_rightmost_id, + // temp_snarl_leftmost_id, sink_id, source_id); + // move_path_to_snarl(path, new_snarl_topo_order, temp_snarl_leftmost_id, + // temp_snarl_rightmost_id, source_id, sink_id, backwards); + // cerr << "is path backwards? " << backwards << endl; + // cerr << "path first: " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " step after path first: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_next_step(path.first))) << " path second: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + // cerr << "source: " << source_id << " sink: " << sink_id << endl; + // pair path_spans_left_right; + // path_spans_left_right.first = (!backwards && _graph.get_id(_graph.get_handle_of_step(path.first)) == source_id) || (backwards && _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == source_id); + // path_spans_left_right.second = (!backwards && _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == sink_id) || (backwards && _graph.get_id(_graph.get_handle_of_step(path.first)) == sink_id); + // cerr << "first: " << path_spans_left_right.first << "second: " << path_spans_left_right.second << endl; + pair path_spans_left_right; + path_spans_left_right.first = (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id); + path_spans_left_right.second = (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == sink_id); + + move_path_to_new_snarl(path, temp_snarl_leftmost_id, temp_snarl_rightmost_id, path_spans_left_right, !backwards); + } // Destroy the old snarl. - old_snarl.for_each_handle( - - [&](const handle_t &handle) { + old_snarl.for_each_handle([&](const handle_t &handle) { // //todo: debug_statement these are the handles in old_snarl: // cerr << old_snarl.get_id(handle) << old_snarl.get_sequence(handle) << endl; _graph.destroy_handle(handle); @@ -736,383 +784,79 @@ void SnarlNormalizer::integrate_snarl( // (for compatibility with future iterations on neighboring top-level snarls using the // same snarl manager. Couldn't replace it before b/c we needed the old handles to // move the paths. - handle_t new_source_handle = _graph.create_handle( - _graph.get_sequence(_graph.get_handle(temp_snarl_source_id)), source_id); - handle_t new_sink_handle = _graph.create_handle( - _graph.get_sequence(new_snarl_topo_order.back()), sink_id); - + handle_t new_leftmost_handle; + handle_t new_rightmost_handle; + if (!backwards) + { + new_leftmost_handle = _graph.create_handle( + _graph.get_sequence(_graph.get_handle(temp_snarl_leftmost_id)), source_id); + new_rightmost_handle = _graph.create_handle( + _graph.get_sequence(new_snarl_topo_order.back()), sink_id); + } + else + { + new_leftmost_handle = _graph.create_handle( + _graph.get_sequence(_graph.get_handle(temp_snarl_leftmost_id)), sink_id); + new_rightmost_handle = _graph.create_handle( + _graph.get_sequence(new_snarl_topo_order.back()), source_id); + } // move the source edges: // TODO: note the copy/paste. Fix? - _graph.follow_edges(_graph.get_handle(temp_snarl_source_id), true, + _graph.follow_edges(_graph.get_handle(temp_snarl_leftmost_id), true, [&](const handle_t &prev_handle) { - _graph.create_edge(prev_handle, new_source_handle); + _graph.create_edge(prev_handle, new_leftmost_handle); }); - _graph.follow_edges(_graph.get_handle(temp_snarl_source_id), false, + _graph.follow_edges(_graph.get_handle(temp_snarl_leftmost_id), false, [&](const handle_t &next_handle) { - _graph.create_edge(new_source_handle, next_handle); + _graph.create_edge(new_leftmost_handle, next_handle); }); // move the sink edges: - _graph.follow_edges(_graph.get_handle(temp_snarl_sink_id), true, + _graph.follow_edges(_graph.get_handle(temp_snarl_rightmost_id), true, [&](const handle_t &prev_handle) { - _graph.create_edge(prev_handle, new_sink_handle); + _graph.create_edge(prev_handle, new_rightmost_handle); }); - _graph.follow_edges(_graph.get_handle(temp_snarl_sink_id), false, + _graph.follow_edges(_graph.get_handle(temp_snarl_rightmost_id), false, [&](const handle_t &next_handle) { - _graph.create_edge(new_sink_handle, next_handle); + _graph.create_edge(new_rightmost_handle, next_handle); }); // move the paths: _graph.for_each_step_on_handle( - _graph.get_handle(temp_snarl_source_id), [&](step_handle_t step) { + _graph.get_handle(temp_snarl_leftmost_id), [&](step_handle_t step) { _graph.rewrite_segment(step, _graph.get_next_step(step), - vector{new_source_handle}); + vector{new_leftmost_handle}); }); _graph.for_each_step_on_handle( - _graph.get_handle(temp_snarl_sink_id), [&](step_handle_t step) { + _graph.get_handle(temp_snarl_rightmost_id), [&](step_handle_t step) { _graph.rewrite_segment(step, _graph.get_next_step(step), - vector{new_sink_handle}); + vector{new_rightmost_handle}); }); - cerr << "the temp source id: " << temp_snarl_source_id << endl; - cerr << "the temp sink id: " << temp_snarl_sink_id << endl; + cerr << "the temp leftmost id: " << temp_snarl_leftmost_id << endl; + cerr << "the temp rightmost id: " << temp_snarl_rightmost_id << endl; // delete the previously created source and sink: - for (handle_t handle : {_graph.get_handle(temp_snarl_source_id), - _graph.get_handle(temp_snarl_sink_id)}) { + for (handle_t handle : {_graph.get_handle(temp_snarl_leftmost_id), + _graph.get_handle(temp_snarl_rightmost_id)}) { cerr << "id of handle to delete from the old source/sink: " << _graph.get_id(handle) << endl; _graph.destroy_handle(handle); } } -// Moves a path from its original location in the _graph to a new snarl, -// defined by a vector of interconnected handles. -// NOTE: the handles in new_snarl_handles may not preserve topological order after -// being passed to this method, if they were ordered before. -// Arguments: _graph: the _graph containing the old_embedded_path and the handles in -// new_snarl_topo_order -// old_embedded_path: a pair, where -// pair.first is the first step_handle of interest in the -// old_embedded_path, and pair.second is the step_handle *after* -// the last step_handle of interest in the old_embedded_path (can -// be the null step at the end of the path.) -// new_snarl_topo_order: all the handles in the new snarl, inside the _graph. -// Return: None. -void SnarlNormalizer::move_path_to_snarl( - const pair &old_embedded_path, - vector &new_snarl_handles, id_t &new_source_id, id_t &new_sink_id, - const id_t &old_source_id, const id_t &old_sink_id) { - // cerr << "\nmove_path_to_snarl" << endl; - // //TODO: debug_statement: - // cerr << "path name: " - // << - // _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) - // << endl; - // cerr << "source: " << new_source_id << " sink: " << new_sink_id << endl; - // if (_graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) - // == - // "chr10") { - // cerr << "\t\tstart and end of old embedded path: " - // << _graph.get_id(_graph.get_handle_of_step(old_embedded_path.first)) - // << "end id" - // << _graph.get_id(_graph.get_handle_of_step(old_embedded_path.second)) - // << endl; - // } - // cerr << "#### handles in snarl (according to move_path_to_snarl): ####" << endl; - // for (handle_t handle : new_snarl_handles) { - // cerr << "\t" << _graph.get_id(handle) << " " << _graph.get_sequence(handle); - // } - // cerr << endl << endl; - // cerr << "~~~~~ Handles following each handle:" << endl; - // for (handle_t handle : new_snarl_handles) { - // cerr << "neighbors of handle " << _graph.get_id(handle) << " (" - // << _graph.get_sequence(handle) << "):" << endl; - // _graph.follow_edges(handle, false, [&](const handle_t &next_handle) { - // cerr << "\t" << _graph.get_id(next_handle) << " " - // << _graph.get_sequence(next_handle) << endl; - // }); - // } - - // get the sequence associated with the path - string path_seq; - step_handle_t cur_step = old_embedded_path.first; - - // if the old path is touching either/both the source/sink, we want to make sure that - // the newly moved path also touches those. Otherwise, any paths that extend beyond - // the source or sink may be cut into pieces when the portion of the path overlapping - // the snarl is moved to a region inside the snarl. - bool touching_source = - (_graph.get_id(_graph.get_handle_of_step(old_embedded_path.first)) == - old_source_id); - bool touching_sink = - (_graph.get_id(_graph.get_handle_of_step( - _graph.get_previous_step(old_embedded_path.second))) == old_sink_id); - - // extract the path sequence of the embedded path: - while (cur_step != old_embedded_path.second) { - path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); - cur_step = _graph.get_next_step(cur_step); - } - cerr << "path_seq to move: " << path_seq << endl; - - // TODO: debug_statement: - // cerr << "\t\tpath sequence length: " << path_seq.size() << endl; - // cerr << "path sequence: " << path_seq << endl; - - // for the given path, find every good possible starting handle in the new_snarl - // format of pair is < possible_path_handle_vec, - // starting_index_in_the_first_handle, current_index_in_path_seq> - // //todo: debug_statement - // cerr << "checking handles as start of path-seq" << endl; - vector, int, int>> possible_paths; - for (handle_t handle : new_snarl_handles) { - string handle_seq = _graph.get_sequence(handle); - - // starting index is where the path would begin in the handle, - // since it could begin in the middle of the handle. - vector starting_indices = - check_handle_as_start_of_path_seq(handle_seq, path_seq); - - // if there is a starting index, - if (starting_indices.size() != 0) { - for (int starting_index : starting_indices) { - if ((handle_seq.size() - starting_index) >= path_seq.size() && - source_and_sink_handles_map_properly(_graph, new_source_id, - new_sink_id, touching_source, - touching_sink, handle, handle)) { - // if the entire path fits inside the current handle, and if any - // paths that touched source and sink in the old snarl would be - // touching source and sink in the new snarl, then we've already - // found the full mapping location of the path! Move the path, end - // the method. - vector new_path{handle}; - _graph.rewrite_segment(old_embedded_path.first, - old_embedded_path.second, new_path); - // //todo: debug_statement - // cerr << "found a full mapping at " << _graph.get_id(handle) - // << " w/ seq " << _graph.get_sequence(handle) << endl; - return; - } else { - // this is a potential starting handle for the path. Add as a - // possible_path. - vector possible_path_handle_vec{handle}; - possible_paths.push_back( - make_tuple(possible_path_handle_vec, starting_index, - handle_seq.size() - starting_index)); - } - } - } - } - - // //todo: debug_statement: - // cerr << "done checking handles as start of path seq" << endl; - - // //TODO: debug_statement: - // cerr << "possible paths so far: " << endl; - // for (tuple, int, int> path : possible_paths) { - // cerr << " possible start: "; - // for (handle_t handle : get<0>(path)) { - // cerr << _graph.get_id(handle) << " "; - // } - // cerr << endl; - // } - - // for every possible path, extend it to determine if it really is the path we're - // looking for: - while (!possible_paths.empty()) { - // take a path off of possible_paths, which will be copied for every iteration - // through _graph.follow_edges, below: - tuple, int, int> possible_path_query = possible_paths.back(); - possible_paths.pop_back(); - - // //TODO: debug_statement: - // for (tuple, int, int> path : possible_paths) { - // cerr << "*\tpossible path query: "; - // for (handle_t handle : get<0>(possible_path_query)) { - // cerr << _graph.get_id(handle) << " " << _graph.get_sequence(handle) - // << " "; - // } - // cerr << endl; - // } - - // extend the path through all right-extending edges to see if any subsequent - // paths still satisfy the requirements for being a possible_path: - bool no_path = _graph.follow_edges( - get<0>(possible_path_query).back(), false, [&](const handle_t &next) { - // //todo: debug_statement - // cerr << "cur handle id: " - // << _graph.get_id(get<0>(possible_path_query).back()) << endl; - - // cerr << "next handle id and seq: " << _graph.get_id(next) << " " - // << _graph.get_sequence(next) << endl; - // make a copy to be extended for through each possible next handle in - // follow edges. - tuple, int, int> possible_path = possible_path_query; - - // extract relevant information to make code more readable. - string next_seq = _graph.get_sequence(next); - id_t next_id = _graph.get_id(next); - int &cur_index_in_path = get<2>(possible_path); - if (cur_index_in_path <= path_seq.size() && - (find(new_snarl_handles.cbegin(), new_snarl_handles.cend(), next) != - new_snarl_handles.cend())) { - // if the next handle would be the ending handle for the path, - if (next_seq.size() >= (path_seq.size() - cur_index_in_path)) { - // cerr << "next handle would be the ending handle for the path" - // << endl; - // check to see if the sequence in the handle is suitable - // for ending the path: - int compare_length = path_seq.size() - cur_index_in_path; - - // //todo: debug_statement - // cerr << "about to compare. compare val: " - // << (next_seq.compare(0, compare_length, path_seq, - // cur_index_in_path, compare_length) == - // 0) - // << " source_and_sink_handles_map " - // << source_and_sink_handles_map_properly( - // _graph, new_source_id, new_sink_id, - // touching_source, touching_sink, - // get<0>(possible_path).front(), next) - // << endl; - // cerr << "arguments of compare: " - // << " " << 0 << " " << compare_length << " " << path_seq - // << " " << cur_index_in_path << " " << compare_length << " - // " - // << endl; - if ((next_seq.compare(0, compare_length, path_seq, - cur_index_in_path, compare_length) == 0) && - source_and_sink_handles_map_properly( - _graph, new_source_id, new_sink_id, touching_source, - touching_sink, get<0>(possible_path).front(), next)) { - // todo: debug_statement - // cerr << "compared." << endl; - - // we've found the new path! Move path to the new sequence, - // and end the function. - - if (compare_length < next_seq.size()) { - // If the path ends before the end of next_seq, then split - // the handle so that the path ends flush with the end of - // the first of the two split handles. - - // divide the handle where the path ends; - pair divided_next = - _graph.divide_handle(next, compare_length); - get<0>(possible_path).push_back(divided_next.first); - - // Special case if next is the sink or the source, to - // preserve the reassignment of source and sink ids in - // integrate_snarl. - if (next_id == new_sink_id) { - new_sink_id = _graph.get_id(divided_next.second); - } - - // TODO: NOTE: finding the old "next" handle is expensive. - // TODO: Use different container? - auto it = find(new_snarl_handles.begin(), - new_snarl_handles.end(), next); - - // replace the old invalidated handle with one of the new - // ones - *it = divided_next.first; - // stick the other new handle on the end of - // new_snarl_handles. - new_snarl_handles.push_back(divided_next.second); - - } else { - // otherwise, the end of the path already coincides with - // the end of the handle. In that case, just add it to the - // path. - get<0>(possible_path).push_back(next); - } - _graph.rewrite_segment(old_embedded_path.first, - old_embedded_path.second, - get<0>(possible_path)); - // //todo: debug_statement: - // cerr << "got a full path: "; - // for (handle_t handle : get<0>(possible_path)) { - // cerr << _graph.get_id(handle) << " "; - // } - // cerr << endl; - - // we've already found the path. No need to keep looking for - // more paths. - return false; - } - } - // see if the next handle would be the continuation of the path, but - // not the end, - else { - - // check to see if the sequence in the handle is suitable for - // extending the path: - int compare_length = next_seq.size(); - // //todo: debug_statement - // cerr << "compare returned false" << endl; - // cerr << "compare in returned false: " - // << " next_seq len " << next_seq.size() << " compare_length - // " - // << compare_length << " path_seq len " << path_seq.size() - // << " cur_index_in_path " << cur_index_in_path << endl; - // cerr << "if statement eval: cur_index_in_path <= - // next_seq.size() " - // << (cur_index_in_path <= next_seq.size()) - // << " next_seq.compare(0, compare_length, path_seq, " - // "cur_index_in_path, compare_length) == 0) " - // << (next_seq.compare(0, compare_length, path_seq, - // cur_index_in_path, compare_length) == - // 0) - // << endl; - if (next_seq.compare(0, compare_length, path_seq, - cur_index_in_path, compare_length) == 0) { - // cerr << "compared in return false" << endl; - // extend the path - get<0>(possible_path).push_back(next); - - // update the current index in path_seq. - get<2>(possible_path) += next_seq.size(); - - // place back into possible_paths - possible_paths.push_back(possible_path); - // cerr << "extending the path!" << endl; - } - } - } - // continue to iterate through follow_edges. - return true; - }); - - // //todo: debug_statement: - // if - // (graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) - // == - // "_alt_19f9bc9ad2826f58f113965edf36bb93740df46d_0") { - // cerr << "mystery node 4214930: " - // << _graph.get_sequence(graph.get_handle(4214930)) << endl; - // } - - // if we've found a complete path in the above follow_edges, then we've - // already moved the path, and we're done. - if (!no_path) { - return; - } - } - // //todo: figure out how to do some better error message instead of cerr. - // if we failed to find a path, show an error message. - cerr << "##########################\nWarning! Didn't find a corresponding path of " - "name " - << _graph.get_path_name(_graph.get_path_handle_of_step(old_embedded_path.first)) - << " from the old snarl at " << old_source_id - << " in the newly aligned snarl. This snarl WILL be " - "normalized, resulting in a probably incorrectly-constructed snarl." - "\n##########################" - << endl - << endl; - // throw _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)); - // assert(true && "Warning! Didn't find a corresponding path of name " + - // _graph.get_path_name(graph.get_path_handle_of_step(old_embedded_path.first)) - // + " from the old snarl in the newly aligned snarl."); -} +// /** +// * Deletes the given handle's underlying node, and returns a new handle to a new node +// * with the desired node_id +// * +// * @param {handle_t} handle : The handle to be replaced with a new handle & new node. +// * @param {id_t} node_id : The node id for the new node. Cannot be currently in use in +// * the graph. +// * @return {handle_t} : The new handle, in the same position as the original handle +// * in the graph, but with the new node_id. +// */ +// handle_t SnarlNormalizer::change_node_id(handle_t old_handle, const id_t& new_node_id) +// { + +// } /** Used to help move_path_to_snarl map paths from an old snarl to its newly * normalized counterpart. In particular, ensures that any paths which touch the diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index e9d749490bb..c256c3d3c82 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -20,7 +20,7 @@ class SnarlNormalizer { virtual void normalize_top_level_snarls(ifstream &snarl_stream); - virtual vector normalize_snarl(const id_t &source_id, const id_t &sink_id); + virtual vector normalize_snarl(id_t source_id, id_t sink_id, const bool backwards); protected: // member variables: @@ -38,8 +38,8 @@ class SnarlNormalizer { // finding information on original graph: ////////////////////////////////////////////////////////////////////////////////////// - SubHandleGraph extract_subgraph(const HandleGraph &graph, const id_t &start_id, - const id_t &end_id); + SubHandleGraph extract_subgraph(const HandleGraph &graph, id_t start_id, + id_t end_id, const bool backwards); vector check_handle_as_start_of_path_seq(const string &handle_seq, const string &path_seq); @@ -50,14 +50,10 @@ class SnarlNormalizer { VG align_source_to_sink_haplotypes(unordered_set source_to_sink_haplotypes); - void integrate_snarl(const HandleGraph &new_snarl, + void integrate_snarl(SubHandleGraph &old_snarl, const HandleGraph &new_snarl, const vector> embedded_paths, - const id_t &source_id, const id_t &sink_id); + const id_t &source_id, const id_t &sink_id, const bool backwards); - void move_path_to_snarl(const pair &old_embedded_path, - vector &new_snarl_handles, id_t &new_source_id, - id_t &new_sink_id, const id_t &old_source_id, - const id_t &old_sink_id); bool source_and_sink_handles_map_properly( const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, @@ -66,6 +62,18 @@ class SnarlNormalizer { void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); + void move_path_to_snarl(const pair &old_embedded_path, + vector &new_snarl_handles, id_t &new_source_id, + id_t &new_sink_id, const id_t &old_source_id, + const id_t &old_sink_id, const bool backwards); + + // moving paths to new graph (new draft functions) + vector, int> > find_possible_path_starts (const handle_t& leftmost_handle, const handle_t& rightmost_handle, const pair& path_spans_left_right); + + vector extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right); + + void move_path_to_new_snarl(const pair & old_path, const id_t &source, const id_t &sink, const pair &path_spans_left_right, const bool &path_directed_left_to_right); + ////////////////////////////////////////////////////////////////////////////////////// // format-type switching: ////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index c2166123f6a..b3c5cbd450d 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -39,8 +39,8 @@ namespace algorithms{ SnarlSequenceFinder::SnarlSequenceFinder(const PathHandleGraph & graph, const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, - const id_t &source_id, const id_t &sink_id) - : _graph(graph), _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id) {} + const id_t &source_id, const id_t &sink_id, const bool &backwards) + : _graph(graph), _haploGraph(haploGraph), _snarl(snarl), _source_id(source_id), _sink_id(sink_id), _backwards(backwards) {} // TODO: test that it successfully extracts any haplotypes that start/end in the middle of // TODO: the snarl. @@ -55,6 +55,15 @@ SnarlSequenceFinder::SnarlSequenceFinder(const PathHandleGraph & graph, */ tuple>, vector>, unordered_set> SnarlSequenceFinder::find_gbwt_haps() { + // If snarl has been fed to us backwards, run the algorithm with righmost_id as source + // and vice-versa. Otherwise, keep source as leftmost_id. + id_t leftmost_id = _source_id; + id_t rightmost_id = _sink_id; + if (_backwards) { + leftmost_id = _sink_id; + rightmost_id = _source_id; + } + /** * haplotype_queue contains all started exon_haplotypes not completed yet. * Every time we encounter a branch in the paths, the next node down the path @@ -64,8 +73,8 @@ SnarlSequenceFinder::find_gbwt_haps() { vector, gbwt::SearchState>> haplotype_queue; // source and sink handle for _haploGraph: - handle_t source_handle = _haploGraph.get_handle(_source_id); - handle_t sink_handle = _haploGraph.get_handle(_sink_id); + handle_t source_handle = _haploGraph.get_handle(leftmost_id); + handle_t sink_handle = _haploGraph.get_handle(rightmost_id); // place source in haplotype_queue. vector source_handle_vec(1, source_handle); @@ -130,8 +139,8 @@ SnarlSequenceFinder::find_gbwt_haps() { if (incorrect_connections.find( _snarl.edge_handle(cur_haplotype.first.back(), next_handle)) == incorrect_connections.end()) { - cerr << "_snarl starting at node " << _source_id - << " and ending at " << _sink_id + cerr << "_snarl with source " << _source_id + << " and sink " << _sink_id << " has a thread that incorrectly connects two nodes that " "don't have any edge connecting them. These two nodes are " << _haploGraph.get_id(cur_haplotype.first.back()) << " and " @@ -159,7 +168,7 @@ SnarlSequenceFinder::find_gbwt_haps() { next_handle_vec.push_back(next_handle); // if new_handle is the sink, put in haplotypes_from_source_to_sink - if (_haploGraph.get_id(next_handle) == _sink_id) { + if (_haploGraph.get_id(next_handle) == rightmost_id) { haplotypes_from_source_to_sink.push_back(next_handle_vec); } else // keep extending the haplotype! { @@ -178,9 +187,9 @@ SnarlSequenceFinder::find_gbwt_haps() { other_haplotypes.push_back(cur_haplotype.first); } - // if next_handle is the sink, put in haplotypes_from_source_to_sink + // if next_handle is the "sink"/rightmost_id, put in haplotypes_from_source_to_sink else if (_haploGraph.get_id( - _haploGraph.node_to_handle(next_searches.back().node)) == _sink_id) { + _haploGraph.node_to_handle(next_searches.back().node)) == rightmost_id) { // Then we need to add cur_haplotype + next_search to // haplotypes_from_source_to_sink. handle_t next_handle = _haploGraph.node_to_handle(next_searches.back().node); @@ -246,6 +255,14 @@ SnarlSequenceFinder::find_gbwt_haps() { // snarl. vector> SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touched_handles) { + // If snarl has been fed to us backwards, run the algorithm with righmost_id as source + // and vice-versa. Otherwise, keep source as leftmost_id. + id_t leftmost_id = _source_id; + id_t rightmost_id = _sink_id; + if (_backwards) { + leftmost_id = _sink_id; + rightmost_id = _source_id; + } //todo: debug_statement for (handle_t handle : touched_handles){ cerr << "touched handles find_gbwt_haps: " << _graph.get_id(handle) << endl; @@ -266,7 +283,7 @@ SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touc // We don't need to ever check the sink handle, since paths from the sink handle // extend beyond snarl. - handle_t sink_handle = _haploGraph.get_handle(_sink_id); + handle_t sink_handle = _haploGraph.get_handle(rightmost_id); // touched_handles.erase(sink_handle); // Nested function for making a new_search. Identifies threads starting at a given @@ -347,7 +364,7 @@ SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touc // if next_search is on the sink_handle, // then cur_haplotype.first + next_search goes to finished_haplotypes. - else if (_haploGraph.get_id(next_handle) == _sink_id) { + else if (_haploGraph.get_id(next_handle) == rightmost_id) { // copy over the vector of cur_haplotype: vector next_handle_vec(cur_haplotype.first); @@ -440,10 +457,11 @@ SnarlSequenceFinder::find_embedded_paths() { path_handle_t path = _graph.get_path_handle_of_step(step); cerr << "found a path. Is it new?" << endl; // If it's a step along a new path, save the first step to that path we find. - // In addtion, if there are multiple steps found in the path, (The avoidance + // (The avoidance // of source and sink here is to ensure that we can properly check to see if // we've reached the end of an embedded path walking in any arbitrary // direction (i.e. source towards sink or sink towards source). + //todo: should the following if statement only contain the first conditional? The other two conditionals don't do what the comment says, and also don't seem to make sense. if (paths_found.find(path) == paths_found.end() || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _source_id || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _sink_id) { @@ -479,7 +497,7 @@ SnarlSequenceFinder::find_embedded_paths() { id_t begin_in_snarl_id = _graph.get_id(_graph.get_handle_of_step(begin_in_snarl_step)); - while ((begin_in_snarl_id != _source_id) && + while (((begin_in_snarl_id != _source_id)) && _graph.has_previous_step(begin_in_snarl_step)) { begin_in_snarl_step = _graph.get_previous_step(begin_in_snarl_step); begin_in_snarl_id = @@ -493,19 +511,47 @@ SnarlSequenceFinder::find_embedded_paths() { // while (end_in_snarl_id != source_id and end_in_snarl_id != sink_id and // _graph.has_next_step(end_in_snarl_step)) { - while (end_in_snarl_id != _sink_id and _graph.has_next_step(end_in_snarl_step)) { + while ((end_in_snarl_id != _sink_id) and _graph.has_next_step(end_in_snarl_step)) { end_in_snarl_step = _graph.get_next_step(end_in_snarl_step); end_in_snarl_id = _graph.get_id(_graph.get_handle_of_step(end_in_snarl_step)); } // Note: when adding the end step, path notation convention requires that we add // the null step at the end of the path (or the next arbitrary step, in the case // of a path that extends beyond our snarl.) - // TODO: do we want the next arbitrary step in that latter case? path_in_snarl.second = _graph.get_next_step(end_in_snarl_step); paths_in_snarl.push_back(path_in_snarl); } + //todo: move the following to unit tests: + cerr << "************UNIT_TEST for find_embedded_paths************" << endl; + unordered_set path_names; + for (auto path : paths_in_snarl) { + if (!(_graph.get_id(_graph.get_handle_of_step(path.first)) == _source_id)) { + cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " doesn't start at source of snarl. " << " source: " << _source_id << "; start of path: " << _graph.get_id(_graph.get_handle_of_step(path.first)) << endl; + } + if (!(_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == _sink_id)) { + cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " doesn't end at sink of snarl. " << " source: " << _sink_id << "; end of path: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; + cerr << "note that the 'true' end of the path is one step further than the sink. Print statement above corrects for that convention." << endl; + } + if (!(path_names.find(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))) == path_names.end())) { + cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " has been found more than once in find_embedded_paths, when it should only have been extracted once. " << endl; + } + path_names.emplace(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))); + } + if ((path_names.size() == 0)) { + cerr << "no embedded paths found in find_embedded_paths." << endl; + } + // for (auto path : paths_in_snarl) { + // cerr << "path starts at source? " << (_graph.get_id(_graph.get_handle_of_step(path.first)) == _source_id) << endl; + // cerr << "path ends at sink? " << (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == _sink_id) << endl; + + // cerr << "is a path a duplicate of one we've already extracted? " << (path_names.find(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))) == path_names.end()) << endl; + // path_names.emplace(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))); + // } + // cerr << "tested " << path_names.size() << " paths in UNIT_TEST." << endl; + cerr << "************END-UNIT_TEST for find_embedded_paths. Tested " << path_names.size() << " paths in UNIT_TEST.************"<< endl; + return paths_in_snarl; } diff --git a/src/algorithms/0_snarl_sequence_finder.hpp b/src/algorithms/0_snarl_sequence_finder.hpp index 9bce63d53c9..62175836548 100644 --- a/src/algorithms/0_snarl_sequence_finder.hpp +++ b/src/algorithms/0_snarl_sequence_finder.hpp @@ -12,7 +12,7 @@ class SnarlSequenceFinder { SnarlSequenceFinder(const PathHandleGraph & graph, const SubHandleGraph &snarl, const gbwtgraph::GBWTGraph &haploGraph, const id_t &source_id, - const id_t &sink_id); + const id_t &sink_id, const bool &backwards); tuple>, vector>, unordered_set> find_gbwt_haps(); @@ -32,6 +32,7 @@ class SnarlSequenceFinder { const gbwtgraph::GBWTGraph &_haploGraph; const id_t &_source_id; const id_t &_sink_id; + const bool &_backwards; vector> find_haplotypes_not_at_source(unordered_set &touched_handles); From d1468ecd39d9f138b1b2e61f45c9b26488638ec1 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 26 Feb 2021 09:18:54 -0800 Subject: [PATCH 52/63] right-to-left directed snarls are now supported in normalize_snarls. --- src/algorithms/0_oo_normalize_snarls.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 3ad60f8078f..2750fece7e9 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -843,6 +843,7 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, } } + // /** // * Deletes the given handle's underlying node, and returns a new handle to a new node // * with the desired node_id From 18625c2ed82740fd323afce3f0d57fe27029ce85 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 26 Feb 2021 11:42:05 -0800 Subject: [PATCH 53/63] subdivided part of integrate_snarl into new overwrite_node_id fxn. --- src/algorithms/0_oo_normalize_snarls.cpp | 102 +++++++++-------------- src/algorithms/0_oo_normalize_snarls.hpp | 6 +- 2 files changed, 40 insertions(+), 68 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 2750fece7e9..17a9804021a 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -788,76 +788,52 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, handle_t new_rightmost_handle; if (!backwards) { - new_leftmost_handle = _graph.create_handle( - _graph.get_sequence(_graph.get_handle(temp_snarl_leftmost_id)), source_id); - new_rightmost_handle = _graph.create_handle( - _graph.get_sequence(new_snarl_topo_order.back()), sink_id); + new_leftmost_handle = overwrite_node_id(temp_snarl_leftmost_id, source_id); + new_rightmost_handle = overwrite_node_id(temp_snarl_rightmost_id, sink_id); } else { - new_leftmost_handle = _graph.create_handle( - _graph.get_sequence(_graph.get_handle(temp_snarl_leftmost_id)), sink_id); - new_rightmost_handle = _graph.create_handle( - _graph.get_sequence(new_snarl_topo_order.back()), source_id); - } - // move the source edges: - // TODO: note the copy/paste. Fix? - _graph.follow_edges(_graph.get_handle(temp_snarl_leftmost_id), true, - [&](const handle_t &prev_handle) { - _graph.create_edge(prev_handle, new_leftmost_handle); - }); - _graph.follow_edges(_graph.get_handle(temp_snarl_leftmost_id), false, - [&](const handle_t &next_handle) { - _graph.create_edge(new_leftmost_handle, next_handle); - }); - - // move the sink edges: - _graph.follow_edges(_graph.get_handle(temp_snarl_rightmost_id), true, - [&](const handle_t &prev_handle) { - _graph.create_edge(prev_handle, new_rightmost_handle); - }); - _graph.follow_edges(_graph.get_handle(temp_snarl_rightmost_id), false, - [&](const handle_t &next_handle) { - _graph.create_edge(new_rightmost_handle, next_handle); - }); - - // move the paths: - _graph.for_each_step_on_handle( - _graph.get_handle(temp_snarl_leftmost_id), [&](step_handle_t step) { - _graph.rewrite_segment(step, _graph.get_next_step(step), - vector{new_leftmost_handle}); - }); - _graph.for_each_step_on_handle( - _graph.get_handle(temp_snarl_rightmost_id), [&](step_handle_t step) { - _graph.rewrite_segment(step, _graph.get_next_step(step), - vector{new_rightmost_handle}); - }); - cerr << "the temp leftmost id: " << temp_snarl_leftmost_id << endl; - cerr << "the temp rightmost id: " << temp_snarl_rightmost_id << endl; - - // delete the previously created source and sink: - for (handle_t handle : {_graph.get_handle(temp_snarl_leftmost_id), - _graph.get_handle(temp_snarl_rightmost_id)}) { - cerr << "id of handle to delete from the old source/sink: " << _graph.get_id(handle) << endl; - _graph.destroy_handle(handle); + new_leftmost_handle = overwrite_node_id(temp_snarl_leftmost_id, sink_id); + new_rightmost_handle = overwrite_node_id(temp_snarl_rightmost_id, source_id); } } -// /** -// * Deletes the given handle's underlying node, and returns a new handle to a new node -// * with the desired node_id -// * -// * @param {handle_t} handle : The handle to be replaced with a new handle & new node. -// * @param {id_t} node_id : The node id for the new node. Cannot be currently in use in -// * the graph. -// * @return {handle_t} : The new handle, in the same position as the original handle -// * in the graph, but with the new node_id. -// */ -// handle_t SnarlNormalizer::change_node_id(handle_t old_handle, const id_t& new_node_id) -// { - -// } +/** + * Deletes the given handle's underlying node, and returns a new handle to a new node + * with the desired node_id + * + * @param {id_t} handle : The old node id, to be replaced with a new node id. + * @param {id_t} node_id : The node id for the new node. Cannot be currently in use in + * the graph. + * @return {handle_t} : The new handle, in the same position as the original handle + * in the graph, but with the new node_id. + */ +handle_t SnarlNormalizer::overwrite_node_id(const id_t& old_node_id, const id_t& new_node_id) +{ + handle_t old_handle = _graph.get_handle(old_node_id); + handle_t new_handle = _graph.create_handle(_graph.get_sequence(old_handle), new_node_id); + + // move the edges: + _graph.follow_edges(old_handle, true, [&](const handle_t &prev_handle) + { + _graph.create_edge(prev_handle, new_handle); + }); + _graph.follow_edges(old_handle, false, [&](const handle_t &next_handle) + { + _graph.create_edge(new_handle, next_handle); + }); + + // move the paths: + _graph.for_each_step_on_handle(old_handle, [&](step_handle_t step) + { + _graph.rewrite_segment(step, _graph.get_next_step(step), vector{new_handle}); + }); + + // delete the old_handle: + _graph.destroy_handle(old_handle); + return new_handle; +} /** Used to help move_path_to_snarl map paths from an old snarl to its newly * normalized counterpart. In particular, ensures that any paths which touch the diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index c256c3d3c82..50750cda616 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -54,6 +54,7 @@ class SnarlNormalizer { const vector> embedded_paths, const id_t &source_id, const id_t &sink_id, const bool backwards); + handle_t overwrite_node_id(const id_t& old_node_id, const id_t& new_node_id); bool source_and_sink_handles_map_properly( const HandleGraph &graph, const id_t &new_source_id, const id_t &new_sink_id, @@ -62,11 +63,6 @@ class SnarlNormalizer { void force_maximum_handle_size(MutableHandleGraph &graph, const size_t &max_size); - void move_path_to_snarl(const pair &old_embedded_path, - vector &new_snarl_handles, id_t &new_source_id, - id_t &new_sink_id, const id_t &old_source_id, - const id_t &old_sink_id, const bool backwards); - // moving paths to new graph (new draft functions) vector, int> > find_possible_path_starts (const handle_t& leftmost_handle, const handle_t& rightmost_handle, const pair& path_spans_left_right); From adbeb25b316d8bb12eed5ce51cfb211b328274ce Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 12 Mar 2021 11:32:55 -0800 Subject: [PATCH 54/63] added a few unit tests directly integrated with code; only prints if failed. --- .../0_move_embedded_paths_to_new_snarl.cpp | 49 +++++++++-------- src/algorithms/0_oo_normalize_snarls.cpp | 53 +++++++++++-------- src/algorithms/0_oo_normalize_snarls.hpp | 4 +- src/algorithms/0_snarl_sequence_finder.cpp | 37 +++++++------ 4 files changed, 79 insertions(+), 64 deletions(-) diff --git a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp index cce5d2769e9..c8d0dbc4556 100644 --- a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp +++ b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp @@ -117,12 +117,12 @@ vector SnarlNormalizer::extend_possible_paths(vector SnarlNormalizer::extend_possible_paths(vector} path_spans_left_right : * @param {bool} path_directed_left_to_right : + * @return {pair} : */ -void SnarlNormalizer::move_path_to_new_snarl(const pair & old_path, const id_t &leftmost_id, const id_t &rightmost_id, const pair &path_spans_left_right, const bool &path_directed_left_to_right) +pair SnarlNormalizer::move_path_to_new_snarl(const pair & old_path, const id_t &leftmost_id, const id_t &rightmost_id, const pair &path_spans_left_right, const bool &path_directed_left_to_right) { /* * This should return the series of handles, from left to right if path_left_to_right==true (else vice-versa), that the path should move to. @@ -152,17 +153,21 @@ void SnarlNormalizer::move_path_to_new_snarl(const pair no_path; - return; + pair no_path; + return no_path; } // get the path_string from the handles in the old_path: string path_str; step_handle_t cur_step = old_path.first; string path_name = _graph.get_path_name(_graph.get_path_handle_of_step(old_path.first)); // used for unit tests at bottom. + vector old_path_location; while (cur_step != old_path.second) { + // cerr << "orientation of cur_step: " << _graph.apply_orientation path_str += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); + //todo: note following line of for loop is for debug purposes. delete? + old_path_location.push_back(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } @@ -199,28 +204,29 @@ void SnarlNormalizer::move_path_to_new_snarl(const pair, int>> possible_path_starts = find_possible_path_starts(leftmost_handle, rightmost_handle, path_spans_left_right); // cerr << "size of possible_path_starts: " << possible_path_starts.size() << endl; vector new_path_location = extend_possible_paths(possible_path_starts, path_str, leftmost_handle, rightmost_handle, path_spans_left_right); // cerr << "size of new_path_location: " << new_path_location.size() << endl; - //todo! debug comment_out below. + // flip the order of the handles if the path moves right-to-left. if (!path_directed_left_to_right) { std::reverse(new_path_location.begin(), new_path_location.end()); for (int i = 0; i != new_path_location.size(); i++) { - new_path_location[i] = _graph.flip(new_path_location[i]); + // new_path_location[i] = _graph.app(new_path_location[i]); + // cerr << "path_name: " << path_name << endl; + // cerr << "handle_id: " << _graph.get_id(new_path_location[i]) << endl; + // cerr << "handle seq: " << _graph.get_sequence(new_path_location[i]) << endl; + // cerr << "is the handle reversed?: " << _graph.get_is_reverse(new_path_location[i]) << endl; + // if (path_name != "CBS432.chrXIV") + // { + new_path_location[i] = _graph.flip(new_path_location[i]); + // } + // cerr << "is the handle reversed?: " << _graph.get_is_reverse(new_path_location[i]) << endl; + // cerr << "is the handle at old_path in this pos reversed? " << _graph.get_is_reverse(old_path_location[i]) << endl; } } @@ -249,14 +255,12 @@ void SnarlNormalizer::move_path_to_new_snarl(const pair new_path = _graph.rewrite_segment(old_path.first, old_path.second, new_path_location); - cerr << "************UNIT_TEST for extend_possible_paths************" << endl; // Test that the new path exists. if (new_path_location.size() == 0) { + cerr << "************in UNIT_TEST for move_path_to_new_snarl************" << endl; cerr << "no new path location found." << endl; } // Test that the new path seq = old path seq. @@ -290,15 +294,16 @@ void SnarlNormalizer::move_path_to_new_snarl(const pairstart().node_id() << endl; + // cerr << "normalized snarl starting at: " << roots->start().node_id() << endl; } else { // else, there was an error. Track which errors caused the snarl to not // normalize. @@ -118,7 +118,6 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { } num_snarls_skipped += 1; } - //todo! Make unit test for shrinking snarls. // //todo: debug_statement for extracting snarl of interest. // VG outGraph; @@ -264,7 +263,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const // TODO: accounted for in the code, remove next chunk of code that finds // TODO: source-to-sink paths. // find the paths that stretch from source to sink: - cerr << "~~~~~~~~~~source: " << source_id << "sink: " << sink_id << endl; + // cerr << "~~~~~~~~~~source: " << source_id << "sink: " << sink_id << endl; for (auto path : embedded_paths) { // cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with source " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; @@ -344,6 +343,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const } // todo: decide if we should only normalize snarls that decrease in size. if (error_record[5] > error_record[4]) { + cerr << "**************************in UNIT-TEST for normalize_snarl: **************************" << endl; cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " "starting at " << source_id << endl @@ -633,12 +633,14 @@ SubHandleGraph SnarlNormalizer::extract_subgraph(const HandleGraph &graph, // old_embedded_path, and pair.second is the step_handle *after* // the last step_handle of interest in the old_embedded_path (can // be the null step at the end of the path.) +// Note: these paths will be altered to represent the way they +// overlap in the new snarl. Otherwise, they would be invalidated. // source_id: the source of the old (to be replaced) snarl in _graph // sink_id: the sink of the old (to be replaced) snarl in _graph. // Return: None. void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, const HandleGraph &to_insert_snarl, - const vector> embedded_paths, + vector>& embedded_paths, const id_t &source_id, const id_t &sink_id, const bool backwards) { // cerr << "integrate_snarl" << endl; @@ -750,7 +752,8 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, }); } // For each path of interest, move it onto the new_snarl. - for (auto path : embedded_paths) { + for (int i = 0; i != embedded_paths.size(); i++) + { // //todo: debug_statement // cerr << "the new sink id: " << temp_snarl_rightmost_id << endl; // //todo: debug_statement @@ -766,19 +769,19 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, // path_spans_left_right.second = (!backwards && _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == sink_id) || (backwards && _graph.get_id(_graph.get_handle_of_step(path.first)) == sink_id); // cerr << "first: " << path_spans_left_right.first << "second: " << path_spans_left_right.second << endl; pair path_spans_left_right; - path_spans_left_right.first = (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id); - path_spans_left_right.second = (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == sink_id); - - move_path_to_new_snarl(path, temp_snarl_leftmost_id, temp_snarl_rightmost_id, path_spans_left_right, !backwards); + path_spans_left_right.first = (_graph.get_id(_graph.get_handle_of_step(embedded_paths[i].first)) == source_id); + path_spans_left_right.second = (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(embedded_paths[i].second))) == sink_id); + embedded_paths[i] = move_path_to_new_snarl(embedded_paths[i], temp_snarl_leftmost_id, temp_snarl_rightmost_id, path_spans_left_right, !backwards); } // Destroy the old snarl. - old_snarl.for_each_handle([&](const handle_t &handle) { - // //todo: debug_statement these are the handles in old_snarl: - // cerr << old_snarl.get_id(handle) << old_snarl.get_sequence(handle) << endl; - _graph.destroy_handle(handle); - }); + old_snarl.for_each_handle([&](const handle_t &handle) + { + // //todo: debug_statement these are the handles in old_snarl: + // cerr << old_snarl.get_id(handle) << old_snarl.get_sequence(handle) << endl; + _graph.destroy_handle(handle); + }); // Replace the source and sink handles with ones that have the original source/sink id // (for compatibility with future iterations on neighboring top-level snarls using the @@ -795,7 +798,7 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, { new_leftmost_handle = overwrite_node_id(temp_snarl_leftmost_id, sink_id); new_rightmost_handle = overwrite_node_id(temp_snarl_rightmost_id, source_id); - } + } } @@ -827,6 +830,11 @@ handle_t SnarlNormalizer::overwrite_node_id(const id_t& old_node_id, const id_t& // move the paths: _graph.for_each_step_on_handle(old_handle, [&](step_handle_t step) { + handle_t properly_oriented_old_handle = _graph.get_handle_of_step(step); + if (_graph.get_is_reverse(properly_oriented_old_handle) != _graph.get_is_reverse(new_handle)) + { + new_handle = _graph.flip(new_handle); + } _graph.rewrite_segment(step, _graph.get_next_step(step), vector{new_handle}); }); @@ -834,7 +842,6 @@ handle_t SnarlNormalizer::overwrite_node_id(const id_t& old_node_id, const id_t& _graph.destroy_handle(old_handle); return new_handle; } - /** Used to help move_path_to_snarl map paths from an old snarl to its newly * normalized counterpart. In particular, ensures that any paths which touch the * source and/or sink of the old snarl still do so in the new snarl (which is @@ -968,27 +975,27 @@ SnarlNormalizer::debug_get_sources_and_sinks(const HandleGraph &graph) { // identify sources and sinks graph.for_each_handle([&](const handle_t &handle) { //todo: debug_statements in code below: - cerr << "identifying if " << graph.get_id(handle) << "is a source/sink." < source_to_sink_haplotypes); void integrate_snarl(SubHandleGraph &old_snarl, const HandleGraph &new_snarl, - const vector> embedded_paths, + vector>& embedded_paths, const id_t &source_id, const id_t &sink_id, const bool backwards); handle_t overwrite_node_id(const id_t& old_node_id, const id_t& new_node_id); @@ -68,7 +68,7 @@ class SnarlNormalizer { vector extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right); - void move_path_to_new_snarl(const pair & old_path, const id_t &source, const id_t &sink, const pair &path_spans_left_right, const bool &path_directed_left_to_right); + pair move_path_to_new_snarl(const pair & old_path, const id_t &source, const id_t &sink, const pair &path_spans_left_right, const bool &path_directed_left_to_right); ////////////////////////////////////////////////////////////////////////////////////// // format-type switching: diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index b3c5cbd450d..b23bbb84282 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -228,14 +228,14 @@ SnarlSequenceFinder::find_gbwt_haps() { move(haplotypes_not_starting_at_source.begin(), haplotypes_not_starting_at_source.end(), back_inserter(other_haplotypes)); - //todo: debug_statement - cerr << "lets look through all the haplotypes after extraction:" << endl; - for (vector hap_vec : haplotypes_from_source_to_sink) { - cerr << "new hap:" << endl; - for (handle_t handle : hap_vec){ - cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; - } - } + // //todo: debug_statement + // cerr << "lets look through all the haplotypes after extraction:" << endl; + // for (vector hap_vec : haplotypes_from_source_to_sink) { + // cerr << "new hap:" << endl; + // for (handle_t handle : hap_vec){ + // cerr << _haploGraph.get_id(handle) << " " << _haploGraph.get_sequence(handle) << endl; + // } + // } return tuple>, vector>, unordered_set>{haplotypes_from_source_to_sink, @@ -263,10 +263,10 @@ SnarlSequenceFinder::find_haplotypes_not_at_source(unordered_set &touc leftmost_id = _sink_id; rightmost_id = _source_id; } - //todo: debug_statement - for (handle_t handle : touched_handles){ - cerr << "touched handles find_gbwt_haps: " << _graph.get_id(handle) << endl; - } + // //todo: debug_statement + // for (handle_t handle : touched_handles){ + // cerr << "touched handles find_gbwt_haps: " << _graph.get_id(handle) << endl; + // } // cerr << "find_haplotypes_not_at_source" << endl; /// Search every handle in touched handles for haplotypes starting at that point. @@ -450,12 +450,12 @@ SnarlSequenceFinder::find_embedded_paths() { // look for handles with paths we haven't touched yet. _snarl.for_each_handle([&](const handle_t &handle) { - cerr << "looking for paths at handle " << _graph.get_id(handle) << endl; + // cerr << "looking for paths at handle " << _graph.get_id(handle) << endl; vector steps = _graph.steps_of_handle(handle); // do any of these steps belong to a path not in paths_found? for (step_handle_t &step : steps) { path_handle_t path = _graph.get_path_handle_of_step(step); - cerr << "found a path. Is it new?" << endl; + // cerr << "found a path. Is it new?" << endl; // If it's a step along a new path, save the first step to that path we find. // (The avoidance // of source and sink here is to ensure that we can properly check to see if @@ -465,7 +465,7 @@ SnarlSequenceFinder::find_embedded_paths() { if (paths_found.find(path) == paths_found.end() || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _source_id || _graph.get_id(_graph.get_handle_of_step(paths_found[path])) == _sink_id) { - cerr << "found a new path." << endl; + // cerr << "found a new path." << endl; // then we need to mark it as found and save the step. paths_found[path] = step; } @@ -524,22 +524,25 @@ SnarlSequenceFinder::find_embedded_paths() { } //todo: move the following to unit tests: - cerr << "************UNIT_TEST for find_embedded_paths************" << endl; unordered_set path_names; for (auto path : paths_in_snarl) { if (!(_graph.get_id(_graph.get_handle_of_step(path.first)) == _source_id)) { + cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " doesn't start at source of snarl. " << " source: " << _source_id << "; start of path: " << _graph.get_id(_graph.get_handle_of_step(path.first)) << endl; } if (!(_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == _sink_id)) { + cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " doesn't end at sink of snarl. " << " source: " << _sink_id << "; end of path: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; cerr << "note that the 'true' end of the path is one step further than the sink. Print statement above corrects for that convention." << endl; } if (!(path_names.find(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))) == path_names.end())) { + cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " has been found more than once in find_embedded_paths, when it should only have been extracted once. " << endl; } path_names.emplace(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))); } if ((path_names.size() == 0)) { + cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; cerr << "no embedded paths found in find_embedded_paths." << endl; } // for (auto path : paths_in_snarl) { @@ -550,7 +553,7 @@ SnarlSequenceFinder::find_embedded_paths() { // path_names.emplace(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))); // } // cerr << "tested " << path_names.size() << " paths in UNIT_TEST." << endl; - cerr << "************END-UNIT_TEST for find_embedded_paths. Tested " << path_names.size() << " paths in UNIT_TEST.************"<< endl; + // cerr << "************END-UNIT_TEST for find_embedded_paths. Tested " << path_names.size() << " paths in UNIT_TEST.************"<< endl; return paths_in_snarl; } From 157dc0b6d196f9477386fd8ecb894b32626a9527 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Sat, 20 Mar 2021 11:36:28 -0700 Subject: [PATCH 55/63] commented out debug code --- src/algorithms/0_oo_normalize_snarls.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 540a1cf1282..c7202fb3858 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -74,8 +74,8 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; // // //todo: debug_code - int stop_size = 1; - int num_snarls_touched = 0; + // int stop_size = 1; + // int num_snarls_touched = 0; // int skip_first_few = 2; //#1, node 3702578 is a cyclic snarl. Don't recall about #0. #2 also cyclic. Looks like cyclic snarls weren't buggy? // int skipped = 0; @@ -87,11 +87,11 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // skipped++; // continue; // } - if (num_snarls_touched == stop_size){ - break; - } else { - num_snarls_touched++; - } + // if (num_snarls_touched == stop_size){ + // break; + // } else { + // num_snarls_touched++; + // } // if (roots->start().node_id() == 3881494) { // cerr << "root backwards?" << roots->start().backward() << endl; // cerr << "disambiguating snarl #" From b47a8b867e2dd93b6bce8692da9879005a41851a Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Sat, 20 Mar 2021 17:11:36 -0700 Subject: [PATCH 56/63] more informative debug info --- src/algorithms/0_move_embedded_paths_to_new_snarl.cpp | 4 ++++ src/algorithms/0_snarl_sequence_finder.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp index c8d0dbc4556..5f4f613fb79 100644 --- a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp +++ b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp @@ -120,6 +120,7 @@ vector SnarlNormalizer::extend_possible_paths(vector SnarlNormalizer::move_path_to_new_snarl(const if (!(path_spans_left_right.first and path_spans_left_right.second)) { cerr << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; + cerr << "in snarl with leftmost_id: " << leftmost_id << " and rightmost_id " << rightmost_id << ":" << endl; cerr << "PATH DOESN'T SPAN SOURCE AND SINK! THIS IS CURRENTLY UNSUPPORTED. SNARL WILL BE NORMALIZED, BUT PATH WON'T BE INCLUDED." << endl; cerr << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" << endl; pair no_path; @@ -261,6 +263,7 @@ pair SnarlNormalizer::move_path_to_new_snarl(const if (new_path_location.size() == 0) { cerr << "************in UNIT_TEST for move_path_to_new_snarl************" << endl; + cerr << "in snarl with leftmost_id: " << _graph.get_id(leftmost_handle) << " and rightmost_id " << _graph.get_id(rightmost_handle) << ":" << endl; cerr << "no new path location found." << endl; } // Test that the new path seq = old path seq. @@ -295,6 +298,7 @@ pair SnarlNormalizer::move_path_to_new_snarl(const if (old_path_str != new_path_str) { cerr << "************in UNIT_TEST for move_path_to_new_snarl************" << endl; + cerr << "in snarl with leftmost_id: " << _graph.get_id(leftmost_handle) << " and rightmost_id " << _graph.get_id(rightmost_handle) << ":" << endl; cerr << "Once the path was moved into the new snarl, it didn't have the same sequence." << endl; cerr << "original seq: " << old_path_str << endl; cerr << " new seq: " << new_path_str << endl; diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index b23bbb84282..c782770834a 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -528,21 +528,25 @@ SnarlSequenceFinder::find_embedded_paths() { for (auto path : paths_in_snarl) { if (!(_graph.get_id(_graph.get_handle_of_step(path.first)) == _source_id)) { cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; + cerr << "in snarl with source: " << _source_id << " and sink " << _sink_id << ":" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " doesn't start at source of snarl. " << " source: " << _source_id << "; start of path: " << _graph.get_id(_graph.get_handle_of_step(path.first)) << endl; } if (!(_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) == _sink_id)) { cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; + cerr << "in snarl with source: " << _source_id << " and sink " << _sink_id << ":" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " doesn't end at sink of snarl. " << " source: " << _sink_id << "; end of path: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; cerr << "note that the 'true' end of the path is one step further than the sink. Print statement above corrects for that convention." << endl; } if (!(path_names.find(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))) == path_names.end())) { cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; + cerr << "in snarl with source: " << _source_id << " and sink " << _sink_id << ":" << endl; cerr << "path " << _graph.get_path_name(_graph.get_path_handle_of_step(path.second)) << " has been found more than once in find_embedded_paths, when it should only have been extracted once. " << endl; } path_names.emplace(_graph.get_path_name(_graph.get_path_handle_of_step(path.first))); } if ((path_names.size() == 0)) { cerr << "************in UNIT_TEST for find_embedded_paths************" << endl; + cerr << "in snarl with source: " << _source_id << " and sink " << _sink_id << ":" << endl; cerr << "no embedded paths found in find_embedded_paths." << endl; } // for (auto path : paths_in_snarl) { From f6ebd499b71ce3dbd364a7d00b5c658600ac876e Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Wed, 24 Mar 2021 14:22:57 -0700 Subject: [PATCH 57/63] update to most recent vg --- src/algorithms/0_oo_normalize_snarls.cpp | 5 ++--- src/algorithms/0_oo_normalize_snarls.hpp | 1 - src/algorithms/0_snarl_sequence_finder.cpp | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index c7202fb3858..39440c26f3b 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -16,7 +16,6 @@ #include "../msa_converter.hpp" #include "../snarls.hpp" #include "../vg.hpp" -#include "is_acyclic.hpp" #include "../types.hpp" #include "extract_containing_graph.hpp" @@ -201,7 +200,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const // }); // }); - if (!algorithms::is_acyclic(&snarl)) { + if (!handlealgs::is_acyclic(&snarl)) { cerr << "snarl at " << source_id << " is cyclic. Skipping." << endl; error_record[3] = true; return error_record; @@ -681,7 +680,7 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, // add to_insert_snarl into _graph without directly attaching the snarl to the _graph // (yet). vector to_insert_snarl_topo_order = - algorithms::lazier_topological_order(&to_insert_snarl); + handlealgs::lazier_topological_order(&to_insert_snarl); // Construct a parallel new_snarl_topo_order to identify // paralogous nodes between to_insert_snarl and the new snarl inserted in _graph. diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index e12cfc89dac..006bab073ea 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -2,7 +2,6 @@ #include "../handle.hpp" #include "../subgraph.hpp" #include "../vg.hpp" -#include "count_walks.hpp" #include #include diff --git a/src/algorithms/0_snarl_sequence_finder.cpp b/src/algorithms/0_snarl_sequence_finder.cpp index c782770834a..0efad1d05eb 100644 --- a/src/algorithms/0_snarl_sequence_finder.cpp +++ b/src/algorithms/0_snarl_sequence_finder.cpp @@ -26,7 +26,7 @@ // #include "../msa_converter.hpp" // #include "vg.hpp" -#include "topological_sort.hpp" +// #include "topological_sort.hpp" #include @@ -611,7 +611,7 @@ SnarlSequenceFinder::find_exhaustive_paths() { // count walks by dynamic programming bool overflowed = false; - for (const handle_t &handle : lazier_topological_order(&_snarl)) { + for (const handle_t &handle : handlealgs::lazier_topological_order(&_snarl)) { touched_handles.emplace(handle); size_t count_here = count[handle]; vector seqs_here = sequences[handle]; From 4629d2a57d3899adf7041329699bd9c4cacfdc3e Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Thu, 25 Mar 2021 11:19:20 -0700 Subject: [PATCH 58/63] minor edits --- .gitmodules | 3 +++ deps/seqan | 1 + src/algorithms/0_oo_normalize_snarls.cpp | 12 +++++++----- 3 files changed, 11 insertions(+), 5 deletions(-) create mode 160000 deps/seqan diff --git a/.gitmodules b/.gitmodules index 9818068ee35..fc28ba982c6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -128,3 +128,6 @@ [submodule "deps/atomic_queue"] path = deps/atomic_queue url = https://github.com/max0x7ba/atomic_queue.git +[submodule "deps/seqan"] + path = deps/seqan + url = https://github.com/seqan/seqan.git diff --git a/deps/seqan b/deps/seqan new file mode 160000 index 00000000000..f5f658343c3 --- /dev/null +++ b/deps/seqan @@ -0,0 +1 @@ +Subproject commit f5f658343c366c9c3d44ba358ffc9317e78a09ed diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 39440c26f3b..299b47b0ebe 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -86,11 +86,13 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // skipped++; // continue; // } - // if (num_snarls_touched == stop_size){ - // break; - // } else { - // num_snarls_touched++; - // } + + // if (num_snarls_touched == stop_size){ + // break; + // } else { + // num_snarls_touched++; + // } + // if (roots->start().node_id() == 3881494) { // cerr << "root backwards?" << roots->start().backward() << endl; // cerr << "disambiguating snarl #" From c47c4e057c1ac164e41c00a7027ae8899e1faa7e Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 26 Mar 2021 14:45:15 -0700 Subject: [PATCH 59/63] fixed include error with seqan dependencies. --- src/algorithms/0_oo_normalize_snarls.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 299b47b0ebe..fe2b8a8ef0c 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -5,9 +5,12 @@ // #include #include -#include -#include -#include +#include +#include +#include +// #include +// #include +// #include #include @@ -73,8 +76,8 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; // // //todo: debug_code - // int stop_size = 1; - // int num_snarls_touched = 0; + int stop_size = 1; + int num_snarls_touched = 0; // int skip_first_few = 2; //#1, node 3702578 is a cyclic snarl. Don't recall about #0. #2 also cyclic. Looks like cyclic snarls weren't buggy? // int skipped = 0; @@ -87,11 +90,11 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // continue; // } - // if (num_snarls_touched == stop_size){ - // break; - // } else { - // num_snarls_touched++; - // } + if (num_snarls_touched == stop_size){ + break; + } else { + num_snarls_touched++; + } // if (roots->start().node_id() == 3881494) { // cerr << "root backwards?" << roots->start().backward() << endl; From 9435915eedac6cb09164f13eef6ff36153566045 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Tue, 30 Mar 2021 13:47:19 -0700 Subject: [PATCH 60/63] current draft of normalize snarls without debugging code --- .../0_move_embedded_paths_to_new_snarl.cpp | 15 +++++++++------ src/algorithms/0_oo_normalize_snarls.cpp | 16 ++++++++-------- src/algorithms/0_oo_normalize_snarls.hpp | 4 ++-- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp index 5f4f613fb79..e829308f5ae 100644 --- a/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp +++ b/src/algorithms/0_move_embedded_paths_to_new_snarl.cpp @@ -37,6 +37,9 @@ vector, int>> SnarlNormalizer::find_possible_path_starts(c /** * extend_possible_paths * + * Takes multiple potential starting positions for the path in possible_path_starts, and + * extends them until we find a viable path in the snarl. + * * @param {vector} undefined : * @param {int>>} possible_path_starts : * @param {string} path_str : @@ -46,7 +49,7 @@ vector, int>> SnarlNormalizer::find_possible_path_starts(c * @param {bool>} path_spans_left_right : * @return {vector} : empty if no path; otherwise, the sequence of handles representing a path which matches the path_str. */ -vector SnarlNormalizer::extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right) +vector SnarlNormalizer::extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right, const pair &main_graph_source_and_sink) { // cerr << "path string (note: should be left-to-right at this point, e.g. TTACT, not AGTAA: " << path_str << endl; // cerr << "leftmost handle id and seq: " << _graph.get_id(leftmost_handle) << " " << _graph.get_sequence(leftmost_handle) << endl; @@ -120,7 +123,7 @@ vector SnarlNormalizer::extend_possible_paths(vector SnarlNormalizer::extend_possible_paths(vector} : */ -pair SnarlNormalizer::move_path_to_new_snarl(const pair & old_path, const id_t &leftmost_id, const id_t &rightmost_id, const pair &path_spans_left_right, const bool &path_directed_left_to_right) +pair SnarlNormalizer::move_path_to_new_snarl(const pair & old_path, const id_t &leftmost_id, const id_t &rightmost_id, const pair &path_spans_left_right, const bool &path_directed_left_to_right, const pair &main_graph_source_and_sink) { /* * This should return the series of handles, from left to right if path_left_to_right==true (else vice-versa), that the path should move to. @@ -208,7 +211,7 @@ pair SnarlNormalizer::move_path_to_new_snarl(const vector, int>> possible_path_starts = find_possible_path_starts(leftmost_handle, rightmost_handle, path_spans_left_right); // cerr << "size of possible_path_starts: " << possible_path_starts.size() << endl; - vector new_path_location = extend_possible_paths(possible_path_starts, path_str, leftmost_handle, rightmost_handle, path_spans_left_right); + vector new_path_location = extend_possible_paths(possible_path_starts, path_str, leftmost_handle, rightmost_handle, path_spans_left_right, main_graph_source_and_sink); // cerr << "size of new_path_location: " << new_path_location.size() << endl; @@ -263,7 +266,7 @@ pair SnarlNormalizer::move_path_to_new_snarl(const if (new_path_location.size() == 0) { cerr << "************in UNIT_TEST for move_path_to_new_snarl************" << endl; - cerr << "in snarl with leftmost_id: " << _graph.get_id(leftmost_handle) << " and rightmost_id " << _graph.get_id(rightmost_handle) << ":" << endl; + cerr << "in snarl with source_id: " << main_graph_source_and_sink.first << " and sink_id " << main_graph_source_and_sink.second << ":" << endl; cerr << "no new path location found." << endl; } // Test that the new path seq = old path seq. @@ -298,7 +301,7 @@ pair SnarlNormalizer::move_path_to_new_snarl(const if (old_path_str != new_path_str) { cerr << "************in UNIT_TEST for move_path_to_new_snarl************" << endl; - cerr << "in snarl with leftmost_id: " << _graph.get_id(leftmost_handle) << " and rightmost_id " << _graph.get_id(rightmost_handle) << ":" << endl; + cerr << "in snarl with source_id: " << main_graph_source_and_sink.first << " and sink_id " << main_graph_source_and_sink.second << ":" << endl; cerr << "Once the path was moved into the new snarl, it didn't have the same sequence." << endl; cerr << "original seq: " << old_path_str << endl; cerr << " new seq: " << new_path_str << endl; diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index fe2b8a8ef0c..9911734269f 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -76,8 +76,8 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { pair snarl_sequence_change; // // //todo: debug_code - int stop_size = 1; - int num_snarls_touched = 0; + // int stop_size = 1; + // int num_snarls_touched = 0; // int skip_first_few = 2; //#1, node 3702578 is a cyclic snarl. Don't recall about #0. #2 also cyclic. Looks like cyclic snarls weren't buggy? // int skipped = 0; @@ -90,11 +90,11 @@ void SnarlNormalizer::normalize_top_level_snarls(ifstream &snarl_stream) { // continue; // } - if (num_snarls_touched == stop_size){ - break; - } else { - num_snarls_touched++; - } + // if (num_snarls_touched == stop_size){ + // break; + // } else { + // num_snarls_touched++; + // } // if (roots->start().node_id() == 3881494) { // cerr << "root backwards?" << roots->start().backward() << endl; @@ -776,7 +776,7 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, path_spans_left_right.first = (_graph.get_id(_graph.get_handle_of_step(embedded_paths[i].first)) == source_id); path_spans_left_right.second = (_graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(embedded_paths[i].second))) == sink_id); - embedded_paths[i] = move_path_to_new_snarl(embedded_paths[i], temp_snarl_leftmost_id, temp_snarl_rightmost_id, path_spans_left_right, !backwards); + embedded_paths[i] = move_path_to_new_snarl(embedded_paths[i], temp_snarl_leftmost_id, temp_snarl_rightmost_id, path_spans_left_right, !backwards, make_pair(source_id, sink_id)); } // Destroy the old snarl. diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 006bab073ea..5f81c66ac18 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -65,9 +65,9 @@ class SnarlNormalizer { // moving paths to new graph (new draft functions) vector, int> > find_possible_path_starts (const handle_t& leftmost_handle, const handle_t& rightmost_handle, const pair& path_spans_left_right); - vector extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right); + vector extend_possible_paths(vector, int>> &possible_path_starts, const string &path_str, const handle_t &leftmost_handle, const handle_t &rightmost_handle, const pair &path_spans_left_right, const pair &main_graph_source_and_sink); - pair move_path_to_new_snarl(const pair & old_path, const id_t &source, const id_t &sink, const pair &path_spans_left_right, const bool &path_directed_left_to_right); + pair move_path_to_new_snarl(const pair & old_path, const id_t &source, const id_t &sink, const pair &path_spans_left_right, const bool &path_directed_left_to_right, const pair &main_graph_source_and_sink); ////////////////////////////////////////////////////////////////////////////////////// // format-type switching: From c05948d799742dc8deeada99be7141c412c25f53 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 2 Apr 2021 11:28:22 -0700 Subject: [PATCH 61/63] fixed bug with forcing end-character alignments --- src/algorithms/0_oo_normalize_snarls.cpp | 55 ++++++++++++++++++++---- src/algorithms/0_oo_normalize_snarls.hpp | 2 +- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index 9911734269f..c66c184951c 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -212,9 +212,12 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const } // extract threads + // haplotypes is of format: + // 0: a set of all the haplotypes which stretch from source to sink, in string format. + // - it's a set, so doesn't contain duplicates + // 1: a vector of all the other haps in the snarl (in vector format) + // 2: a vector of all the handles ever touched by the SnarlSequenceFinder. tuple, vector>, unordered_set> haplotypes; - // //todo: debug_statement: determining whether cyclic problem in yeast graph goes away when I swapo source and sink. - // SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, sink_id, source_id); SnarlSequenceFinder sequence_finder = SnarlSequenceFinder(_graph, snarl, _haploGraph, source_id, sink_id, backwards); if (_path_finder == "GBWT") { @@ -263,12 +266,19 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const // cerr << seq << endl; // } + cerr << "strings in path_seq before adding haplotypes: " << endl; + for (auto path : get<0>(haplotypes)) + { + cerr << path << endl; + } + // TODO: once haplotypes that begin/end in the middle of the snarl have been // TODO: accounted for in the code, remove next chunk of code that finds // TODO: source-to-sink paths. // find the paths that stretch from source to sink: // cerr << "~~~~~~~~~~source: " << source_id << "sink: " << sink_id << endl; - for (auto path : embedded_paths) { + for (auto path : embedded_paths) + { // cerr << "checking path of name " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << " with source " << _graph.get_id(_graph.get_handle_of_step(path.first)) << " and sink " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << endl; // cerr << "SOURCE info: prev step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(path.second))) << "prev prev step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_previous_step(_graph.get_previous_step(path.second)))) << " source: " << _graph.get_id(_graph.get_handle_of_step(path.second)) << " next step: " << _graph.get_id(_graph.get_handle_of_step(_graph.get_next_step(path.second))) << endl; @@ -276,6 +286,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const if (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id && _graph.get_id(_graph.get_handle_of_step( _graph.get_previous_step(path.second))) == sink_id) { + cerr << "path_seq added to haplotypes." << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << endl; // cerr << "******************************************\nadding path of name " << // _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << @@ -289,15 +300,27 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step)); cur_step = _graph.get_next_step(cur_step); } + // cerr << "path seq:" << path_seq << endl; if (backwards) { + cerr << "path seq emplaced (in reverse):" << reverse_complement(path_seq) << endl; + int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. get<0>(haplotypes).emplace(reverse_complement(path_seq)); + cerr << "was path_seq a new string? " << get<0>(haplotypes).size() - init_hap_size << endl; } else { + cerr << "path seq emplaced (in forward):" << path_seq << endl; + int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. get<0>(haplotypes).emplace(path_seq); + cerr << "was path_seq a copy? " << get<0>(haplotypes).size() - init_hap_size << endl; } } } + cerr << "haps in haplotypes: " << endl; + for (string hap : get<0>(haplotypes)) + { + cerr << hap << endl; + } // Align the new snarl: VG new_snarl = align_source_to_sink_haplotypes(get<0>(haplotypes)); @@ -392,7 +415,7 @@ unordered_set SnarlNormalizer::format_handle_haplotypes_to_strings( // Returns: // VG object representing the newly realigned snarl. VG SnarlNormalizer::align_source_to_sink_haplotypes( - unordered_set source_to_sink_haplotypes) { + const unordered_set& source_to_sink_haplotypes) { // cerr << "align_source_to_sink_haplotypes" << endl; // cerr << " haplotypes in source_to_sink_haplotypes: " << endl; // for (string hap : source_to_sink_haplotypes) { @@ -416,10 +439,22 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( string source_char(1, random_element.front()); string sink_char(1, random_element.back()); + // cerr << "strings in path_seq before replacing final character: " << endl; + // for (auto path : get<0>(haplotypes)) + // { + // cerr << path << endl;f + // } + // replace the source and sink chars with X, to force match at source and sink. - for (auto hap : source_to_sink_haplotypes) { + unordered_set edited_source_to_sink_haplotypes; + // for (auto it = source_to_sink_haplotypes.begin(); it != source_to_sink_haplotypes.end(); it++) + for (auto hap : source_to_sink_haplotypes) + { + cerr << "hap before replace: " << hap << endl; hap.replace(0, 1, "X"); hap.replace(hap.size() - 1, 1, "X"); + cerr << "hap after replace: " << hap << endl; + edited_source_to_sink_haplotypes.emplace(hap); } // //todo: debug_statement @@ -451,9 +486,11 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( //// seqan::Align align; seqan::Align align; - seqan::resize(rows(align), source_to_sink_haplotypes.size()); + seqan::resize(rows(align), edited_source_to_sink_haplotypes.size()); int i = 0; - for (auto hap : source_to_sink_haplotypes) { + for (auto hap : edited_source_to_sink_haplotypes) { + cerr << "hap as added to align: " << hap << endl; + cerr << "hap.c_str as added to align: " << hap.c_str() << endl; assignSource(row(align, i), hap.c_str()); i++; } @@ -473,15 +510,17 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( // edit the row so that the proper source and sink chars are added to the // haplotype instead of the special characters added to ensure correct alignment // of source and sink. + cerr << "row_string before: " << row_string << endl; row_string.replace(0, 1, source_char); row_string.replace(row_string.size() - 1, 1, sink_char); row_strings.push_back(row_string); + cerr << "row_string after: " << row_string << endl; } stringstream ss; for (string seq : row_strings) { // todo: debug_statement - // cerr << "seq in alignment:" << seq << endl; + cerr << "seq in alignment:" << seq << endl; ss << endl << seq; } // ss << align; diff --git a/src/algorithms/0_oo_normalize_snarls.hpp b/src/algorithms/0_oo_normalize_snarls.hpp index 5f81c66ac18..b374ceceb7f 100644 --- a/src/algorithms/0_oo_normalize_snarls.hpp +++ b/src/algorithms/0_oo_normalize_snarls.hpp @@ -47,7 +47,7 @@ class SnarlNormalizer { // creation of new graph: ////////////////////////////////////////////////////////////////////////////////////// - VG align_source_to_sink_haplotypes(unordered_set source_to_sink_haplotypes); + VG align_source_to_sink_haplotypes(const unordered_set& source_to_sink_haplotypes); void integrate_snarl(SubHandleGraph &old_snarl, const HandleGraph &new_snarl, vector>& embedded_paths, From 37a079fd3d799f406cfd6c471ca269361e8b6462 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Fri, 2 Apr 2021 12:10:35 -0700 Subject: [PATCH 62/63] removed debug prints --- src/algorithms/0_oo_normalize_snarls.cpp | 49 ++++++++++-------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index c66c184951c..b431cf0c9dd 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -261,17 +261,12 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const sequence_finder.find_embedded_paths(); //todo: debug_statement - // cerr << "Let's see what sequences I have before adding embedded paths to seq info:" << endl; - // for (string seq : get<0>(haplotypes)) { - // cerr << seq << endl; + // cerr << "strings in path_seq before adding haplotypes: " << endl; + // for (auto path : get<0>(haplotypes)) + // { + // cerr << path << endl; // } - cerr << "strings in path_seq before adding haplotypes: " << endl; - for (auto path : get<0>(haplotypes)) - { - cerr << path << endl; - } - // TODO: once haplotypes that begin/end in the middle of the snarl have been // TODO: accounted for in the code, remove next chunk of code that finds // TODO: source-to-sink paths. @@ -286,7 +281,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const if (_graph.get_id(_graph.get_handle_of_step(path.first)) == source_id && _graph.get_id(_graph.get_handle_of_step( _graph.get_previous_step(path.second))) == sink_id) { - cerr << "path_seq added to haplotypes." << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << endl; + // cerr << "path_seq added to haplotypes. " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << endl; // cerr << "******************************************\nadding path of name " << // _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << @@ -302,25 +297,25 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const } // cerr << "path seq:" << path_seq << endl; if (backwards) { - cerr << "path seq emplaced (in reverse):" << reverse_complement(path_seq) << endl; - int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. + // cerr << "path seq emplaced (in reverse):" << reverse_complement(path_seq) << endl; + // int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. get<0>(haplotypes).emplace(reverse_complement(path_seq)); - cerr << "was path_seq a new string? " << get<0>(haplotypes).size() - init_hap_size << endl; + // cerr << "was path_seq a new string? " << get<0>(haplotypes).size() - init_hap_size << endl; } else { - cerr << "path seq emplaced (in forward):" << path_seq << endl; - int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. + // cerr << "path seq emplaced (in forward):" << path_seq << endl; + // int init_hap_size = get<0>(haplotypes).size(); // Note: just for debug purposes. get<0>(haplotypes).emplace(path_seq); - cerr << "was path_seq a copy? " << get<0>(haplotypes).size() - init_hap_size << endl; + // cerr << "was path_seq a copy? " << get<0>(haplotypes).size() - init_hap_size << endl; } } } - cerr << "haps in haplotypes: " << endl; - for (string hap : get<0>(haplotypes)) - { - cerr << hap << endl; - } + // cerr << "haps in haplotypes: " << endl; + // for (string hap : get<0>(haplotypes)) + // { + // cerr << hap << endl; + // } // Align the new snarl: VG new_snarl = align_source_to_sink_haplotypes(get<0>(haplotypes)); @@ -450,10 +445,10 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( // for (auto it = source_to_sink_haplotypes.begin(); it != source_to_sink_haplotypes.end(); it++) for (auto hap : source_to_sink_haplotypes) { - cerr << "hap before replace: " << hap << endl; + // cerr << "hap before replace: " << hap << endl; hap.replace(0, 1, "X"); hap.replace(hap.size() - 1, 1, "X"); - cerr << "hap after replace: " << hap << endl; + // cerr << "hap after replace: " << hap << endl; edited_source_to_sink_haplotypes.emplace(hap); } @@ -489,8 +484,6 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( seqan::resize(rows(align), edited_source_to_sink_haplotypes.size()); int i = 0; for (auto hap : edited_source_to_sink_haplotypes) { - cerr << "hap as added to align: " << hap << endl; - cerr << "hap.c_str as added to align: " << hap.c_str() << endl; assignSource(row(align, i), hap.c_str()); i++; } @@ -510,17 +503,17 @@ VG SnarlNormalizer::align_source_to_sink_haplotypes( // edit the row so that the proper source and sink chars are added to the // haplotype instead of the special characters added to ensure correct alignment // of source and sink. - cerr << "row_string before: " << row_string << endl; + // cerr << "row_string before: " << row_string << endl; row_string.replace(0, 1, source_char); row_string.replace(row_string.size() - 1, 1, sink_char); row_strings.push_back(row_string); - cerr << "row_string after: " << row_string << endl; + // cerr << "row_string after: " << row_string << endl; } stringstream ss; for (string seq : row_strings) { // todo: debug_statement - cerr << "seq in alignment:" << seq << endl; + // cerr << "seq in alignment:" << seq << endl; ss << endl << seq; } // ss << align; From ab10f34413a488b0bd91c70eaf6f8293e257f522 Mon Sep 17 00:00:00 2001 From: Robin-Rounthwaite Date: Mon, 5 Apr 2021 17:00:36 -0700 Subject: [PATCH 63/63] fixed typo in comments --- src/algorithms/0_oo_normalize_snarls.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index b431cf0c9dd..ebed483acf4 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -332,7 +332,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const integrate_snarl(snarl, new_snarl, embedded_paths, source_id, sink_id, backwards); } else { if (!get<1>(haplotypes).empty()) { - cerr << "found a snarl starting at " << source_id << " and ending at " + cerr << "found a snarl with source " << source_id << " and sink " << sink_id << " with haplotypes that start or end in the middle. Skipping." << endl; cerr << "There are " << sizeof(get<1>(haplotypes)) << " haplotypes of that description." << endl; @@ -341,14 +341,14 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const error_record[1] = true; } if (get<0>(haplotypes).size() > _max_alignment_size) { - cerr << "found a snarl starting at " << source_id << " and ending at " + cerr << "found a snarl with source " << source_id << " and sink " << sink_id << " with too many haplotypes (" << get<0>(haplotypes).size() << ") to efficiently align. Skipping." << endl; error_record[0] = true; } if (get<2>(haplotypes).size() != handles_in_snarl.size()) { - cerr << "some handles in the snarl starting at " << source_id - << " and ending at " << sink_id + cerr << "some handles in the snarl with source " << source_id + << " and sink " << sink_id << " aren't accounted for by the gbwt_graph. " "Skipping." << endl; @@ -367,7 +367,7 @@ vector SnarlNormalizer::normalize_snarl(id_t source_id, id_t sink_id, const if (error_record[5] > error_record[4]) { cerr << "**************************in UNIT-TEST for normalize_snarl: **************************" << endl; cerr << "NOTE: normalized a snarl which *increased* in sequence quantity, " - "starting at " + "with source " << source_id << endl << "\tsize before: " << error_record[4] << " size after: " << error_record[5] << endl; @@ -704,7 +704,7 @@ void SnarlNormalizer::integrate_snarl(SubHandleGraph &old_snarl, if (to_insert_snarl_defining_handles.first.size() > 1 || to_insert_snarl_defining_handles.second.size() > 1) { - cerr << "ERROR: newly made snarl from a snarl starting at " << source_id + cerr << "ERROR: newly made snarl from a snarl with source " << source_id << " has more than one start or end. # of starts: " << to_insert_snarl_defining_handles.first.size() << " # of ends: " << to_insert_snarl_defining_handles.second.size() << endl;