Corrects querying of genes

samhorsfield96 · Oct 17, 2024 · 7f2c05b · 7f2c05b
1 parent b91fc7d
commit 7f2c05b
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 30 deletions.
diff --git a/ggCaller/__main__.py b/ggCaller/__main__.py
@@ -510,7 +510,7 @@ def main():
     ORFMap_dir = os.path.join(ORFMap_dir, "")
 
     # save the kmer_array to ORFMap_dir
-    graph[0].data_out(ORFMap_dir + "kmer_array.dat")
+    graph.data_out(ORFMap_dir + "kmer_array.dat")
 
     # load models models if required
     if not options.no_filter:
@@ -530,6 +530,10 @@ def main():
 
     ORF_file_paths, Edge_file_paths = file_tuple
 
+    # save the ORF file paths
+    with open(ORFMap_dir + "ORF_file_paths.dat", "wb") as o:
+        cPickle.dump(ORF_file_paths, o)
+
     # generate ORF clusters
     if not options.no_clustering:
         with SharedMemoryManager() as smm:

diff --git a/ggCaller/graph_traversal.py b/ggCaller/graph_traversal.py
@@ -1,16 +1,20 @@
 import os, sys
 import _pickle as cPickle
 from Bio import SeqIO
+import networkx as nx
+from collections import defaultdict
+import ggCaller_cpp
 
 # TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
 def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
     # check if objects_dir present, if not exit
-    if not os.path.exists(objects_dir):
-        print("Please specify a ggc_data directory")
-        sys.exit(1)
-
+    ggc_data_dir = os.path.join(objects_dir, "ggc_data")
     objects_dir = os.path.join(objects_dir, "")
 
+    if not os.path.exists(ggc_data_dir):
+        print("Please specify a directory from a ggCaller run with '--save'")
+        sys.exit(1)
+
     # read in and parse sequences in queryfile
     id_vec = []
     query_vec = []
@@ -33,7 +37,7 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
                     query_vec.append(line.strip())
 
     # read in graph object and high_scoring ORFs and node_index
-    graph.data_in(objects_dir + "ggc_graph.dat", graphfile, coloursfile, num_threads)
+    graph.data_in(os.path.join(objects_dir, "ORF_dir", "kmer_array.dat"), graphfile, coloursfile, num_threads)
 
     # query the sequences in the graph
     print("Querying unitigs in graph...")
@@ -44,39 +48,65 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
         os.path.splitext(os.path.basename(x))[0] for x in input_colours
     ]
 
-    with open(objects_dir + "high_scoring_orfs.dat", "rb") as input_file:
-        high_scoring_ORFs = cPickle.load(input_file)
-
-    with open(objects_dir + "node_index.dat", "rb") as input_file:
+    # load in gene to DBG node mappings
+    with open(os.path.join(ggc_data_dir, "node_index.dat"), "rb") as input_file:
         node_index = cPickle.load(input_file)
 
+    # load in paths to ORF data
+    with open(os.path.join(objects_dir, "ORF_dir", "ORF_file_paths.dat"), "rb") as input_file:
+        ORF_file_paths = cPickle.load(input_file)
+
+    # try to load panaroo graph, not present if panaroo not run
+    G = None
+    ids_len_stop = None
+    try:
+        G = nx.read_gml('t.gml')
+        # load in gene to panaroo node mappings
+        with open(ggc_data_dir + "ORF_to_node_map.dat", "rb") as input_file:
+            ids_len_stop = cPickle.load(input_file)
+    except:
+        pass
+
     outfile = output_dir + "matched_queries.fasta"
     print("Matching overlapping ORFs...")
 
-    ORF_dict = {}
+
+    ORF_dict = defaultdict(lambda: defaultdict(list))
     for i in range(len(query_nodes)):
         for node in query_nodes[i]:
-            for ORF_ID in node_index[node]:
-                if ORF_ID not in ORF_dict:
-                    ORF_dict[ORF_ID] = []
-                ORF_dict[ORF_ID].append(i)
+            for ORF in node_index[node]:
+                split_ID = ORF.split("_")
+                colour = int(split_ID[0])
+                ORF_ID = int(split_ID[-1])
+                ORF_dict[colour][ORF_ID].append(i)
 
     with open(outfile, "w") as f:
-        for ORF, aligned in ORF_dict.items():
-            split_ID = ORF.split("_")
-            colour = int(split_ID[0])
-            ORF_ID = int(split_ID[-1])
+        for colour, ORF_element in ORF_dict.items():
+            # load ORF info
+            ORF_map = ggCaller_cpp.read_ORF_file(ORF_file_paths[colour])
             fasta_ID = isolate_names[colour] + "_" + str(ORF_ID)
-            ORF_info = high_scoring_ORFs[colour][ORF_ID]
-            seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
-            if fasta_file:
-                id_set = set([id_vec[i] for i in aligned])
-                queries = ";".join([i for i in id_set])
-            else:
-                queries = ";".join([query_vec[i] for i in aligned])
-
-            # add annotation
-            f.write(
-                    ">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + ORF_info[-1] + "\n" + seq + "\n")
+
+            # iterate over each ORF that has match
+            for ORF_ID, aligned in ORF_element.items():
+                ORF_info = ORF_map[ORF_ID]
+                seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
+                if fasta_file:
+                    id_set = set([id_vec[i] for i in aligned])
+                    queries = ";".join([i for i in id_set])
+                else:
+                    queries = ";".join([query_vec[i] for i in aligned])
+
+                # get annotation for node
+                annotation = "NA"
+                if G is not None:
+                    delim = "_0_" if ORF_ID >= 0 else "_refound_"
+                    pan_ORF_id = str(colour) + delim + str(ORF_ID)
+                    # if pan_ORF_id not in ORF_map means has been removed so remove it from ORF_map
+                    node = ids_len_stop[pan_ORF_id][2]
+                    annotation = G.nodes[node]["description"]
+
+                # add annotation
+                f.write(
+                        ">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + annotation + "\n" + seq + "\n")
 
     return