Skip to content

Commit

Permalink
Corrects querying of genes
Browse files Browse the repository at this point in the history
  • Loading branch information
samhorsfield96 committed Oct 17, 2024
1 parent b91fc7d commit 7f2c05b
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 30 deletions.
6 changes: 5 additions & 1 deletion ggCaller/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def main():
ORFMap_dir = os.path.join(ORFMap_dir, "")

# save the kmer_array to ORFMap_dir
graph[0].data_out(ORFMap_dir + "kmer_array.dat")
graph.data_out(ORFMap_dir + "kmer_array.dat")

# load models models if required
if not options.no_filter:
Expand All @@ -530,6 +530,10 @@ def main():

ORF_file_paths, Edge_file_paths = file_tuple

# save the ORF file paths
with open(ORFMap_dir + "ORF_file_paths.dat", "wb") as o:
cPickle.dump(ORF_file_paths, o)

# generate ORF clusters
if not options.no_clustering:
with SharedMemoryManager() as smm:
Expand Down
88 changes: 59 additions & 29 deletions ggCaller/graph_traversal.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import os, sys
import _pickle as cPickle
from Bio import SeqIO
import networkx as nx
from collections import defaultdict
import ggCaller_cpp

# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
# check if objects_dir present, if not exit
if not os.path.exists(objects_dir):
print("Please specify a ggc_data directory")
sys.exit(1)

ggc_data_dir = os.path.join(objects_dir, "ggc_data")
objects_dir = os.path.join(objects_dir, "")

if not os.path.exists(ggc_data_dir):
print("Please specify a directory from a ggCaller run with '--save'")
sys.exit(1)

# read in and parse sequences in queryfile
id_vec = []
query_vec = []
Expand All @@ -33,7 +37,7 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
query_vec.append(line.strip())

# read in graph object and high_scoring ORFs and node_index
graph.data_in(objects_dir + "ggc_graph.dat", graphfile, coloursfile, num_threads)
graph.data_in(os.path.join(objects_dir, "ORF_dir", "kmer_array.dat"), graphfile, coloursfile, num_threads)

# query the sequences in the graph
print("Querying unitigs in graph...")
Expand All @@ -44,39 +48,65 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
os.path.splitext(os.path.basename(x))[0] for x in input_colours
]

with open(objects_dir + "high_scoring_orfs.dat", "rb") as input_file:
high_scoring_ORFs = cPickle.load(input_file)

with open(objects_dir + "node_index.dat", "rb") as input_file:
# load in gene to DBG node mappings
with open(os.path.join(ggc_data_dir, "node_index.dat"), "rb") as input_file:
node_index = cPickle.load(input_file)

# load in paths to ORF data
with open(os.path.join(objects_dir, "ORF_dir", "ORF_file_paths.dat"), "rb") as input_file:
ORF_file_paths = cPickle.load(input_file)

# try to load panaroo graph, not present if panaroo not run
G = None
ids_len_stop = None
try:
G = nx.read_gml('t.gml')
# load in gene to panaroo node mappings
with open(ggc_data_dir + "ORF_to_node_map.dat", "rb") as input_file:
ids_len_stop = cPickle.load(input_file)
except:
pass

outfile = output_dir + "matched_queries.fasta"
print("Matching overlapping ORFs...")

ORF_dict = {}

ORF_dict = defaultdict(lambda: defaultdict(list))
for i in range(len(query_nodes)):
for node in query_nodes[i]:
for ORF_ID in node_index[node]:
if ORF_ID not in ORF_dict:
ORF_dict[ORF_ID] = []
ORF_dict[ORF_ID].append(i)
for ORF in node_index[node]:
split_ID = ORF.split("_")
colour = int(split_ID[0])
ORF_ID = int(split_ID[-1])
ORF_dict[colour][ORF_ID].append(i)

with open(outfile, "w") as f:
for ORF, aligned in ORF_dict.items():
split_ID = ORF.split("_")
colour = int(split_ID[0])
ORF_ID = int(split_ID[-1])
for colour, ORF_element in ORF_dict.items():
# load ORF info
ORF_map = ggCaller_cpp.read_ORF_file(ORF_file_paths[colour])
fasta_ID = isolate_names[colour] + "_" + str(ORF_ID)
ORF_info = high_scoring_ORFs[colour][ORF_ID]
seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
if fasta_file:
id_set = set([id_vec[i] for i in aligned])
queries = ";".join([i for i in id_set])
else:
queries = ";".join([query_vec[i] for i in aligned])

# add annotation
f.write(
">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + ORF_info[-1] + "\n" + seq + "\n")

# iterate over each ORF that has match
for ORF_ID, aligned in ORF_element.items():
ORF_info = ORF_map[ORF_ID]
seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
if fasta_file:
id_set = set([id_vec[i] for i in aligned])
queries = ";".join([i for i in id_set])
else:
queries = ";".join([query_vec[i] for i in aligned])

# get annotation for node
annotation = "NA"
if G is not None:
delim = "_0_" if ORF_ID >= 0 else "_refound_"
pan_ORF_id = str(colour) + delim + str(ORF_ID)
# if pan_ORF_id not in ORF_map means has been removed so remove it from ORF_map
node = ids_len_stop[pan_ORF_id][2]
annotation = G.nodes[node]["description"]

# add annotation
f.write(
">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + annotation + "\n" + seq + "\n")

return

0 comments on commit 7f2c05b

Please sign in to comment.