Skip to content

Commit

Permalink
Completes removal of high scoring ORF dict from panaroo runner
Browse files Browse the repository at this point in the history
  • Loading branch information
samhorsfield96 committed Oct 9, 2024
1 parent bc2b6cb commit 8ea266b
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 31 deletions.
15 changes: 8 additions & 7 deletions ggCaller/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def main():
options.truncation_threshold, options.save, options.refind)

else:
print_ORF_calls(high_scoring_ORFs, os.path.join(output_dir, "gene_calls"),
print_ORF_calls(ORF_file_paths, os.path.join(output_dir, "gene_calls"),
input_colours, overlap, graph)

if options.save:
Expand All @@ -544,16 +544,17 @@ def main():

# serialise graph object and high scoring ORFs to future reading
graph[0].data_out(objects_dir + "ggc_graph.dat")
with open(objects_dir + "high_scoring_orfs.dat", "wb") as o:
cPickle.dump(high_scoring_ORFs, o)

# create index of all high_scoring_ORFs node_IDs
node_index = defaultdict(list)
for colour, gene_dict in high_scoring_ORFs.items():
for ORF_ID, ORF_info in gene_dict.items():
entry_ID = str(colour) + "_" + str(ORF_ID)
for colour_ID, file_path in ORF_file_paths.items():
ORF_map = ggCaller_cpp.read_ORF_file(file_path)

for ORF_ID, ORF_info in ORF_map.items():
delim = "_0_" if ORF_ID > 0 else "_refound_"
entry_ID = str(colour_ID) + delim + str(ORF_ID)
for node in ORF_info[0]:
node_index[node].append(entry_ID)
node_index[abs(node)].append(entry_ID)

with open(objects_dir + "node_index.dat", "wb") as o:
cPickle.dump(node_index, o)
Expand Down
1 change: 1 addition & 0 deletions ggCaller/graph_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import _pickle as cPickle
from Bio import SeqIO

# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
# check if objects_dir present, if not exit
if not os.path.exists(objects_dir):
Expand Down
30 changes: 6 additions & 24 deletions panaroo_runner/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
G = find_missing(G,
ORF_file_paths,
shd_arr_tup,
kmer=kmer,ƒ
kmer=kmer,
repeat=repeat,
overlap=overlap,
isolate_names=input_colours,
Expand Down Expand Up @@ -339,13 +339,6 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
G.nodes[node]['dna'] = ";".join(conv_list(G.nodes[node]['dna']))
G.nodes[node]['protein'] = ";".join(conv_list(G.nodes[node]['protein']))

# TODO sort out saving object, don't necessarily need to anymore as they are saved by default
# # add node annotation
# if save_objects:
# high_scoring_ORFs[genome_id][local_id] = list(high_scoring_ORFs[genome_id][local_id])
# high_scoring_ORFs[genome_id][local_id][-1] = G.nodes[node]["description"]


for edge in G.edges():
G.edges[edge[0], edge[1]]['genomeIDs'] = ";".join(
[str(m) for m in G.edges[edge[0], edge[1]]['members']])
Expand All @@ -366,34 +359,23 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
# make sure trailing forward slash is present
objects_dir = os.path.join(objects_dir, "")

to_remove = defaultdict(set)

# create index of all high_scoring_ORFs node_IDs, remove if not in panaroo graph
# create index of all high_scoring_ORFs node_IDs
node_index = defaultdict(list)
for colour_ID, file_path in ORF_file_paths.items():
ORF_map = ggCaller_cpp.read_ORF_file(file_path)

for ORF_ID, ORF_info in ORF_map.items():
if not isinstance(ORF_info, list):
to_remove[colour_ID].add(ORF_ID)
continue
delim = "_0_" if ORF_ID > 0 else "_refound_"
entry_ID = str(colour_ID) + delim + str(ORF_ID)
for node in ORF_info[0]:
node_index[abs(node)].append(entry_ID)

# remove genes not in panaroo graph
for colour_ID, file_path in ORF_file_paths.items():
ORF_map = ggCaller_cpp.read_ORF_file(file_path)

for ORF_ID in to_remove[colour_ID]:
del ORF_map[ORF_ID]

ggCaller_cpp.save_ORF_file(file_path, ORF_map)

with open(objects_dir + "node_index.dat", "wb") as o:
cPickle.dump(node_index, o)


# map each ORF to node in graph
with open(objects_dir + "ORF_to_node_map.dat", "wb") as o:
cPickle.dump(ids_len_stop, o)

return

Expand Down
10 changes: 10 additions & 0 deletions panaroo_runner/generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,10 +273,15 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca
description=""))
else:
for colour, file_path in ORF_file_paths.items():
to_remove = set()
ORF_map = ggCaller_cpp.read_ORF_file(file_path)

for ORF_ID, ORF_info in ORF_map.items():
pan_ORF_id = str(colour) + "_0_" + str(ORF_ID)
# if pan_ORF_id not in ORF_map means has been removed so remove it from ORF_map
if pan_ORF_id not in ids_len_stop:
to_remove.add(ORF_ID)
continue
node = ids_len_stop[pan_ORF_id][2]
node_annotation = G.nodes[node]["description"]
length_centroid = G.nodes[node]['lengths'][G.nodes[node]['maxLenId']]
Expand All @@ -299,6 +304,11 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca
aa_records.append(
SeqRecord(Seq(gene).translate(), id=isolate_names[colour] + "_" + str(ORF_ID),
description="ggcID=" + pan_ORF_id + ";" + gene_annotation))

# remove from ORF_map and save
for ORF_ID in to_remove:
del ORF_map[ORF_ID]
ggCaller_cpp.save_ORF_file(file_path, ORF_map)
SeqIO.write(DNA_records, outfile + ".ffn", "fasta")
SeqIO.write(aa_records, outfile + ".faa", "fasta")
return
Expand Down

0 comments on commit 8ea266b

Please sign in to comment.