handle fasta inputs from different directories

iqbal-lab-org · Dec 15, 2023 · 40c69ff · 40c69ff
1 parent 0ffa81d
commit 40c69ff
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 13 deletions.
diff --git a/pling/align_snakemake/Snakefile b/pling/align_snakemake/Snakefile
@@ -7,9 +7,10 @@ configfile: "../config.yaml"
 
 OUTPUTPATH = config["output_dir"]
 PREFIX = config["prefix"]
-FASTAFILES, FASTAEXT, FASTAPATH = get_fasta_file_info(config["genomes_list"])
-GENOMES = list(FASTAEXT.keys())
 batch_size = config["batch_size"]
+FASTAFILES = [el[0] for el in pd.read_csv(config["genomes_list"], header=None).values]
+FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES}
+GENOMES = list(FASTAEXT.keys())
 
 include: "../common_rules/common_rules.smk"
 

diff --git a/pling/align_snakemake/unimog.py b/pling/align_snakemake/unimog.py
@@ -11,15 +11,15 @@ def unimogs_to_ilp_core(genome_1_fasta, genome_2_fasta, genome_1, genome_2, iden
     unimog = f">{genome_1}~{genome_2}:{genome_1}\n{plasmid_1_unimogs}\n>{genome_1}~{genome_2}:{genome_2}\n{plasmid_2_unimogs}\n"
     return unimog, jaccard_distance, blocks_ref, blocks_query
 
-def batchwise_unimog(fastapath, fastaext, pairs, unimogpath, mappath, jaccardpath, identity_threshold, jaccard_distance):
+def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identity_threshold, jaccard_distance):
     jaccards = []
     unimogs = []
     batch_blocks = {}
     for pair in pairs:
         genome_1 = pair[0]
         genome_2 = pair[1]
-        genome_1_fasta = f"{fastapath}/{genome_1}{fastaext[genome_1]}"
-        genome_2_fasta = f"{fastapath}/{genome_2}{fastaext[genome_2]}"
+        genome_1_fasta = fastafiles[genome_1]
+        genome_2_fasta = fastafiles[genome_2]
         unimog, jaccard, blocks_ref, blocks_query = unimogs_to_ilp_core(genome_1_fasta, genome_2_fasta, genome_1, genome_2, identity_threshold)
         jaccards.append(f"{genome_1}\t{genome_2}\t{jaccard}\n")
         if jaccard<=jaccard_distance:
@@ -66,7 +66,7 @@ def main():
 
     pairs=read_in_batch_pairs(f"{args.outputpath}/batches/batch_{args.batch}.txt")
 
-    batchwise_unimog(fastapath, fastaext, pairs, args.unimog_output, args.map_output, args.jaccard_output, args.identity_threshold, args.jaccard_distance)
+    batchwise_unimog(fastafiles, pairs, args.unimog_output, args.map_output, args.jaccard_output, args.identity_threshold, args.jaccard_distance)
 
 
 if __name__ == "__main__":

diff --git a/pling/jac_network_snakemake/seq_jaccard.py b/pling/jac_network_snakemake/seq_jaccard.py
@@ -61,13 +61,13 @@ def get_sequence_jaccard_distance(plasmid_1: Path, plasmid_2: Path, prefix: str,
     jaccard_distance = 1-jaccard_similarity
     return jaccard_distance
 
-def batchwise_jaccard(fastapath, fastaext, pairs, jaccardpath, identity_threshold):
+def batchwise_jaccard(fastafiles, pairs, jaccardpath, identity_threshold):
     jaccards = []
     for pair in pairs:
         genome_1 = pair[0]
         genome_2 = pair[1]
-        genome_1_fasta = f"{fastapath}/{genome_1}{fastaext[genome_1]}"
-        genome_2_fasta = f"{fastapath}/{genome_2}{fastaext[genome_2]}"
+        genome_1_fasta = fastafiles[genome_1]
+        genome_2_fasta = fastafiles[genome_2]
         jaccard_distance = get_sequence_jaccard_distance(genome_1_fasta, genome_2_fasta, f"{genome_1}~{genome_2}", identity_threshold)
         jaccards.append(f"{genome_1}\t{genome_2}\t{jaccard_distance}\n")
     with open(jaccardpath, 'w') as f:
@@ -80,7 +80,7 @@ def main(args):
 
     pairs=read_in_batch_pairs(f"{args.outputpath}/batches/batch_{args.batch}.txt")
 
-    batchwise_jaccard(fastapath, fastaext, pairs, args.jaccard_output, args.identity_threshold)
+    batchwise_jaccard(fastafiles, pairs, args.jaccard_output, args.identity_threshold)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Calculate Jaccard index for genome sequences.')

diff --git a/pling/utils.py b/pling/utils.py
@@ -19,7 +19,8 @@ def read_in_batch_pairs(filepath):
     return pairs
 
 def get_fasta_file_info(genomes_list):
-    FASTAFILES = [el[0] for el in pd.read_csv(genomes_list, header=None).values]
-    FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES}
-    FASTAPATH = os.path.dirname(FASTAFILES[0])
+    FASTAFILES_LIST = [el[0] for el in pd.read_csv(genomes_list, header=None).values]
+    FASTAFILES = {os.path.splitext(os.path.basename(el))[0]:el for el in FASTAFILES_LIST}
+    FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES_LIST}
+    FASTAPATH = os.path.dirname(FASTAFILES_LIST[0])
     return FASTAFILES, FASTAEXT, FASTAPATH