Skip to content

Commit

Permalink
handle fasta inputs from different directories
Browse files Browse the repository at this point in the history
  • Loading branch information
babayagaofficial committed Dec 15, 2023
1 parent 0ffa81d commit 40c69ff
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 13 deletions.
5 changes: 3 additions & 2 deletions pling/align_snakemake/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ configfile: "../config.yaml"

OUTPUTPATH = config["output_dir"]
PREFIX = config["prefix"]
FASTAFILES, FASTAEXT, FASTAPATH = get_fasta_file_info(config["genomes_list"])
GENOMES = list(FASTAEXT.keys())
batch_size = config["batch_size"]
FASTAFILES = [el[0] for el in pd.read_csv(config["genomes_list"], header=None).values]
FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES}
GENOMES = list(FASTAEXT.keys())

include: "../common_rules/common_rules.smk"

Expand Down
8 changes: 4 additions & 4 deletions pling/align_snakemake/unimog.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ def unimogs_to_ilp_core(genome_1_fasta, genome_2_fasta, genome_1, genome_2, iden
unimog = f">{genome_1}~{genome_2}:{genome_1}\n{plasmid_1_unimogs}\n>{genome_1}~{genome_2}:{genome_2}\n{plasmid_2_unimogs}\n"
return unimog, jaccard_distance, blocks_ref, blocks_query

def batchwise_unimog(fastapath, fastaext, pairs, unimogpath, mappath, jaccardpath, identity_threshold, jaccard_distance):
def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identity_threshold, jaccard_distance):
jaccards = []
unimogs = []
batch_blocks = {}
for pair in pairs:
genome_1 = pair[0]
genome_2 = pair[1]
genome_1_fasta = f"{fastapath}/{genome_1}{fastaext[genome_1]}"
genome_2_fasta = f"{fastapath}/{genome_2}{fastaext[genome_2]}"
genome_1_fasta = fastafiles[genome_1]
genome_2_fasta = fastafiles[genome_2]
unimog, jaccard, blocks_ref, blocks_query = unimogs_to_ilp_core(genome_1_fasta, genome_2_fasta, genome_1, genome_2, identity_threshold)
jaccards.append(f"{genome_1}\t{genome_2}\t{jaccard}\n")
if jaccard<=jaccard_distance:
Expand Down Expand Up @@ -66,7 +66,7 @@ def main():

pairs=read_in_batch_pairs(f"{args.outputpath}/batches/batch_{args.batch}.txt")

batchwise_unimog(fastapath, fastaext, pairs, args.unimog_output, args.map_output, args.jaccard_output, args.identity_threshold, args.jaccard_distance)
batchwise_unimog(fastafiles, pairs, args.unimog_output, args.map_output, args.jaccard_output, args.identity_threshold, args.jaccard_distance)


if __name__ == "__main__":
Expand Down
8 changes: 4 additions & 4 deletions pling/jac_network_snakemake/seq_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,13 @@ def get_sequence_jaccard_distance(plasmid_1: Path, plasmid_2: Path, prefix: str,
jaccard_distance = 1-jaccard_similarity
return jaccard_distance

def batchwise_jaccard(fastapath, fastaext, pairs, jaccardpath, identity_threshold):
def batchwise_jaccard(fastafiles, pairs, jaccardpath, identity_threshold):
jaccards = []
for pair in pairs:
genome_1 = pair[0]
genome_2 = pair[1]
genome_1_fasta = f"{fastapath}/{genome_1}{fastaext[genome_1]}"
genome_2_fasta = f"{fastapath}/{genome_2}{fastaext[genome_2]}"
genome_1_fasta = fastafiles[genome_1]
genome_2_fasta = fastafiles[genome_2]
jaccard_distance = get_sequence_jaccard_distance(genome_1_fasta, genome_2_fasta, f"{genome_1}~{genome_2}", identity_threshold)
jaccards.append(f"{genome_1}\t{genome_2}\t{jaccard_distance}\n")
with open(jaccardpath, 'w') as f:
Expand All @@ -80,7 +80,7 @@ def main(args):

pairs=read_in_batch_pairs(f"{args.outputpath}/batches/batch_{args.batch}.txt")

batchwise_jaccard(fastapath, fastaext, pairs, args.jaccard_output, args.identity_threshold)
batchwise_jaccard(fastafiles, pairs, args.jaccard_output, args.identity_threshold)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Calculate Jaccard index for genome sequences.')
Expand Down
7 changes: 4 additions & 3 deletions pling/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def read_in_batch_pairs(filepath):
return pairs

def get_fasta_file_info(genomes_list):
FASTAFILES = [el[0] for el in pd.read_csv(genomes_list, header=None).values]
FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES}
FASTAPATH = os.path.dirname(FASTAFILES[0])
FASTAFILES_LIST = [el[0] for el in pd.read_csv(genomes_list, header=None).values]
FASTAFILES = {os.path.splitext(os.path.basename(el))[0]:el for el in FASTAFILES_LIST}
FASTAEXT = {os.path.splitext(os.path.basename(el))[0]:os.path.splitext(os.path.basename(el))[1] for el in FASTAFILES_LIST}
FASTAPATH = os.path.dirname(FASTAFILES_LIST[0])
return FASTAFILES, FASTAEXT, FASTAPATH

0 comments on commit 40c69ff

Please sign in to comment.