From 15308baa29e40e3a5fd8ac4288c38e00e93a5c93 Mon Sep 17 00:00:00 2001 From: babayagaofficial Date: Thu, 11 Jan 2024 18:21:56 +0000 Subject: [PATCH 1/3] workaround to make blocks work for the pair NZ_CP029142.1, NZ_CP060660.1. Issue was that nucmer reported two matches with an overlap, where ref and query had different lengths, but no indels were reported (and the two matches probably ought to have been reported as one whole match. Workaround doesn't use the projection to get the overlapping intervals coordinates in split_match function anymore. --- pling/align_snakemake/matches.py | 30 ++++++++++++++++++++++++------ pling/align_snakemake/unimog.py | 1 + 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py index 4bf98ac..abd493f 100644 --- a/pling/align_snakemake/matches.py +++ b/pling/align_snakemake/matches.py @@ -37,7 +37,9 @@ def __str__(self): def projection(self, coord, ref_bool): #if ref_bool=True, project from reference to query, else query to reference if ref_bool: + print(self.indels) dist = coord - self.rstart + print(dist) for indel in self.indels: if indel.rstart<=coordoverlap_threshold: overlap_matches = self.find_opposite_overlaps(i, False) + print(overlap_matches[0]) contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True) for match in contain_overlap_1: - self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True) + self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, self[i].qend, self[i+1].qstart, True) contain_overlap_2 = self.contain_interval(overlap_matches[1].rstart, overlap_matches[1].rend, True) for match in contain_overlap_2: - self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True) + self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, self[i].qend, self[i+1].qstart, True) if containment: + print(self) current_match = self[i] + print(current_match) self.sort(False) i = self.list.index(current_match) + print("contained") i = i+1 self.sort(True) i=0 finished = False - while not finished: + while not finished and i<50: + print("ref", i) + print(self) overlap = 0 try: overlap = self[i].rend-self[i+1].rstart @@ -264,10 +282,10 @@ def resolve_overlaps(self, overlap_threshold): overlap_matches = self.find_opposite_overlaps(i, True) contain_overlap_1 = self.contain_interval(overlap_matches[0].qstart, overlap_matches[0].qend, False) for match in contain_overlap_1: - self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, False) + self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, self[i].rend, self[i+1].rstart, False) contain_overlap_2 = self.contain_interval(overlap_matches[1].qstart, overlap_matches[1].qend, False) for match in contain_overlap_2: - self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, False) + self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, self[i].rend, self[i+1].rstart, False) if containment: current_match = self[i] self.sort(True) diff --git a/pling/align_snakemake/unimog.py b/pling/align_snakemake/unimog.py index 0ec2f08..4884d4a 100644 --- a/pling/align_snakemake/unimog.py +++ b/pling/align_snakemake/unimog.py @@ -26,6 +26,7 @@ def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identi unimogs.append(unimog) blocks = pd.concat([blocks_ref, blocks_query], ignore_index=True) batch_blocks[f"{genome_1}~{genome_2}"] = blocks + print(pair) with open(unimogpath, 'w') as f: for line in unimogs: f.write(line) From b4ac51512c904adad3c258650d4c128a9bcc33c1 Mon Sep 17 00:00:00 2001 From: babayagaofficial Date: Fri, 12 Jan 2024 17:20:47 +0000 Subject: [PATCH 2/3] Remove workaround from previous commit, since changing the parameters nucmer is run with fixes the issue. Switched from using dnadiff to using nucmer directly, and to run with --breaklen 500 and --diagdiff 20, which (anecdotely) should prevent breaking up matches like in the pair NZ_CP029142.1, NZ_CP060660.1. --- pling/align_snakemake/integerise_plasmids.py | 2 +- pling/align_snakemake/matches.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pling/align_snakemake/integerise_plasmids.py b/pling/align_snakemake/integerise_plasmids.py index 4a08ff2..9d246be 100644 --- a/pling/align_snakemake/integerise_plasmids.py +++ b/pling/align_snakemake/integerise_plasmids.py @@ -77,7 +77,7 @@ def get_coverage(tree): return coverage def integerise_plasmids(plasmid_1: Path, plasmid_2: Path, prefix: str, plasmid_1_name, plasmid_2_name, identity_threshold=80, length_threshold=200): - subprocess.check_call(f"perl -w $(which dnadiff) {plasmid_1} {plasmid_2} -p {prefix} 2>/dev/null", shell=True) + subprocess.check_call(f"nucmer --diagdiff 20 --breaklen 500 --maxmatch -p {prefix} {plasmid_1} {plasmid_2} && delta-filter -1 {prefix}.delta > {prefix}.1delta", shell=True) show_coords_output = subprocess.check_output(f"show-coords -TrcldH -I {identity_threshold} {prefix}.1delta", shell=True).strip().split(b'\n') # TODO: what about this threshold? show_snps_output = subprocess.check_output(f"show-snps -TrH {prefix}.1delta", shell=True).strip().split(b'\n') diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py index abd493f..d245dfe 100644 --- a/pling/align_snakemake/matches.py +++ b/pling/align_snakemake/matches.py @@ -151,12 +151,10 @@ def contain_interval(self, start, end, ref_bool): #find in which matches interva matches.append(Match(interval.data[0], interval.data[1], interval.begin, interval.end, interval.data[2])) return matches - def split_match(self, match, start, end, interval_start, interval_end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it) + def split_match(self, match, start, end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it) index = self.list.index(match) projected_start = match.projection(start,ref_bool) projected_end = match.projection(end,ref_bool) - projected_start = interval_start - projected_end = interval_end print(projected_start, projected_end) if ref_bool: if match.strand == 1: @@ -250,10 +248,10 @@ def resolve_overlaps(self, overlap_threshold): print(overlap_matches[0]) contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True) for match in contain_overlap_1: - self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, self[i].qend, self[i+1].qstart, True) + self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True) contain_overlap_2 = self.contain_interval(overlap_matches[1].rstart, overlap_matches[1].rend, True) for match in contain_overlap_2: - self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, self[i].qend, self[i+1].qstart, True) + self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True) if containment: print(self) current_match = self[i] @@ -282,10 +280,10 @@ def resolve_overlaps(self, overlap_threshold): overlap_matches = self.find_opposite_overlaps(i, True) contain_overlap_1 = self.contain_interval(overlap_matches[0].qstart, overlap_matches[0].qend, False) for match in contain_overlap_1: - self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, self[i].rend, self[i+1].rstart, False) + self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, False) contain_overlap_2 = self.contain_interval(overlap_matches[1].qstart, overlap_matches[1].qend, False) for match in contain_overlap_2: - self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, self[i].rend, self[i+1].rstart, False) + self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, False) if containment: current_match = self[i] self.sort(True) From 4294a642d295ec91962ae195117a70b56e7069c3 Mon Sep 17 00:00:00 2001 From: babayagaofficial Date: Fri, 12 Jan 2024 17:46:46 +0000 Subject: [PATCH 3/3] Made matches.py error out if number of iterations in one of the while loop in resolve_overlaps goes over 100000 to make catching issues quicker in future. Cleaned up mess from debugging. --- pling/align_snakemake/matches.py | 23 +++++------------------ pling/align_snakemake/unimog.py | 1 - 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py index d245dfe..8b8acff 100644 --- a/pling/align_snakemake/matches.py +++ b/pling/align_snakemake/matches.py @@ -37,9 +37,7 @@ def __str__(self): def projection(self, coord, ref_bool): #if ref_bool=True, project from reference to query, else query to reference if ref_bool: - print(self.indels) dist = coord - self.rstart - print(dist) for indel in self.indels: if indel.rstart<=coordoverlap_threshold: overlap_matches = self.find_opposite_overlaps(i, False) - print(overlap_matches[0]) contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True) for match in contain_overlap_1: self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True) @@ -253,24 +242,20 @@ def resolve_overlaps(self, overlap_threshold): for match in contain_overlap_2: self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True) if containment: - print(self) current_match = self[i] - print(current_match) self.sort(False) i = self.list.index(current_match) - print("contained") i = i+1 + if i>100000: + raise Exception("The resolve_overlaps function in matches.py has been stuck in the while loop for 100000 iterations! There is likely a bug, please raise an issue.") self.sort(True) i=0 finished = False - while not finished and i<50: - print("ref", i) - print(self) + while not finished: overlap = 0 try: overlap = self[i].rend-self[i+1].rstart not_null = self[i].rstart!=self[i].rend and self[i+1].rstart!=self[i+1].rend - not_the_same = self[i].rstart!=self[i+1].rstart and self[i].rend!=self[i+1].rend #ignore multicopy containment = self[i+1].rend100000: + raise Exception("The resolve_overlaps function in matches.py has been stuck in the while loop for 100000 iterations! There is likely a bug, please raise an issue.") testing = False if testing == True: diff --git a/pling/align_snakemake/unimog.py b/pling/align_snakemake/unimog.py index 4884d4a..0ec2f08 100644 --- a/pling/align_snakemake/unimog.py +++ b/pling/align_snakemake/unimog.py @@ -26,7 +26,6 @@ def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identi unimogs.append(unimog) blocks = pd.concat([blocks_ref, blocks_query], ignore_index=True) batch_blocks[f"{genome_1}~{genome_2}"] = blocks - print(pair) with open(unimogpath, 'w') as f: for line in unimogs: f.write(line)