From 15308baa29e40e3a5fd8ac4288c38e00e93a5c93 Mon Sep 17 00:00:00 2001
From: babayagaofficial <baba.yaga.df@gmail.com>
Date: Thu, 11 Jan 2024 18:21:56 +0000
Subject: [PATCH 1/3] workaround to make blocks work for the pair
 NZ_CP029142.1, NZ_CP060660.1. Issue was that nucmer reported two matches with
 an overlap, where ref and query had different lengths, but no indels were
 reported (and the two matches probably ought to have been reported as one
 whole match. Workaround doesn't use the projection to get the overlapping
 intervals coordinates in split_match function anymore.

---
 pling/align_snakemake/matches.py | 30 ++++++++++++++++++++++++------
 pling/align_snakemake/unimog.py  |  1 +
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py
index 4bf98ac..abd493f 100644
--- a/pling/align_snakemake/matches.py
+++ b/pling/align_snakemake/matches.py
@@ -37,7 +37,9 @@ def __str__(self):
 
     def projection(self, coord, ref_bool): #if ref_bool=True, project from reference to query, else query to reference
         if ref_bool:
+            print(self.indels)
             dist = coord - self.rstart
+            print(dist)
             for indel in self.indels:
                 if indel.rstart<=coord<indel.rend:
                     return indel.qstart
@@ -46,12 +48,15 @@ def projection(self, coord, ref_bool): #if ref_bool=True, project from reference
                         dist += indel.len
                     elif indel.type == "DEL":
                         dist -= indel.len
+            print(dist)
             if self.strand == 1:
                 projected_coord = self.qstart + dist
             else:
                 projected_coord = self.qend - dist
         else:
+            print(self.indels)
             dist = coord - self.qstart
+            print(dist)
             for indel in self.indels:
                 if indel.qstart<=coord<indel.qend:
                     return indel.rstart
@@ -60,6 +65,7 @@ def projection(self, coord, ref_bool): #if ref_bool=True, project from reference
                         dist += indel.len
                     elif indel.type == "DEL":
                         dist -= indel.len
+            print(dist)
             if self.strand == 1:
                 projected_coord = self.rstart + dist
             else:
@@ -145,10 +151,13 @@ def contain_interval(self, start, end, ref_bool): #find in which matches interva
                     matches.append(Match(interval.data[0], interval.data[1], interval.begin, interval.end, interval.data[2]))
         return matches
 
-    def split_match(self, match, start, end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it)
+    def split_match(self, match, start, end, interval_start, interval_end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it)
         index = self.list.index(match)
         projected_start = match.projection(start,ref_bool)
         projected_end = match.projection(end,ref_bool)
+        projected_start = interval_start
+        projected_end = interval_end
+        print(projected_start, projected_end)
         if ref_bool:
             if match.strand == 1:
                 lhs_split = Match(match.rstart, start, match.qstart, projected_start, 1)
@@ -167,6 +176,7 @@ def split_match(self, match, start, end, ref_bool): #given an interval (start,en
                 rhs_split = Match(projected_start, match.rend, match.qstart, start, -1 )
                 interval = Match(projected_end, projected_start, start, end, -1)
                 lhs_split = Match(match.rstart, projected_end, end, match.qend, -1)
+        print(lhs_split,interval,rhs_split)
         if lhs_split.rend != lhs_split.rstart or lhs_split.qend!=lhs_split.qstart:
             self[index] = lhs_split #add even if null interval bc otherwise will break the walk from left to right -- null interval from here will be removed later
             self.insert(index+1, interval)
@@ -224,6 +234,8 @@ def resolve_overlaps(self, overlap_threshold):
         i=0
         finished = False
         while not finished:
+            print("query", i)
+            print(self)
             overlap = 0
             try:
                 overlap = self[i].qend-self[i+1].qstart
@@ -235,21 +247,27 @@ def resolve_overlaps(self, overlap_threshold):
                 containment = False
             if not_null and overlap>overlap_threshold:
                 overlap_matches = self.find_opposite_overlaps(i, False)
+                print(overlap_matches[0])
                 contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True)
                 for match in contain_overlap_1:
-                    self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True)
+                    self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, self[i].qend, self[i+1].qstart, True)
                 contain_overlap_2 = self.contain_interval(overlap_matches[1].rstart, overlap_matches[1].rend, True)
                 for match in contain_overlap_2:
-                    self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True)
+                    self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, self[i].qend, self[i+1].qstart, True)
                 if containment:
+                    print(self)
                     current_match = self[i]
+                    print(current_match)
                     self.sort(False)
                     i = self.list.index(current_match)
+                    print("contained")
             i = i+1
         self.sort(True)
         i=0
         finished = False
-        while not finished:
+        while not finished and i<50:
+            print("ref", i)
+            print(self)
             overlap = 0
             try:
                 overlap = self[i].rend-self[i+1].rstart
@@ -264,10 +282,10 @@ def resolve_overlaps(self, overlap_threshold):
                 overlap_matches = self.find_opposite_overlaps(i, True)
                 contain_overlap_1 = self.contain_interval(overlap_matches[0].qstart, overlap_matches[0].qend, False)
                 for match in contain_overlap_1:
-                    self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, False)
+                    self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, self[i].rend, self[i+1].rstart, False)
                 contain_overlap_2 = self.contain_interval(overlap_matches[1].qstart, overlap_matches[1].qend, False)
                 for match in contain_overlap_2:
-                    self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, False)
+                    self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, self[i].rend, self[i+1].rstart, False)
                 if containment:
                     current_match = self[i]
                     self.sort(True)
diff --git a/pling/align_snakemake/unimog.py b/pling/align_snakemake/unimog.py
index 0ec2f08..4884d4a 100644
--- a/pling/align_snakemake/unimog.py
+++ b/pling/align_snakemake/unimog.py
@@ -26,6 +26,7 @@ def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identi
             unimogs.append(unimog)
             blocks = pd.concat([blocks_ref, blocks_query], ignore_index=True)
             batch_blocks[f"{genome_1}~{genome_2}"] = blocks
+        print(pair)
     with open(unimogpath, 'w') as f:
         for line in unimogs:
             f.write(line)

From b4ac51512c904adad3c258650d4c128a9bcc33c1 Mon Sep 17 00:00:00 2001
From: babayagaofficial <baba.yaga.df@gmail.com>
Date: Fri, 12 Jan 2024 17:20:47 +0000
Subject: [PATCH 2/3] Remove workaround from previous commit, since changing
 the parameters nucmer is run with fixes the issue. Switched from using
 dnadiff to using nucmer directly, and to run with --breaklen 500 and
 --diagdiff 20, which (anecdotely) should prevent breaking up matches like in
 the pair NZ_CP029142.1, NZ_CP060660.1.

---
 pling/align_snakemake/integerise_plasmids.py |  2 +-
 pling/align_snakemake/matches.py             | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/pling/align_snakemake/integerise_plasmids.py b/pling/align_snakemake/integerise_plasmids.py
index 4a08ff2..9d246be 100644
--- a/pling/align_snakemake/integerise_plasmids.py
+++ b/pling/align_snakemake/integerise_plasmids.py
@@ -77,7 +77,7 @@ def get_coverage(tree):
     return coverage
 
 def integerise_plasmids(plasmid_1: Path, plasmid_2: Path, prefix: str, plasmid_1_name, plasmid_2_name, identity_threshold=80, length_threshold=200):
-    subprocess.check_call(f"perl -w $(which dnadiff) {plasmid_1} {plasmid_2} -p {prefix} 2>/dev/null", shell=True)
+    subprocess.check_call(f"nucmer --diagdiff 20 --breaklen 500  --maxmatch -p {prefix} {plasmid_1} {plasmid_2} && delta-filter -1 {prefix}.delta > {prefix}.1delta", shell=True)
     show_coords_output = subprocess.check_output(f"show-coords -TrcldH -I {identity_threshold} {prefix}.1delta", shell=True).strip().split(b'\n')  # TODO: what about this threshold?
     show_snps_output = subprocess.check_output(f"show-snps -TrH {prefix}.1delta", shell=True).strip().split(b'\n')
 
diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py
index abd493f..d245dfe 100644
--- a/pling/align_snakemake/matches.py
+++ b/pling/align_snakemake/matches.py
@@ -151,12 +151,10 @@ def contain_interval(self, start, end, ref_bool): #find in which matches interva
                     matches.append(Match(interval.data[0], interval.data[1], interval.begin, interval.end, interval.data[2]))
         return matches
 
-    def split_match(self, match, start, end, interval_start, interval_end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it)
+    def split_match(self, match, start, end, ref_bool): #given an interval (start,end) on ref/query genome, split up the match (containing it)
         index = self.list.index(match)
         projected_start = match.projection(start,ref_bool)
         projected_end = match.projection(end,ref_bool)
-        projected_start = interval_start
-        projected_end = interval_end
         print(projected_start, projected_end)
         if ref_bool:
             if match.strand == 1:
@@ -250,10 +248,10 @@ def resolve_overlaps(self, overlap_threshold):
                 print(overlap_matches[0])
                 contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True)
                 for match in contain_overlap_1:
-                    self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, self[i].qend, self[i+1].qstart, True)
+                    self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True)
                 contain_overlap_2 = self.contain_interval(overlap_matches[1].rstart, overlap_matches[1].rend, True)
                 for match in contain_overlap_2:
-                    self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, self[i].qend, self[i+1].qstart, True)
+                    self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True)
                 if containment:
                     print(self)
                     current_match = self[i]
@@ -282,10 +280,10 @@ def resolve_overlaps(self, overlap_threshold):
                 overlap_matches = self.find_opposite_overlaps(i, True)
                 contain_overlap_1 = self.contain_interval(overlap_matches[0].qstart, overlap_matches[0].qend, False)
                 for match in contain_overlap_1:
-                    self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, self[i].rend, self[i+1].rstart, False)
+                    self.split_match(match, overlap_matches[0].qstart, overlap_matches[0].qend, False)
                 contain_overlap_2 = self.contain_interval(overlap_matches[1].qstart, overlap_matches[1].qend, False)
                 for match in contain_overlap_2:
-                    self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, self[i].rend, self[i+1].rstart, False)
+                    self.split_match(match, overlap_matches[1].qstart, overlap_matches[1].qend, False)
                 if containment:
                     current_match = self[i]
                     self.sort(True)

From 4294a642d295ec91962ae195117a70b56e7069c3 Mon Sep 17 00:00:00 2001
From: babayagaofficial <baba.yaga.df@gmail.com>
Date: Fri, 12 Jan 2024 17:46:46 +0000
Subject: [PATCH 3/3] Made matches.py error out if number of iterations in one
 of the while loop in resolve_overlaps goes over 100000 to make catching
 issues quicker in future. Cleaned up mess from debugging.

---
 pling/align_snakemake/matches.py | 23 +++++------------------
 pling/align_snakemake/unimog.py  |  1 -
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/pling/align_snakemake/matches.py b/pling/align_snakemake/matches.py
index d245dfe..8b8acff 100644
--- a/pling/align_snakemake/matches.py
+++ b/pling/align_snakemake/matches.py
@@ -37,9 +37,7 @@ def __str__(self):
 
     def projection(self, coord, ref_bool): #if ref_bool=True, project from reference to query, else query to reference
         if ref_bool:
-            print(self.indels)
             dist = coord - self.rstart
-            print(dist)
             for indel in self.indels:
                 if indel.rstart<=coord<indel.rend:
                     return indel.qstart
@@ -48,15 +46,12 @@ def projection(self, coord, ref_bool): #if ref_bool=True, project from reference
                         dist += indel.len
                     elif indel.type == "DEL":
                         dist -= indel.len
-            print(dist)
             if self.strand == 1:
                 projected_coord = self.qstart + dist
             else:
                 projected_coord = self.qend - dist
         else:
-            print(self.indels)
             dist = coord - self.qstart
-            print(dist)
             for indel in self.indels:
                 if indel.qstart<=coord<indel.qend:
                     return indel.rstart
@@ -65,7 +60,6 @@ def projection(self, coord, ref_bool): #if ref_bool=True, project from reference
                         dist += indel.len
                     elif indel.type == "DEL":
                         dist -= indel.len
-            print(dist)
             if self.strand == 1:
                 projected_coord = self.rstart + dist
             else:
@@ -155,7 +149,6 @@ def split_match(self, match, start, end, ref_bool): #given an interval (start,en
         index = self.list.index(match)
         projected_start = match.projection(start,ref_bool)
         projected_end = match.projection(end,ref_bool)
-        print(projected_start, projected_end)
         if ref_bool:
             if match.strand == 1:
                 lhs_split = Match(match.rstart, start, match.qstart, projected_start, 1)
@@ -174,7 +167,6 @@ def split_match(self, match, start, end, ref_bool): #given an interval (start,en
                 rhs_split = Match(projected_start, match.rend, match.qstart, start, -1 )
                 interval = Match(projected_end, projected_start, start, end, -1)
                 lhs_split = Match(match.rstart, projected_end, end, match.qend, -1)
-        print(lhs_split,interval,rhs_split)
         if lhs_split.rend != lhs_split.rstart or lhs_split.qend!=lhs_split.qstart:
             self[index] = lhs_split #add even if null interval bc otherwise will break the walk from left to right -- null interval from here will be removed later
             self.insert(index+1, interval)
@@ -232,8 +224,6 @@ def resolve_overlaps(self, overlap_threshold):
         i=0
         finished = False
         while not finished:
-            print("query", i)
-            print(self)
             overlap = 0
             try:
                 overlap = self[i].qend-self[i+1].qstart
@@ -245,7 +235,6 @@ def resolve_overlaps(self, overlap_threshold):
                 containment = False
             if not_null and overlap>overlap_threshold:
                 overlap_matches = self.find_opposite_overlaps(i, False)
-                print(overlap_matches[0])
                 contain_overlap_1 = self.contain_interval(overlap_matches[0].rstart, overlap_matches[0].rend, True)
                 for match in contain_overlap_1:
                     self.split_match(match, overlap_matches[0].rstart, overlap_matches[0].rend, True)
@@ -253,24 +242,20 @@ def resolve_overlaps(self, overlap_threshold):
                 for match in contain_overlap_2:
                     self.split_match(match, overlap_matches[1].rstart, overlap_matches[1].rend, True)
                 if containment:
-                    print(self)
                     current_match = self[i]
-                    print(current_match)
                     self.sort(False)
                     i = self.list.index(current_match)
-                    print("contained")
             i = i+1
+            if i>100000:
+                raise Exception("The resolve_overlaps function in matches.py has been stuck in the while loop for 100000 iterations! There is likely a bug, please raise an issue.")
         self.sort(True)
         i=0
         finished = False
-        while not finished and i<50:
-            print("ref", i)
-            print(self)
+        while not finished:
             overlap = 0
             try:
                 overlap = self[i].rend-self[i+1].rstart
                 not_null = self[i].rstart!=self[i].rend and self[i+1].rstart!=self[i+1].rend
-                not_the_same = self[i].rstart!=self[i+1].rstart and self[i].rend!=self[i+1].rend #ignore multicopy
                 containment = self[i+1].rend<self[i].rend
             except IndexError:
                 finished = True
@@ -289,6 +274,8 @@ def resolve_overlaps(self, overlap_threshold):
                     self.sort(True)
                     i = self.list.index(current_match)
             i = i+1
+            if i>100000:
+                raise Exception("The resolve_overlaps function in matches.py has been stuck in the while loop for 100000 iterations! There is likely a bug, please raise an issue.")
 
 testing = False
 if testing == True:
diff --git a/pling/align_snakemake/unimog.py b/pling/align_snakemake/unimog.py
index 4884d4a..0ec2f08 100644
--- a/pling/align_snakemake/unimog.py
+++ b/pling/align_snakemake/unimog.py
@@ -26,7 +26,6 @@ def batchwise_unimog(fastafiles, pairs, unimogpath, mappath, jaccardpath, identi
             unimogs.append(unimog)
             blocks = pd.concat([blocks_ref, blocks_query], ignore_index=True)
             batch_blocks[f"{genome_1}~{genome_2}"] = blocks
-        print(pair)
     with open(unimogpath, 'w') as f:
         for line in unimogs:
             f.write(line)