Check Dam / Dcm / N*5 patterns

Edinburgh-Genome-Foundry · Oct 16, 2024 · ce7f353 · ce7f353
1 parent b32b46e
commit ce7f353
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 3 deletions.
diff --git a/ediacara/Comparator.py b/ediacara/Comparator.py
@@ -19,8 +19,11 @@
 from weighted_levenshtein import lev
 
 from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
 import cyvcf2
 
+import dnachisel
 import dna_features_viewer
 import geneblocks
 
@@ -39,6 +42,16 @@
     "uncertain": "LOW_COVERAGE",
 }
 
+patterns = [
+    "GATC",  # Dam
+    "CCTGG",  # Dcm
+    "CCAGG",  # Dcm
+    "AAAAA",
+    "TTTTT",
+    "CCCCC",
+    "GGGGG",
+]
+
 
 class CustomTranslator(dna_features_viewer.BiopythonTranslator):
     """Custom translator used in the coverage plot."""
@@ -611,18 +624,49 @@ def calculate_stats(self):
             for i, value in enumerate(self.yy)
             if value < self.median_yy * self.coverage_cutoff
         ]
+
+        # Group (or create blocks of) indices:
         G = (
             list(x)
             for _, x in itertools.groupby(
                 indices, lambda x, c=itertools.count(): next(c) - x
             )
         )
-        self.low_coverage_positions_string = ", ".join(
-            "-".join(map(str, (g[0], g[-1])[: len(g)])) for g in G
-        )
+        groups_of_indices = ["-".join(map(str, (g[0], g[-1])[: len(g)])) for g in G]
+
+        # Separate out low coverage positions at Dam / Dcm / N*5 patterns;
+        # A clean record for finding patterns:
+        rec_for_search = SeqRecord(Seq(self.record.seq.upper()))
+        for pattern in patterns:
+            rec_for_search = dnachisel.annotate_pattern_occurrences(
+                record=rec_for_search, pattern=dnachisel.DnaNotationPattern(pattern)
+            )
+
+        indices_true = []
+        indices_ignore = []
+        for position in groups_of_indices:
+            ignore = False
+            if "-" in position:  # do not check positions within ranges
+                indices_true += [position]
+                continue
+            for feature in rec_for_search.features:
+                if int(position) in feature:
+                    ignore = True
+                    break
+            if ignore:
+                indices_ignore += [position]
+            else:
+                indices_true += [position]
+
+        # As string for simple reporting:
+        self.low_coverage_positions_string = ", ".join(indices_true)
         if self.low_coverage_positions_string == "":
             self.low_coverage_positions_string = "-"  # looks better in the pdf report
 
+        self.low_coverage_pos_ignore_string = ", ".join(indices_ignore)
+        if self.low_coverage_pos_ignore_string == "":
+            self.low_coverage_pos_ignore_string = "-"  # looks better in the pdf report
+
     def plot_coverage(self):
         """Plot the reference with the coverage and weak reads."""
 

diff --git a/ediacara/report_assets/run_simulation_report.pug b/ediacara/report_assets/run_simulation_report.pug
@@ -84,6 +84,8 @@ each comparatorgroup in sequencinggroup.comparatorgroups
 
     p Positions with low coverage (<{{ comparator.coverage_cutoff_pct }}%): #[span.red {{ comparator.low_coverage_positions_string }} ]
 
+    p Low coverage positions at Dam / Dcm / N×5 patterns: #[span.grey {{ comparator.low_coverage_pos_ignore_string }} ]
+
     p Cumulative plot of longest unaligned interval in each read ({{ comparator.insert_pct_above_cutoff }}% above cutoff. mode: {{ comparator.insert_mode }} bp):
 
     img#diff-figure(src="{{ comparator.insert_plot_data }}")

diff --git a/ediacara/reports.py b/ediacara/reports.py
@@ -22,6 +22,7 @@
 )
 import pdf_reports.tools as pdf_tools
 
+
 from .version import __version__
 from .Comparator import result_keywords
 

diff --git a/setup.py b/setup.py
@@ -26,5 +26,6 @@
         "cyvcf2",
         "pdf_reports",
         "portion",
+        "dnachisel",
     ],
 )