Skip to content

Commit

Permalink
Check Dam / Dcm / N*5 patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
veghp committed Oct 16, 2024
1 parent b32b46e commit ce7f353
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 3 deletions.
50 changes: 47 additions & 3 deletions ediacara/Comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
from weighted_levenshtein import lev

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import cyvcf2

import dnachisel
import dna_features_viewer
import geneblocks

Expand All @@ -39,6 +42,16 @@
"uncertain": "LOW_COVERAGE",
}

patterns = [
"GATC", # Dam
"CCTGG", # Dcm
"CCAGG", # Dcm
"AAAAA",
"TTTTT",
"CCCCC",
"GGGGG",
]


class CustomTranslator(dna_features_viewer.BiopythonTranslator):
"""Custom translator used in the coverage plot."""
Expand Down Expand Up @@ -611,18 +624,49 @@ def calculate_stats(self):
for i, value in enumerate(self.yy)
if value < self.median_yy * self.coverage_cutoff
]

# Group (or create blocks of) indices:
G = (
list(x)
for _, x in itertools.groupby(
indices, lambda x, c=itertools.count(): next(c) - x
)
)
self.low_coverage_positions_string = ", ".join(
"-".join(map(str, (g[0], g[-1])[: len(g)])) for g in G
)
groups_of_indices = ["-".join(map(str, (g[0], g[-1])[: len(g)])) for g in G]

# Separate out low coverage positions at Dam / Dcm / N*5 patterns;
# A clean record for finding patterns:
rec_for_search = SeqRecord(Seq(self.record.seq.upper()))
for pattern in patterns:
rec_for_search = dnachisel.annotate_pattern_occurrences(
record=rec_for_search, pattern=dnachisel.DnaNotationPattern(pattern)
)

indices_true = []
indices_ignore = []
for position in groups_of_indices:
ignore = False
if "-" in position: # do not check positions within ranges
indices_true += [position]
continue
for feature in rec_for_search.features:
if int(position) in feature:
ignore = True
break
if ignore:
indices_ignore += [position]
else:
indices_true += [position]

# As string for simple reporting:
self.low_coverage_positions_string = ", ".join(indices_true)
if self.low_coverage_positions_string == "":
self.low_coverage_positions_string = "-" # looks better in the pdf report

self.low_coverage_pos_ignore_string = ", ".join(indices_ignore)
if self.low_coverage_pos_ignore_string == "":
self.low_coverage_pos_ignore_string = "-" # looks better in the pdf report

def plot_coverage(self):
"""Plot the reference with the coverage and weak reads."""

Expand Down
2 changes: 2 additions & 0 deletions ediacara/report_assets/run_simulation_report.pug
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ each comparatorgroup in sequencinggroup.comparatorgroups

p Positions with low coverage (<{{ comparator.coverage_cutoff_pct }}%): #[span.red {{ comparator.low_coverage_positions_string }} ]

p Low coverage positions at Dam / Dcm / N×5 patterns: #[span.grey {{ comparator.low_coverage_pos_ignore_string }} ]

p Cumulative plot of longest unaligned interval in each read ({{ comparator.insert_pct_above_cutoff }}% above cutoff. mode: {{ comparator.insert_mode }} bp):

img#diff-figure(src="{{ comparator.insert_plot_data }}")
Expand Down
1 change: 1 addition & 0 deletions ediacara/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
import pdf_reports.tools as pdf_tools


from .version import __version__
from .Comparator import result_keywords

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@
"cyvcf2",
"pdf_reports",
"portion",
"dnachisel",
],
)

0 comments on commit ce7f353

Please sign in to comment.