Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(#741): Pretty print #742

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c254b47
feat(pretty print): adding some experimental code for pretty printing…
andreasprlic May 13, 2024
8986d20
feat(pretty print): making sure we don't bloat the test cache.
andreasprlic May 13, 2024
e9fde79
feat(pretty print): making type py3.8 compatible
andreasprlic May 13, 2024
d3a5e6e
feat(pretty print): breaking up creation of each line in display into…
andreasprlic May 28, 2024
f2018df
feat(pretty print): breaking up creation of each line in display into…
andreasprlic May 29, 2024
2f7069b
feat(pretty print): feat(pretty print): improvements for RNA coding t…
andreasprlic Jun 1, 2024
27ae2cd
feat(repeat-detection): using fully justified representation.
andreasprlic Jun 3, 2024
b5b3d64
feat(repeat-detection): using fully justified representation.
andreasprlic Jun 3, 2024
2a72299
formatting
andreasprlic Jun 3, 2024
0dcab28
feat(prot-pos): now showing the protein (amino acid) positions more c…
andreasprlic Jun 6, 2024
e9f578a
feat(repeat-detection): now with better detection of repetitive units…
andreasprlic Jun 7, 2024
99a614f
feat(repeat-detection): hooking repeat detection improvements in pret…
andreasprlic Jun 7, 2024
da48e01
Merge branch 'main' into pretty_print
andreasprlic Jun 9, 2024
f1e9703
feat(pretty_print): now showing hgvs_p, making code more accessible f…
andreasprlic Jun 30, 2024
7ad7f0c
Merge branch 'main' of github.com:biocommons/hgvs into pretty_print
andreasprlic Jun 30, 2024
a31cd26
feat(ci): trying to get pytest to work again
andreasprlic Jun 30, 2024
30addf2
feat(cleanup): removing cache from test_repeats. They are creating to…
andreasprlic Jul 1, 2024
bc483b5
cleanup
andreasprlic Aug 13, 2024
cf22aae
#741 small refactoring to update directory structure, based on PR com…
andreasprlic Sep 2, 2024
d4f2ac4
Merge branch 'main' into pretty_print
andreasprlic Sep 2, 2024
d08f02d
#741 small refactoring to update directory structure, based on PR com…
andreasprlic Sep 2, 2024
401c5a3
#741 excluding data intensive tests from CI. They would create too mu…
andreasprlic Sep 6, 2024
336ea2c
trying to fix CI
andreasprlic Sep 6, 2024
d216bba
feat(repeats): found an issue with count_pattern_occurences, fixed an…
andreasprlic Sep 7, 2024
9d15a22
feat(repeats): adding examples from HGVS repeats recommendations page…
andreasprlic Sep 16, 2024
99f27bf
feat(repeats): adding an example of a large variant to make sure we d…
andreasprlic Sep 17, 2024
703efbf
feat(repeats): moving max repeat config to global config. Also a smal…
andreasprlic Sep 19, 2024
bb7eb3c
skipfeat(repeats): skipping more tests that are too data heavy.
andreasprlic Sep 19, 2024
fed5af4
feat(repeats): improvements in repeat check.
andreasprlic Sep 19, 2024
9a2e049
feat(reverse-strand): Now showing reverse strand transcript in 5'-3' …
andreasprlic Oct 3, 2024
b82ab35
feat(is_rna): moving the check if a transcript is RNA only to datacom…
andreasprlic Oct 3, 2024
cdd87a5
fix
andreasprlic Oct 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/hgvs/pretty/console/tx_pos.py
andreasprlic marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ def display(self, data: VariantData) -> str:
"""show the position of the transcript seq"""
var_str = ""

rna_coding = data.var_c_or_n.ac.startswith("NR_")
if data.var_c_or_n:
rna_coding = data.var_c_or_n.ac.startswith("NR_")
else:
rna_coding = False

count = -1
for pdata in data.position_details:
Expand Down
4 changes: 3 additions & 1 deletion src/hgvs/pretty/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ def data(
# for p in position_details:
# print(f"{p}\t{p.protein_data}\t")

strand = mapper.strand if mapper else 1

vd = VariantData(
seq_start,
seq_end,
Expand All @@ -355,7 +357,7 @@ def data(
tx_seq,
mapper,
var_g,
mapper.strand,
strand,
var_c_or_n,
var_p,
position_details,
Expand Down
255 changes: 209 additions & 46 deletions src/hgvs/repeats.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,236 @@
# -*- coding: utf-8 -*-
""" A class to manage conversion of SequenceVariants to repeat representation"""

from dataclasses import dataclass
import re
from typing import Optional

from hgvs.pretty.models import VariantCoords


def count_pattern_occurence(pattern, string):
"""Counts how often a pattern can be found in the string."""
matches = re.finditer(f'(?={pattern})', string)
return sum(1 for _ in matches)


def count_repetitive_units(s):
length = len(s)

for i in range(1, length + 1):
unit = s[:i]
if length % i == 0:
if unit * (length // i) == s:
return length // i, unit

return 1, s
@dataclass(eq=True, repr=True, frozen=True, order=True)
class RepeatUnit:
repeat_count: int
repeat_unit:str
block_size:int
block: str


class RepeatAnalyser:

def __init__(self, fs: VariantCoords) -> None:
def __init__(self, fs: VariantCoords, reverse:bool=False) -> None:

self.is_repeat = False
self.ref_count = 0
self.alt_count = 0
self.repeat_units_ref = None
self.repeat_units_alt = None
self.ref_str = fs.ref
self.alt_str = fs.alt
self.reverse = reverse

self.repeat_unit = self._get_repeat_unit(fs)
if self.repeat_unit is None:
self.repeat_units_ref = detect_repetitive_block_lengths(self.ref_str, self.reverse)
self.repeat_units_alt = detect_repetitive_block_lengths(self.alt_str, self.reverse)

if len(self.repeat_units_ref) == 0 and len(self.repeat_units_alt) ==0 :
return

# check longest repeat blocks:
# we only look at ref to determine if there are repeats
# If ref has no repeat, we don't call this a repeat variant, even if alt would have a repetitive unit
longest_r_unit = self._get_longest_repeat_unit(self.repeat_units_ref)
if longest_r_unit is None:
return

# filter our too fragmented results
expected_size = len(self.ref_str) / 3
if longest_r_unit.block_size < expected_size:
return

if longest_r_unit.repeat_unit not in self.alt_str:
return


self.is_repeat = True

self.ref_count = count_pattern_occurence(self.repeat_unit, fs.ref)
self.alt_count = count_pattern_occurence(self.repeat_unit, fs.alt)
self.ref_str = f"{self.repeat_unit}[{self.ref_count}]"
self.alt_str = f"{self.repeat_unit}[{self.alt_count}]"
ref_repeat = get_repeat_str(self.ref_str, self.alt_str, self.repeat_units_ref, self.repeat_units_alt, self.reverse)
alt_repeat = get_repeat_str(self.alt_str, self.ref_str, self.repeat_units_alt, self.repeat_units_ref, self.reverse)

self.ref_str = ref_repeat
self.alt_str = alt_repeat

def __repr__(self):
return f"{self.ref_str}>{self.alt_str}"

def _get_repeat_unit(self, fs: VariantCoords) -> Optional[str]:
"""Takes fully justified coordiantes and tries to detect a repeat in them."""
# analyze for repeat:
if len(fs.ref) == len(fs.alt) > 0:
# seems we cant shuffle. is an SVN or delins
return None

if len(fs.alt) > 0:
if fs.alt in fs.ref:
if fs.ref.startswith(fs.alt):
d = fs.ref[len(fs.alt) :]
if count_pattern_occurence(d, fs.ref) > 1:
c, u = count_repetitive_units(d)
return u
elif fs.ref in fs.alt:
if fs.alt.startswith(fs.ref):
d = fs.alt[len(fs.ref) :]
if count_pattern_occurence(d, fs.ref) > 1:
c, u = count_repetitive_units(d)
return u
def _get_longest_repeat_unit(self, repeat_units:list[RepeatUnit])->RepeatUnit:
lru = None
for ru in repeat_units:
if not lru:
lru = ru
continue

if ru.block_size > len(lru.block):
lru = ru

return lru


def detect_repetitive_block_lengths(sequence: str, reverse: bool = False) -> list[RepeatUnit]:
"""Detects the length of repetitive blocks in a string, with an option to search from left to right or reverse.

In reverse mode, it creates the largest possible blocks of the smallest possible units.
"""
result: list[RepeatUnit] = []
seq_len = len(sequence)

if reverse:
i = seq_len # Start from the end of the sequence
while i > 0:
matched = False
# Iterate over block sizes from smallest to largest
for block_size in range(1, seq_len // 2 + 1):
if i - block_size < 0:
continue # Not enough characters to form a repeat unit

# Extract the potential repeat unit ending at position i
repeat_unit = sequence[i - block_size:i]

# Calculate the maximum possible number of repeats for this block size
max_possible_repeats = i // block_size

# Initialize repeat_count to the maximum possible and decrease
for repeat_count in range(max_possible_repeats, 1, -1):
start_index = i - repeat_count * block_size
if start_index < 0:
continue # Not enough characters to form the repetitive block

# Extract the substring that could be the repetitive block
substr = sequence[start_index:i]

# Build the regex pattern for the current repeat unit and count
pattern = rf'({re.escape(repeat_unit)})' + r'{' + f'{repeat_count}' + r'}'

# Check if the substring matches the pattern
if re.fullmatch(pattern, substr):
repetitive_block = substr
ru = RepeatUnit(
block=repetitive_block,
block_size=len(repetitive_block),
repeat_unit=repeat_unit,
repeat_count=repeat_count
)
result.append(ru)
# Move the index `i` backward by the length of the repetitive block
i -= len(repetitive_block)
matched = True
break # Found the largest block for this unit size

if matched:
break # Proceed to the next position `i`

if not matched:
# No repeat found, remove one character
ru = RepeatUnit(
block=sequence[i - 1],
block_size=1,
repeat_unit=sequence[i - 1],
repeat_count=1
)
result.append(ru)
i -= 1 # Move back by one character if no match is found

else:
i = 0 # Start from the beginning of the sequence
while i < seq_len:
matched = False
for block_size in range(1, seq_len // 2 + 1):
# Build the regex pattern for the current block size
pattern = rf'(.{{{block_size}}})\1+'
match = re.match(pattern, sequence[i:])

if match:
repetitive_block = match.group() # The full repeating pattern
repeated_unit = match.group(1) # The repeating unit
repetition_count = len(repetitive_block) // len(repeated_unit)

# Add the repetitive block and its details to the result
ru = RepeatUnit(
block=repetitive_block,
block_size=len(repetitive_block),
repeat_unit=repeated_unit,
repeat_count=repetition_count
)
result.append(ru)

# Move the index `i` forward by the length of the repetitive block
i += len(repetitive_block)
matched = True
break # Found a match, break the loop for current block size

if not matched:
i += 1 # Move forward by one character if no match is found

return result


def get_repeat_str(
sequence: str,
other_seq: str,
primary_repeat_unit: list[RepeatUnit],
other_repeat_unit: list[RepeatUnit],
reverse: bool = False
) -> str:
if len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 0:
return None

if len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 1 and sequence == other_repeat_unit[0].repeat_unit:
return f"{sequence}[1]"
elif len(primary_repeat_unit) == 0 and len(other_repeat_unit) == 1 and sequence != other_repeat_unit[0].repeat_unit:
return None

if len(primary_repeat_unit) > 0 and len(other_repeat_unit) > 0:
return_str = assemble_repeat_string(sequence, primary_repeat_unit, reverse=reverse)
return return_str

if len(other_repeat_unit) == 0 and len(other_seq) > 0:
return_str = assemble_repeat_string(sequence, primary_repeat_unit, reverse=reverse)
if len(return_str) > 0:
return return_str

return None

def assemble_repeat_string(sequence: str, repeat_units: list[RepeatUnit], reverse: bool = False) -> str:
return_str = ""
primary_repeat_unit = repeat_units.copy()
seq = sequence

if reverse:
while len(seq) > 0:
found_unit = None
for ru in primary_repeat_unit:
if seq.endswith(ru.block):
return_str = f"{ru.repeat_unit}[{ru.repeat_count}]" + return_str
seq = seq[:-len(ru.block)]
found_unit = ru
break
if not found_unit:
# remove one character if no matching repeat unit is found
seq_char = seq[0]
seq = seq[:-1]
return_str = f"{seq_char}[1]" + return_str

else: # forward direction
while len(seq) > 0:
found_unit = None
for ru in primary_repeat_unit:
if seq.startswith(ru.block):
return_str += f"{ru.repeat_unit}[{ru.repeat_count}]"
seq = seq[len(ru.block):]
found_unit = ru
break
if not found_unit:
# remove one character if no matching repeat unit is found
seq_char = seq[0]
seq = seq[1:]
return_str += f"{seq_char}[1]"


return return_str

Loading
Loading