Skip to content

Commit

Permalink
update GCoverageSet
Browse files Browse the repository at this point in the history
  • Loading branch information
chaochungkuo committed Mar 5, 2024
1 parent 3217aa7 commit f2e3f3d
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 103 deletions.
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,10 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

.vscode/
.vscode/

tests/local_test/*.bed
tests/local_test/*.fasta
tests/local_test/*.bed

.DS_Store
57 changes: 51 additions & 6 deletions genomkit/coverages/gcoverages.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ def __init__(self, bin_size: int = 1, load: str = "", windows=None):
self.load_coverage_from_bigwig(load, windows=windows)
elif load.lower().endswith(".bam"):
self.calculate_coverage_from_bam(load, windows=windows)
elif load.lower().endswith(".bed"):
regions = GRegions(name=load, load=load)
self.calculate_coverage_GRegions(windows=windows,
scores=regions)
elif load.lower().endswith(".bed") or \
load.lower().endswith(".bedgraph"):
regions = GRegions(name=os.path.basename(load), load=load)
self.calculate_coverage_GRegions2(windows=windows,
scores=regions)

def load_coverage_from_bigwig(self, filename: str, windows=None):
"""Load coverage data from a bigwig file.
Expand Down Expand Up @@ -114,6 +115,51 @@ def calculate_coverage_from_bam(self, filename: str, windows=None):
len(self.coverage[r]),
self.bin_size)]

def calculate_coverage_GRegions2(self, scores, windows=None,
strandness: bool = False):
"""Calculate the coverage from two GRegions. `windows` defines the loci
for the coverage `scores` contains the scores loaded into the coverage.
:param windows: Define the windows and the length of the coverage
:type windows: GRegions
:param scores: Provide the scores for calculating the coverage
:type scores: GRegions
:param strandness: Make this operation strandness specific, defaults
to False
:type strandness: bool, optional
"""
from genomkit import GRegions
assert isinstance(windows, GRegions)
assert isinstance(scores, GRegions)
print("1")
filtered_scores = scores.intersect(target=windows, mode="ORIGINAL")
print("2")
# Preallocate coverage arrays
# num_windows = len(windows)
num_bins = len(windows[0]) // self.bin_size
self.coverage = {region: np.zeros(shape=num_bins)
for region in windows}
for target in tqdm(filtered_scores, desc=scores.name,
total=len(filtered_scores)):
overlap_regions = [region for region in windows
if region.overlap(target,
strandness=strandness)]
start_inds = np.maximum(0,
target.start -
np.array([region.start
for region in overlap_regions]))
start_inds = start_inds // self.bin_size
end_inds = np.minimum([len(region)
for region in overlap_regions],
target.end -
np.array([region.start
for region in overlap_regions]))
end_inds = end_inds // self.bin_size
for region, start_ind, end_ind in zip(overlap_regions,
start_inds,
end_inds):
self.coverage[region][start_ind:end_ind] += target.score

def calculate_coverage_GRegions(self, scores, windows=None,
strandness: bool = False):
"""Calculate the coverage from two GRegions. `windows` defines the loci
Expand All @@ -132,8 +178,7 @@ def calculate_coverage_GRegions(self, scores, windows=None,
assert isinstance(scores, GRegions)
filtered_scores = scores.intersect(target=windows,
mode="ORIGINAL")
for region in tqdm(windows, desc="Calculating coverage",
total=len(windows)):
for region in tqdm(windows, desc=scores.name, total=len(windows)):
self.coverage[region] = np.zeros(shape=len(region) //
self.bin_size)
for target in filtered_scores:
Expand Down
196 changes: 101 additions & 95 deletions genomkit/regions/gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import sys
from copy import deepcopy
from tqdm import tqdm


###########################################################################
Expand Down Expand Up @@ -292,13 +293,13 @@ def intersect(self, target, mode: str = "OVERLAP",
else:
a = copy.deepcopy(self)
b = copy.deepcopy(target)
if not a.sorted:
a.sort()
if not b.sorted:
b.sort()
if mode == "OVERLAP":
a.merge()
b.merge()
# if not a.sorted:
# a.sort()
# if not b.sorted:
# b.sort()
# if mode == "OVERLAP":
# a.merge()
# b.merge()

iter_a = iter(a)
s = next(iter_a)
Expand All @@ -309,111 +310,116 @@ def intersect(self, target, mode: str = "OVERLAP",
cont_overlap = False
# OVERLAP ###############################
if mode == "OVERLAP":
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
new_regions.add(GRegion(sequence=s.sequence,
start=max(s.start, b[j].start),
end=min(s.end, b[j].end),
name=s.name,
orientation=s.orientation,
data=s.data))
if not cont_overlap:
pre_inter = j
if j == last_j:
with tqdm(total=len(b)) as pbar:
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
new_regions.add(GRegion(sequence=s.sequence,
start=max(s.start,
b[j].start),
end=min(s.end, b[j].end),
name=s.name,
orientation=s.orientation,
data=s.data))
if not cont_overlap:
pre_inter = j
if j == last_j:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
else:
j += 1
cont_overlap = True

elif s < b[j]:
try:
s = next(iter_a)
if s.sequence == b[j].sequence and pre_inter > 0:
j = pre_inter
cont_overlap = False
except StopIteration:
cont_loop = False
else:
j += 1
cont_overlap = True

elif s < b[j]:
try:
s = next(iter_a)
if s.sequence == b[j].sequence and pre_inter > 0:
j = pre_inter
cont_overlap = False
except StopIteration:
cont_loop = False

elif s > b[j]:
if j == last_j:
cont_loop = False
elif s > b[j]:
if j == last_j:
cont_loop = False
else:
j += 1
cont_overlap = False
else:
j += 1
cont_overlap = False
else:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False

try:
s = next(iter_a)
except StopIteration:
cont_loop = False
pbar.update(1)
# ORIGINAL ###############################
if mode == "ORIGINAL":
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
new_regions.add(s)
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
elif s < b[j]:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
elif s > b[j]:
if j == last_j:
cont_loop = False
else:
j += 1
else:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
# COMP_INCL ###############################
if mode == "COMP_INCL":
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
if s.start >= b[j].start and s.end <= b[j].end:
with tqdm(total=len(b)) as pbar:
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
new_regions.add(s)
if not cont_overlap:
pre_inter = j
if j == last_j:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
elif s < b[j]:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
elif s > b[j]:
if j == last_j:
cont_loop = False
else:
j += 1
else:
j += 1
cont_overlap = True
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
pbar.update(1)
# COMP_INCL ###############################
if mode == "COMP_INCL":
with tqdm(total=len(b)) as pbar:
while cont_loop:
# When the regions overlap
if s.overlap(b[j]):
if s.start >= b[j].start and s.end <= b[j].end:
new_regions.add(s)
if not cont_overlap:
pre_inter = j
if j == last_j:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False
else:
j += 1
cont_overlap = True

elif s < b[j]:
try:
s = next(iter_a)
if s.sequence == b[j].sequence and pre_inter > 0:
j = pre_inter
cont_overlap = False
except StopIteration:
cont_loop = False
elif s < b[j]:
try:
s = next(iter_a)
if s.sequence == b[j].sequence and pre_inter > 0:
j = pre_inter
cont_overlap = False
except StopIteration:
cont_loop = False

elif s > b[j]:
if j == last_j:
cont_loop = False
elif s > b[j]:
if j == last_j:
cont_loop = False
else:
j += 1
cont_overlap = False
else:
j += 1
cont_overlap = False
else:
try:
s = next(iter_a)
except StopIteration:
cont_loop = False

try:
s = next(iter_a)
except StopIteration:
cont_loop = False
pbar.update(1)
# if rm_duplicates:
new_regions.remove_duplicates()
# new_regions.sort()
Expand Down
5 changes: 4 additions & 1 deletion genomkit/regions/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from tqdm import tqdm


###########################################################################
Expand All @@ -17,7 +18,9 @@ def get_score(score):
else:
res = GRegions()
with open(filename, 'r') as file:
for line in file:
for line in tqdm(file,
desc=os.path.basename(filename),
unit=" lines"):
if not line.startswith("#"):
infos = line.strip().split()
if len(infos) == 3:
Expand Down

0 comments on commit f2e3f3d

Please sign in to comment.