Skip to content

Commit

Permalink
update data
Browse files Browse the repository at this point in the history
  • Loading branch information
chaochungkuo committed Feb 27, 2024
1 parent c841d52 commit 4aa014b
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ sphinx-rtd-theme==1.3.0rc1
numpy
pysam==0.22.0
tqdm
pyBigWig==0.3.22
pyBigWig==0.3.22
pandas
33 changes: 26 additions & 7 deletions genomkit/regions/gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def subtract(self, regions, whole_region: bool = False,
if counts == len(regions.elements):
finished = 1
else:
# region.sequence <
# region.sequence <
# target.sequences[counts].sequence:
small_self.add(r)
else:
Expand Down Expand Up @@ -805,8 +805,9 @@ def subtract(self, regions, whole_region: bool = False,
return self
if not self.sorted:
self.sort()
# if there is overlap within self or y, and the `merge` option is set,
# we merge any overlapping sequence and create two different GRegions
# if there is overlap within self or y, and the `merge` option
# is set, we merge any overlapping sequence and create two
# different GRegions
if merge:
a = self.merge(inplace=False)
b = regions.merge(inplace=False)
Expand Down Expand Up @@ -846,10 +847,18 @@ def subtract(self, regions, whole_region: bool = False,
# ------ ----- -------
# ------ -- ---
if s.end > b[j].end:
s1 = GRegion(sequence=s.sequence, start=s.start, end=b[j].start,
name=s.name, orientation=s.orientation, data=s.data)
s2 = GRegion(sequence=s.sequence, start=b[j].end, end=s.end,
name=s.name, orientation=s.orientation, data=s.data)
s1 = GRegion(sequence=s.sequence,
start=s.start,
end=b[j].start,
name=s.name,
orientation=s.orientation,
data=s.data)
s2 = GRegion(sequence=s.sequence,
start=b[j].end,
end=s.end,
name=s.name,
orientation=s.orientation,
data=s.data)
res.add(s1)
s = s2
if j < last_j:
Expand Down Expand Up @@ -983,3 +992,13 @@ def cluster(self, max_distance):
previous = deepcopy(s)
z.add(previous)
return z

def total_coverage(self):
"""Return the total coverage (bp) of all the regions.
:return: Total coverage (bp)
:rtype: int
"""
merged_regions = self.merge(inplace=False)
cov = sum([len(r) for r in merged_regions])
return cov
56 changes: 56 additions & 0 deletions genomkit/regions/gregions_set.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from genomkit import GRegions
from collections import OrderedDict
import numpy as np
import pandas as pd


class GRegionsSet:
Expand All @@ -10,6 +12,15 @@ class GRegionsSet:
multiple GRegions.
"""
def __init__(self, name: str = "", load_dict=None):
"""Initiate a GRegionsSet object which can contain multiple GRegions.
:param name: Define the name, defaults to ""
:type name: str, optional
:param load_dict: Given the file paths of multiple GRegions as a
dictionary with names as keys and values as file
paths, defaults to None
:type load_dict: dict, optional
"""
self.collection = OrderedDict()
if load_dict:
for name, filename in load_dict.items():
Expand All @@ -18,9 +29,21 @@ def __init__(self, name: str = "", load_dict=None):
load=filename))

def add(self, name: str, regions):
"""Add a GRegions object into this set.
:param name: Define the name
:type name: str
:param regions: Given the GRegions
:type regions: GRegions
"""
self.collection[name] = regions

def __len__(self):
"""Return the number of GRegions in this set.
:return: Number of GRegions
:rtype: int
"""
return len(self.collection)

def __getattr__(self, key):
Expand All @@ -36,10 +59,43 @@ def __setattr__(self, key, value):
self.collection[key] = value

def get_names(self):
"""Return the names of all GRegions.
:return: Names
:rtype: list
"""
return list(self.collection.keys())

def get_lengths(self):
"""Return a list of the number of regions in all GRegions
:return: A list of region numbers
:rtype: list
"""
res = OrderedDict()
for name, regions in self.collection.items():
res[name] = len(regions)
return res

def count_overlaps(self, query_set):
"""Return a pandas dataframe of the numbers of overlapping regions
between the reference GRegionsSet (self) and the query GRegionsSet.
:param query_set: Query GRegionsSet
:type query_set: GRegionsSet
:return: Matrix of numbers of overlaps
:rtype: dataframe
"""
res = np.zeros((len(self), len(query_set)))
row_names = []
col_names = []
for i, (ref_name, ref) in enumerate(self.collection.items()):
row_names.append(ref_name)
for j, (query_name, query) in enumerate(query_set.items()):
col_names.append(query_name)
c = ref.overlap_count(target=query)
res[i, j] = c
df = pd.DataFrame(res,
index=row_names,
columns=col_names)
return df
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
pysam==0.22.0
tqdm
pyBigWig==0.3.22
pyBigWig==0.3.22
pandas
Empty file.
13 changes: 13 additions & 0 deletions tests/test_files/wig/example.wig
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
track type=wiggle_0 name="Example Track" description="Example Wiggle Track"
variableStep chrom=chr1
1000 0.5
2000 0.7
3000 0.9
4000 1.0
5000 0.8
variableStep chrom=chr2
1000 0.3
2000 0.4
3000 0.6
4000 0.7
5000 0.5

0 comments on commit 4aa014b

Please sign in to comment.