Skip to content

Commit

Permalink
update GCoverages for calculating coverages from BEDs
Browse files Browse the repository at this point in the history
  • Loading branch information
chaochungkuo committed Feb 29, 2024
1 parent f76f937 commit 0845059
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 23 deletions.
6 changes: 6 additions & 0 deletions docs/source/examples_bed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,9 @@ Often you want to find the interaction between two BED files, but not simply by
TFBSs = GRegions(name="TFBSs", load="TFBSs.bed")
close_TFBSs = TFBSs.intersect(target=promoters, mode="ORIGINAL")
close_TFBSs.write(filename="TFBSs_close_to_promoters.bed")
Generate a heapmap from two BED files: one BED file is used as windows and the other used as the signal
-------------------------

Sometimes your signals (scores) are stored in a BED file (column 5), instead of BEDGraph or BigWig. And now you want to visualize the interactions between these two BED files. For example, `DMSs.bed` contains the differential methylated CpGs with the score for hypermethylation or hypomethylation. And `TSSs.bed` include all transcription start sites with a window of 2000 bp. Now you want to visualize their interaction with a heatmap.

2 changes: 1 addition & 1 deletion genomkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .sequences.gsequences import GSequences
from .annotation.gannotation import GAnnotation
from .alignments.galignments import GAlignments
from .coverages.gcoverages import GCoverage
from .coverages.gcoverages import GCoverages
from .variants.gvariant import GVariant
from .variants.gvariants import GVariants

Expand Down
2 changes: 2 additions & 0 deletions genomkit/annotation/gannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,11 @@ def load_data(self):
elif feature_type == 'exon':
exon_id = attributes['exon_id'].strip('"')
transcript_id = attributes['transcript_id'].strip('"')
gene_id = attributes['gene_id'].strip('"')
self.exons[exon_id] = {
'id': exon_id,
'transcript_id': transcript_id,
'gene_id': gene_id,
'chr': fields[0],
'start': int(fields[3]),
'end': int(fields[4]),
Expand Down
54 changes: 50 additions & 4 deletions genomkit/coverages/gcoverages.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import pyBigWig
import numpy as np
import pandas as pd
import pysam
from tqdm import tqdm


class GCoverage:
class GCoverages:
"""
GCoverage module
GCoverages module
This module contains functions and classes for working with a collection of
genomic coverages. It provides utilities for handling and analyzing the
interactions of many genomic coverages.
"""
def __init__(self, bin_size: int = 1):
"""Initialize GCoverage object.
"""Initialize GCoverages object.
:param bin_size: Size of the bin for coverage calculation.
Defaults to 1 (single nucleotide resolution).
Expand Down Expand Up @@ -60,6 +62,34 @@ def calculate_coverage_from_bam(self, filename: str):
len(self.coverage[chrom]),
self.bin_size)]

def calculate_coverage_GRegions(self, regions, scores,
strandness: bool = False):
"""Calculate the coverage from two GRegions. `regions` defines the loci for the coverage
`scores` contains the scores loaded into the coverage.
:param regions: Define the loci and the length of the coverage
:type regions: GRegions
:param scores: Provide the scores for calculating the coverage
:type scores: GRegions
:param strandness: Make this operation strandness specific, defaults to False
:type strandness: bool, optional
"""
from genomkit import GRegions
assert isinstance(regions, GRegions)
assert isinstance(scores, GRegions)
filtered_scores = scores.intersect(target=regions,
mode="ORIGIN")
for region in tqdm(regions, desc="Calculating coverage",
total=len(regions)):
self.coverage[region] = np.zeros(shape=len(region) //
self.bin_size)
for target in filtered_scores:
if region.overlap(target, strandness=strandness):
start_ind = (target.start - region.start) // self.bin_size
end_ind = (target.end - region.end) // self.bin_size
for i in range(start_ind, end_ind):
self.coverage[region][i] += target.score

def get_coverage(self, seq_name: str):
"""Get coverage data for a specific sequence by name. This sequence
can be a chromosome or a genomic region.
Expand Down Expand Up @@ -110,4 +140,20 @@ def scale_coverage(self, coefficient):
# Replace NaN values with 0 before scaling
self.coverage[chrom][np.isnan(self.coverage[chrom])] = 0
# Scale the non-NaN values
self.coverage[chrom] = self.coverage[chrom] * coefficient
self.coverage[chrom] = self.coverage[chrom] * coefficient

def flip_negative_regions(self):
"""Flip the coverage arrays which are on the negative strands. If the
coverage arrays are calculated by the whole chromosomes, it won't
work.
"""
for region, cov in self.coverage.items():
if region.orientation == "-":
self.coverage[region] = cov[::-1]

def get_dataframe(self):
"""Return a pandas dataframe concatenating all coverage arrays."""
df = pd.DataFrame(self.coverage)
df = df.transpose()
df.index.name = None
return df
62 changes: 59 additions & 3 deletions genomkit/regions/gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ def resize(self, extend_upstream: int, extend_downstream: int,
else:
return res

def intersect_python(self, target, mode: str = "OVERLAP",
rm_duplicates: bool = False):
def intersect(self, target, mode: str = "OVERLAP",
rm_duplicates: bool = False):
"""Return a GRegions for the intersections between the two given
GRegions objects. There are three modes for overlapping:
Expand Down Expand Up @@ -652,7 +652,7 @@ def make_array_same_length(array_1, array_2):
return res

def overlap_count(self, target):
intersect = self.intersect_python(target, mode="ORIGINAL")
intersect = self.intersect(target, mode="ORIGINAL")
return len(intersect)

def subtract(self, regions, whole_region: bool = False,
Expand Down Expand Up @@ -937,6 +937,14 @@ def subtract(self, regions, whole_region: bool = False,
return res

def get_GSequences(self, FASTA_file):
"""Return a GSequences object according to the loci on the given
reference FASTQ file.
:param FASTA_file: Path to the FASTA file
:type FASTA_file: str
:return: A GSequences.
:rtype: GSequences
"""
from genomkit import GSequences
if os.path.exists(FASTA_file):
fasta = GSequences(load=FASTA_file)
Expand Down Expand Up @@ -1002,3 +1010,51 @@ def total_coverage(self):
merged_regions = self.merge(inplace=False)
cov = sum([len(r) for r in merged_regions])
return cov

def filter_by_names(self, names, inplace=False):
"""Filter the elements by the given list of names
:param names: A list of names for filtering
:type names: list
:param inplace: Define whether this operation will be applied on the
same object (True) or return a new object.
:type inplace: bool, default to True
:return: A GRegions with filtered regions
:rtype: GRegions
"""
res = GRegions(name=self.name)
for r in self.elements:
if r.name in names:
res.add(r)
if inplace:
self.elements = res.elements
else:
return res

def filter_by_score(self, larger_than=0, smaller_than=0, inplace=False):
"""Filter the elements by the given list of names
:param larger_than: Define the minimal cutoff. Any region with the
score larger than this value will be returned.
:type larger_than: float
:param smaller_than: Define the maximal cutoff. Any region with the
score smaller than this value will be returned.
:type smaller_than: float
:param inplace: Define whether this operation will be applied on the
same object (True) or return a new object.
:type inplace: bool, default to True
:return: A GRegions with filtered regions
:rtype: GRegions
"""
res = GRegions(name=self.name)
for r in self.elements:
if r.score > larger_than:
res.add(r)
elif r.score < smaller_than:
res.add(r)
else:
continue
if inplace:
self.elements = res.elements
else:
return res
2 changes: 1 addition & 1 deletion tests/evaluate/evaluate_intersect.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

@profile # noqa
def time_intersect_python():
intersect = peaks.intersect_python(genes)
intersect = peaks.intersect(genes)


@profile # noqa
Expand Down
30 changes: 30 additions & 0 deletions tests/test_gannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,33 @@ def test_get_transcript(self):
file_format="gff")
self.assertEqual(gff.get_transcript("ENST00000456328.2")["gene_id"],
"ENSG00000290825.1")

def test_get_exon(self):
gtf = GAnnotation(file_path=gtf_file,
file_format="gtf")
self.assertEqual(gtf.get_exon("ENSE00002234944.1")["gene_id"],
"ENSG00000290825.1")
gff = GAnnotation(file_path=gff_file,
file_format="gff")
self.assertEqual(gff.get_exon("ENSE00002234944.1")["gene_id"],
"ENSG00000290825.1")

def test_get_transcript_ids(self):
gtf = GAnnotation(file_path=gtf_file,
file_format="gtf")
self.assertEqual(gtf.get_transcript_ids()[0],
"ENST00000456328.2")
gff = GAnnotation(file_path=gff_file,
file_format="gff")
self.assertEqual(gff.get_transcript_ids()[0],
"ENST00000456328.2")

def test_get_exon_ids(self):
gtf = GAnnotation(file_path=gtf_file,
file_format="gtf")
self.assertEqual(gtf.get_exon_ids()[0],
"ENSE00002234944.1")
gff = GAnnotation(file_path=gff_file,
file_format="gff")
self.assertEqual(gff.get_exon_ids()[0],
"ENSE00002234944.1")
20 changes: 10 additions & 10 deletions tests/test_gcoverages.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import unittest
from genomkit import GCoverage, GRegions, GRegion
from genomkit import GCoverages, GRegions, GRegion
# from genomkit.sequences.io import load_FASTA, load_FASTQ
import os

script_path = os.path.dirname(__file__)


class TestGCoverage(unittest.TestCase):
class TestGCoverages(unittest.TestCase):

def test_load_coverage_from_bigwig(self):
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bw")
Expand All @@ -19,15 +19,15 @@ def test_load_coverage_from_bigwig(self):
self.assertEqual(len(cov.coverage), 2)
self.assertEqual(list(cov.coverage.keys())[0], "1")
self.assertEqual(list(cov.coverage.keys())[1], "10")
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bigwig")
)
self.assertEqual(len(cov.coverage.keys()), 2)

def test_calculate_coverage_from_bam(self):
cov = GCoverage()
cov = GCoverages()
cov.calculate_coverage_from_bam(
filename=os.path.join(script_path,
"test_files/bam/Col0_C1.100k.bam")
Expand All @@ -37,14 +37,14 @@ def test_calculate_coverage_from_bam(self):
self.assertEqual(len(cov.coverage.keys()), 1)

def test_get_coverage(self):
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bw")
)
res = cov.get_coverage("10")
self.assertEqual(len(res), 130694993)
cov = GCoverage()
cov = GCoverages()
cov.calculate_coverage_from_bam(
filename=os.path.join(script_path,
"test_files/bam/Col0_C1.100k.bam")
Expand All @@ -53,7 +53,7 @@ def test_get_coverage(self):
self.assertEqual(len(res), 30427671)

def test_filter_regions_coverage(self):
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bw")
Expand All @@ -67,7 +67,7 @@ def test_filter_regions_coverage(self):
GRegion(sequence="1", start=100, end=500))

def test_total_sequencing_depth(self):
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bw")
Expand All @@ -76,7 +76,7 @@ def test_total_sequencing_depth(self):
self.assertEqual(int(res), 272)

def test_scale_coverage(self):
cov = GCoverage()
cov = GCoverages()
cov.load_coverage_from_bigwig(
filename=os.path.join(script_path,
"test_files/bigwig/test.bw")
Expand Down
8 changes: 4 additions & 4 deletions tests/test_gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,26 +62,26 @@ def test_get_names(self):
self.assertEqual(regions.get_names(unique=True),
["Feature1", "Feature2", "Feature3", "Feature4"])

def test_intersect_python(self):
def test_intersect(self):
regions1 = GRegions(name="test")
regions1.load(filename=os.path.join(script_path,
"test_files/bed/example.bed"))
regions2 = GRegions(name="test")
regions2.load(filename=os.path.join(script_path,
"test_files/bed/example2.bed"))
intersect = regions1.intersect_python(regions2, mode='OVERLAP')
intersect = regions1.intersect(regions2, mode='OVERLAP')
self.assertEqual(len(intersect), 4)
self.assertEqual(len(intersect[0]), 500)
self.assertEqual(len(intersect[1]), 500)
self.assertEqual(len(intersect[2]), 500)
self.assertEqual(len(intersect[3]), 500)
intersect = regions1.intersect_python(regions2, mode='ORIGINAL')
intersect = regions1.intersect(regions2, mode='ORIGINAL')
self.assertEqual(len(intersect), 4)
self.assertEqual(len(intersect[0]), 1000)
self.assertEqual(len(intersect[1]), 1000)
self.assertEqual(len(intersect[2]), 1000)
self.assertEqual(len(intersect[3]), 1000)
intersect = regions1.intersect_python(regions2, mode='COMP_INCL')
intersect = regions1.intersect(regions2, mode='COMP_INCL')
self.assertEqual(len(intersect), 0)

def test_intersect_array(self):
Expand Down

0 comments on commit 0845059

Please sign in to comment.