update GCoverages for calculating coverages from BEDs

chaochungkuo · Feb 29, 2024 · 0845059 · 0845059
1 parent f76f937
commit 0845059
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 23 deletions.
diff --git a/docs/source/examples_bed.rst b/docs/source/examples_bed.rst
@@ -80,3 +80,9 @@ Often you want to find the interaction between two BED files, but not simply by
     TFBSs = GRegions(name="TFBSs", load="TFBSs.bed")
     close_TFBSs = TFBSs.intersect(target=promoters, mode="ORIGINAL")
     close_TFBSs.write(filename="TFBSs_close_to_promoters.bed")
+
+Generate a heapmap from two BED files: one BED file is used as windows and the other used as the signal
+-------------------------
+
+Sometimes your signals (scores) are stored in a BED file (column 5), instead of BEDGraph or BigWig. And now you want to visualize the interactions between these two BED files. For example, `DMSs.bed` contains the differential methylated CpGs with the score for hypermethylation or hypomethylation. And `TSSs.bed` include all transcription start sites with a window of 2000 bp. Now you want to visualize their interaction with a heatmap.
+
diff --git a/genomkit/__init__.py b/genomkit/__init__.py
@@ -4,7 +4,7 @@
 from .sequences.gsequences import GSequences
 from .annotation.gannotation import GAnnotation
 from .alignments.galignments import GAlignments
-from .coverages.gcoverages import GCoverage
+from .coverages.gcoverages import GCoverages
 from .variants.gvariant import GVariant
 from .variants.gvariants import GVariants
 

diff --git a/genomkit/annotation/gannotation.py b/genomkit/annotation/gannotation.py
@@ -78,9 +78,11 @@ def load_data(self):
                 elif feature_type == 'exon':
                     exon_id = attributes['exon_id'].strip('"')
                     transcript_id = attributes['transcript_id'].strip('"')
+                    gene_id = attributes['gene_id'].strip('"')
                     self.exons[exon_id] = {
                         'id': exon_id,
                         'transcript_id': transcript_id,
+                        'gene_id': gene_id,
                         'chr': fields[0],
                         'start': int(fields[3]),
                         'end': int(fields[4]),

diff --git a/genomkit/coverages/gcoverages.py b/genomkit/coverages/gcoverages.py
@@ -1,18 +1,20 @@
 import pyBigWig
 import numpy as np
+import pandas as pd
 import pysam
+from tqdm import tqdm
 
 
-class GCoverage:
+class GCoverages:
     """
-    GCoverage module
+    GCoverages module
 
     This module contains functions and classes for working with a collection of
     genomic coverages. It provides utilities for handling and analyzing the
     interactions of many genomic coverages.
     """
     def __init__(self, bin_size: int = 1):
-        """Initialize GCoverage object.
+        """Initialize GCoverages object.
 
         :param bin_size: Size of the bin for coverage calculation.
                          Defaults to 1 (single nucleotide resolution).
@@ -60,6 +62,34 @@ def calculate_coverage_from_bam(self, filename: str):
                                         len(self.coverage[chrom]),
                                         self.bin_size)]
 
+    def calculate_coverage_GRegions(self, regions, scores,
+                                    strandness: bool = False):
+        """Calculate the coverage from two GRegions. `regions` defines the loci for the coverage
+        `scores` contains the scores loaded into the coverage.
+
+        :param regions: Define the loci and the length of the coverage
+        :type regions: GRegions
+        :param scores: Provide the scores for calculating the coverage
+        :type scores: GRegions
+        :param strandness: Make this operation strandness specific, defaults to False
+        :type strandness: bool, optional
+        """
+        from genomkit import GRegions
+        assert isinstance(regions, GRegions)
+        assert isinstance(scores, GRegions)
+        filtered_scores = scores.intersect(target=regions,
+                                           mode="ORIGIN")
+        for region in tqdm(regions, desc="Calculating coverage",
+                           total=len(regions)):
+            self.coverage[region] = np.zeros(shape=len(region) //
+                                             self.bin_size)
+            for target in filtered_scores:
+                if region.overlap(target, strandness=strandness):
+                    start_ind = (target.start - region.start) // self.bin_size
+                    end_ind = (target.end - region.end) // self.bin_size
+                    for i in range(start_ind, end_ind):
+                        self.coverage[region][i] += target.score
+
     def get_coverage(self, seq_name: str):
         """Get coverage data for a specific sequence by name. This sequence
         can be a chromosome or a genomic region.
@@ -110,4 +140,20 @@ def scale_coverage(self, coefficient):
             # Replace NaN values with 0 before scaling
             self.coverage[chrom][np.isnan(self.coverage[chrom])] = 0
             # Scale the non-NaN values
-            self.coverage[chrom] = self.coverage[chrom] * coefficient
+            self.coverage[chrom] = self.coverage[chrom] * coefficient
+
+    def flip_negative_regions(self):
+        """Flip the coverage arrays which are on the negative strands. If the
+        coverage arrays are calculated by the whole chromosomes, it won't
+        work.
+        """
+        for region, cov in self.coverage.items():
+            if region.orientation == "-":
+                self.coverage[region] = cov[::-1]
+
+    def get_dataframe(self):
+        """Return a pandas dataframe concatenating all coverage arrays."""
+        df = pd.DataFrame(self.coverage)
+        df = df.transpose()
+        df.index.name = None
+        return df
diff --git a/genomkit/regions/gregions.py b/genomkit/regions/gregions.py
@@ -236,8 +236,8 @@ def resize(self, extend_upstream: int, extend_downstream: int,
         else:
             return res
 
-    def intersect_python(self, target, mode: str = "OVERLAP",
-                         rm_duplicates: bool = False):
+    def intersect(self, target, mode: str = "OVERLAP",
+                  rm_duplicates: bool = False):
         """Return a GRegions for the intersections between the two given
         GRegions objects. There are three modes for overlapping:
 
@@ -652,7 +652,7 @@ def make_array_same_length(array_1, array_2):
         return res
 
     def overlap_count(self, target):
-        intersect = self.intersect_python(target, mode="ORIGINAL")
+        intersect = self.intersect(target, mode="ORIGINAL")
         return len(intersect)
 
     def subtract(self, regions, whole_region: bool = False,
@@ -937,6 +937,14 @@ def subtract(self, regions, whole_region: bool = False,
                 return res
 
     def get_GSequences(self, FASTA_file):
+        """Return a GSequences object according to the loci on the given
+        reference FASTQ file.
+
+        :param FASTA_file: Path to the FASTA file
+        :type FASTA_file: str
+        :return: A GSequences.
+        :rtype: GSequences
+        """
         from genomkit import GSequences
         if os.path.exists(FASTA_file):
             fasta = GSequences(load=FASTA_file)
@@ -1002,3 +1010,51 @@ def total_coverage(self):
         merged_regions = self.merge(inplace=False)
         cov = sum([len(r) for r in merged_regions])
         return cov
+
+    def filter_by_names(self, names, inplace=False):
+        """Filter the elements by the given list of names
+
+        :param names: A list of names for filtering
+        :type names: list
+        :param inplace: Define whether this operation will be applied on the
+                        same object (True) or return a new object.
+        :type inplace: bool, default to True
+        :return: A GRegions with filtered regions
+        :rtype: GRegions
+        """
+        res = GRegions(name=self.name)
+        for r in self.elements:
+            if r.name in names:
+                res.add(r)
+        if inplace:
+            self.elements = res.elements
+        else:
+            return res
+
+    def filter_by_score(self, larger_than=0, smaller_than=0, inplace=False):
+        """Filter the elements by the given list of names
+
+        :param larger_than: Define the minimal cutoff. Any region with the
+                            score larger than this value will be returned.
+        :type larger_than: float
+        :param smaller_than: Define the maximal cutoff. Any region with the
+                            score smaller than this value will be returned.
+        :type smaller_than: float
+        :param inplace: Define whether this operation will be applied on the
+                        same object (True) or return a new object.
+        :type inplace: bool, default to True
+        :return: A GRegions with filtered regions
+        :rtype: GRegions
+        """
+        res = GRegions(name=self.name)
+        for r in self.elements:
+            if r.score > larger_than:
+                res.add(r)
+            elif r.score < smaller_than:
+                res.add(r)
+            else:
+                continue
+        if inplace:
+            self.elements = res.elements
+        else:
+            return res
diff --git a/tests/evaluate/evaluate_intersect.py b/tests/evaluate/evaluate_intersect.py
@@ -23,7 +23,7 @@
 
 @profile # noqa
 def time_intersect_python():
-    intersect = peaks.intersect_python(genes)
+    intersect = peaks.intersect(genes)
 
 
 @profile # noqa

diff --git a/tests/test_gannotation.py b/tests/test_gannotation.py
@@ -66,3 +66,33 @@ def test_get_transcript(self):
                           file_format="gff")
         self.assertEqual(gff.get_transcript("ENST00000456328.2")["gene_id"],
                          "ENSG00000290825.1")
+
+    def test_get_exon(self):
+        gtf = GAnnotation(file_path=gtf_file,
+                          file_format="gtf")
+        self.assertEqual(gtf.get_exon("ENSE00002234944.1")["gene_id"],
+                         "ENSG00000290825.1")
+        gff = GAnnotation(file_path=gff_file,
+                          file_format="gff")
+        self.assertEqual(gff.get_exon("ENSE00002234944.1")["gene_id"],
+                         "ENSG00000290825.1")
+
+    def test_get_transcript_ids(self):
+        gtf = GAnnotation(file_path=gtf_file,
+                          file_format="gtf")
+        self.assertEqual(gtf.get_transcript_ids()[0],
+                         "ENST00000456328.2")
+        gff = GAnnotation(file_path=gff_file,
+                          file_format="gff")
+        self.assertEqual(gff.get_transcript_ids()[0],
+                         "ENST00000456328.2")
+
+    def test_get_exon_ids(self):
+        gtf = GAnnotation(file_path=gtf_file,
+                          file_format="gtf")
+        self.assertEqual(gtf.get_exon_ids()[0],
+                         "ENSE00002234944.1")
+        gff = GAnnotation(file_path=gff_file,
+                          file_format="gff")
+        self.assertEqual(gff.get_exon_ids()[0],
+                         "ENSE00002234944.1")
diff --git a/tests/test_gcoverages.py b/tests/test_gcoverages.py
@@ -1,15 +1,15 @@
 import unittest
-from genomkit import GCoverage, GRegions, GRegion
+from genomkit import GCoverages, GRegions, GRegion
 # from genomkit.sequences.io import load_FASTA, load_FASTQ
 import os
 
 script_path = os.path.dirname(__file__)
 
 
-class TestGCoverage(unittest.TestCase):
+class TestGCoverages(unittest.TestCase):
 
     def test_load_coverage_from_bigwig(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bw")
@@ -19,15 +19,15 @@ def test_load_coverage_from_bigwig(self):
         self.assertEqual(len(cov.coverage), 2)
         self.assertEqual(list(cov.coverage.keys())[0], "1")
         self.assertEqual(list(cov.coverage.keys())[1], "10")
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bigwig")
             )
         self.assertEqual(len(cov.coverage.keys()), 2)
 
     def test_calculate_coverage_from_bam(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.calculate_coverage_from_bam(
             filename=os.path.join(script_path,
                                   "test_files/bam/Col0_C1.100k.bam")
@@ -37,14 +37,14 @@ def test_calculate_coverage_from_bam(self):
         self.assertEqual(len(cov.coverage.keys()), 1)
 
     def test_get_coverage(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bw")
             )
         res = cov.get_coverage("10")
         self.assertEqual(len(res), 130694993)
-        cov = GCoverage()
+        cov = GCoverages()
         cov.calculate_coverage_from_bam(
             filename=os.path.join(script_path,
                                   "test_files/bam/Col0_C1.100k.bam")
@@ -53,7 +53,7 @@ def test_get_coverage(self):
         self.assertEqual(len(res), 30427671)
 
     def test_filter_regions_coverage(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bw")
@@ -67,7 +67,7 @@ def test_filter_regions_coverage(self):
                          GRegion(sequence="1", start=100, end=500))
 
     def test_total_sequencing_depth(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bw")
@@ -76,7 +76,7 @@ def test_total_sequencing_depth(self):
         self.assertEqual(int(res), 272)
 
     def test_scale_coverage(self):
-        cov = GCoverage()
+        cov = GCoverages()
         cov.load_coverage_from_bigwig(
             filename=os.path.join(script_path,
                                   "test_files/bigwig/test.bw")

diff --git a/tests/test_gregions.py b/tests/test_gregions.py
@@ -62,26 +62,26 @@ def test_get_names(self):
         self.assertEqual(regions.get_names(unique=True),
                          ["Feature1", "Feature2", "Feature3", "Feature4"])
 
-    def test_intersect_python(self):
+    def test_intersect(self):
         regions1 = GRegions(name="test")
         regions1.load(filename=os.path.join(script_path,
                                             "test_files/bed/example.bed"))
         regions2 = GRegions(name="test")
         regions2.load(filename=os.path.join(script_path,
                                             "test_files/bed/example2.bed"))
-        intersect = regions1.intersect_python(regions2, mode='OVERLAP')
+        intersect = regions1.intersect(regions2, mode='OVERLAP')
         self.assertEqual(len(intersect), 4)
         self.assertEqual(len(intersect[0]), 500)
         self.assertEqual(len(intersect[1]), 500)
         self.assertEqual(len(intersect[2]), 500)
         self.assertEqual(len(intersect[3]), 500)
-        intersect = regions1.intersect_python(regions2, mode='ORIGINAL')
+        intersect = regions1.intersect(regions2, mode='ORIGINAL')
         self.assertEqual(len(intersect), 4)
         self.assertEqual(len(intersect[0]), 1000)
         self.assertEqual(len(intersect[1]), 1000)
         self.assertEqual(len(intersect[2]), 1000)
         self.assertEqual(len(intersect[3]), 1000)
-        intersect = regions1.intersect_python(regions2, mode='COMP_INCL')
+        intersect = regions1.intersect(regions2, mode='COMP_INCL')
         self.assertEqual(len(intersect), 0)
 
     def test_intersect_array(self):