update local test

chaochungkuo · Mar 2, 2024 · cfd8751 · cfd8751
1 parent c0fc22d
commit cfd8751
Show file tree

Hide file tree

Showing 6 changed files with 467 additions and 1 deletion.
diff --git a/docs/source/examples_bed.rst b/docs/source/examples_bed.rst
@@ -86,3 +86,51 @@ Generate a heapmap from two BED files: one BED file is used as windows and the o
 
 Sometimes your signals (scores) are stored in a BED file (column 5), instead of BEDGraph or BigWig. And now you want to visualize the interactions between these two BED files. For example, `DMSs.bed` contains the differential methylated CpGs with the score for hypermethylation or hypomethylation. And `TSSs.bed` include all transcription start sites with a window of 2000 bp. Now you want to visualize their interaction with a heatmap.
 
+
+============================
+Starting with many BED files
+============================
+
+
+When you have multiple BED files and want to investigate the interactions among those sets of genomic elements, you need to use `GRegionsSet` class. Below are some usage cases.
+
+Test the relevance of multiple peaks in BED files to different biotypes in GTF
+---------------------------
+
+For example, you have 4 BED files for different peaks:
+
+- `peaks_A.bed`
+- `peaks_B.bed`
+- `peaks_C.bed`
+- `peaks_D.bed`
+
+And you have generated some BED files for different biotypes in human (:ref:`Please refer to this tutorial <_gtf_genes_biotypes>`).
+
+- `hg38_protein_coding_genes.bed`
+- `hg38_lncRNA_genes.bed`
+- `hg38_snRNA_genes.bed`
+- `hg38_miRNA_genes.bed`
+
+Now you want to check the overlaps of the peaks with the genes.
+
+.. code-block:: python
+
+    from genomkit import GRegionsSet
+    # load peaks
+    peaks_dict = {"A": "./peaks_A.bed",
+                  "B": "./peaks_B.bed",
+                  "C": "./peaks_C.bed",
+                  "D": "./peaks_D.bed",}
+    peaks = GRegionsSet(name="peaks", load_dict=peaks_dict)
+    # load biotypes
+    biotypes_dict = {"protein_coding": "./hg38_protein_coding_genes.bed",
+                     "lncRNA": "./hg38_lncRNA_genes.bed",
+                     "snRNA": "./hg38_snRNA_genes.bed",
+                     "miRNA": "./hg38_miRNA_genes.bed",}
+    biotypes = GRegionsSet(name="biotypes", load_dict=biotypes_dict)
+    # Get a dataframe for overlapping counts
+    overlaps = peaks.count_overlaps(query_set=biotypes)
+    # Visualization
+
+    # Testing the association
+    peaks.test_
diff --git a/docs/source/examples_gtf.rst b/docs/source/examples_gtf.rst
@@ -67,6 +67,8 @@ Get all promoter regions in BED format from a GTF file
                             center="5prime", inplace=False)
     promoters.write(filename="hg38_promoters.bed")
 
+.. _gtf_genes_biotypes:
+
 Extract the genes by their biotypes from a GTF file
 ---------------------------------------------------
 

diff --git a/genomkit/regions/gregions_set.py b/genomkit/regions/gregions_set.py
@@ -77,12 +77,15 @@ def get_lengths(self):
             res[name] = len(regions)
         return res
 
-    def count_overlaps(self, query_set):
+    def count_overlaps(self, query_set, percentage: bool = False):
         """Return a pandas dataframe of the numbers of overlapping regions
         between the reference GRegionsSet (self) and the query GRegionsSet.
 
         :param query_set: Query GRegionsSet
         :type query_set: GRegionsSet
+        :param percentage: Convert the contingency table into percentage. The
+                           sum per row (reference) is 100%, defaults to False
+        :type percentage: bool, optional
         :return: Matrix of numbers of overlaps
         :rtype: dataframe
         """
@@ -98,4 +101,6 @@ def count_overlaps(self, query_set):
         df = pd.DataFrame(res,
                           index=row_names,
                           columns=col_names)
+        if percentage:
+            df = df.div(df.sum(axis=1), axis=0) * 100
         return df
diff --git a/tests/local_test/.DS_Store b/tests/local_test/.DS_Store