Skip to content

Commit

Permalink
update examples
Browse files Browse the repository at this point in the history
  • Loading branch information
chaochungkuo committed Feb 20, 2024
1 parent d2ab326 commit f70a5db
Show file tree
Hide file tree
Showing 12 changed files with 53 additions and 4 deletions.
21 changes: 21 additions & 0 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,28 @@ Extract exon, intron, and intergenetic regions in BED format from a GTF file

``GTF`` ``BED``

``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below:

- hg38_exons.bed
- hg38_introns.bed
- hg38_intergenic_regions.bed

.. code-block:: python
from genomkit import GRegions
from genomkit import GAnnotation
gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
genes = gtf.get_regions(element_type="gene")
exons = gtf.get_regions(element_type="exon")
introns = genes.subtract(exons, inplace=False)
chromosomes = GRegions(name="chromosomes")
chromosomes.get_chromosomes(organism="hg38")
intergenic_regions = chromosomes.subtract(genes, inplace=False)
exons.write(filename="hg38_exons.bed")
introns.write(filename="hg38_introns.bed")
intergenic_regions.write(filename="hg38_intergenic_regions.bed")
Get all promoter regions in BED format from a GTF file
Expand Down
14 changes: 11 additions & 3 deletions genomkit/annotation/gannotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def load_data(self):
if feature_type == 'gene':
gene_id = attributes['gene_id'].strip('"')
self.genes[gene_id] = {
'id': gene_id,
'chr': fields[0],
'start': int(fields[3]),
'end': int(fields[4]),
Expand All @@ -62,6 +63,7 @@ def load_data(self):
transcript_id = attributes['transcript_id'].strip('"')
gene_id = attributes['gene_id'].strip('"')
self.transcripts[transcript_id] = {
'id': transcript_id,
'gene_id': gene_id,
'chr': fields[0],
'start': int(fields[3]),
Expand All @@ -77,6 +79,7 @@ def load_data(self):
exon_id = attributes['exon_id'].strip('"')
transcript_id = attributes['transcript_id'].strip('"')
self.exons[exon_id] = {
'id': exon_id,
'transcript_id': transcript_id,
'chr': fields[0],
'start': int(fields[3]),
Expand Down Expand Up @@ -166,7 +169,6 @@ def filter_elements(self, element_type, attribute=None, value=None):
:param value: Value of the attribute to filter on.
:return: List of filtered elements.
"""
filtered_elements = []
if element_type == 'gene':
elements = self.genes.values()
elif element_type == 'transcript':
Expand All @@ -176,6 +178,7 @@ def filter_elements(self, element_type, attribute=None, value=None):
else:
raise ValueError("Invalid element type. Supported types are"
"'gene', 'transcript', 'exon'.")
filtered_elements = []
for element in elements:
if attribute and value:
if attribute in element and element[attribute] == value:
Expand Down Expand Up @@ -205,11 +208,16 @@ def get_regions(self, element_type: str,
element_type, attribute=attribute, value=value)
res = GRegions()
for element in filtered_elements:
extra_data = [":".join([attribute,
",".join(list(element[attribute]))])
for attribute in element.keys()
if attribute not in ["id", "chr", "start", "end",
"strand"]]
region = GRegion(sequence=element["chr"],
start=element["start"],
end=element["end"],
orientation=element["strand"],
name=element["gene_name"],
data=[element["gene_type"]])
name=element["id"],
data=extra_data)
res.add(region)
return res
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
20 changes: 20 additions & 0 deletions genomkit/regions/gregions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import copy
import numpy as np
from .io import load_BED
import os


###########################################################################
Expand Down Expand Up @@ -183,6 +184,25 @@ def extend_fold(self, upstream: float = 0.0, downstream: float = 0.0,
output.add(r)
return output

def load_chrom_size_file(self, file_path):
with open(file_path, 'r') as file:
for line in file:
parts = line.strip().split('\t')
self.add(GRegion(sequence=parts[0], start=0,
end=int(parts[1])))

def get_chromosomes(self, organism: str):
import pkg_resources
chrome_size_file = 'data/chrom_size/chrom.sizes.' + organism
file_path = pkg_resources.resource_filename('genomkit',
chrome_size_file)
if os.path.exists(file_path):
self.load_chrom_size_file(file_path)
elif os.path.exists(organism):
self.load_chrom_size_file(file_path)
else:
print(organism + " chromosome size file does not exist")

def intersect_python(self, target, mode: str = "OVERLAP",
rm_duplicates: bool = False):
"""Return a GRegions for the intersections between the two given
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def find_version(*file_paths):
long_description_content_type='text/markdown',
url='https://github.com/chaochungkuo/genomkit',
packages=find_packages(),
package_data={'genomkit': ['data/*']},
package_data={'genomkit': ['data/chrom_size/*']},
install_requires=requirements,
# entry_points={
# 'console_scripts': [
Expand Down

0 comments on commit f70a5db

Please sign in to comment.