From f70a5db9d72a8cac4396fe864daa9c846332ba8f Mon Sep 17 00:00:00 2001 From: Joseph Kuo Date: Tue, 20 Feb 2024 17:03:17 +0100 Subject: [PATCH] update examples --- docs/source/examples.rst | 21 +++++++++++++++++++ genomkit/annotation/gannotation.py | 14 ++++++++++--- .../data}/chrom_size/chrom.sizes.hg19 | 0 .../data}/chrom_size/chrom.sizes.hg38 | 0 .../data}/chrom_size/chrom.sizes.mm10 | 0 .../data}/chrom_size/chrom.sizes.mm39 | 0 .../data}/chrom_size/chrom.sizes.mm9 | 0 .../data}/chrom_size/chrom.sizes.tair10 | 0 .../data}/chrom_size/chrom.sizes.zv10 | 0 .../data}/chrom_size/chrom.sizes.zv9 | 0 genomkit/regions/gregions.py | 20 ++++++++++++++++++ setup.py | 2 +- 12 files changed, 53 insertions(+), 4 deletions(-) rename {data => genomkit/data}/chrom_size/chrom.sizes.hg19 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.hg38 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.mm10 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.mm39 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.mm9 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.tair10 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.zv10 (100%) rename {data => genomkit/data}/chrom_size/chrom.sizes.zv9 (100%) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 8c3aa14..6a35aa8 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -7,7 +7,28 @@ Extract exon, intron, and intergenetic regions in BED format from a GTF file ``GTF`` ``BED`` +``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below: +- hg38_exons.bed +- hg38_introns.bed +- hg38_intergenic_regions.bed + +.. code-block:: python + + from genomkit import GRegions + from genomkit import GAnnotation + + gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf") + genes = gtf.get_regions(element_type="gene") + exons = gtf.get_regions(element_type="exon") + introns = genes.subtract(exons, inplace=False) + + chromosomes = GRegions(name="chromosomes") + chromosomes.get_chromosomes(organism="hg38") + intergenic_regions = chromosomes.subtract(genes, inplace=False) + exons.write(filename="hg38_exons.bed") + introns.write(filename="hg38_introns.bed") + intergenic_regions.write(filename="hg38_intergenic_regions.bed") Get all promoter regions in BED format from a GTF file diff --git a/genomkit/annotation/gannotation.py b/genomkit/annotation/gannotation.py index 28c4cee..cdf8e1a 100644 --- a/genomkit/annotation/gannotation.py +++ b/genomkit/annotation/gannotation.py @@ -49,6 +49,7 @@ def load_data(self): if feature_type == 'gene': gene_id = attributes['gene_id'].strip('"') self.genes[gene_id] = { + 'id': gene_id, 'chr': fields[0], 'start': int(fields[3]), 'end': int(fields[4]), @@ -62,6 +63,7 @@ def load_data(self): transcript_id = attributes['transcript_id'].strip('"') gene_id = attributes['gene_id'].strip('"') self.transcripts[transcript_id] = { + 'id': transcript_id, 'gene_id': gene_id, 'chr': fields[0], 'start': int(fields[3]), @@ -77,6 +79,7 @@ def load_data(self): exon_id = attributes['exon_id'].strip('"') transcript_id = attributes['transcript_id'].strip('"') self.exons[exon_id] = { + 'id': exon_id, 'transcript_id': transcript_id, 'chr': fields[0], 'start': int(fields[3]), @@ -166,7 +169,6 @@ def filter_elements(self, element_type, attribute=None, value=None): :param value: Value of the attribute to filter on. :return: List of filtered elements. """ - filtered_elements = [] if element_type == 'gene': elements = self.genes.values() elif element_type == 'transcript': @@ -176,6 +178,7 @@ def filter_elements(self, element_type, attribute=None, value=None): else: raise ValueError("Invalid element type. Supported types are" "'gene', 'transcript', 'exon'.") + filtered_elements = [] for element in elements: if attribute and value: if attribute in element and element[attribute] == value: @@ -205,11 +208,16 @@ def get_regions(self, element_type: str, element_type, attribute=attribute, value=value) res = GRegions() for element in filtered_elements: + extra_data = [":".join([attribute, + ",".join(list(element[attribute]))]) + for attribute in element.keys() + if attribute not in ["id", "chr", "start", "end", + "strand"]] region = GRegion(sequence=element["chr"], start=element["start"], end=element["end"], orientation=element["strand"], - name=element["gene_name"], - data=[element["gene_type"]]) + name=element["id"], + data=extra_data) res.add(region) return res diff --git a/data/chrom_size/chrom.sizes.hg19 b/genomkit/data/chrom_size/chrom.sizes.hg19 similarity index 100% rename from data/chrom_size/chrom.sizes.hg19 rename to genomkit/data/chrom_size/chrom.sizes.hg19 diff --git a/data/chrom_size/chrom.sizes.hg38 b/genomkit/data/chrom_size/chrom.sizes.hg38 similarity index 100% rename from data/chrom_size/chrom.sizes.hg38 rename to genomkit/data/chrom_size/chrom.sizes.hg38 diff --git a/data/chrom_size/chrom.sizes.mm10 b/genomkit/data/chrom_size/chrom.sizes.mm10 similarity index 100% rename from data/chrom_size/chrom.sizes.mm10 rename to genomkit/data/chrom_size/chrom.sizes.mm10 diff --git a/data/chrom_size/chrom.sizes.mm39 b/genomkit/data/chrom_size/chrom.sizes.mm39 similarity index 100% rename from data/chrom_size/chrom.sizes.mm39 rename to genomkit/data/chrom_size/chrom.sizes.mm39 diff --git a/data/chrom_size/chrom.sizes.mm9 b/genomkit/data/chrom_size/chrom.sizes.mm9 similarity index 100% rename from data/chrom_size/chrom.sizes.mm9 rename to genomkit/data/chrom_size/chrom.sizes.mm9 diff --git a/data/chrom_size/chrom.sizes.tair10 b/genomkit/data/chrom_size/chrom.sizes.tair10 similarity index 100% rename from data/chrom_size/chrom.sizes.tair10 rename to genomkit/data/chrom_size/chrom.sizes.tair10 diff --git a/data/chrom_size/chrom.sizes.zv10 b/genomkit/data/chrom_size/chrom.sizes.zv10 similarity index 100% rename from data/chrom_size/chrom.sizes.zv10 rename to genomkit/data/chrom_size/chrom.sizes.zv10 diff --git a/data/chrom_size/chrom.sizes.zv9 b/genomkit/data/chrom_size/chrom.sizes.zv9 similarity index 100% rename from data/chrom_size/chrom.sizes.zv9 rename to genomkit/data/chrom_size/chrom.sizes.zv9 diff --git a/genomkit/regions/gregions.py b/genomkit/regions/gregions.py index d313946..6b81ac9 100644 --- a/genomkit/regions/gregions.py +++ b/genomkit/regions/gregions.py @@ -3,6 +3,7 @@ import copy import numpy as np from .io import load_BED +import os ########################################################################### @@ -183,6 +184,25 @@ def extend_fold(self, upstream: float = 0.0, downstream: float = 0.0, output.add(r) return output + def load_chrom_size_file(self, file_path): + with open(file_path, 'r') as file: + for line in file: + parts = line.strip().split('\t') + self.add(GRegion(sequence=parts[0], start=0, + end=int(parts[1]))) + + def get_chromosomes(self, organism: str): + import pkg_resources + chrome_size_file = 'data/chrom_size/chrom.sizes.' + organism + file_path = pkg_resources.resource_filename('genomkit', + chrome_size_file) + if os.path.exists(file_path): + self.load_chrom_size_file(file_path) + elif os.path.exists(organism): + self.load_chrom_size_file(file_path) + else: + print(organism + " chromosome size file does not exist") + def intersect_python(self, target, mode: str = "OVERLAP", rm_duplicates: bool = False): """Return a GRegions for the intersections between the two given diff --git a/setup.py b/setup.py index f5d0fcc..983e48a 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def find_version(*file_paths): long_description_content_type='text/markdown', url='https://github.com/chaochungkuo/genomkit', packages=find_packages(), - package_data={'genomkit': ['data/*']}, + package_data={'genomkit': ['data/chrom_size/*']}, install_requires=requirements, # entry_points={ # 'console_scripts': [