Skip to content

Commit

Permalink
fix the gff issue with exon
Browse files Browse the repository at this point in the history
  • Loading branch information
Runsheng committed Sep 4, 2022
1 parent 508fe6a commit 473ff99
Show file tree
Hide file tree
Showing 13 changed files with 343 additions and 129 deletions.
21 changes: 18 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# TrackCluster
![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green)

Trackcluster is an isoform calling and quantification pipeline for Nanopore direct-RNA long reads.

## Table of Contents
Expand Down Expand Up @@ -41,6 +43,15 @@ pip install ./trackcluster
# test if all dependencies are installed
trackrun.py test --install

# prepare the reference annotation bed file from gff file
# tested on Ensembl, WormBase and Arapost gff
gff2bigg.py -i ensemblxxxx.gff3 -o ref.bed
# WormBase full gff contains too many information, need to extract the lines from WormBase only
cat c_elegans.PRJNA13758.WS266.annotations.gff3 |grep WormBase > ws266.gff
gff2bigg.py -i ws266.gff -o ref.bed
# the ref.bed can be sorted to speed up the analysis
bedtools sort -i ref.bed > refs.bed # refs.bed contains the sorted, know transcripts from gff annotation

# generate the read track from minimap2 bam file
bam2bigg.py -b group1.bam -o group1.bed
bam2bigg.py -b group2.bam -o group2.bed
Expand All @@ -50,12 +61,16 @@ cat group1.bed group2.bed > read.bed
bedtools sort -i read.bed > reads.bed

# Examples for running commands:
trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, generate the isoform.bed
trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, will generate the isoform.bed
trackrun.py count -s reads.bed -r refs.bed -i isoform.bed # generate the csv file for isoform expression
# alternative for cluster
trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode, slower, will generate the isoform.bed

# the post analysis could include the classification of novel isoforms
trackrun.py desc --isoform isoform.bed --reference ref.bed > desc.bed # generate the description for each novel isoform
# this part can be run directly on reads, to count the frequency of splicing events in reads, like intron_retention
trackrun.py desc --isoform reads.bed --reference ref.bed > reads_desc.bed

# alternative for cluster
trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode, slower
```


Expand Down
20 changes: 9 additions & 11 deletions script/gff2bigg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,26 @@
parentdir = os.path.dirname(os.path.dirname(currentdir))
sys.path.insert(0,parentdir)

from trackcluster import convert
from trackcluster.convert import gff_to_bigGenePred
from trackcluster.gff import GFF
from trackcluster.tracklist import write_bigg

parser=argparse.ArgumentParser()
parser.add_argument("-i", "--gff",
help="the sorted gff file")
parser.add_argument("-o", "--out", default="bigg.bed",
help="the output bigGenePred file name")

parser.add_argument("-k", "--key", default="Name",
help="The key used as gene name in gff line, like Name=let-1")
parser.add_argument("-k", "--key", default="ID",
help="The key used as gene name in gff line attr column, default is ID, which is used in most Ensembl gff."
"like ID=gene:ENSMUSG00000064842;Name=let-1")

args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])

# make a file using the functions

fw=open(args.out, "w")
gff = GFF(args.gff)
bigg_list = gff_to_bigGenePred(gff, indicator=args.key)

gff = convert.GFF(args.gff, args.key)
bigg_list = convert.gff_to_bigGenePred(gff)
write_bigg(bigg_list, args.out)

for bigg in bigg_list:
fw.write(bigg.to_str())
fw.write("\n")

fw.close()
22 changes: 11 additions & 11 deletions script/trackrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,17 @@ def cluster(self):
#args = parser.parse_args(args=None if sys.argv[2:] else ['--help'])

flow_cluster_all_gene_novel(wkdir=args.folder,
prefix=args.prefix,
nano_bed=args.sample,
gff_bed=args.reference,
core=args.thread,
f1=args.intersect1,
f2=args.intersect2,
count_cutoff=args.count,
batchsize=args.batchsize,
intronweight=args.intronweight,
cutoff1=args.cutoff1,
cutoff2=args.cutoff2)
prefix=args.prefix,
nano_bed=args.sample,
gff_bed=args.reference,
core=args.thread,
f1=args.intersect1,
f2=args.intersect2,
count_cutoff=args.count,
batchsize=args.batchsize,
intronweight=args.intronweight,
cutoff1=args.cutoff1,
cutoff2=args.cutoff2)


def clusterj(self):
Expand Down
23 changes: 23 additions & 0 deletions test/convert_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,29 @@ def test_cigar_count(self):

print((cigar_count(cigar_tuple)))

def test_ensembl(self):
ensembl_file = "/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3"
gff=GFF(ensembl_file)
bigglist=gff_to_bigGenePred(gff)
print(len(bigglist))

def test_tair(self):
# 52059 mRNA
gff_file = "/data/reference/tair/GCF_000001735.4_TAIR10.1_genomic.gff"
gff=GFF(gff_file)
bigglist=gff_to_bigGenePred(gff)
print(len(bigglist))

def test_wormbase(self):
gff_file = "/data/reference/cel/ws266.gff"
gff=GFF(gff_file)
gff.transcript_format("ID")
#print(gff.transcript_to_gene["T"])
bigglist=gff_to_bigGenePred(gff)
print(len(bigglist))




def tearDown(self):
self.bigg = None
71 changes: 68 additions & 3 deletions test/gff_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,33 @@ class GFFTest(unittest.TestCase):
def setUp(self):
#self.gff = GFF("/home/zhaolab1/reference/ce10_ws266.gff")
self.gff=GFF("./genes/unc52/unc52.gff")
self.attr_str="ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52"

def test_format_write(self):
self.gff.gene_format()
print((len(list(self.gff.gene_d.keys()))))
print((list(self.gff.gene_d.keys())[0:10]))
self.gff.gff_write("./genes/unc52/unc52_write.gff", keys=["ZC101.2"])
self.gff.gff_write("./genes/unc52/unc52_write_2.gff")
print("test gff IO done")

def test_parser(self):
print((parse_attributes("ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52")))
def test_parser_attr(self):
print(parse_attributes(self.attr_str))

def test_attr2str(self):
attr_dic=parse_attributes(self.attr_str)
attr_str_new=attr_dic_to_str(attr_dic)
print(attr_str_new)
print(self.attr_str)

def test_parser_gff_and_reverse(self):
self.gff.to_list()
gff_one=self.gff.gff_string_list[1]
gff_record=parse_gff_line(gff_one)
print(gff_record)
gff_str=gff_record_to_str(gff_record)
print(gff_one)
print(gff_str)


def test_to_transcript(self):
self.gff.transcript_format()
Expand All @@ -31,6 +48,54 @@ def test_to_transcript(self):
print((self.gff.transcript_d))
print("Finish to_transcript test")

def test_ensembl(self):
"""
test the gff performace in ensembl gff format
:return:
"""
ensembl_file="/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3"
gff=GFF(ensembl_file)
gff.to_list()
gff.gene_format()
gff.transcript_format()
print(len(gff.gene_d), len(gff.transcript_d))

def __test_gene():
"""
test code for ensembl mouse gff
:return:
"""
keys=["gene:ENSMUSG00000114538","gene:ENSMUSG00000103506" ]
for k in keys:
print(k)
print(gene2line[k])
for i in gene2line[k]:
print(self.gff_records[i])



def test_ucsc(self):
"""
todo: this need a gtg parser
test the gff convert from ucsc gtf or gff
:return:
"""
pass


def test_wormbase(self):
pass

def test_genbank(self):
"""
test the gff format from genbank bioproject
:return:
"""
pass





def tearDown(self):
self.bigg = None
Expand Down
9 changes: 5 additions & 4 deletions test/track_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class Test(unittest.TestCase):
def setUp(self):
genes=["unc52", "AT1G06860", "AT2G02100", "AT2G43410"]
gene=genes[-1]
gene=genes[0]

bigg_nano=[]
with open("./genes/{gene}/{gene}_nano.bed".format(gene=gene)) as f:
Expand Down Expand Up @@ -131,7 +131,7 @@ def __test_bindseq_reverse(self):
def test_bindseq_forward(self):
#ref_dict = fasta2dic("/home/li/reference/tair/tair10.fa")
mrna_dict = fasta2dic("./genes/AT2G02100/AT2G02100.fasta")
pass


def test_mrna_pos_to_chro(self):
sample = self.bigg[0]
Expand All @@ -143,11 +143,12 @@ def test_mrna_pos_to_chro(self):
### check if the nucl of the same pos is indentical
### chro is 0 based and mRNA is 1 based

def __test_orfs(self):
def test_orfs(self):
"""
can only run with ce10 ref
"""
ref_dict=fasta2dic("/home/zhaolab1/reference/ce10_ucsc.fa")
# ref_dic is a large external file
ref_dict=fasta2dic("/data/reference/ce10_ucsc.fa")
bigg_one=self.bigg[30]
bigg_one.bind_chroseq(ref_dict, gap=0, intron=False)
print((bigg_one.seq_chro))
Expand Down
3 changes: 3 additions & 0 deletions test/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def test_is_bin_in(self):

myexe("/bin/bash -i -c minimap2")

def test_is_package_installed(self):
print(is_package_installed("Bio"))

def test_summary(self):
logger = log_summary()
logger.info("This is just a test, print number 100 as float" + "," + str(float(1) * 100))
Expand Down
2 changes: 1 addition & 1 deletion trackcluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# For relative imports to work in Python 3.6
import os, sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
__version__="0.1.1"
__version__="0.1.2"
__all__ = []
10 changes: 7 additions & 3 deletions trackcluster/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def cigar_count(cigar_tuple):

#### another function

def gff_to_bigGenePred(gff):
def gff_to_bigGenePred(gff, indicator="ID"):
"""
Note gff is 1 start and bigGenePred is 0
:param gff: a GFF class with only one gene inside
Expand All @@ -127,11 +127,15 @@ def gff_to_bigGenePred(gff):
bigg_list=[]

if gff.transcript_d is None:
gff.transcript_format()
gff.transcript_format(indicator=indicator)
#print(list(gff.transcript_d.keys()))

for key in list(gff.transcript_d.keys()):
bigg = bigGenePred()
gene=gff.transcript_to_gene[key]
try:
gene=gff.transcript_to_gene[key]
except KeyError:
print("KeyERROR", key)

for n, record in enumerate(gff.transcript_d[key]):
#use the first line to get basic infor
Expand Down
23 changes: 22 additions & 1 deletion trackcluster/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,4 +431,25 @@ def flow_cluster_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,f1=0.01
########################################################################################################

def flow_annotation(wkdir, ):
pass
pass


############
def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000):
"""
The overall full run pipeline for impatient people
:param wkdir:
:param prefix:
:param ref_fasta:
:param fastq_l:
:param nano_bed:
:param gff_bed:
:param core:
:param f1:
:param f2:
:param count_cutoff:
:param batchsize:
:return:
"""
pass
Loading

0 comments on commit 473ff99

Please sign in to comment.