diff --git a/Readme.md b/Readme.md index 31075c1..3883ef2 100644 --- a/Readme.md +++ b/Readme.md @@ -1,4 +1,6 @@ # TrackCluster +![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green) + Trackcluster is an isoform calling and quantification pipeline for Nanopore direct-RNA long reads. ## Table of Contents @@ -41,6 +43,15 @@ pip install ./trackcluster # test if all dependencies are installed trackrun.py test --install +# prepare the reference annotation bed file from gff file +# tested on Ensembl, WormBase and Arapost gff +gff2bigg.py -i ensemblxxxx.gff3 -o ref.bed +# WormBase full gff contains too many information, need to extract the lines from WormBase only +cat c_elegans.PRJNA13758.WS266.annotations.gff3 |grep WormBase > ws266.gff +gff2bigg.py -i ws266.gff -o ref.bed +# the ref.bed can be sorted to speed up the analysis +bedtools sort -i ref.bed > refs.bed # refs.bed contains the sorted, know transcripts from gff annotation + # generate the read track from minimap2 bam file bam2bigg.py -b group1.bam -o group1.bed bam2bigg.py -b group2.bam -o group2.bed @@ -50,12 +61,16 @@ cat group1.bed group2.bed > read.bed bedtools sort -i read.bed > reads.bed # Examples for running commands: -trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, generate the isoform.bed +trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, will generate the isoform.bed trackrun.py count -s reads.bed -r refs.bed -i isoform.bed # generate the csv file for isoform expression +# alternative for cluster +trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode, slower, will generate the isoform.bed + +# the post analysis could include the classification of novel isoforms trackrun.py desc --isoform isoform.bed --reference ref.bed > desc.bed # generate the description for each novel isoform +# this part can be run directly on reads, to count the frequency of splicing events in reads, like intron_retention +trackrun.py desc --isoform reads.bed --reference ref.bed > reads_desc.bed -# alternative for cluster -trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode, slower ``` diff --git a/script/gff2bigg.py b/script/gff2bigg.py index 08489a9..9386118 100755 --- a/script/gff2bigg.py +++ b/script/gff2bigg.py @@ -7,7 +7,9 @@ parentdir = os.path.dirname(os.path.dirname(currentdir)) sys.path.insert(0,parentdir) -from trackcluster import convert +from trackcluster.convert import gff_to_bigGenePred +from trackcluster.gff import GFF +from trackcluster.tracklist import write_bigg parser=argparse.ArgumentParser() parser.add_argument("-i", "--gff", @@ -15,20 +17,16 @@ parser.add_argument("-o", "--out", default="bigg.bed", help="the output bigGenePred file name") -parser.add_argument("-k", "--key", default="Name", - help="The key used as gene name in gff line, like Name=let-1") +parser.add_argument("-k", "--key", default="ID", + help="The key used as gene name in gff line attr column, default is ID, which is used in most Ensembl gff." + "like ID=gene:ENSMUSG00000064842;Name=let-1") args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) # make a file using the functions -fw=open(args.out, "w") +gff = GFF(args.gff) +bigg_list = gff_to_bigGenePred(gff, indicator=args.key) -gff = convert.GFF(args.gff, args.key) -bigg_list = convert.gff_to_bigGenePred(gff) +write_bigg(bigg_list, args.out) -for bigg in bigg_list: - fw.write(bigg.to_str()) - fw.write("\n") - -fw.close() diff --git a/script/trackrun.py b/script/trackrun.py index afbde8f..0a5608a 100644 --- a/script/trackrun.py +++ b/script/trackrun.py @@ -106,17 +106,17 @@ def cluster(self): #args = parser.parse_args(args=None if sys.argv[2:] else ['--help']) flow_cluster_all_gene_novel(wkdir=args.folder, - prefix=args.prefix, - nano_bed=args.sample, - gff_bed=args.reference, - core=args.thread, - f1=args.intersect1, - f2=args.intersect2, - count_cutoff=args.count, - batchsize=args.batchsize, - intronweight=args.intronweight, - cutoff1=args.cutoff1, - cutoff2=args.cutoff2) + prefix=args.prefix, + nano_bed=args.sample, + gff_bed=args.reference, + core=args.thread, + f1=args.intersect1, + f2=args.intersect2, + count_cutoff=args.count, + batchsize=args.batchsize, + intronweight=args.intronweight, + cutoff1=args.cutoff1, + cutoff2=args.cutoff2) def clusterj(self): diff --git a/test/convert_test.py b/test/convert_test.py index d10ea56..b872fc9 100644 --- a/test/convert_test.py +++ b/test/convert_test.py @@ -36,6 +36,29 @@ def test_cigar_count(self): print((cigar_count(cigar_tuple))) + def test_ensembl(self): + ensembl_file = "/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3" + gff=GFF(ensembl_file) + bigglist=gff_to_bigGenePred(gff) + print(len(bigglist)) + + def test_tair(self): + # 52059 mRNA + gff_file = "/data/reference/tair/GCF_000001735.4_TAIR10.1_genomic.gff" + gff=GFF(gff_file) + bigglist=gff_to_bigGenePred(gff) + print(len(bigglist)) + + def test_wormbase(self): + gff_file = "/data/reference/cel/ws266.gff" + gff=GFF(gff_file) + gff.transcript_format("ID") + #print(gff.transcript_to_gene["T"]) + bigglist=gff_to_bigGenePred(gff) + print(len(bigglist)) + + + def tearDown(self): self.bigg = None \ No newline at end of file diff --git a/test/gff_test.py b/test/gff_test.py index 887977e..a7c8503 100644 --- a/test/gff_test.py +++ b/test/gff_test.py @@ -13,16 +13,33 @@ class GFFTest(unittest.TestCase): def setUp(self): #self.gff = GFF("/home/zhaolab1/reference/ce10_ws266.gff") self.gff=GFF("./genes/unc52/unc52.gff") + self.attr_str="ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52" def test_format_write(self): self.gff.gene_format() print((len(list(self.gff.gene_d.keys())))) print((list(self.gff.gene_d.keys())[0:10])) - self.gff.gff_write("./genes/unc52/unc52_write.gff", keys=["ZC101.2"]) + self.gff.gff_write("./genes/unc52/unc52_write_2.gff") print("test gff IO done") - def test_parser(self): - print((parse_attributes("ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52"))) + def test_parser_attr(self): + print(parse_attributes(self.attr_str)) + + def test_attr2str(self): + attr_dic=parse_attributes(self.attr_str) + attr_str_new=attr_dic_to_str(attr_dic) + print(attr_str_new) + print(self.attr_str) + + def test_parser_gff_and_reverse(self): + self.gff.to_list() + gff_one=self.gff.gff_string_list[1] + gff_record=parse_gff_line(gff_one) + print(gff_record) + gff_str=gff_record_to_str(gff_record) + print(gff_one) + print(gff_str) + def test_to_transcript(self): self.gff.transcript_format() @@ -31,6 +48,54 @@ def test_to_transcript(self): print((self.gff.transcript_d)) print("Finish to_transcript test") + def test_ensembl(self): + """ + test the gff performace in ensembl gff format + :return: + """ + ensembl_file="/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3" + gff=GFF(ensembl_file) + gff.to_list() + gff.gene_format() + gff.transcript_format() + print(len(gff.gene_d), len(gff.transcript_d)) + + def __test_gene(): + """ + test code for ensembl mouse gff + :return: + """ + keys=["gene:ENSMUSG00000114538","gene:ENSMUSG00000103506" ] + for k in keys: + print(k) + print(gene2line[k]) + for i in gene2line[k]: + print(self.gff_records[i]) + + + + def test_ucsc(self): + """ + todo: this need a gtg parser + test the gff convert from ucsc gtf or gff + :return: + """ + pass + + + def test_wormbase(self): + pass + + def test_genbank(self): + """ + test the gff format from genbank bioproject + :return: + """ + pass + + + + def tearDown(self): self.bigg = None diff --git a/test/track_test.py b/test/track_test.py index 3c1889d..5f00a06 100644 --- a/test/track_test.py +++ b/test/track_test.py @@ -13,7 +13,7 @@ class Test(unittest.TestCase): def setUp(self): genes=["unc52", "AT1G06860", "AT2G02100", "AT2G43410"] - gene=genes[-1] + gene=genes[0] bigg_nano=[] with open("./genes/{gene}/{gene}_nano.bed".format(gene=gene)) as f: @@ -131,7 +131,7 @@ def __test_bindseq_reverse(self): def test_bindseq_forward(self): #ref_dict = fasta2dic("/home/li/reference/tair/tair10.fa") mrna_dict = fasta2dic("./genes/AT2G02100/AT2G02100.fasta") - pass + def test_mrna_pos_to_chro(self): sample = self.bigg[0] @@ -143,11 +143,12 @@ def test_mrna_pos_to_chro(self): ### check if the nucl of the same pos is indentical ### chro is 0 based and mRNA is 1 based - def __test_orfs(self): + def test_orfs(self): """ can only run with ce10 ref """ - ref_dict=fasta2dic("/home/zhaolab1/reference/ce10_ucsc.fa") + # ref_dic is a large external file + ref_dict=fasta2dic("/data/reference/ce10_ucsc.fa") bigg_one=self.bigg[30] bigg_one.bind_chroseq(ref_dict, gap=0, intron=False) print((bigg_one.seq_chro)) diff --git a/test/utils_test.py b/test/utils_test.py index 1cea3a2..58973f0 100644 --- a/test/utils_test.py +++ b/test/utils_test.py @@ -36,6 +36,9 @@ def test_is_bin_in(self): myexe("/bin/bash -i -c minimap2") + def test_is_package_installed(self): + print(is_package_installed("Bio")) + def test_summary(self): logger = log_summary() logger.info("This is just a test, print number 100 as float" + "," + str(float(1) * 100)) diff --git a/trackcluster/__init__.py b/trackcluster/__init__.py index 0a75a58..d899c04 100644 --- a/trackcluster/__init__.py +++ b/trackcluster/__init__.py @@ -1,5 +1,5 @@ # For relative imports to work in Python 3.6 import os, sys sys.path.append(os.path.dirname(os.path.realpath(__file__))) -__version__="0.1.1" +__version__="0.1.2" __all__ = [] diff --git a/trackcluster/convert.py b/trackcluster/convert.py index a8f0690..f55c21a 100644 --- a/trackcluster/convert.py +++ b/trackcluster/convert.py @@ -117,7 +117,7 @@ def cigar_count(cigar_tuple): #### another function -def gff_to_bigGenePred(gff): +def gff_to_bigGenePred(gff, indicator="ID"): """ Note gff is 1 start and bigGenePred is 0 :param gff: a GFF class with only one gene inside @@ -127,11 +127,15 @@ def gff_to_bigGenePred(gff): bigg_list=[] if gff.transcript_d is None: - gff.transcript_format() + gff.transcript_format(indicator=indicator) + #print(list(gff.transcript_d.keys())) for key in list(gff.transcript_d.keys()): bigg = bigGenePred() - gene=gff.transcript_to_gene[key] + try: + gene=gff.transcript_to_gene[key] + except KeyError: + print("KeyERROR", key) for n, record in enumerate(gff.transcript_d[key]): #use the first line to get basic infor diff --git a/trackcluster/flow.py b/trackcluster/flow.py index 926a6ae..a06b035 100644 --- a/trackcluster/flow.py +++ b/trackcluster/flow.py @@ -431,4 +431,25 @@ def flow_cluster_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,f1=0.01 ######################################################################################################## def flow_annotation(wkdir, ): - pass \ No newline at end of file + pass + + +############ +def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30, + f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000): + """ + The overall full run pipeline for impatient people + :param wkdir: + :param prefix: + :param ref_fasta: + :param fastq_l: + :param nano_bed: + :param gff_bed: + :param core: + :param f1: + :param f2: + :param count_cutoff: + :param batchsize: + :return: + """ + pass diff --git a/trackcluster/gff.py b/trackcluster/gff.py index c641902..0a4556c 100644 --- a/trackcluster/gff.py +++ b/trackcluster/gff.py @@ -46,7 +46,19 @@ def parse_gff_line(line): 'attributes': parse_attributes(line_l[8]) } - return GFF_record(**data) + return GFF_record(**data) + +def gff_record_to_str(record): + """ + reverse function for parse_gff_line + :param record: + :return: + """ + attr_str=attr_dic_to_str(record.attributes) + line_l=list(record) + line_l[-1]=attr_str + line_str=["." if x is None else str(x) for x in line_l] # change the None back to . + return "\t".join(line_str) def parse_attributes(attr_string): @@ -64,30 +76,54 @@ def parse_attributes(attr_string): else: attr_dic = OrderedDict() for attribute in attr_string.strip().split(";"): - if len(attribute) > 0: - elems = attribute.strip().split('=') - key = elems[0] - value = ' '.join(elems[1:]) - if value[0] == '"': - value = value[1:] - if value[-1] == '"': - value = value[0:-1] - attr_dic[key] = value + if len(attribute) > 0 : + if "=" in attribute.strip(): + elems = attribute.strip().split('=') + key = elems[0] + value = ' '.join(elems[1:]) + if value[0] == '"': + value = value[1:] + if value[-1] == '"': + value = value[0:-1] + attr_dic[key] = value return attr_dic +def attr_dic_to_str(attr_dic): + """ + reverse function of parser_attr + :param attr_dic: + :return: string str for a valid gff file + """ + if len(attr_dic)==0: + return "" + else: + line_l=[] + for k, v in attr_dic.items(): + str_one=k+"="+v + line_l.append(str_one) + return ";".join(line_l) + + class GFF(object): """ A general class to parse the gff file For speed issue, just read all lines into mem + + parser the gff using + """ - def __init__(self, gfffile, indicator="sequence_name"): + def __init__(self, gfffile): + + """ + :param gfffile: + """ # init in file reading self.filename=gfffile self.gff_string_list = [] - self.gff_to_list() # here use list for the flexibility instead of generator + self.to_list() # here use list for the flexibility instead of generator self.gff_records=[] # hold multiple GFF_record in a list @@ -96,9 +132,8 @@ def __init__(self, gfffile, indicator="sequence_name"): self.transcript_to_gene=None self.transcript_d=None - self.indicator=indicator - def gff_to_list(self): + def to_list(self): """ :return: gfflines list """ @@ -106,73 +141,135 @@ def gff_to_list(self): lines = f.readlines() for line in lines: if line[0] != "#" and len(line) > 18: # ignore the # and small lines - self.gff_string_list.append(line) + self.gff_string_list.append(line.strip()) + + def parser(self): + if len(self.gff_string_list)==0: + self.to_list() + else: + for line in self.gff_string_list: + self.gff_records.append(parse_gff_line(line)) - def gene_format(self): + def gene_format(self, indicator="ID"): """ format the gff line to blocks, {gene name, [mRNAline, cdsline...]} + :param indicator: usually be "ID", "Name", "sequence_name", the name used in gene line, which will be further used in exon line and/or cds line + the gff need to be sorted and all gene/mRNA/exon/cds/UTR is together - """ - indicator=self.indicator - gene_l=self.gff_string_list + select the gene lines, including the + gene, mRNA, exon,CDS, UTR + exclude the other lines, may need to add the other lines like pseudogene, + should try to include all keys containing an exon + switch to bottom-up method to avoid non-exon information + :return: + """ + if len(self.gff_records)==0: + self.parser() + + #### + parent2line_dic={} # dic for {namexxx:[1,2,3...]}. collect the level 1 ID and parent information + nontop_s=set() # set to store all lines numbers, which contains parent in attr + id2pos={} # {namexxx:2} + pos2id={} + + # collect all three dic + for n, record in enumerate(self.gff_records): + try: + id = record.attributes[indicator] + id2pos[id] = n + pos2id[n]=id + except KeyError: + pass + + try: # record all parent + p1=record.attributes["Parent"] + try: + parent2line_dic[p1].append(n) + except KeyError: + parent2line_dic[p1]=[n] + nontop_s.add(n) + except KeyError: + pass + + #print(parent2line_dic) + #print(nontop_s) + #print(id2pos) + + # add the top key to in id2line to the gene2line + # for ensembl gff, the top line with indicator="ID" will keep only genes + # usually the line is one mRNA line + gene2line={} + for id, pos_l in parent2line_dic.items(): + try: + if id2pos[id] not in nontop_s or len(pos_l)==0: # some gff will have missing parent line, it is wrong but it happens + gene2line[id]=pos_l + except KeyError: + pass + #print("gene2line,",len(gene2line), "parent2line", len(parent2line_dic)) + #print_dic(gene2line) + + # reverse the line2gene + line2gene={} + for gene, pos_l in gene2line.items(): + for pos in pos_l: + try: + record=self.gff_records[pos] + name=record.attributes[indicator] + line2gene[name]=gene + except KeyError: + pass + + #add the nontop ones to top keys + # use id2line to glue all untop line and their subline to gene2line + # the value in gene2line need to be expaned + for id, pos_l in parent2line_dic.items(): + try: + if id2pos[id] in nontop_s: # not top key, has parent, need to add to gene2line + try: + gene=line2gene[id] # + #print("in", gene) + gene2line[gene].extend(pos_l) + except KeyError: + pass + except KeyError: + pass + # create the gene_d gene_d = OrderedDict() - for line in gene_l: - line_l = line.split("\t") - - gff_type = line_l[2] - attributes = line_l[-1].strip() - - if gff_type == "gene": - for attribute in attributes.split(";"): - if indicator in attribute: # the key "sequence_name" only for wormbase gff - kk = attribute.split("=")[1] - gene_d[kk] = [] - gene_d[kk].append(line) - - else: - for attribute in attributes.split(";"): - if "ID" in attribute: - parent = attribute.split("=")[1] - if kk in parent: - gene_d[kk].append(line) - break - if "Parent" in attribute: - parent = attribute.split("=")[1] - if kk in parent: - gene_d[kk].append(line) - break + for k, v in gene2line.items(): + gene_d[k]=[] + line_gene=id2pos[k] + gene_d[k].append(self.gff_records[line_gene]) + + line_other=v + for i in line_other: + gene_d[k].append(self.gff_records[i]) self.gene_d=gene_d - def transcript_format(self, keys=None): + def transcript_format(self, indicator="ID", keys=None): transcript_to_gene={} # {mrna_name: genename} transcript_d=OrderedDict() # {mrna_name: gff_line} if self.gene_d is None: - self.gene_format() + self.gene_format(indicator=indicator) if keys is None: # if no chose ones, use all mrna keys=list(self.gene_d.keys()) for k in keys: - gff_list=self.gene_d[k] + record_list=self.gene_d[k] # use record instead of gff # first test the type of the gene by test if CDS is in the gff_line - type_set=set([x.split("\t")[2] for x in gff_list]) + type_set=set([x.type for x in record_list]) is_protein_coding=True if "CDS" in type_set else False exon_junction=set() - for line in gff_list: - record=parse_gff_line(line) - + for record in record_list: if record.type == "exon": - try: - transcript_name = record.attributes["Parent"].split(":")[1] # only for wormbase - except IndexError: - transcript_name = record.attributes["Parent"] # general + transcript_name = record.attributes["Parent"] # general transcript_to_gene[transcript_name] = k try: @@ -184,23 +281,6 @@ def transcript_format(self, keys=None): exon_junction.add(record.start) exon_junction.add(record.end) - for line in gff_list: - # exon can be used to record genes from non-coding genes - if is_protein_coding: # need to add the not in exon cds to the key - record=parse_gff_line(line) - if record.type=="CDS" and (record.start not in exon_junction) and (record.end not in exon_junction): # cds can have multiple parents - parent_l= record.attributes["Parent"].split(",") - for parent in parent_l: - try: - transcript_name=parent.split(":")[1] # only for wormbase - except IndexError: - transcript_name=parent # only for wormbase - #transcript_to_gene[transcript_name]=k - try: - transcript_d[transcript_name].append(record) - except KeyError: - transcript_d[transcript_name]=[] - transcript_d[transcript_name].append(record) for k, v in list(transcript_d.items()): v.sort(key=attrgetter('start')) @@ -209,9 +289,17 @@ def transcript_format(self, keys=None): self.transcript_d=transcript_d def gff_write(self, out, keys=None): + """ + write the formatted or the original gff to a new file + :param out: + :param keys: + :return: + """ - def write_line(): - fw.write(line) + def write_line(gff_record): + gff_str = gff_record_to_str(gff_record) + fw.write(gff_str) + fw.write("\n") fw=open(out,"w") @@ -220,22 +308,10 @@ def write_line(): for k in keys: print(k) - v=self.gene_d[k] - for line in v: - if "gene" in line.split("\t")[2]: - write_line() - for line in v: - if "mRNA" in line.split("\t")[2]: - write_line() - for line in v: - if "exon" in line.split("\t")[2]: - write_line() - for line in v: - if "CDS" in line.split("\t")[2]: - write_line() - for line in v: - if "UTR" in line.split("\t")[2]: - write_line() + record_l=self.gene_d[k] + for record in record_l: + write_line(record) + fw.close() diff --git a/trackcluster/track.py b/trackcluster/track.py index dc8a429..f4622bd 100644 --- a/trackcluster/track.py +++ b/trackcluster/track.py @@ -404,7 +404,6 @@ def find_orfs_with_trans(self, trans_table=1, min_protein_length=10): return None else: from Bio.Seq import Seq - from Bio.Alphabet import generic_dna seq=Seq(self.seq_chro) seq0=seq.translate(table=1) diff --git a/trackcluster/utils.py b/trackcluster/utils.py index 6b6d3c2..e23dce5 100644 --- a/trackcluster/utils.py +++ b/trackcluster/utils.py @@ -252,6 +252,7 @@ def get_file_location(filepath): def list2file(ll, genename_file): """ + IO function write a list to a file :param ll: :param genename_file: @@ -265,6 +266,7 @@ def list2file(ll, genename_file): def file2list(genename_file): """ + IO function reverse of file2list :param genename_file: :return: @@ -273,4 +275,11 @@ def file2list(genename_file): with open(genename_file, "r") as f: for line in f.readlines(): gene_l.append(line.strip()) - return gene_l \ No newline at end of file + return gene_l + +def print_dic(dic, n=100): + """ + test code, print the first 100 line of dic + """ + for k in list(dic.keys())[:n]: + print(k, dic[k]) \ No newline at end of file