fix the gff issue with exon

Runsheng · Sep 4, 2022 · 473ff99 · 473ff99
1 parent 508fe6a
commit 473ff99
Show file tree

Hide file tree

Showing 13 changed files with 343 additions and 129 deletions.
diff --git a/Readme.md b/Readme.md
@@ -1,4 +1,6 @@
 # TrackCluster
+![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green)
+
 Trackcluster is an isoform calling and quantification pipeline for Nanopore direct-RNA long reads.
 
 ## Table of Contents
@@ -41,6 +43,15 @@ pip install ./trackcluster
 # test if all dependencies are installed
 trackrun.py test --install
 
+# prepare the reference annotation bed file from gff file
+# tested on Ensembl, WormBase and Arapost gff
+gff2bigg.py -i ensemblxxxx.gff3 -o ref.bed 
+# WormBase full gff contains too many information, need to extract the lines from WormBase only
+cat c_elegans.PRJNA13758.WS266.annotations.gff3 |grep WormBase > ws266.gff
+gff2bigg.py -i ws266.gff -o ref.bed
+# the ref.bed can be sorted to speed up the analysis
+bedtools sort -i ref.bed > refs.bed # refs.bed contains the sorted, know transcripts from gff annotation
+
 # generate the read track from minimap2 bam file
 bam2bigg.py -b group1.bam -o group1.bed
 bam2bigg.py -b group2.bam -o group2.bed
@@ -50,12 +61,16 @@ cat group1.bed group2.bed > read.bed
 bedtools sort -i read.bed > reads.bed
 
 # Examples for running commands:
-trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, generate the isoform.bed
+trackrun.py clusterj -s reads.bed -r refs.bed -t 40 # run in junction mode, will generate the isoform.bed
 trackrun.py count -s reads.bed -r refs.bed -i isoform.bed # generate the csv file for isoform expression
+# alternative for cluster
+trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode， slower, will generate the isoform.bed
+
+# the post analysis could include the classification of novel isoforms
 trackrun.py desc --isoform isoform.bed --reference ref.bed > desc.bed  # generate the description for each novel isoform
+# this part can be run directly on reads, to count the frequency of splicing events in reads, like intron_retention
+trackrun.py desc --isoform reads.bed --reference ref.bed > reads_desc.bed 
 
-# alternative for cluster
-trackrun.py cluster -s reads.bed -r refs.bed -t 40 # run in exon/intron intersection mode， slower
 ```
 
 

diff --git a/script/gff2bigg.py b/script/gff2bigg.py
@@ -7,28 +7,26 @@
 parentdir = os.path.dirname(os.path.dirname(currentdir))
 sys.path.insert(0,parentdir)
 
-from trackcluster import convert
+from trackcluster.convert import gff_to_bigGenePred
+from trackcluster.gff import GFF
+from trackcluster.tracklist import write_bigg
 
 parser=argparse.ArgumentParser()
 parser.add_argument("-i", "--gff",
                     help="the sorted gff file")
 parser.add_argument("-o", "--out", default="bigg.bed",
                     help="the output bigGenePred file name")
 
-parser.add_argument("-k", "--key", default="Name",
-                    help="The key used as gene name in gff line, like Name=let-1")
+parser.add_argument("-k", "--key", default="ID",
+                    help="The key used as gene name in gff line attr column, default is ID, which is used in most Ensembl gff."
+                         "like ID=gene:ENSMUSG00000064842;Name=let-1")
 
 args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
 
 # make a file using the functions
 
-fw=open(args.out, "w")
+gff = GFF(args.gff)
+bigg_list = gff_to_bigGenePred(gff, indicator=args.key)
 
-gff = convert.GFF(args.gff, args.key)
-bigg_list = convert.gff_to_bigGenePred(gff)
+write_bigg(bigg_list, args.out)
 
-for bigg in bigg_list:
-    fw.write(bigg.to_str())
-    fw.write("\n")
-
-fw.close()
diff --git a/script/trackrun.py b/script/trackrun.py
@@ -106,17 +106,17 @@ def cluster(self):
         #args = parser.parse_args(args=None if sys.argv[2:] else ['--help'])
 
         flow_cluster_all_gene_novel(wkdir=args.folder,
-                                     prefix=args.prefix,
-                                     nano_bed=args.sample,
-                                     gff_bed=args.reference,
-                                     core=args.thread,
-                                     f1=args.intersect1,
-                                     f2=args.intersect2,
-                                     count_cutoff=args.count,
-                                     batchsize=args.batchsize,
-                                    intronweight=args.intronweight,
-                                    cutoff1=args.cutoff1,
-                                    cutoff2=args.cutoff2)
+                                 prefix=args.prefix,
+                                 nano_bed=args.sample,
+                                 gff_bed=args.reference,
+                                 core=args.thread,
+                                 f1=args.intersect1,
+                                 f2=args.intersect2,
+                                 count_cutoff=args.count,
+                                 batchsize=args.batchsize,
+                                 intronweight=args.intronweight,
+                                 cutoff1=args.cutoff1,
+                                 cutoff2=args.cutoff2)
 
 
     def clusterj(self):

diff --git a/test/convert_test.py b/test/convert_test.py
@@ -36,6 +36,29 @@ def test_cigar_count(self):
 
         print((cigar_count(cigar_tuple)))
 
+    def test_ensembl(self):
+        ensembl_file = "/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3"
+        gff=GFF(ensembl_file)
+        bigglist=gff_to_bigGenePred(gff)
+        print(len(bigglist))
+
+    def test_tair(self):
+        # 52059 mRNA
+        gff_file = "/data/reference/tair/GCF_000001735.4_TAIR10.1_genomic.gff"
+        gff=GFF(gff_file)
+        bigglist=gff_to_bigGenePred(gff)
+        print(len(bigglist))
+
+    def test_wormbase(self):
+        gff_file = "/data/reference/cel/ws266.gff"
+        gff=GFF(gff_file)
+        gff.transcript_format("ID")
+        #print(gff.transcript_to_gene["T"])
+        bigglist=gff_to_bigGenePred(gff)
+        print(len(bigglist))
+
+
+
 
     def tearDown(self):
         self.bigg = None
diff --git a/test/gff_test.py b/test/gff_test.py
@@ -13,16 +13,33 @@ class GFFTest(unittest.TestCase):
     def setUp(self):
         #self.gff = GFF("/home/zhaolab1/reference/ce10_ws266.gff")
         self.gff=GFF("./genes/unc52/unc52.gff")
+        self.attr_str="ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52"
 
     def test_format_write(self):
         self.gff.gene_format()
         print((len(list(self.gff.gene_d.keys()))))
         print((list(self.gff.gene_d.keys())[0:10]))
-        self.gff.gff_write("./genes/unc52/unc52_write.gff", keys=["ZC101.2"])
+        self.gff.gff_write("./genes/unc52/unc52_write_2.gff")
         print("test gff IO done")
 
-    def test_parser(self):
-        print((parse_attributes("ID=Transcript:ZC101.2e;Parent=Gene:WBGene00006787;Name=ZC101.2e;wormpep=WP:CE18424;locus=unc-52")))
+    def test_parser_attr(self):
+        print(parse_attributes(self.attr_str))
+
+    def test_attr2str(self):
+        attr_dic=parse_attributes(self.attr_str)
+        attr_str_new=attr_dic_to_str(attr_dic)
+        print(attr_str_new)
+        print(self.attr_str)
+
+    def test_parser_gff_and_reverse(self):
+        self.gff.to_list()
+        gff_one=self.gff.gff_string_list[1]
+        gff_record=parse_gff_line(gff_one)
+        print(gff_record)
+        gff_str=gff_record_to_str(gff_record)
+        print(gff_one)
+        print(gff_str)
+
 
     def test_to_transcript(self):
         self.gff.transcript_format()
@@ -31,6 +48,54 @@ def test_to_transcript(self):
         print((self.gff.transcript_d))
         print("Finish to_transcript test")
 
+    def test_ensembl(self):
+        """
+        test the gff performace in ensembl gff format
+        :return:
+        """
+        ensembl_file="/data/reference/mouse/Mus_musculus.GRCm39.107.chr.gff3"
+        gff=GFF(ensembl_file)
+        gff.to_list()
+        gff.gene_format()
+        gff.transcript_format()
+        print(len(gff.gene_d), len(gff.transcript_d))
+
+        def __test_gene():
+            """
+            test code for ensembl mouse gff
+            :return:
+            """
+            keys=["gene:ENSMUSG00000114538","gene:ENSMUSG00000103506" ]
+            for k in keys:
+                print(k)
+                print(gene2line[k])
+                for i in gene2line[k]:
+                    print(self.gff_records[i])
+
+
+
+    def test_ucsc(self):
+        """
+        todo: this need a gtg parser
+        test the gff convert from ucsc gtf or gff
+        :return:
+        """
+        pass
+
+
+    def test_wormbase(self):
+        pass
+
+    def test_genbank(self):
+        """
+        test the gff format from genbank bioproject
+        :return:
+        """
+        pass
+
+
+
+
 
     def tearDown(self):
         self.bigg = None

diff --git a/test/track_test.py b/test/track_test.py
@@ -13,7 +13,7 @@
 class Test(unittest.TestCase):
     def setUp(self):
         genes=["unc52", "AT1G06860", "AT2G02100", "AT2G43410"]
-        gene=genes[-1]
+        gene=genes[0]
 
         bigg_nano=[]
         with open("./genes/{gene}/{gene}_nano.bed".format(gene=gene)) as f:
@@ -131,7 +131,7 @@ def __test_bindseq_reverse(self):
     def test_bindseq_forward(self):
         #ref_dict = fasta2dic("/home/li/reference/tair/tair10.fa")
         mrna_dict = fasta2dic("./genes/AT2G02100/AT2G02100.fasta")
-        pass
+
 
     def test_mrna_pos_to_chro(self):
         sample = self.bigg[0]
@@ -143,11 +143,12 @@ def test_mrna_pos_to_chro(self):
         ### check if the nucl of the same pos is indentical
         ### chro is 0 based and mRNA is 1 based
 
-    def __test_orfs(self):
+    def test_orfs(self):
         """
         can only run with ce10 ref
         """
-        ref_dict=fasta2dic("/home/zhaolab1/reference/ce10_ucsc.fa")
+        # ref_dic is a large external file
+        ref_dict=fasta2dic("/data/reference/ce10_ucsc.fa")
         bigg_one=self.bigg[30]
         bigg_one.bind_chroseq(ref_dict, gap=0, intron=False)
         print((bigg_one.seq_chro))

diff --git a/test/utils_test.py b/test/utils_test.py
@@ -36,6 +36,9 @@ def test_is_bin_in(self):
 
         myexe("/bin/bash -i -c minimap2")
 
+    def test_is_package_installed(self):
+        print(is_package_installed("Bio"))
+
     def test_summary(self):
         logger = log_summary()
         logger.info("This is just a test, print number 100 as float" + "," + str(float(1) * 100))

diff --git a/trackcluster/__init__.py b/trackcluster/__init__.py
@@ -1,5 +1,5 @@
 # For relative imports to work in Python 3.6
 import os, sys
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-__version__="0.1.1"
+__version__="0.1.2"
 __all__ = []
diff --git a/trackcluster/convert.py b/trackcluster/convert.py
@@ -117,7 +117,7 @@ def cigar_count(cigar_tuple):
 
 #### another function
 
-def gff_to_bigGenePred(gff):
+def gff_to_bigGenePred(gff, indicator="ID"):
     """
     Note gff is 1 start and bigGenePred is 0
     :param gff: a GFF class with only one gene inside
@@ -127,11 +127,15 @@ def gff_to_bigGenePred(gff):
     bigg_list=[]
 
     if gff.transcript_d is None:
-        gff.transcript_format()
+        gff.transcript_format(indicator=indicator)
+    #print(list(gff.transcript_d.keys()))
 
     for key in list(gff.transcript_d.keys()):
         bigg = bigGenePred()
-        gene=gff.transcript_to_gene[key]
+        try:
+            gene=gff.transcript_to_gene[key]
+        except KeyError:
+            print("KeyERROR", key)
 
         for n, record in enumerate(gff.transcript_d[key]):
             #use the first line to get basic infor

diff --git a/trackcluster/flow.py b/trackcluster/flow.py
@@ -431,4 +431,25 @@ def flow_cluster_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,f1=0.01
 ########################################################################################################
 
 def flow_annotation(wkdir, ):
-    pass
+    pass
+
+
+############
+def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
+                                 f1=0.01, f2=0.01, count_cutoff=5, batchsize=2000):
+    """
+    The overall full run pipeline for impatient people
+    :param wkdir:
+    :param prefix:
+    :param ref_fasta:
+    :param fastq_l:
+    :param nano_bed:
+    :param gff_bed:
+    :param core:
+    :param f1:
+    :param f2:
+    :param count_cutoff:
+    :param batchsize:
+    :return:
+    """
+    pass