From 70f61aa53f61445c9ec3f2d8417dee2d5651bb76 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Fri, 23 Mar 2018 14:04:23 -0400 Subject: [PATCH 1/3] :feet: on 0.4.7 : dummy species contains biotype of gene --- celseq2/dummy_species.py | 62 +++++++++++++++-------------- celseq2/prepare_annotation_model.py | 58 ++++++++++++++++++++------- celseq2/template/config.yaml | 2 + celseq2/version.py | 2 +- celseq2/workflow/celseq2.snakemake | 49 ++++++++++++++++++++++- 5 files changed, 127 insertions(+), 46 deletions(-) diff --git a/celseq2/dummy_species.py b/celseq2/dummy_species.py index 61a533c..5b0c60c 100644 --- a/celseq2/dummy_species.py +++ b/celseq2/dummy_species.py @@ -11,7 +11,7 @@ import argparse -def gtf_attr_str(gene_id=None, gene_name=None, +def gtf_attr_str(gene_id=None, gene_name=None, gene_biotype=None, transcript_id=None, exon_num=None): ''' Simple writer to fill 'attribute' column of GTF @@ -26,6 +26,8 @@ def gtf_attr_str(gene_id=None, gene_name=None, return(None) out = "gene_id \"{gid}\"; gene_name \"{gname}\";".format(gid=gene_id, gname=gene_name) + if gene_biotype: + out += " gene_biotype \"{biotype}\";".format(biotype=gene_biotype) if transcript_id: out += " transcript_id \"{txid}\";".format(txid=transcript_id) if exon_num: @@ -62,7 +64,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): frame = '.' out = [] # g0 - g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0') + g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -73,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', + tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding', transcript_id='tx0') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -86,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', + exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding', transcript_id='tx0', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -103,7 +105,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream, e_stream = e_stream - 2 * len_exon - 1 * len_intron + 1, e_stream # g8 - g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8') + g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA') g = gtf_str(chrm='chr2', src=src, feature='gene', start=s_stream, @@ -115,7 +117,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(g)) out.append(g) tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', - transcript_id='tx8') + transcript_id='tx8', gene_biotype='lincRNA') tx = gtf_str(chrm='chr2', src=src, feature='transcript', start=s_stream, @@ -127,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', + exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA', transcript_id='tx8', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr2', src=src, @@ -144,7 +146,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream = e_stream + 1 + len_intergenic # g1 - g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1') + g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -155,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', + tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA', transcript_id='tx1') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -168,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(3, 0, -1): - exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', + exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA', transcript_id='tx1', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -185,7 +187,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream = e_stream + 1 + len_intergenic # g2 - g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2') + g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -196,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', + tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', transcript_id='tx2.1') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -209,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2, 0, -1): - exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', + exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', transcript_id='tx2', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -224,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): out.append(exon) s_stream = e_stream + 1 + len_intron - tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', + tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', transcript_id='tx2.2') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -236,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=tx_attr) fout.write('{}\n'.format(tx)) out.append(tx) - exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', + exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', transcript_id='tx2.2', exon_num=1) exon = gtf_str(chrm='chr1', src=src, feature='exon', @@ -251,7 +253,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream = e_stream + 1 + len_intergenic # g3 - g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3') + g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -262,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', + tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding', transcript_id='tx3') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -275,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', + exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding', transcript_id='tx3', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -293,7 +295,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream = e_stream + 1 + len_intergenic # g4 - g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4') + g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -304,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', + tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding', transcript_id='tx4') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -317,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', + exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding', transcript_id='tx4', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -334,7 +336,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): # g5 s_stream, e_stream = e_stream - len_exon + 1, e_stream - g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5') + g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -345,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', + tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding', transcript_id='tx5') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -358,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(1, 0, -1): - exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', + exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding', transcript_id='tx5', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -375,7 +377,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): s_stream = e_stream + 1 + len_intergenic # g6 - g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6') + g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -386,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', + tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding', transcript_id='tx6') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -398,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=tx_attr) fout.write('{}\n'.format(tx)) out.append(tx) - exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', + exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding', transcript_id='tx6', exon_num=1) exon = gtf_str(chrm='chr1', src=src, feature='exon', @@ -411,7 +413,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(exon)) out.append(exon) # g7 - g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7') + g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding') g = gtf_str(chrm='chr1', src=src, feature='gene', start=s_stream, @@ -422,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', + tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding', transcript_id='tx7') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -435,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', + exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding', transcript_id='tx7', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, diff --git a/celseq2/prepare_annotation_model.py b/celseq2/prepare_annotation_model.py index 70a8e60..fa4c792 100644 --- a/celseq2/prepare_annotation_model.py +++ b/celseq2/prepare_annotation_model.py @@ -8,30 +8,54 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', + gene_types = (), stranded=True, dumpto=None, verbose=False): + ''' + Prepare a feature model. + + Output: (features, exported_genes) where: + - features: HTSeq.GenomicArrayOfSets() + - exported_genes: a sorted list + + For example, feature_atrr = 'gene_name', feature_type = 'exon', + gene_types = ('protein_coding', 'lincRNA'): + - features: all exons ~ all gnames mapping and ready for counting + - exported_genes: only protein_coding and lincRNA gnames are visible + Quantification used the full genes but only the selected genes are reported. + ''' features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) fh_gff = HTSeq.GFF_Reader(gff_fpath) - all_genes = set() + exported_genes = set() i = 0 for gff in fh_gff: if verbose and i % 100000 == 0: print_logger('Processing {:,} lines of GFF...'.format(i)) i += 1 - if gff.type == feature_type: - features[gff.iv] += gff.attr[feature_atrr].strip() - all_genes.add(gff.attr[feature_atrr].strip()) - else: - pass + if gff.type != feature_type: + continue + + features[gff.iv] += gff.attr[feature_atrr].strip() + + if not feature_atrr.startswith('gene'): + exported_genes.add(gff.attr[feature_atrr].strip()) + continue + + if not gene_types: + exported_genes.add(gff.attr[feature_atrr].strip()) + continue + + if gff.attr.get('gene_type', None) in gene_types: + exported_genes.add(gff.attr[feature_atrr].strip()) - # if i == 5000: - # break ## test mode print_logger('Processed {:,} lines of GFF...'.format(i)) + if exported_genes: + exported_genes = sorted(exported_genes) if dumpto: with open(dumpto, 'wb') as fh: - pickle.dump((features, all_genes), fh) - return((features, all_genes)) + pickle.dump((features, exported_genes), fh) + return((features, exported_genes)) def main(): @@ -41,12 +65,17 @@ def main(): help='File path to GFF') parser.add_argument('--feature-atrr', type=str, default='gene_id', - help=('Reserved word for feature_atrr in GFF to be ' - 'called a \'gene\'.')) + help=('Reserved word for feature_atrr in GTF/GFF to be' + ' considered a Gene, e.g., \"gene_id\".')) parser.add_argument('--feature-type', type=str, default='exon', - help=('Reserved word for feature type in GFF to be ' - 'the components of \'gene\'.')) + help=('Reserved word for feature type in 3rd column' + ' in GTF/GFF to be the components of \'Gene\',' + ' e.g., \"exon\".')) + parser.add_argument('--gene-types', type=tuple, + default=(), + help=('Reserved words for the attrbute \"gene_biotype\"' + ' (Gene type), e.g., protein_coding, lincRNA')) parser.add_argument('--strandless', dest='stranded', action='store_false') parser.set_defaults(stranded=True) parser.add_argument('--dumpto', type=str, metavar='FILENAME', @@ -59,6 +88,7 @@ def main(): _ = cook_anno_model(gff_fpath=args.gff_file, feature_atrr=args.feature_atrr, feature_type=args.feature_type, + gene_types=args.gene_types, stranded=args.stranded, dumpto=args.dumpto, verbose=args.verbose) diff --git a/celseq2/template/config.yaml b/celseq2/template/config.yaml index 043723c..9bb96fb 100644 --- a/celseq2/template/config.yaml +++ b/celseq2/template/config.yaml @@ -21,6 +21,8 @@ GFF: '/absolute/path/to/wonderful.gtf.gz' # Refer: http://htseq.readthedocs.io/en/master/count.html FEATURE_ID: 'gene_name' FEATURE_CONTENT: 'exon' +# GENE_BIOTYPE: ['protein_coding', 'lincRNA'] +GENE_BIOTYPE: ## Demultiplexing ## FASTQ_QUAL_MIN_OF_BC: 10 diff --git a/celseq2/version.py b/celseq2/version.py index ab45471..1e4826d 100644 --- a/celseq2/version.py +++ b/celseq2/version.py @@ -1 +1 @@ -__version__ = '0.4.6' +__version__ = '0.4.7' diff --git a/celseq2/workflow/celseq2.snakemake b/celseq2/workflow/celseq2.snakemake index 29547cc..727642e 100644 --- a/celseq2/workflow/celseq2.snakemake +++ b/celseq2/workflow/celseq2.snakemake @@ -47,6 +47,8 @@ STAR = config.get('STAR', None) GFF = config.get('GFF', None) FEATURE_ID = config.get('FEATURE_ID', 'gene_name') FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon') +GENE_BIOTYPE = config.get('GENE_BIOTYPE', None) +if not GENE_BIOTYPE: GENE_BIOTYPE = (); ## Demultiplexing ## FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10 @@ -222,7 +224,7 @@ rule setup_dir: mkfolder(d) ## HT-seq Count UMI ## -rule cook_annotation: +rule COOK_ANNOTATION: input: '_done_setupdir', output: @@ -233,12 +235,57 @@ rule cook_annotation: run: _ = cook_anno_model(GFF, feature_atrr=FEATURE_ID, feature_type=FEATURE_CONTENT, + gene_types=GENE_BIOTYPE, stranded=True, dumpto=output.anno, verbose=verbose) shell('touch {output.flag}') # Combo-demultiplexing +rule COMBO_DEMULTIPLEXING: + input: + flag2 = '_done_annotation', + flag1 = '_done_setupdir', + output: + '_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal + message: 'Performing combo-demultiplexing' + threads: len(item_names) + run: + # Demultiplx fastq in Process pool + p = Pool(threads) + for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2): + itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid) + try: + os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz')) + os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz')) + except OSError: + pass + itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid) + itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid, + 'demultiplexing.log') + print_logger('Demultiplexing {}'.format(itemid)) + cmd = " ".join(["bc_demultiplex", + itemr1, + itemr2, + "--bc-index {}".format(BC_INDEX_FPATH), + "--bc-seq-column {}".format(BC_SEQ_COLUMN), + "--bc-index-used {}".format(itembc), + "--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC), + "--umi-start-position {}".format( + UMI_START_POSITION), + "--bc-start-position {}".format(BC_START_POSITION), + "--umi-length {}".format(UMI_LENGTH), + "--bc-length {}".format(BC_LENGTH), + "--cut-length {}".format(CUT_LENGTH), + "--out-dir {}".format(itemid_fqs_dir), + "--is-gzip ", + "--stats-file {}".format(itemid_log)]) + p.apply_async(shell, args=(cmd,)) + p.close() + p.join() + shell('touch {output} ') + + rule combo_demultiplexing: input: flag2 = '_done_annotation', From 0f74ed8490783d9bf368c726683420e2a55b7654 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Fri, 23 Mar 2018 14:26:13 -0400 Subject: [PATCH 2/3] :feet: 0.4.7 : remove unnecessary sort --- celseq2/prepare_annotation_model.py | 2 +- celseq2/workflow/celseq2.snakemake | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/celseq2/prepare_annotation_model.py b/celseq2/prepare_annotation_model.py index fa4c792..549dfc2 100644 --- a/celseq2/prepare_annotation_model.py +++ b/celseq2/prepare_annotation_model.py @@ -45,7 +45,7 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', exported_genes.add(gff.attr[feature_atrr].strip()) continue - if gff.attr.get('gene_type', None) in gene_types: + if gff.attr.get('gene_biotype', None) in gene_types: exported_genes.add(gff.attr[feature_atrr].strip()) print_logger('Processed {:,} lines of GFF...'.format(i)) diff --git a/celseq2/workflow/celseq2.snakemake b/celseq2/workflow/celseq2.snakemake index 727642e..4c095fd 100644 --- a/celseq2/workflow/celseq2.snakemake +++ b/celseq2/workflow/celseq2.snakemake @@ -439,8 +439,7 @@ rule count_umi: aln_diag = join_path(DIR_PROJ, SUBDIR_DIAG, '{itemid}', '{bc}.pkl'), message: 'Counting {input.sam}' run: - features_f, all_genes = pickle.load(open(input.gff, 'rb')) - all_genes = sorted(all_genes) + features_f, _ = pickle.load(open(input.gff, 'rb')) umi_cnt, umi_set, aln_cnt = count_umi(sam_fpath=input.sam, features=features_f, len_umi=UMI_LENGTH, @@ -469,8 +468,7 @@ rule summarize_umi_matrix_per_item: expid=sample_list, itemid=item_names), flag = '_done_umimatrix_per_item', run: - _, all_genes = pickle.load(open(input.gff, 'rb')) - all_genes = sorted(all_genes) + _, export_genes = pickle.load(open(input.gff, 'rb')) # item -> dict(cell_bc -> Counter(umi_vector)) item_expr_matrix = defaultdict(dict) @@ -484,10 +482,10 @@ rule summarize_umi_matrix_per_item: exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 for bc, cnt in expr_dict.items(): - expr_dict[bc] = pd.Series([cnt[x] for x in all_genes], - index=all_genes) + expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], + index=export_genes) - expr_df = pd.DataFrame(expr_dict, index=all_genes).fillna(0) + expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, item_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, @@ -511,8 +509,7 @@ rule summarize_umi_matrix_per_experiment: expid=list(set(sample_list))), flag = '_done_umimatrix_per_experiment', run: - _, all_genes = pickle.load(open(input.gff, 'rb')) - all_genes = sorted(all_genes) + _, export_genes = pickle.load(open(input.gff, 'rb')) sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values @@ -532,7 +529,7 @@ rule summarize_umi_matrix_per_experiment: exp_expr_matrix[exp_id][bc_name] = umiset_stream continue umiset_cached = exp_expr_matrix[exp_id][bc_name] - for x in all_genes: + for x in export_genes: y1 = exp_expr_matrix[exp_id][bc_name].get(x, set()) y2 = umiset_stream.get(x, set()) exp_expr_matrix[exp_id][bc_name][x] = y1 | y2 @@ -540,9 +537,9 @@ rule summarize_umi_matrix_per_experiment: for exp_id, expr_dict in exp_expr_matrix.items(): for bc, cnt in expr_dict.items(): cnt = _flatten_umi_set(cnt) - expr_dict[bc] = pd.Series([cnt[x] for x in all_genes], - index=all_genes) - expr_df = pd.DataFrame(expr_dict, index=all_genes).fillna(0) + expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], + index=export_genes) + expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, From 1d19ff98d1f6c146254de144fd6c75fe4e5214cd Mon Sep 17 00:00:00 2001 From: yy1533 Date: Fri, 23 Mar 2018 14:32:31 -0400 Subject: [PATCH 3/3] :cn: release 0.4.7 --- docs/about/release_note.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/about/release_note.md b/docs/about/release_note.md index bdd93fc..b8193c6 100644 --- a/docs/about/release_note.md +++ b/docs/about/release_note.md @@ -10,6 +10,20 @@ --- +## :fa-flag-checkered: **v0.4.7** + +:fa-calendar: **2018-03-23** + +:fa-star: **Features** + +- Support "gene_biotype" selection. +- Better support SpatialTranscriptome data. + - `celseq2-to-st` understands `expr.csv` file + - `celseq2-to-st` recognizes "(1 out of many)" annotation inside GTF/GFF. +- Robust design of work flow. + +--- + ## :fa-flag-checkered: **v0.4.4** :fa-calendar: **2018-02-13**