diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py index 4170a83..dbd7538 100644 --- a/celseq2/demultiplex.py +++ b/celseq2/demultiplex.py @@ -77,6 +77,7 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq, len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10, is_gzip=True, + save_unknown_bc_fastq=False, do_bc_rev_complement=False, do_tx_rev_complement=False, verbose=False): @@ -153,12 +154,13 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq, try: fhout = bc_fhout[cell_bc] except KeyError: - fhout = bc_fhout['UNKNOWNBC_R1'] - fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq, - "+", umibc_qualstr)) - fhout = bc_fhout['UNKNOWNBC_R2'] - fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq, - "+", tx_qualstr)) + if save_unknown_bc_fastq: + fhout = bc_fhout['UNKNOWNBC_R1'] + fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq, + "+", umibc_qualstr)) + fhout = bc_fhout['UNKNOWNBC_R2'] + fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq, + "+", tx_qualstr)) sample_counter['unknown'] += 1 continue @@ -184,20 +186,20 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq, def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath): if stats_fpath is None: - stats_fpath = 'demultiplexing.log' + stats_fpath = 'demultiplexing.csv' try: fh_stats = open(stats_fpath, 'w') except Exception as e: raise Exception(e) - fh_stats.write('BC\tReads(#)\tReads(%)\n') + fh_stats.write('BC,Reads(#),Reads(%)\n') for bc_id, bc_seq in dict_bc_id2seq.items(): # bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id))) - formatter = '{:04d}-{}\t{}\t{:07.3f}\n' + formatter = '{:04d}-{},{},{:07.3f}\n' fh_stats.write(formatter.format(bc_id, bc_seq, stats[bc_seq], stats[bc_seq] / stats['total'] * 100)) - formatter = '{}\t{}\t{:07.3f}\n' + formatter = '{},{},{:07.3f}\n' fh_stats.write(formatter.format('saved', stats['saved'], stats['saved'] / stats['total'] * 100)) fh_stats.write(formatter.format('unknown', stats['unknown'], @@ -247,6 +249,9 @@ def main(): help='Length of CELSeq barcode (default=6)') parser.add_argument('--cut-length', metavar='N', type=int, default=35, help='Length of read on R2 to be mapped. (default=35)') + parser.add_argument('--save-unknown-bc-fastq', + dest='save_unknown_bc_fastq', action='store_true') + parser.set_defaults(save_unknown_bc_fastq=False) parser.add_argument('--verbose', dest='verbose', action='store_true') parser.set_defaults(verbose=False) @@ -269,6 +274,7 @@ def main(): len_tx=args.cut_length, bc_qual_min=args.min_bc_quality, is_gzip=args.is_gzip, + save_unknown_bc_fastq=args.save_unknown_bc_fastq, do_bc_rev_complement=False, do_tx_rev_complement=False, verbose=args.verbose) diff --git a/celseq2/dummy_species.py b/celseq2/dummy_species.py index 5b0c60c..292f38b 100644 --- a/celseq2/dummy_species.py +++ b/celseq2/dummy_species.py @@ -75,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding', transcript_id='tx0') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -88,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding', transcript_id='tx0', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -116,7 +116,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', + tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', transcript_id='tx8', gene_biotype='lincRNA') tx = gtf_str(chrm='chr2', src=src, feature='transcript', @@ -129,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA', + exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA', transcript_id='tx8', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr2', src=src, @@ -157,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA', + tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA', transcript_id='tx1') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -170,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(3, 0, -1): - exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA', + exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA', transcript_id='tx1', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -198,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding', transcript_id='tx2.1') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -211,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2, 0, -1): - exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding', transcript_id='tx2', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -226,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): out.append(exon) s_stream = e_stream + 1 + len_intron - tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding', transcript_id='tx2.2') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -238,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=tx_attr) fout.write('{}\n'.format(tx)) out.append(tx) - exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding', transcript_id='tx2.2', exon_num=1) exon = gtf_str(chrm='chr1', src=src, feature='exon', @@ -264,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding', transcript_id='tx3') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -277,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding', transcript_id='tx3', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -306,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding', transcript_id='tx4') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -319,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding', transcript_id='tx4', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -347,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding', transcript_id='tx5') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -360,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(1, 0, -1): - exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding', transcript_id='tx5', exon_num=i) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, @@ -388,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding', transcript_id='tx6') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -400,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=tx_attr) fout.write('{}\n'.format(tx)) out.append(tx) - exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding', transcript_id='tx6', exon_num=1) exon = gtf_str(chrm='chr1', src=src, feature='exon', @@ -424,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): attr=g_attr) fout.write('{}\n'.format(g)) out.append(g) - tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding', + tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding', transcript_id='tx7') tx = gtf_str(chrm='chr1', src=src, feature='transcript', @@ -437,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300): fout.write('{}\n'.format(tx)) out.append(tx) for i in range(2): - exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding', + exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding', transcript_id='tx7', exon_num=i + 1) s_stream, e_stream = s_stream, s_stream + len_exon - 1 exon = gtf_str(chrm='chr1', src=src, diff --git a/celseq2/helper.py b/celseq2/helper.py index ecfd99e..a39f26a 100644 --- a/celseq2/helper.py +++ b/celseq2/helper.py @@ -35,7 +35,7 @@ def ymdhms(): def print_logger(msg): localtime = time.asctime(time.localtime(time.time())) # sys.stderr.write("[ {} ] {}\n".format(localtime, msg)) - print("[ {} ] {}\n".format(localtime, msg)) + print("[ {} ] {}\n".format(localtime, msg), flush=True) def mkfolder(dirpath): diff --git a/celseq2/prepare_annotation_model.py b/celseq2/prepare_annotation_model.py index 549dfc2..fb552b9 100644 --- a/celseq2/prepare_annotation_model.py +++ b/celseq2/prepare_annotation_model.py @@ -5,6 +5,7 @@ import pickle from celseq2.helper import print_logger +# from genometools.ensembl.annotations import get_genes def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', @@ -50,8 +51,13 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', print_logger('Processed {:,} lines of GFF...'.format(i)) + # Use genometools to select exported_genes + # if gene_types: + # exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types)) + # exported_genes = list(exported_genes['name'].values) + if exported_genes: - exported_genes = sorted(exported_genes) + exported_genes = tuple(sorted(exported_genes)) if dumpto: with open(dumpto, 'wb') as fh: pickle.dump((features, exported_genes), fh) diff --git a/celseq2/template/config.yaml b/celseq2/template/config.yaml index 9bb96fb..6f5d11d 100644 --- a/celseq2/template/config.yaml +++ b/celseq2/template/config.yaml @@ -1,38 +1,59 @@ -## CEL-seq2 Tech Setting ## +#################################### +## Tech Setting +#################################### BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab' -BC_SEQ_COLUMN: 1 +BC_SEQ_COLUMN: 0 BC_IDs_DEFAULT: '1-96' UMI_START_POSITION: 0 UMI_LENGTH: 6 BC_START_POSITION: 6 BC_LENGTH: 6 -## Tools ## -# 'bowtie2' 'star' +#################################### +## Alignment Tools +#################################### +## Which RNA-seq aligner to use? 'bowtie2' or 'star' ALIGNER: 'bowtie2' -BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index' +## What is the absolute path to command bowtie2? BOWTIE2: '/absolute/path/to/bowtie2' -STAR_INDEX_DIR: '/absolute/path/to/star/folder' +## What is the sharef prefix of bowtie2 index? +BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index' +## What is the absolute path to command STAR? STAR: '/absolute/path/to/star' +## Whare is the directory to save STAR index? +STAR_INDEX_DIR: '/absolute/path/to/star/folder/' +#################################### ## Annotations ## -# Path to GTF/GFF file +#################################### +## Where is the GTF.gz/GFF.gz/GTF/GFF file? GFF: '/absolute/path/to/wonderful.gtf.gz' -# Refer: http://htseq.readthedocs.io/en/master/count.html +## Refer: http://htseq.readthedocs.io/en/master/count.html +## What is considered the "gene"? 'gene_name' or 'gene_id'? FEATURE_ID: 'gene_name' +## What is the content of the "gene"? Mostly 'exon'. FEATURE_CONTENT: 'exon' -# GENE_BIOTYPE: ['protein_coding', 'lincRNA'] +## Which type of genes to report? GENE_BIOTYPE: + - 'protein_coding' + - 'lincRNA' +## If nothing set as below, all genes are reported. +## GENE_BIOTYPE: -## Demultiplexing ## +#################################### +## Demultiplexing +#################################### FASTQ_QUAL_MIN_OF_BC: 10 CUT_LENGTH: 35 +SAVE_UNKNOWN_BC_FASTQ: false -## Alignment ## - -## UMI Count ## +#################################### +## UMI Count +#################################### ALN_QUAL_MIN: 0 -## Running Parameters ## +#################################### +## Running Parameters +#################################### num_threads: 15 verbose: True diff --git a/celseq2/version.py b/celseq2/version.py index 1e4826d..5bf52d5 100644 --- a/celseq2/version.py +++ b/celseq2/version.py @@ -1 +1 @@ -__version__ = '0.4.7' +__version__ = '0.4.8' diff --git a/celseq2/workflow/celseq2.snakemake b/celseq2/workflow/celseq2.snakemake index 4c095fd..fd85f68 100644 --- a/celseq2/workflow/celseq2.snakemake +++ b/celseq2/workflow/celseq2.snakemake @@ -18,6 +18,7 @@ import tempfile import shutil import os + ## Inforamtion ## # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2' DIR_PROJ = config.get('output_dir', None) @@ -53,6 +54,7 @@ if not GENE_BIOTYPE: GENE_BIOTYPE = (); ## Demultiplexing ## FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10 CUT_LENGTH = config.get('CUT_LENGTH', None) # 35 +SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False ## Alignment ## ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star' assert (ALIGNER), 'Error: Specify aligner.' @@ -65,8 +67,8 @@ STRANDED = config.get('stranded', 'yes') ## Running Parameters ## # bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5' -num_threads = config.get('num_threads', None) # 5 -verbose = config.get('verbose', None) # True +num_threads = config.get('num_threads', 16) # 5 +verbose = config.get('verbose', True) # True ####################### @@ -107,10 +109,10 @@ SUBDIRS = [SUBDIR_INPUT, SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_DIAG, SUBDIR_ANNO ] -aln_diagnose_item = ["_unmapped", - "_low_map_qual", '_multimapped', "_uniquemapped", - "_no_feature", "_ambiguous", - "_total"] +# aln_diagnose_item = ["_unmapped", +# "_low_map_qual", '_multimapped', "_uniquemapped", +# "_no_feature", "_ambiguous", +# "_total"] ## Dev ## ##################### @@ -121,39 +123,16 @@ workdir: DIR_PROJ rule all: input: + '_done_annotation', '_done_UMI', '_done_report', - '_done_annotation', - '_done_umimatrix_per_experiment', + output: + touch('_DONE'), run: if glob.glob('celseq2_job*.sh*'): + mkfolder(SUBDIR_QSUB) shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB)) - - -rule COUNT_MATRIX: - input: - csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'), - expid=list(set(sample_list))), - hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'), - expid=list(set(sample_list))), - - output: - touch('_done_UMI') - message: 'Finished counting UMI-count matrix.' - run: - if ALIGNER == 'star': - shell('rm {}'.format('_done_star_genome_loaded')) - print('Free memory loaded by STAR', flush=True) - with tempfile.TemporaryDirectory() as tmpdirname: - cmd = 'STAR ' - cmd += '--genomeLoad Remove ' - cmd += '--genomeDir {STAR_INDEX_DIR} ' - cmd += '--outFileNamePrefix ' - cmd += join_path(tmpdirname, '') - shell(cmd) - print_logger('UMI-count matrix is saved at {}'.format(input.csv)) - ''' rule celseq2: message: 'Finished Entire Pipeline.' @@ -201,112 +180,87 @@ rule celseq2: print_logger('Expression UMI matrix is saved at {}'.format(input.csv)) ''' -rule setup_dir: - input: SAMPLE_TABLE_FPATH - output: - touch('_done_setupdir'), - dir1 = SUBDIRS, - dir2 = expand(join_path('{subdir}', '{itemid}'), - subdir=[SUBDIR_INPUT, - SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG, - SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG], - itemid=item_names), - dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'), - zip, expid=sample_list, itemid=item_names), - - message: 'Setting up project directory.' - run: - for d in output.dir1: - mkfolder(d) - for d in output.dir2: - mkfolder(d) - for d in output.dir3: - mkfolder(d) +# rule setup_dir: +# input: SAMPLE_TABLE_FPATH +# output: +# touch('_done_setupdir'), +# dir1 = SUBDIRS, +# dir2 = expand(join_path('{subdir}', '{itemid}'), +# subdir=[SUBDIR_INPUT, +# SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_DIAG, +# SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_LOG], +# itemid=item_names), +# dir3 = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', '{itemid}'), +# zip, expid=sample_list, itemid=item_names), + +# message: 'Setting up project directory.' +# run: +# for d in output.dir1: +# mkfolder(d) +# for d in output.dir2: +# mkfolder(d) +# for d in output.dir3: +# mkfolder(d) -## HT-seq Count UMI ## -rule COOK_ANNOTATION: + +rule COUNT_MATRIX: input: - '_done_setupdir', + csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'), + expid=list(set(sample_list))), + hdf = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.h5'), + expid=list(set(sample_list))), output: - anno = join_path(DIR_PROJ, SUBDIR_ANNO, - base_name(GFF) + '.pickle'), - flag = '_done_annotation', - message: 'Cooking Annotation' + touch('_done_UMI') + message: 'Finished counting UMI-count matrix.' run: - _ = cook_anno_model(GFF, feature_atrr=FEATURE_ID, - feature_type=FEATURE_CONTENT, - gene_types=GENE_BIOTYPE, - stranded=True, - dumpto=output.anno, - verbose=verbose) - shell('touch {output.flag}') + if ALIGNER == 'star': + shell('rm {}'.format('_done_star_genome_loaded')) + print('Free memory loaded by STAR', flush=True) + with tempfile.TemporaryDirectory() as tmpdirname: + cmd = 'STAR ' + cmd += '--genomeLoad Remove ' + cmd += '--genomeDir {STAR_INDEX_DIR} ' + cmd += '--outFileNamePrefix ' + cmd += join_path(tmpdirname, '') + shell(cmd) + print_logger('UMI-count matrix is saved at {}'.format(input.csv)) -# Combo-demultiplexing -rule COMBO_DEMULTIPLEXING: +rule REPORT_ALIGNMENT_LOG: input: - flag2 = '_done_annotation', - flag1 = '_done_setupdir', + report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}', + 'alignment_'+ALIGNER+'.csv'), + itemid=item_names), output: - '_done_combodemultiplex', # dynamic() is not allowed if task being called on terminal - message: 'Performing combo-demultiplexing' - threads: len(item_names) - run: - # Demultiplx fastq in Process pool - p = Pool(threads) - for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2): - itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid) - try: - os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz')) - os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz')) - except OSError: - pass - itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid) - itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid, - 'demultiplexing.log') - print_logger('Demultiplexing {}'.format(itemid)) - cmd = " ".join(["bc_demultiplex", - itemr1, - itemr2, - "--bc-index {}".format(BC_INDEX_FPATH), - "--bc-seq-column {}".format(BC_SEQ_COLUMN), - "--bc-index-used {}".format(itembc), - "--min-bc-quality {}".format(FASTQ_QUAL_MIN_OF_BC), - "--umi-start-position {}".format( - UMI_START_POSITION), - "--bc-start-position {}".format(BC_START_POSITION), - "--umi-length {}".format(UMI_LENGTH), - "--bc-length {}".format(BC_LENGTH), - "--cut-length {}".format(CUT_LENGTH), - "--out-dir {}".format(itemid_fqs_dir), - "--is-gzip ", - "--stats-file {}".format(itemid_log)]) - p.apply_async(shell, args=(cmd,)) - p.close() - p.join() - shell('touch {output} ') + touch('_done_report') +# Combo-demultiplexing rule combo_demultiplexing: - input: - flag2 = '_done_annotation', - flag1 = '_done_setupdir', + input: SAMPLE_TABLE_FPATH, + # flag2 = '_done_annotation', + # flag1 = '_done_setupdir', output: fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemid}', '{bc}.fastq')), message: 'Performing combo-demultiplexing' - threads: len(item_names) + params: + jobs = len(item_names), + save_unknown_bc_fastq = SAVE_UNKNOWN_BC_FASTQ, run: # Demultiplx fastq in Process pool - p = Pool(threads) + p = Pool(params.jobs) for itemid, itembc, itemr1, itemr2 in zip(item_names, bc_used, R1, R2): itemid_in = join_path(DIR_PROJ, SUBDIR_INPUT, itemid) + mkfolder(itemid_in) try: os.symlink(itemr1, join_path(itemid_in, 'R1.fastq.gz')) os.symlink(itemr2, join_path(itemid_in, 'R2.fastq.gz')) except OSError: pass itemid_fqs_dir = join_path(DIR_PROJ, SUBDIR_FASTQ, itemid) - itemid_log = join_path(DIR_PROJ, SUBDIR_DIAG, itemid, - 'demultiplexing.log') + + mkfolder(join_path(DIR_PROJ, SUBDIR_REPORT, itemid)) + itemid_log = join_path(DIR_PROJ, SUBDIR_REPORT, itemid, + 'demultiplexing.csv') print_logger('Demultiplexing {}'.format(itemid)) cmd = " ".join(["bc_demultiplex", itemr1, @@ -324,6 +278,9 @@ rule combo_demultiplexing: "--out-dir {}".format(itemid_fqs_dir), "--is-gzip ", "--stats-file {}".format(itemid_log)]) + if params.save_unknown_bc_fastq: + cmd += ' --save-unknown-bc-fastq ' + p.apply_async(shell, args=(cmd,)) p.close() p.join() @@ -427,11 +384,52 @@ if ALIGNER == 'star': pickle.dump(df, open(output.report, 'wb')) -rule count_umi: +## HT-seq Count UMI ## +rule COOK_ANNOTATION: input: - gff = join_path(DIR_PROJ, SUBDIR_ANNO, - base_name(GFF) + '.pickle'), + SAMPLE_TABLE_FPATH, + output: + anno_pkl = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.pickle'), + anno_csv = join_path(DIR_PROJ, SUBDIR_ANNO, base_name(GFF) + '.csv'), flag = '_done_annotation', + params: + gene_type = GENE_BIOTYPE + priority: 100 + message: 'Cooking Annotation' + run: + from genometools.ensembl.annotations import get_genes + + features, exported_genes = cook_anno_model( + GFF, feature_atrr=FEATURE_ID, + feature_type=FEATURE_CONTENT, + gene_types=[], # defautl to all genes + stranded=True, + dumpto=None, # manual export + verbose=verbose) + + if params.gene_type: + print_logger('Types of reported gene: {}.'.format( + ', '.join(params.gene_type))) + gene_df = get_genes(GFF, valid_biotypes = set(params.gene_type)) + exported_genes = sorted(gene_df['name'].values) + print_logger('Number of reported genes: {}.'.format( + len(exported_genes))) + gene_df.to_csv(output.anno_csv) + else: + shell('touch {output.anno_csv}') + + with open(output.anno_pkl, 'wb') as fh: + pickle.dump( (features, exported_genes), fh) + + shell('touch {output.flag}') + + +rule count_umi: + input: + # gff = join_path(DIR_PROJ, SUBDIR_ANNO, + # base_name(GFF) + '.pickle'), + # flag = '_done_annotation', + gff = rules.COOK_ANNOTATION.output.anno_pkl, sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemid}', '{bc}.sam'), output: umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT, '{itemid}', '{bc}.pkl'), @@ -454,8 +452,9 @@ rule count_umi: # - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate rule summarize_umi_matrix_per_item: input: - gff = join_path(DIR_PROJ, SUBDIR_ANNO, - base_name(GFF) + '.pickle'), + # gff = join_path(DIR_PROJ, SUBDIR_ANNO, + # base_name(GFF) + '.pickle'), + gff = rules.COOK_ANNOTATION.output.anno_pkl, umicnt = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_CNT, '{itemid}', '{bc}.pkl')), output: @@ -497,8 +496,9 @@ rule summarize_umi_matrix_per_item: # - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate rule summarize_umi_matrix_per_experiment: input: - gff = join_path(DIR_PROJ, SUBDIR_ANNO, - base_name(GFF) + '.pickle'), + # gff = join_path(DIR_PROJ, SUBDIR_ANNO, + # base_name(GFF) + '.pickle'), + gff = rules.COOK_ANNOTATION.output.anno_pkl, umiset = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_SET, '{itemid}', '{bc}.pkl')), output: @@ -548,12 +548,6 @@ rule summarize_umi_matrix_per_experiment: shell('touch {output.flag} ') -rule REPORT_ALIGNMENT_LOG: - input: - report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}', - 'alignment_'+ALIGNER+'.csv'), - itemid=item_names), - rule report_alignment_log: input: df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER, @@ -561,7 +555,6 @@ rule report_alignment_log: output: report = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemid}', 'alignment_'+ALIGNER+'.csv'), itemid=item_names), - flag = '_done_report', run: for item in item_names: @@ -582,7 +575,6 @@ rule report_alignment_log: report_names=logs_name_item, aligner_name=ALIGNER, savetocsv=report_item) - shell('touch {output.flag} ') # rule summarize_alignment_diagnose: # input: diff --git a/setup.py b/setup.py index 528acd1..bb075e5 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ 'pandas>=0.20.0', 'numpy>=1.12.0', 'tables>=3.4.2', + 'genometools', ] # do not require installation if built by ReadTheDocs