Skip to content

Commit

Permalink
Merge pull request #16 from Puriney/master
Browse files Browse the repository at this point in the history
0.4.8
  • Loading branch information
Puriney authored Apr 3, 2018
2 parents ec5d06b + 9e4ced0 commit 51c02b7
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 178 deletions.
26 changes: 16 additions & 10 deletions celseq2/demultiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
len_umi=6, len_bc=6, len_tx=35,
bc_qual_min=10,
is_gzip=True,
save_unknown_bc_fastq=False,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=False):
Expand Down Expand Up @@ -153,12 +154,13 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,
try:
fhout = bc_fhout[cell_bc]
except KeyError:
fhout = bc_fhout['UNKNOWNBC_R1']
fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
"+", umibc_qualstr))
fhout = bc_fhout['UNKNOWNBC_R2']
fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq,
"+", tx_qualstr))
if save_unknown_bc_fastq:
fhout = bc_fhout['UNKNOWNBC_R1']
fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
"+", umibc_qualstr))
fhout = bc_fhout['UNKNOWNBC_R2']
fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq,
"+", tx_qualstr))
sample_counter['unknown'] += 1
continue

Expand All @@ -184,20 +186,20 @@ def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq,

def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath):
if stats_fpath is None:
stats_fpath = 'demultiplexing.log'
stats_fpath = 'demultiplexing.csv'
try:
fh_stats = open(stats_fpath, 'w')
except Exception as e:
raise Exception(e)
fh_stats.write('BC\tReads(#)\tReads(%)\n')
fh_stats.write('BC,Reads(#),Reads(%)\n')

for bc_id, bc_seq in dict_bc_id2seq.items():
# bc_id = '[{:04d}]'.format('-'.join(map(str, bc_id)))
formatter = '{:04d}-{}\t{}\t{:07.3f}\n'
formatter = '{:04d}-{},{},{:07.3f}\n'
fh_stats.write(formatter.format(bc_id, bc_seq, stats[bc_seq],
stats[bc_seq] / stats['total'] * 100))

formatter = '{}\t{}\t{:07.3f}\n'
formatter = '{},{},{:07.3f}\n'
fh_stats.write(formatter.format('saved', stats['saved'],
stats['saved'] / stats['total'] * 100))
fh_stats.write(formatter.format('unknown', stats['unknown'],
Expand Down Expand Up @@ -247,6 +249,9 @@ def main():
help='Length of CELSeq barcode (default=6)')
parser.add_argument('--cut-length', metavar='N', type=int, default=35,
help='Length of read on R2 to be mapped. (default=35)')
parser.add_argument('--save-unknown-bc-fastq',
dest='save_unknown_bc_fastq', action='store_true')
parser.set_defaults(save_unknown_bc_fastq=False)
parser.add_argument('--verbose', dest='verbose', action='store_true')
parser.set_defaults(verbose=False)

Expand All @@ -269,6 +274,7 @@ def main():
len_tx=args.cut_length,
bc_qual_min=args.min_bc_quality,
is_gzip=args.is_gzip,
save_unknown_bc_fastq=args.save_unknown_bc_fastq,
do_bc_rev_complement=False,
do_tx_rev_complement=False,
verbose=args.verbose)
Expand Down
40 changes: 20 additions & 20 deletions celseq2/dummy_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -88,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding',
transcript_id='tx0', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down Expand Up @@ -116,7 +116,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8',
transcript_id='tx8', gene_biotype='lincRNA')
tx = gtf_str(chrm='chr2', src=src,
feature='transcript',
Expand All @@ -129,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA',
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA',
transcript_id='tx8', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr2', src=src,
Expand Down Expand Up @@ -157,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -170,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(3, 0, -1):
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA',
transcript_id='tx1', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down Expand Up @@ -198,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -211,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2, 0, -1):
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -226,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out.append(exon)
s_stream = e_stream + 1 + len_intron

tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -238,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
Expand All @@ -264,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -277,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding',
transcript_id='tx3', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down Expand Up @@ -306,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -319,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding',
transcript_id='tx4', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down Expand Up @@ -347,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -360,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(1, 0, -1):
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding',
transcript_id='tx5', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down Expand Up @@ -388,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -400,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding',
transcript_id='tx6', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
Expand All @@ -424,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -437,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding',
transcript_id='tx7', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down
2 changes: 1 addition & 1 deletion celseq2/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def ymdhms():
def print_logger(msg):
localtime = time.asctime(time.localtime(time.time()))
# sys.stderr.write("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg))
print("[ {} ] {}\n".format(localtime, msg), flush=True)


def mkfolder(dirpath):
Expand Down
8 changes: 7 additions & 1 deletion celseq2/prepare_annotation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pickle

from celseq2.helper import print_logger
# from genometools.ensembl.annotations import get_genes


def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
Expand Down Expand Up @@ -50,8 +51,13 @@ def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',

print_logger('Processed {:,} lines of GFF...'.format(i))

# Use genometools to select exported_genes
# if gene_types:
# exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
# exported_genes = list(exported_genes['name'].values)

if exported_genes:
exported_genes = sorted(exported_genes)
exported_genes = tuple(sorted(exported_genes))
if dumpto:
with open(dumpto, 'wb') as fh:
pickle.dump((features, exported_genes), fh)
Expand Down
49 changes: 35 additions & 14 deletions celseq2/template/config.yaml
Original file line number Diff line number Diff line change
@@ -1,38 +1,59 @@
## CEL-seq2 Tech Setting ##
####################################
## Tech Setting
####################################
BC_INDEX_FPATH: '/absolute/path/to/wonderful_umi.tab'
BC_SEQ_COLUMN: 1
BC_SEQ_COLUMN: 0
BC_IDs_DEFAULT: '1-96'
UMI_START_POSITION: 0
UMI_LENGTH: 6
BC_START_POSITION: 6
BC_LENGTH: 6

## Tools ##
# 'bowtie2' 'star'
####################################
## Alignment Tools
####################################
## Which RNA-seq aligner to use? 'bowtie2' or 'star'
ALIGNER: 'bowtie2'
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
## What is the absolute path to command bowtie2?
BOWTIE2: '/absolute/path/to/bowtie2'
STAR_INDEX_DIR: '/absolute/path/to/star/folder'
## What is the sharef prefix of bowtie2 index?
BOWTIE2_INDEX_PREFIX: '/absolute/path/to/bowtie2_index'
## What is the absolute path to command STAR?
STAR: '/absolute/path/to/star'
## Whare is the directory to save STAR index?
STAR_INDEX_DIR: '/absolute/path/to/star/folder/'

####################################
## Annotations ##
# Path to GTF/GFF file
####################################
## Where is the GTF.gz/GFF.gz/GTF/GFF file?
GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
## Refer: http://htseq.readthedocs.io/en/master/count.html
## What is considered the "gene"? 'gene_name' or 'gene_id'?
FEATURE_ID: 'gene_name'
## What is the content of the "gene"? Mostly 'exon'.
FEATURE_CONTENT: 'exon'
# GENE_BIOTYPE: ['protein_coding', 'lincRNA']
## Which type of genes to report?
GENE_BIOTYPE:
- 'protein_coding'
- 'lincRNA'
## If nothing set as below, all genes are reported.
## GENE_BIOTYPE:

## Demultiplexing ##
####################################
## Demultiplexing
####################################
FASTQ_QUAL_MIN_OF_BC: 10
CUT_LENGTH: 35
SAVE_UNKNOWN_BC_FASTQ: false

## Alignment ##

## UMI Count ##
####################################
## UMI Count
####################################
ALN_QUAL_MIN: 0

## Running Parameters ##
####################################
## Running Parameters
####################################
num_threads: 15
verbose: True
2 changes: 1 addition & 1 deletion celseq2/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.7'
__version__ = '0.4.8'
Loading

0 comments on commit 51c02b7

Please sign in to comment.