Skip to content

Commit

Permalink
Merge pull request #15 from Puriney/master
Browse files Browse the repository at this point in the history
Release 0.4.7
  • Loading branch information
Puriney authored Mar 23, 2018
2 parents 4296672 + 243e6b6 commit ec5d06b
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 59 deletions.
62 changes: 32 additions & 30 deletions celseq2/dummy_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import argparse


def gtf_attr_str(gene_id=None, gene_name=None,
def gtf_attr_str(gene_id=None, gene_name=None, gene_biotype=None,
transcript_id=None, exon_num=None):
'''
Simple writer to fill 'attribute' column of GTF
Expand All @@ -26,6 +26,8 @@ def gtf_attr_str(gene_id=None, gene_name=None,
return(None)
out = "gene_id \"{gid}\"; gene_name \"{gname}\";".format(gid=gene_id,
gname=gene_name)
if gene_biotype:
out += " gene_biotype \"{biotype}\";".format(biotype=gene_biotype)
if transcript_id:
out += " transcript_id \"{txid}\";".format(txid=transcript_id)
if exon_num:
Expand Down Expand Up @@ -62,7 +64,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
frame = '.'
out = []
# g0
g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0')
g_attr = gtf_attr_str(gene_id='g0', gene_name='celseq2_gene-0', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -73,7 +75,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0',
tx_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
transcript_id='tx0')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -86,7 +88,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0',
exon_attr = gtf_attr_str(gene_id='g0', gene_name='celseq_gene-0', gene_biotype='protein_coding',
transcript_id='tx0', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -103,7 +105,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

s_stream, e_stream = e_stream - 2 * len_exon - 1 * len_intron + 1, e_stream
# g8
g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8')
g_attr = gtf_attr_str(gene_id='g8', gene_name='celseq2_gene-8', gene_biotype='lincRNA')
g = gtf_str(chrm='chr2', src=src,
feature='gene',
start=s_stream,
Expand All @@ -115,7 +117,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
transcript_id='tx8')
transcript_id='tx8', gene_biotype='lincRNA')
tx = gtf_str(chrm='chr2', src=src,
feature='transcript',
start=s_stream,
Expand All @@ -127,7 +129,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8',
exon_attr = gtf_attr_str(gene_id='g8', gene_name='celseq_gene-8', gene_biotype='lincRNA',
transcript_id='tx8', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr2', src=src,
Expand All @@ -144,7 +146,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

s_stream = e_stream + 1 + len_intergenic
# g1
g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1')
g_attr = gtf_attr_str(gene_id='g1', gene_name='celseq2_gene-1', gene_biotype='miRNA')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -155,7 +157,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1',
tx_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
transcript_id='tx1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -168,7 +170,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(3, 0, -1):
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1',
exon_attr = gtf_attr_str(gene_id='g1', gene_name='celseq_gene-1', gene_biotype='miRNA',
transcript_id='tx1', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -185,7 +187,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

s_stream = e_stream + 1 + len_intergenic
# g2
g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2')
g_attr = gtf_attr_str(gene_id='g2', gene_name='celseq2_gene-2', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -196,7 +198,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.1')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -209,7 +211,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2, 0, -1):
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -224,7 +226,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
out.append(exon)
s_stream = e_stream + 1 + len_intron

tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
tx_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -236,7 +238,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2',
exon_attr = gtf_attr_str(gene_id='g2', gene_name='celseq_gene-2', gene_biotype='protein_coding',
transcript_id='tx2.2', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
Expand All @@ -251,7 +253,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

s_stream = e_stream + 1 + len_intergenic
# g3
g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3')
g_attr = gtf_attr_str(gene_id='g3', gene_name='celseq2_gene-3', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -262,7 +264,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3',
tx_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
transcript_id='tx3')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -275,7 +277,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3',
exon_attr = gtf_attr_str(gene_id='g3', gene_name='celseq_gene-3', gene_biotype='protein_coding',
transcript_id='tx3', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -293,7 +295,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
s_stream = e_stream + 1 + len_intergenic

# g4
g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4')
g_attr = gtf_attr_str(gene_id='g4', gene_name='celseq2_gene-4', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -304,7 +306,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4',
tx_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
transcript_id='tx4')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -317,7 +319,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4',
exon_attr = gtf_attr_str(gene_id='g4', gene_name='celseq_gene-4', gene_biotype='protein_coding',
transcript_id='tx4', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -334,7 +336,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

# g5
s_stream, e_stream = e_stream - len_exon + 1, e_stream
g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5')
g_attr = gtf_attr_str(gene_id='g5', gene_name='celseq2_gene-5', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -345,7 +347,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5',
tx_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
transcript_id='tx5')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -358,7 +360,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(1, 0, -1):
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5',
exon_attr = gtf_attr_str(gene_id='g5', gene_name='celseq_gene-5', gene_biotype='protein_coding',
transcript_id='tx5', exon_num=i)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand All @@ -375,7 +377,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):

s_stream = e_stream + 1 + len_intergenic
# g6
g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6')
g_attr = gtf_attr_str(gene_id='g6', gene_name='celseq2_gene-6', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -386,7 +388,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6',
tx_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
transcript_id='tx6')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -398,7 +400,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=tx_attr)
fout.write('{}\n'.format(tx))
out.append(tx)
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6',
exon_attr = gtf_attr_str(gene_id='g6', gene_name='celseq_gene-6', gene_biotype='protein_coding',
transcript_id='tx6', exon_num=1)
exon = gtf_str(chrm='chr1', src=src,
feature='exon',
Expand All @@ -411,7 +413,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(exon))
out.append(exon)
# g7
g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7')
g_attr = gtf_attr_str(gene_id='g7', gene_name='celseq2_gene-7', gene_biotype='protein_coding')
g = gtf_str(chrm='chr1', src=src,
feature='gene',
start=s_stream,
Expand All @@ -422,7 +424,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
attr=g_attr)
fout.write('{}\n'.format(g))
out.append(g)
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7',
tx_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
transcript_id='tx7')
tx = gtf_str(chrm='chr1', src=src,
feature='transcript',
Expand All @@ -435,7 +437,7 @@ def dummy_gtf(saveto=None, len_exon=100, len_intron=200, len_intergenic=300):
fout.write('{}\n'.format(tx))
out.append(tx)
for i in range(2):
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7',
exon_attr = gtf_attr_str(gene_id='g7', gene_name='celseq_gene-7', gene_biotype='protein_coding',
transcript_id='tx7', exon_num=i + 1)
s_stream, e_stream = s_stream, s_stream + len_exon - 1
exon = gtf_str(chrm='chr1', src=src,
Expand Down
58 changes: 44 additions & 14 deletions celseq2/prepare_annotation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,54 @@


def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon',
gene_types = (),
stranded=True, dumpto=None, verbose=False):
'''
Prepare a feature model.
Output: (features, exported_genes) where:
- features: HTSeq.GenomicArrayOfSets()
- exported_genes: a sorted list
For example, feature_atrr = 'gene_name', feature_type = 'exon',
gene_types = ('protein_coding', 'lincRNA'):
- features: all exons ~ all gnames mapping and ready for counting
- exported_genes: only protein_coding and lincRNA gnames are visible
Quantification used the full genes but only the selected genes are reported.
'''
features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
fh_gff = HTSeq.GFF_Reader(gff_fpath)
all_genes = set()
exported_genes = set()
i = 0
for gff in fh_gff:
if verbose and i % 100000 == 0:
print_logger('Processing {:,} lines of GFF...'.format(i))
i += 1

if gff.type == feature_type:
features[gff.iv] += gff.attr[feature_atrr].strip()
all_genes.add(gff.attr[feature_atrr].strip())
else:
pass
if gff.type != feature_type:
continue

features[gff.iv] += gff.attr[feature_atrr].strip()

if not feature_atrr.startswith('gene'):
exported_genes.add(gff.attr[feature_atrr].strip())
continue

if not gene_types:
exported_genes.add(gff.attr[feature_atrr].strip())
continue

if gff.attr.get('gene_biotype', None) in gene_types:
exported_genes.add(gff.attr[feature_atrr].strip())

# if i == 5000:
# break ## test mode
print_logger('Processed {:,} lines of GFF...'.format(i))

if exported_genes:
exported_genes = sorted(exported_genes)
if dumpto:
with open(dumpto, 'wb') as fh:
pickle.dump((features, all_genes), fh)
return((features, all_genes))
pickle.dump((features, exported_genes), fh)
return((features, exported_genes))


def main():
Expand All @@ -41,12 +65,17 @@ def main():
help='File path to GFF')
parser.add_argument('--feature-atrr', type=str,
default='gene_id',
help=('Reserved word for feature_atrr in GFF to be '
'called a \'gene\'.'))
help=('Reserved word for feature_atrr in GTF/GFF to be'
' considered a Gene, e.g., \"gene_id\".'))
parser.add_argument('--feature-type', type=str,
default='exon',
help=('Reserved word for feature type in GFF to be '
'the components of \'gene\'.'))
help=('Reserved word for feature type in 3rd column'
' in GTF/GFF to be the components of \'Gene\','
' e.g., \"exon\".'))
parser.add_argument('--gene-types', type=tuple,
default=(),
help=('Reserved words for the attrbute \"gene_biotype\"'
' (Gene type), e.g., protein_coding, lincRNA'))
parser.add_argument('--strandless', dest='stranded', action='store_false')
parser.set_defaults(stranded=True)
parser.add_argument('--dumpto', type=str, metavar='FILENAME',
Expand All @@ -59,6 +88,7 @@ def main():
_ = cook_anno_model(gff_fpath=args.gff_file,
feature_atrr=args.feature_atrr,
feature_type=args.feature_type,
gene_types=args.gene_types,
stranded=args.stranded,
dumpto=args.dumpto,
verbose=args.verbose)
Expand Down
2 changes: 2 additions & 0 deletions celseq2/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ GFF: '/absolute/path/to/wonderful.gtf.gz'
# Refer: http://htseq.readthedocs.io/en/master/count.html
FEATURE_ID: 'gene_name'
FEATURE_CONTENT: 'exon'
# GENE_BIOTYPE: ['protein_coding', 'lincRNA']
GENE_BIOTYPE:

## Demultiplexing ##
FASTQ_QUAL_MIN_OF_BC: 10
Expand Down
2 changes: 1 addition & 1 deletion celseq2/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.6'
__version__ = '0.4.7'
Loading

0 comments on commit ec5d06b

Please sign in to comment.