-
Notifications
You must be signed in to change notification settings - Fork 31
/
barcode2fqname.py
executable file
·46 lines (42 loc) · 1.73 KB
/
barcode2fqname.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python2
# Process PE reads and insert barcode of given size (size) into fq name.
# New files are written in current directory!
# USAGE: barcode2fqname.py size reads_?.fq.gz
import os, sys, gzip, subprocess
from itertools import izip as zip
def fastq_iterator(handle):
"""Return FastQ fields"""
data = []
for i, l in enumerate(handle):
if not i%4:
if len(data)==4: yield data
data = []
data.append(l[:-1])
if len(data)==4: yield data
def barcode2fqname(size, fnames):
"""Process barcoded FastQ and insert barcode into name field"""
if len(fnames)!=2:
sys.stderr.write("Provide 2 FastQ files!\n")
return
barcodes = {}
i = 0
ofnames = map(os.path.basename, fnames)
for fn in ofnames:
if os.path.isfile(fn):
sys.stderr.write("File exist: %s\n"%fn)
return
handle1, handle2 = [subprocess.Popen(['zcat', fn], stdout=subprocess.PIPE).stdout for fn in fnames]
#handle1, handle2 = [gzip.open(fn) for fn in fnames]
out1, out2 = [gzip.open(fn, "w") for fn in ofnames]
for i, (fq1, fq2) in enumerate(zip(fastq_iterator(handle1), fastq_iterator(handle2))):
if not i%1e3: sys.stderr.write(' %s %s\r'%(i, len(barcodes)))
barcode = fq1[1][:size]
if barcode not in barcodes:
barcodes[barcode] = 1
else:
barcodes[barcode] += 1
out1.write("@%s.%s/1\n%s\n+\n%s\n"%(barcode, barcodes[barcode], fq1[1][size:], fq1[3][size:]))
out2.write("@%s.%s/2\n%s\n+\n%s\n"%(barcode, barcodes[barcode], fq2[1], fq2[3]))
sys.stdout.write("%s barcodes in %s sequences\n"%(len(barcodes), i))
if __name__=="__main__":
barcode2fqname(int(sys.argv[1]), sys.argv[2:4])