-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremove_Hauturu_barcodes_from_BOLD.py
30 lines (27 loc) · 1.08 KB
/
remove_Hauturu_barcodes_from_BOLD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 21 16:01:34 2018
@author: dopheidea
"""
import os, glob, re
from Bio import SeqIO
# Exclude any Hauturu barcode sequences from downloaded BOLD database sequences
# Hauturu barcodes have Genbank IDS in the range KP420745 - KP422464
# http://gamon.webfactional.com/regexnumericrangegenerator/
reg = "KP4(2074[5-9]|207[5-9][0-9]|20[89][0-9]{2}|21[0-9]{3}|22[0-3][0-9]{2}|224[0-5][0-9]|2246[0-4])$"
os.chdir('G:/Documents/GitHub/Barcoding_invertebrate_biodiversity/BOLD_NZ_seqs_2018/')
files = glob.glob("*.fas")
for f in files:
keep = list()
exclude = list()
with open(f, "r") as infile:
label = f.split(".fas")[0]
with open("{0}_keep.fasta".format(label), "a") as outfile:
for seq in SeqIO.parse(infile, "fasta"):
r = re.search(reg, seq.id)
if r is not None:
exclude.append(seq.id)
print("excluded:{0}".format(seq.id))
elif r is None:
keep.append(seq.id)
SeqIO.write(seq, outfile, "fasta")