-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
39 lines (34 loc) · 1.12 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
bases = {'g':1, 'a':1, 't':1, 'c':1, 'u':1, 'G':1, 'A':1, 'T':1, 'C':1, 'U':1}
uppercase = {'G':1, 'A':1, 'T':1, 'C':1}
"""
Takes in a path to a directory, then opens each file in the
directory (presumed to be a in .dp format) and parses out the RNA
sequence. Returns a list of all RNA sequences as strings.
"""
def parse(path, numRNA):
##finds all files in directory
data = []
for dir_entry in os.listdir(path):
dir_entry_path = os.path.join(path, dir_entry)
if os.path.isfile(dir_entry_path):
data += [dir_entry_path]
##opens each file and parses for RNA sequence
seqList = []
for theFile in data[:numRNA]:
h = open(theFile)
k = 0
seq = []
strSeq = ''
for line in h:
if line[0] in bases:
seq = [letter.capitalize() for letter in line if letter in bases]
else:
continue
strLineSeq = ''.join(seq)
strSeq += strLineSeq
seq = []
strLineSeq = ''
seqList += [(strSeq, theFile)]
return seqList
##parse('RNAseqs', 10)