-
Notifications
You must be signed in to change notification settings - Fork 0
/
loci2gphocs_klp_scrapped.py
79 lines (64 loc) · 3.22 KB
/
loci2gphocs_klp_scrapped.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# pyrad .loci to gphocs format conversion
#
# This is a very simple conversion because the formats are very similar.
#
# Usage: python loci2gphocs -f <my_input.loci> [-o <my_output>] [-v True]
#
# Isaac Overcast
# March 21, 2015
## import libraries
import sys, os
import argparse
#parser = argparse.ArgumentParser(prog='PROG')
#parser.add_argument("-f", dest="locifile", required=True, help="name of the loci input file being converted")
#parser.add_argument("-o", dest="outfile", default='output.gphocs', help="gphocs output file name")
#parser.add_argument("-v", dest="verbose", help="Set verbosity. Dump tons of info to the screen", default=False)
#options = parser.parse_args()
#infilename = options.locifile
#outfilename = '%s' %(options.outfile)
#infile = open( infilename )
#outfile = open( outfilename, 'w' )
infile = open(r"C:/Users/Kaiya/Documents/Columbia/Thesis/THESIS DATA/cardcard.loci")
outfile = open("C:/Users/Kaiya/Documents/Columbia/Thesis/THESIS DATA/cardcard_klp.gphocs","w")
## parse the loci
## Each set of reads at a locus is appended with a |,
## so after this call 'loci' will contain an array
## of sets of each read per locus.
loci = infile.read().strip().split("//")
## only examine loci w/ at least 1 outgroup sample
## and at least two inds from each focal populations
#if(options.verbose): print "Got nloci:", len(loci)
# Print the header, the number of loci in this file
print >>outfile, len(loci)-1
# iterate through each locus, print out the header for each locus:
# <locus_name> <n_samples> <locus_length>
# Then print the data for each sample in this format:
# <individual_name> <sequence>
for i, loc in enumerate(loci):
# The enumerate is going to return is a [] as the last element
# so if this is the last locus then we're done.
if i == len(loci)-1:
break
# Get the length of the sequence read at this locus so we can print the
# header. Basically you get the first line, then you take the 2nd element
first_sequence = loc.strip().split("\n")[0]
print("first",first_sequence)
sequence_length = len(first_sequence.strip().split()[1])
print("length",sequence_length)
#if(options.verbose): print "Length of locus", i, "=", sequence_length
# Separate out each sequence within the loc block. 'sequences'
# will now be a list strings containing name/sequence pairs.
# This is confusing, the [:-1] thing at the end means "all but the last
# line, since this is not a read, but is the line that contains info
# about positions of snps.
sequences = loc.strip().split("\n")[:-1]
# Print out the header for this locus
print >>outfile, 'locus'+ str(i), len(sequences), str( sequence_length )
# Iterate through each sequence read at this locus and write it to the file.
for sequence in sequences:
# Clean up the sequence data to make gphocs happy. Only accepts UPPER
# case chars for bases, and only accepts 'N' for missing data. Also,
# the .loci format prepends a '>' on to the individual names, so we have
# to clean this up which is what the [1:] is doing.
sequence = sequence[1:].upper().replace( '-', 'N' )
print >>outfile, sequence