-
Notifications
You must be signed in to change notification settings - Fork 0
/
sandpi.py
81 lines (72 loc) · 4.51 KB
/
sandpi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
##sandpi.py
##Answer for Class
##DETERMINE S and PI
##input number of samples and a list of genera (separated by ',') to include for the genus
##EXAMPLE python sandpi.py 5 Homo,Gorilla
import sys
import pysam
import numpy as np
import random
pD="/home/melinda_yang/greatapes/" ##Set this to whatever directory your data is in
samplecount,genera=int(sys.argv[1]),sys.argv[2].split(',') ##Read input options into variables
bedfile=open(pD+"Intersect_filtered_cov8_chr21_rand1000.bed",'r') ##BED FILE
vcfin=pysam.VariantFile(pD+"greatapes.fixedchr21.vcf.gz") ##VCF FILE
header = list(vcfin.header.samples) ##Get list of all IDs in header
##### MAKE GENOTYPE ARRAY FOR ALL INDIVIDUALS
seqary=np.zeros((1,len(header))) ##EACH ROW IS A SNP, EACH COLUMN IS AN INDIVIDUAL
num=0 ##Index to count number of SNPs that we wanted to keep for analysis
for ind,line in enumerate(bedfile): ##Loop over every set of SNPs in BED file
x=line.split()
##if ind%100==0: print ind ##A way I check to see how long it's taking to go through bed file
for pos in range(int(x[1]),int(x[2])):
for entry in vcfin.fetch(x[0],pos-1,pos): ##Use .fetch (pysam method) to run through and grab the line for each position in the VCF
if len(entry.alleles) != 2: continue ##Check if biallelic; continue if not
ref,alt=tuple(entry.alleles) ##Not necessary for computation, but way to call alleles in vcf
##Grab genotypes for all individuals, marking them as 0 or 1 depending on genotype
genos=[]
for geno in str(entry).split('\t')[9:]:
vcfgeno=geno.split(':')[0]
if vcfgeno in ['1/1']: genos.append(1)
elif vcfgeno in ['0/0','./.']: genos.append(0) ##Treat ./. as fixed for reference allele
elif vcfgeno in ['1/0','0/1']: genos.append(random.choice([0,1])) ##Randomly choose allele for heterozygotes
else: print 'what is vcfgeno?',vcfgeno ##Just a check to make sure I don't miss anything
genos=np.array(genos)
if num==0: seqary=genos ##Create array of genotypes at first line with information
else: seqary=np.vstack((seqary,genos)) ##Add new genotypes for next line to array of genotypes
num+=1 ##Keeping track of every SNP we used in analysis
##Save all genotype information into an array using np.savetxt (use np.loadtxt to retrieve)
np.savetxt(pD+"greatapes_genos.txt",seqary,fmt="%.1f",comments="#".join(header))
#####
##If you do not want to make the genotype array every run, you can comment out everything
##included in the ##### above
seqary=np.loadtxt(pD+"greatapes_genos.txt") ##Loads genotype array if you ran the part in #####
##Find random set of indices (with count based on "samplecount") for each genera included.
##Add these indices to dictionary with keys as each genera
myapeinds={}
for ind,speciesind in enumerate(header):
genusname=speciesind.split('_')[0] ##Grab the genus from the entire individual's ID
if genusname not in myapeinds: myapeinds[genusname]=[ind] ##Initialize each dictionary entry
else: myapeinds[genusname].append(ind) ##Add new indices to correct dictionary entry
##Get random sample of each genus
for mygenus in genera: myapeinds[mygenus]=sorted(random.sample(myapeinds[mygenus],samplecount))
print myapeinds ##Dictionary of indices for each genus wanted
##This for loop goes through the genotype array and picks out the columns
##including the individuals I want to calculate S and pi
for myape in myapeinds:
myary=seqary[:,myapeinds[myape]] ##Get array of genotypes for individuals you want
##Find S: Add 1 for each site that is not fixed reference or fixed alternative
S=0
for i in np.sum(myary,axis=1):
if i not in [0,myary.shape[1]]: S+=1
#Calculate pi: For every pair of individuals, subtract their array of sites and find all
##non-zero entries (where the two individuals are dissimilar). These are given a value of 1
##using np.count_nonzero. Thus, summing across the new array of differences, gives the total
##number of differences between those two individuals, which are added to a list, for which
##calculate the average.
pairdiff=[]
for myind1 in range(myary.shape[1]):
for myind2 in range(myary.shape[1]):
if myind1>=myind2: continue ##Don't accidentally include a pair more than once!
else: pairdiff.append(np.sum(np.count_nonzero(myary[:,myind1]-myary[:,myind2])))
pi = sum(pairdiff)/float(len(pairdiff)) #Get average for pi
print myape, S, pi