-
Notifications
You must be signed in to change notification settings - Fork 5
/
fst_stats.pl
executable file
·330 lines (262 loc) · 9.54 KB
/
fst_stats.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/perl
#fst_stats.pl by Megan Supple
#created 11 Mar 2012
#last modified 16 August 2013
#
#script to calculate sliding window Fst and baseline Fst with jackknife confidence intervals from per position Fst
#usage fst_stats.pl InputFstFile.txt WindowSize StepSize MinPropSites MinPropIndivs
# InputFstFile.txt is the fst output file from fasta2popgen.pl
# contig, position, pop1#genoed, pop2#genoed, s1, s2, fst
# WindowSize is the size of the sliding window
# StepSize is the size of the step when moving the sliding window
# MinPropSites is the minimum proportion of markers needed to calculate Fst at a window
# MinPropIndivs is the minimum proportion of individuals of each population needed to use a site
use lib $ENV{PERL5LIB};
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
my $usage = "Usage: fst_stats.pl [options] <InputFstFile.txt> <WindowSize> <StepSize> <MinPropSites> <MinPropIndivs> <BootReps>
arguments (required):
<InputFstFile.txt> is per position fst from fasta2popgen.pl
<WindowSize> is the size of the sliding window
<StepSize> is the size of the step when moving the sliding window
<MinPropSites> is the minimum proportion of markers needed to calculate Fst at a window
<MinPropIndivs> is the minimum proportion of individuals of each population needed to use a site
<BootReps> is the number of bootstrap replicates to do (required only for boot option)
options:
-jack calculate a jackknife n-1 confidence interval for the contig
-boot calculate a bootstrap confidence interval for the contig
";
my $jackknife;
my $bootstrap;
GetOptions ( "jack" => \$jackknife,
"boot" => \$bootstrap, );
die "$usage" unless (@ARGV >= 5);
#read in command line arguments
my ($infst,$windowSize,$stepSize,$minProp, $minPropInds, $BootReps)=@ARGV;
#open input file and output file for sliding window fst and baseline fst
open(INFST, $infst)||die "can't open input fst file. $!\n";
my $outfile="slidingFst_" . $windowSize . "x" . $stepSize . "x" . $minProp . "x" . $minPropInds .".txt";
open(OUTFST, ">$outfile");
open(BASE, ">baseline.txt");
#print input parameters to output file
print OUTFST "####################\n";
print OUTFST "#input fst file=$infst\n";
print OUTFST "#window size=$windowSize\n";
print OUTFST "#step size=$stepSize\n";
print OUTFST "#minimum proportion of sites=$minProp\n";
print OUTFST "#minimum proportion of individual=$minPropInds\n";
print BASE "#input fst file=$infst\n";
#read header info from input file and write to output file
my $pop1n; my $pop2n;
my $line=<INFST>; print OUTFST "$line";#read in header border
$line=<INFST>; print OUTFST "$line"; #read in population 1 sample size
chomp $line;
my @entry=split("=",$line);
$pop1n=$entry[1];
$line=<INFST>; print OUTFST "$line"; #read in population 1 sample list
$line=<INFST>; print OUTFST "$line"; #read in population 2 sample size
chomp $line;
@entry=split("=",$line);
$pop2n=$entry[1];
$line=<INFST>; print OUTFST "$line"; #read in population 2 sample list
$line=<INFST>; print OUTFST "$line"; #read in header border
$line=<INFST>; #read column header
print "n1=$pop1n and n2=$pop2n\n";
print OUTFST "contig\tposition\ts1\ts2\tFst\n";
print BASE "contig\ts1\ts2\tlowCI\tbaseline_Fst\thighCI\n";
print "calculating sliding window Fst\n";
#declare variables to track window
my $window_start=1;
my $window_end=$windowSize;
#declare other variable
my $contig;
my $fst;
my $mid_pos;
my @current_set=(); #an array of arrays holding the current set of markers
my $contig_s1=0;
my $contig_s2=0; #tracks s1 and s2 for the current contig
my @jack_set=(); #set of all usable sites for jackknifing to get CI
#process input fst file until EOF
while($line=<INFST>)
{
#break up line into component parts
my @entry=split(" ", $line);
$contig=$entry[0];
my $pos=$entry[1];
#add to baseline s1 and s2
#if enough individuals, add it to current marker set
if($entry[2]>=$minPropInds*$pop1n && $entry[3]>=$minPropInds*$pop2n)
{
$contig_s1+=$entry[4];
$contig_s2+=$entry[5];
push (@jack_set, [@entry]);
}
#calculate sliding window Fst
#determine if current entry is in the window
if ($pos>=$window_start && $pos<=$window_end)
{
#marker is in window
#if enough individuals, add it to current marker set
if($entry[2]>=$minPropInds*$pop1n && $entry[3]>=$minPropInds*$pop2n)
{
push (@current_set, [@entry]);
}
}
else
{
while ($pos<$window_start || $pos>$window_end)
{
#marker is not in window, need to calculate Fst windows until it is in window
#calculate Fst from previous window if there are enough markers, otherwise print NAs
if (@current_set<$windowSize*$minProp)
{
$mid_pos=int($window_start+.5*($windowSize-1));
print OUTFST "$contig\t$mid_pos\tNA\tNA\tNA\n";
}
else
{
#calc fst by dividing sum of s1 by sum of s2
#sum s1 and s2
my $sum_s1=0;
my $sum_s2=0;
for (my $i=0;$i<@current_set;$i++)
{
$sum_s1+=$current_set[$i][4];
$sum_s2+=$current_set[$i][5];
}
my $fst=eval{$sum_s1/$sum_s2};
#calc mid position
$mid_pos=int($window_start+.5*($windowSize-1));
#print results to outfile
if (defined $fst) {print OUTFST "$contig\t$mid_pos\t$sum_s1\t$sum_s2\t$fst\n";}
else {print OUTFST "$contig\t$mid_pos\tNA\tNA\tNA\n";}
}
#reset current window
$window_start+=$stepSize;
$window_end=$window_start+$windowSize-1;
#reset current marker set by removing elements that are not in new window
for (my $i=0;$i<@current_set;$i++)
{
#look at each element to see if it is in new window
if ($current_set[$i][1]<$window_start || $current_set[$i][1]>$window_end)
{
#not in new window so remove from current set
splice(@current_set, $i, 1);
#removed an entry so need to reindex
$i--;
}
}
}
#marker is now in current window so push into current array if enough samples
if($entry[2]>=$minPropInds*$pop1n && $entry[3]>=$minPropInds*$pop2n)
{
push (@current_set, [@entry]);
}
}
}
#after EOF calculate final fst window
#calculate Fst from previous window
#if there are enough markers, calc Fst
if (@current_set<$windowSize*$minProp)
{
$mid_pos=int($window_start+.5*($windowSize-1));
print OUTFST "$contig\t$mid_pos\tNA\tNA\tNA\n";
}
else
{
#calc fst by dividing sum of s1 by sum of s2
#sum s1 and s2
my $sum_s1=0;
my $sum_s2=0;
for (my $i=0;$i<@current_set;$i++)
{
$sum_s1+=$current_set[$i][4];
$sum_s2+=$current_set[$i][5];
}
$fst=eval{$sum_s1/$sum_s2};
#calc mid position
$mid_pos=int($window_start+.5*($windowSize-1));
#print results to outfile
if (defined $fst) {print OUTFST "$contig\t$mid_pos\t$sum_s1\t$sum_s2\t$fst\n";}
else {print OUTFST "$contig\t$mid_pos\tNA\tNA\tNA\n";}
}
#after EOF calculate basline fst plus confidence intervals
#calculate baseline fst
my $baseline=eval{$contig_s1/$contig_s2};
#calculate CI
my @resample_set;
my @fst_dist;
my @fst_sort;
my $i_low;
my $i_high;
if ($jackknife || $bootstrap)
{
my $locus=0;
#iterate over sites
for (my $i=0;$i<@jack_set;$i++)
{
#if position is variable put it in array for resampling
if ($jack_set[$i][5]!=0)
{
$resample_set[$locus][0]=$jack_set[$i][4];
$resample_set[$locus][1]=$jack_set[$i][5];
$locus++;
}
}
#print the number of variable sites
print "$locus variable sites\n";
if($jackknife)
{
print "jackknifing\n";
@fst_dist=();
#iterate over loci
for (my $i=0;$i<$locus;$i++)
{
#calculate fst for all sites but the current and add it to the distribution of fsts
$fst_dist[$i]=eval{($contig_s1-$resample_set[$i][0])/($contig_s2-$resample_set[$i][1])};
}
#if there are at least 2 variable sites to calc the confidence interval
if (defined $fst_dist[0])
{
@fst_sort=sort(@fst_dist);
$i_low=$locus*0.025;
$i_high=$locus*0.975;
}
print BASE "$contig\t$contig_s1\t$contig_s2\t$fst_sort[$i_low]\t$baseline\t$fst_sort[$i_high]\t(jackknife)\n";
}
if($bootstrap)
{
print "bootstrapping\n";
@fst_dist=();
#iterate over bootstrap replicates
for (my $rep=0;$rep<$BootReps;$rep++)
{
my $s1=0; my $s2=0;
#resample same amount of loci
for (my $i=0;$i<$locus;$i++)
{
#randomly sample loci and record s1 and s2
my $rand_num=int(rand($locus));
$s1+=$resample_set[$rand_num][0];
$s2+=$resample_set[$rand_num][1];
}
#calculate fst
$fst_dist[$rep]=eval{$s1/$s2};
}
#if there are any bootstrap values, calculate the confidence interval
if (defined $fst_dist[0])
{
@fst_sort=sort(@fst_dist);
$i_low=$BootReps*0.025;
$i_high=$BootReps*0.975;
}
print BASE "$contig\t$contig_s1\t$contig_s2\t$fst_sort[$i_low]\t$baseline\t$fst_sort[$i_high]\t(bootstrap)\n";
}
}
else {print BASE "$contig\t$contig_s1\t$contig_s2\tNA\t$baseline\tNA\n";}
close INFST;
close OUTFST;
close BASE;
print "done!\n";