-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfilter_homopolymer_nucleotides.pl
executable file
·124 lines (95 loc) · 3.89 KB
/
filter_homopolymer_nucleotides.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
use Getopt::Long;
use FindBin;
use lib "$FindBin::Bin/../";
use snpir::config;
################################################################################
################################################################################
# $Revision: $
# Authors: Robert Piskol ( [email protected] ), Gokul Ramaswami ( [email protected] )
# Last modification $Author: piskol $
# perl script that screens if the variant is located in a homopolymer
#inputs: input file of edits, output file name
my ($INPUTFILE,$OUTPUTFILE,$REFERENCEGENOME,$HELP);
parse_command_line();
sub parse_command_line {
my $help;
usage() if (scalar @ARGV==0);
&GetOptions(
"infile=s" => \$INPUTFILE,
"outfile=s" => \$OUTPUTFILE,
"refgenome=s" => \$REFERENCEGENOME,
"help" => \$HELP
);
usage() if ($help);
if(!$INPUTFILE || !$OUTPUTFILE || !$REFERENCEGENOME){
usage();
}
}
my ($leftbuffer, $rightbuffer) = (4,4); #sequence buffer length on both sides
my $tmpfile = join '', $INPUTFILE, '.nuctmp';
my $tmpfafile = join '', $INPUTFILE, '.tmpnuctmp';
open (SITES, "<", $INPUTFILE ) or die "error opening inputfile: $!\n"; #Read INPUTFILE
open (OUTPUT, ">", $OUTPUTFILE) or die "error opening outputfile: $!\n";
open (OUTPUT_failed, ">", $OUTPUTFILE."_failed") or die "error opening outputfile: $!\n";
while(<SITES>) { #Read INPUTFILE
chomp;
my $line = $_;
my @fields = split;
my ($chromname, $position, $editnuc) = ($fields[0], $fields[1], $fields[4]); #### CAN CHANGE $editnuc to $fields[X] for your input file
my $homopol = 0;
open (TMP, ">", $tmpfile);
my ($startpos, $endpos) = ($position-$leftbuffer,$position+$rightbuffer+1);
print TMP "$chromname\t$startpos\t$endpos\n";
#print "$chromname\t$startpos\t$endpos\n";
close TMP;
system("$FASTAFROMBED -fi $REFERENCEGENOME -bed $tmpfile -fo $tmpfafile"); #get sequence
#print ("/srv/gs1/projects/li/shared-data/software/BEDTools-Version-2.12.0/bin/fastaFromBed -fi /srv/gs1/projects/li/shared-data/reference-genomes/$refspec/softmasked/chr/$chromname.fa -bed $tmpfile -fo $tmpfafile"); #get sequence
my $seq = '';
open (TMPFAFILE, "<", $tmpfafile);
while(<TMPFAFILE>) {
chomp;
next if ($_ =~ m/\>/);
$seq = join '', $seq, $_; #get sequence flanking variant
}
close TMPFAFILE;
system("rm $tmpfafile");
system("rm $tmpfile");
my @splitseq = split(//,$seq);
$editnuc = uc $editnuc;
for (my $i = 0; $i < @splitseq; $i++) {
$splitseq[$i] = uc $splitseq[$i];
}
$homopol = 1 if ($editnuc =~ $splitseq[0] && $editnuc =~ $splitseq[1] && $editnuc =~ $splitseq[2] && $editnuc =~ $splitseq[3] );
$homopol = 1 if ($editnuc =~ $splitseq[1] && $editnuc =~ $splitseq[2] && $editnuc =~ $splitseq[3] && $editnuc =~ $splitseq[5] );
$homopol = 1 if ($editnuc =~ $splitseq[2] && $editnuc =~ $splitseq[3] && $editnuc =~ $splitseq[5] && $editnuc =~ $splitseq[6] );
$homopol = 1 if ($editnuc =~ $splitseq[3] && $editnuc =~ $splitseq[5] && $editnuc =~ $splitseq[6] && $editnuc =~ $splitseq[7] );
$homopol = 1 if ($editnuc =~ $splitseq[5] && $editnuc =~ $splitseq[6] && $editnuc =~ $splitseq[7] && $editnuc =~ $splitseq[8] );
if(!$homopol){
print OUTPUT "$line\n"; #check if homopolymer
}
else{
print OUTPUT_failed "$line\n";
}
#print "homopol\n" if $homopol;
}
close SITES;
close OUTPUT;
close OUTPUT_failed;
sub usage(){
print<<EOF;
Homopolymer filter, by Robert Piskol (piskol\@stanford.edu) & Gokul Ramaswami (gokulr\@stanford.edu) 07/25/2013
This program takes a variant file, and a reference genome and removes variants that are located
whtin homopolymers.
usage: $0 -infile FILE -outfile FILE -refgenome FILE
Arguments:
-infile FILE - File containing a list of variants to be filtered
-outfile FILE - Output file for filtered variants
-refgenome FILE - File in FASTA format containing the reference genome of the species
-help - Show this help screen
EOF
exit 1;
}