-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathpbcluster_upgma.pl
executable file
·120 lines (106 loc) · 2.34 KB
/
pbcluster_upgma.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/perl -w
#
# Author: Jue Ruan
#
use strict;
my @seqs = ();
while(<>){
my @ts = split;
push(@seqs, [$ts[0], uc $ts[1]]);
}
die("No sequences") if(@seqs < 1);
my $N = @seqs;
my $M = length $seqs[0][1];
my @weights = ();
for(my $i=0;$i<$M;$i++){
my %bases = ();
for(my $j=0;$j<$N;$j++){
my $c = substr($seqs[$j][1], $i, 1);
next if($c eq '-');
$bases{$c} ++;
}
my $sum = 0;
my $max = 0;
foreach my $c (keys %bases){
$sum += $bases{$c};
$max = $bases{$c} if($bases{$c} > $max);
}
my $weight = 1.0;
if($sum >= 4){
$weight = ($sum - $max) / $max * 2.0;
}
push(@weights, $weight);
}
my @mats = ();
# init matches matrix
for(my $i=1;$i<$N;$i++){
for(my $j=0;$j<$i;$j++){
my $mat = 0;
for(my $m=0;$m<$M;$m++){
my $a = substr($seqs[$i][1], $m, 1);
my $b = substr($seqs[$j][1], $m, 1);
$mat += $weights[$m] if($a eq $b and $a ne '-');
}
push(@mats, $mat);
}
}
my @tree = ();
#init nodes
for(my $i=0;$i<$N;$i++){
push(@tree, [[], -1, 0]);
}
#clustering
for(my $iter=0;$iter<$N-1;$iter++){
my $max = 0;
my $mi = 0;
my $mj = 0;
for(my $i=0;$i<@tree;$i++){
next if($tree[$i][1] != -1);
for(my $j=$i+1;$j<@tree;$j++){
next if($tree[$j][1] != -1);
my $mat = get_upgma_matrix($i, $j);
if($mat > $max){ $max = $mat; $mi = $i; $mj = $j; }
}
}
$tree[$mi][1] = @tree;
$tree[$mi][2] = sprintf("%0.2f", $max / 2);
$tree[$mj][1] = @tree;
$tree[$mj][2] = sprintf("%0.2f", $max / 2);
push(@tree, [[$mi, $mj], -1, 0]);
#update matches matrix
for(my $i=0;$i<@tree-1;$i++){
if($tree[$i][1] == -1){
push(@mats, (get_upgma_matrix($i, $mi) + get_upgma_matrix($i, $mj)) / 2);
} else {
push(@mats, 0);
}
}
}
#print results
print_tree(scalar(@tree) - 1, 0);
print "\n"; print STDERR "\n";
1;
sub get_upgma_matrix {
my ($i, $j) = @_;
my $idx = ($i < $j)? ($j * ($j - 1)) / 2 + $i : ($i * ($i - 1)) / 2 + $j;
return $mats[$idx];
}
sub print_tree {
my $idx = shift;
my $lv = shift;
my $node = $tree[$idx];
if(@{$node->[0]}){
print " " x $lv;
print "(\n"; print STDERR "(";
print_tree($node->[0][0], $lv + 1);
print ",\n"; print STDERR ",";
print_tree($node->[0][1], $lv + 1);
print "\n";
print " " x $lv;
print "):", $node->[2]; print STDERR "):", $node->[2];
} else {
#print " " x $lv;
print $seqs[$idx][0], ":", $seqs[$idx][1], ":", $node->[2];
print STDERR $seqs[$idx][0], ":", $node->[2];
}
}