-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuniprobe2meme
187 lines (163 loc) · 5.79 KB
/
uniprobe2meme
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/perl -w
#
# FILE: uniprobe2meme
# AUTHOR: Timothy L. Bailey
# CREATE DATE: 3/19/2009
# DESCRIPTION: Convert a file containing list of TFBS motif matrices from
# UNIPROBE to MEME output format.
use warnings;
use strict;
use lib qw(/home/jiang/meme/meme_4.12.0/lib/perl);
use Alphabet qw(dna rna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);
use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;
=head1 SYNOPSIS
uniprobe2meme [options]
Options:
-rna output an RNA database instead of a DNA database.
-skip <ID> skip this ID (may be repeated)
-numseqs <numseqs> assume frequencies based on this many
sequences; default: 20
-truncate_names truncate motif names at first underscore
-numbers use numbers instead of strings as motif names
-bg <background file> file with background frequencies of letters;
default: uniform background
-pseudo <total pseudocounts> add <total pseudocounts> times letter
background to each frequency; default: 0
-logodds print log-odds matrix, too;
default: print frequency matrix only
-url <website> website for the motif; The untruncated ID is
substituted for MOTIF_NAME; default: no url
-sg <sg_file_name> TSV file with motif name, ID and source publication
in columns 1, 2 & 6 (+1 to column# if first blank)
-sp <source_pub> only consider lines in <sg_file_name> that match
this source publication; default: use all lines
-h print usage message
Read a UNIPROBE (concatenated) matrix file and convert to MEME format.
Reads standard input.
Writes standard output
=cut
# Set option defaults
my %skips;
my $num_seqs = 20;
my $truncate_names = 0;
my $use_numbers = 0;
my $bg_file;
my $pseudo_total = 0; # default total pseudocounts
my $print_logodds = 0;
my $url_pattern = '';
my $sg_file_name = '';
my $help;
my $is_rna = 0;
my $source_pub = "";
GetOptions("rna" => \$is_rna,
"skip=s" => sub {$skips{$_[1]} = 1}, "numseqs=i" => \$num_seqs,
"truncate_names" => \$truncate_names, "numbers" => \$use_numbers,
"bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds,
"url=s" => \$url_pattern, "sg=s" => \$sg_file_name, "sp=s" => \$source_pub,
"h" => \$help) or pod2usage(2);
#check num seqs
pod2usage("Option -numseqs must have a positive value.") if ($num_seqs < 0);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);
# output help if requested
pod2usage(1) if $help;
# Get the background model.
my %bg = &read_background_file($is_rna ? &rna() : &dna(), $bg_file); #DNA background file
# Get a dictionary of UniPROBE IDs indexed by motif name.
my %name_id_dict = get_name_id_dict($sg_file_name, $source_pub);
my %id_count = ();
# Read the input file.
my $num_skipped = 0;
my $num_motifs = 0;
my $num_bad = 0;
my (%motifs, $motif_name, $motif_freqs);
my %rows = ();
my $name;
while (my $line = <>) {
chomp($line);
next if ($line =~ /^\s*$/); # skip blank lines
# Split the line into name and everything else.
my ($first, $rest) = split('\s+', $line, 2);
if ($first =~ /^\s*([ACGT]):$/i) { # frequency line
my $base = uc($1);
$rows{$base} = $rest;
} else { # found start of next motif
#check if there was a previous motif
conditional_motif_output();
#record information for new motif
chomp($first);
$name = $first;
%rows = ();
}
}
#check if there was a previous motif
conditional_motif_output();
print(STDERR "Converted $num_motifs motifs.\n");
print(STDERR "Skipped $num_skipped motifs.\n");
sub conditional_motif_output {
if (%rows) {
if ($skips{$name}) {
$num_skipped++;
} else {
my $trunc_name = $name;
$trunc_name =~ s/^([^_]+)_.*$/$1/;
my $id = $name_id_dict{$trunc_name}; # look up ID
my $uniprobe;
if (! defined $id) {
$id = $name;
$uniprobe = $name;
} else {
$uniprobe = $id;
$uniprobe =~ s/^UP//;
# make the ID unique
my $ic = $id_count{$id};
if (! $ic) {
$ic = $id_count{$id} = 1;
}
$id_count{$id}++;
$id = $id . "_$ic";
}
my $url = $url_pattern;
$url =~ s/MOTIF_NAME/$uniprobe/g;
$name = $trunc_name if $truncate_names;
if ($use_numbers) {
$name = $num_motifs + 1;
}
my $matrix = $rows{A} . "\n" . $rows{C} . "\n" . $rows{G} . "\n" . $rows{T};
my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $num_seqs, $pseudo_total, id => $id, alt => $name, url => $url);
print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
$num_bad++ unless $motif;
print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
}
}
}
sub trim {
my $s = shift;
$s =~ s/^\s+|\s+$//g;
return $s;
}
# Read a UniPROBE screen-grab and get the motif name and uniprobe ID from the first two columns.
# If source publication name is provided, only consider lines from screen grab for that paper.
sub get_name_id_dict {
my ($screen_grab_file_name, $source_pub) = @_;
my %dict = ();
if ($screen_grab_file_name) {
open(SG, "<$screen_grab_file_name") || die("Can't open screen-grab file: $!\n");
while(<SG>) {
my @words = split '\t';
shift @words if ($words[0] eq ""); # new format has an empty first field
my $name = trim($words[0]);
my $id = trim($words[1]);
if (int(@words) >= 5) {
my $pub = trim($words[5]);
next if ($source_pub && $pub ne $source_pub);
}
$dict{$name} = $id;
#print stderr "$name $id\n";
}
}
return(%dict);
}