uniprobe2meme

#!/usr/bin/perl -w
#
# FILE: uniprobe2meme
# AUTHOR: Timothy L. Bailey
# CREATE DATE: 3/19/2009
# DESCRIPTION: Convert a file containing list of TFBS motif matrices from 
# UNIPROBE to MEME output format. 

use warnings;
use strict;

use lib qw(/home/jiang/meme/meme_4.12.0/lib/perl);

use Alphabet qw(dna rna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;

=head1 SYNOPSIS

uniprobe2meme [options]

 Options: 
  -rna                          output an RNA database instead of a DNA database.
  -skip <ID>                    skip this ID (may be repeated)
  -numseqs <numseqs>            assume frequencies based on this many
                                sequences; default: 20
  -truncate_names               truncate motif names at first underscore
  -numbers                      use numbers instead of strings as motif names
  -bg <background file>         file with background frequencies of letters;
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter
                                background to each frequency; default: 0
  -logodds                      print log-odds matrix, too;
                                default: print frequency matrix only
  -url <website>                website for the motif; The untruncated ID is
                                substituted for MOTIF_NAME; default: no url
  -sg <sg_file_name>		TSV file with motif name, ID and source publication 
				in columns 1, 2 & 6 (+1 to column# if first blank)
  -sp <source_pub>		only consider lines in <sg_file_name> that match
				this source publication; default: use all lines
  -h                            print usage message

  Read a UNIPROBE (concatenated) matrix file and convert to MEME format.

  Reads standard input.
  Writes standard output

=cut

# Set option defaults
my %skips;
my $num_seqs = 20;
my $truncate_names = 0;
my $use_numbers = 0;
my $bg_file;
my $pseudo_total = 0; # default total pseudocounts
my $print_logodds = 0;
my $url_pattern = '';
my $sg_file_name = '';
my $help;
my $is_rna = 0;
my $source_pub = "";

GetOptions("rna" => \$is_rna, 
  "skip=s" => sub {$skips{$_[1]} = 1}, "numseqs=i" => \$num_seqs,
  "truncate_names" => \$truncate_names, "numbers" => \$use_numbers, 
  "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds,
  "url=s" => \$url_pattern, "sg=s" => \$sg_file_name, "sp=s" => \$source_pub, 
  "h" => \$help) or pod2usage(2);
#check num seqs
pod2usage("Option -numseqs must have a positive value.") if ($num_seqs < 0);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

# output help if requested
pod2usage(1) if $help;

# Get the background model.
my %bg = &read_background_file($is_rna ? &rna() : &dna(), $bg_file); #DNA background file

# Get a dictionary of UniPROBE IDs indexed by motif name.
my %name_id_dict = get_name_id_dict($sg_file_name, $source_pub);
my %id_count = ();

# Read the input file.
my $num_skipped = 0;
my $num_motifs = 0;
my $num_bad = 0;
my (%motifs, $motif_name, $motif_freqs);
my %rows = ();
my $name;
while (my $line = <>) {
  chomp($line);
  next if ($line =~ /^\s*$/); # skip blank lines

  # Split the line into name and everything else.
  my ($first, $rest) = split('\s+', $line, 2);
  if ($first =~ /^\s*([ACGT]):$/i) { # frequency line
    my $base = uc($1);
    $rows{$base} = $rest;
  } else { # found start of next motif
    #check if there was a previous motif
    conditional_motif_output();
    #record information for new motif
    chomp($first);
    $name = $first;
    %rows = ();
  }
}
#check if there was a previous motif
conditional_motif_output();


print(STDERR "Converted $num_motifs motifs.\n");
print(STDERR "Skipped $num_skipped motifs.\n");


sub conditional_motif_output {
  if (%rows) {
    if ($skips{$name}) {
      $num_skipped++;
    } else {
      my $trunc_name = $name;
      $trunc_name =~ s/^([^_]+)_.*$/$1/;
      my $id = $name_id_dict{$trunc_name};	# look up ID
      my $uniprobe;
      if (! defined $id) {
        $id = $name;
        $uniprobe = $name;
      } else {
        $uniprobe = $id;
        $uniprobe =~ s/^UP//;
        # make the ID unique
        my $ic = $id_count{$id}; 
        if (! $ic) {
          $ic = $id_count{$id} = 1;
        }
        $id_count{$id}++;
        $id = $id . "_$ic";
      }
      my $url = $url_pattern;
      $url =~ s/MOTIF_NAME/$uniprobe/g;
      $name = $trunc_name if $truncate_names;
      if ($use_numbers) {
        $name = $num_motifs + 1;
      }
      my $matrix = $rows{A} . "\n" . $rows{C} . "\n" . $rows{G} . "\n" . $rows{T};
      my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $num_seqs, $pseudo_total, id => $id, alt => $name, url => $url);
      print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
      $num_bad++ unless $motif;
      print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
    }
  }
}

sub trim { 
  my $s = shift; 
  $s =~ s/^\s+|\s+$//g;
  return $s;
}

# Read a UniPROBE screen-grab and get the motif name and uniprobe ID from the first two columns.
# If source publication name is provided, only consider lines from screen grab for that paper. 
sub get_name_id_dict {
  my ($screen_grab_file_name, $source_pub) = @_;
  my %dict = ();
  if ($screen_grab_file_name) {
    open(SG, "<$screen_grab_file_name") || die("Can't open screen-grab file: $!\n");
    while(<SG>) {
      my @words = split '\t';
      shift @words if ($words[0] eq "");	# new format has an empty first field
      my $name = trim($words[0]);
      my $id = trim($words[1]);
      if (int(@words) >= 5) {
	my $pub = trim($words[5]);
	next if ($source_pub && $pub ne $source_pub);
      }
      $dict{$name} = $id;
      #print stderr "$name $id\n";
    }
  }
  return(%dict);
}