Skip to content

Commit

Permalink
various RHEL7 fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
adkinsrs committed Jun 27, 2017
1 parent 86d20a0 commit 7eab628
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 9 deletions.
2 changes: 1 addition & 1 deletion components/jaccard/jaccard.i1.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
<type>RunUnixCommand</type>
<name>jaccard2fasta</name>
<state>incomplete</state>
<executable>$;BIN_DIR$;/CogProteinFasta</executable>
<executable>$;BIN_DIR$;/CogProteinFastaNoMap</executable>
<param>
<key>--cogFile</key>
<value>$;OUTPUT_DIRECTORY$;/$;ITERATOR_NAME$;/g$;GROUP_NUMBER$;/$;I_GENOME$;.jkcluster.out</value>
Expand Down
2 changes: 2 additions & 0 deletions doc/CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
next
---
- FIX - Fixed an issue with JOC pipeline where CogProteinFasta needed a map file
- This is only true for mugsy-comparative, so I split the two script versions and named this CogProteinFastaNoMap
- NEW - Added fastortho component
- NEW - Added LGT Phylogenetic pipeline
- NEW - lgt_bwa will now use scratch as the TMP_DIR env variable instead of /tmp
Expand Down
215 changes: 215 additions & 0 deletions src/perl/CogProteinFastaNoMap.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/perl

use strict;
use BSML::BsmlParserSerialSearch;
use Ergatis::Logger;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);

my %options = ();
my $results = GetOptions( \%options, 'cogFile|c=s', 'bsmlModelList|m=s', 'outputDir|o=s', 'outputToken=s', 'maxCogSeqCount|s=s', 'extension|e=s', 'use_feature_ids_in_fasta=i', 'log|l=s', 'debug=s');

my $cogFile = $options{'cogFile'};
my $bsmlModelList = $options{'bsmlModelList'};
my $outDir = $options{'outputDir'};
my $maxCogSeqCount = $options{'maxCogSeqCount'};
my $outputToken = $options{'outputToken'};
$options{'outputToken'} .= "_$$";
if($options{'extension'} eq ''){
$options{'extension'} = 'fsa';
}
my $use_feature_ids_in_fasta = defined($options{'use_feature_ids_in_fasta'}) && ($options{'use_feature_ids_in_fasta'} > 0);

## logging
my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename();
my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile,
'LOG_LEVEL'=>$options{'debug'});
$logger = Ergatis::Logger::get_logger();

# mapping from Seq.id and/or Seq-data-import.identifier to polypeptide sequence
my $Prot = {};
# mapping from polypeptide Feature.id to Seq.id
my $Feat = {};


my %seqParserArgs = ( ReadFeatureTables => 0, SequenceCallBack => \&createPolypeptideLookup );
if ($use_feature_ids_in_fasta) {
$seqParserArgs{FeatureCallBack} = \&createFeatureLookup;
$seqParserArgs{ReadFeatureTables} = 1;
}

# set up a serial parser to parse sequence elements, bypassing feature tables for efficiency if possible
my $seqParser = new BSML::BsmlParserSerialSearch(%seqParserArgs);

#Get rid of trailing slashes in directory names

$bsmlModelList =~ s/\/+$//;
$outDir =~ s/\/+$//;

if( !$bsmlModelList )
{
$logger->logdie("no Bsml Directory specified");
}
else
{
if( ! -e $bsmlModelList )
{
$logger->logdie("could not open list file: $bsmlModelList");
}
}

if( !$cogFile )
{
$logger->logdie("cog file not specified");
}

if( !$outDir )
{
$logger->logdie("output directory not specified");
}
else
{
if( ! -d $outDir )
{
mkdir( $outDir );
}
}

open FILE, $bsmlModelList or $logger->logdie("Can't open file $bsmlModelList");
while(my $bsmlFile=<FILE>)
{
chomp $bsmlFile;
if (!(-e $bsmlFile) && -e "$bsmlFile.gz") {
$bsmlFile .= ".gz";
}
if(-e $bsmlFile){
my $ifh;
if ($bsmlFile =~ /\.(gz|gzip)$/) {
open ($ifh, "<:gzip", $bsmlFile) || die "can't read input file $bsmlFile: $!";
} else {
open ($ifh, "<$bsmlFile") || die "can't read input file $bsmlFile: $!";
}
$seqParser->parse( $ifh );
close $ifh;
}
else{
$logger->logdie("Can't read bsml file $bsmlFile");
}
}

open( INPUTCOGS, "<$cogFile" ) or $logger->logdie("could not open $cogFile.");

my $cog = undef;
my $list = [];

while( my $line = <INPUTCOGS> )
{
if( $line =~ /^\t([\S]*)/ )
{
# A new sequence has been found and added to the current cog.
$logger->debug("read line $line");
push( @{$list}, $1 );
}

if( $line =~ /^COG\s+=\s+([^,\s]+)/ )
{
# A new cog has been encountered, flush the previous to disk if present
my $newcog = $1;
$logger->debug("read cog $line $newcog");
&outputCog($cog, $list) if (defined($cog));
$cog = $newcog;
$list = [];
}
}
outputCog($cog,$list) if (defined($cog));
exit(0);

## subroutines

sub outputCog {
my($cog, $list) = @_;
$logger->debug("outputting cog $cog");
if(scalar(@{$list})>1){
if(@{$list} <= $maxCogSeqCount){
open( OUTFILE, ">$outDir/$cog.$options{'outputToken'}.$options{'extension'}" ) or $logger->logdie("could not open $cog.$options{'extension'}");
foreach my $seq ( @{$list} )
{
print OUTFILE ">$seq\n";
my $residues = undef;
if ($use_feature_ids_in_fasta) {
my $seq_id = $Feat->{$seq};
# Not sure why we would need to die here if we aren't dying below.
#$logger->logdie("no sequence id found for feature with id=$seq") if (!defined($seq_id));
if($seq_id) {
$residues = $Prot->{$seq_id};
}
} else {
$residues = $Prot->{$seq};
}
if (!defined($residues)) {
print STDERR "no sequence data found for seq=$seq";
$residues = 'X';
}
print OUTFILE $residues."\n";
}
close( OUTFILE );
}
else{
open( OUTFILE, ">$outDir/$cog.$options{'outputToken'}.$options{'extension'}" ) or $logger->logdie("could not open $cog.$options{'extension'}");
foreach my $seq ( @{$list} )
{
print OUTFILE ">$seq\n";
print OUTFILE "X\n";
}
close( OUTFILE );
}
}
}

sub createPolypeptideLookup
{
my $seqRef = shift;

# We're only interested in polypeptide sequences for all-vs-all and pblast

if( ($seqRef->returnattr( 'molecule' ) eq 'aa') || ($seqRef->returnattr('class') eq 'polypeptide'))
{
my $seq = $seqRef->subSequence(-1,0,0);

my $identifier;
if((defined $seqRef->{'BsmlSeqDataImport'}) && (!$use_feature_ids_in_fasta)){
# don't think this is right, but trying to maintain reverse compatibility:
$identifier = $seqRef->{'BsmlSeqDataImport'}->{'identifier'};
}
else{
$identifier = $seqRef->returnattr( 'id' );
}

if( $identifier && $seq )
{
$Prot->{$identifier} = $seq;
}
}
}

sub createFeatureLookup
{
my $featRef = shift;
my $class = $featRef->returnattr('class');
return unless ($class eq 'polypeptide');
my $feat_id = $featRef->returnattr('id');
my $links = $featRef->returnBsmlLinkListR();
my @seqlinks = grep { $_->{'rel'} eq 'sequence' } @$links;
my $nsl = scalar(@seqlinks);

if ($nsl == 1) {
my $rel = $seqlinks[0]->{'rel'};
my $seq_id = $seqlinks[0]->{'href'};
$seq_id =~ s/^\#//;
if ($rel eq 'sequence') {
$Feat->{$feat_id} = $seq_id;
}
} elsif ($nsl > 1) {
$logger->logdie("feature $feat_id has $nsl sequence Links");
}
}

4 changes: 0 additions & 4 deletions src/perl/blast2btab.pl
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#!/usr/bin/perl
BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}};
use lib (@INC,$ENV{"PERL_MOD_DIR"});
no lib "$ENV{PERL_MOD_DIR}/i686-linux";
no lib ".";

=head1 NAME
Expand Down
4 changes: 0 additions & 4 deletions src/perl/clusterBsmlPairwiseAlignments.pl
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#!/usr/bin/perl

BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}};
use lib (@INC,$ENV{"PERL_MOD_DIR"});
no lib "$ENV{PERL_MOD_DIR}/i686-linux";
no lib ".";
=head1 NAME
clusterBsmlPairwiseAlignments.pl -
Expand Down

0 comments on commit 7eab628

Please sign in to comment.