diff --git a/docker/Dockerfile b/docker/Dockerfile index 81ac08c13930527c2648f2ed78c7a4836815b2d7..db4c3f628dd850993c445c3469b2f95d4cebc116 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -49,6 +49,12 @@ ENV VERSION_BLAST="2.2.26" \ VERSION_HHBLITS="3.3.0" \ VERSION_PSIPRED="3.5" \ VERSION_SSPRO="4.03" +# +# Copy patch files for SSPro/ ACCPro +# +COPY getACCAligns.pl.patch /tmp/ +COPY homology_sa.pl.patch /tmp/ +COPY predict_ss_sa.pl.patch /tmp/ RUN set -eo pipefail; \ # \ # Install dependencies for building software \ @@ -57,8 +63,9 @@ RUN set -eo pipefail; \ /usr/bin/yum -y install epel-release; \ /usr/bin/yum update --assumeyes; \ /usr/bin/yum install --assumeyes bzip2 gcc-c++ compat-libstdc++-33.i686 \ - glibc.i686 make openssl-devel pcre-devel \ - perl python3 python3-devel zlib-static; \ + glibc.i686 make openssl-devel patch \ + pcre-devel perl python3 python3-devel \ + zlib-static; \ /usr/bin/yum --assumeyes clean all; \ /usr/bin/rm -rf /var/cache/yum; \ # \ @@ -156,8 +163,6 @@ RUN set -eo pipefail; \ /usr/bin/sed -i 's/\/cluster\/toolkit\/production\/bioprogs\/psipred\/bin/\/usr\/local\/bin/g' scripts/HHPaths.pm; \ /usr/bin/sed -i 's/\/cluster\/toolkit\/production\/bioprogs\/psipred\/data/\/usr\/local\/data/g' scripts/HHPaths.pm; \ /usr/bin/sed -i 's/\/cluster\/toolkit\/production\/bioprogs\/blast\/bin/\/usr\/local\/bin/g' scripts/HHPaths.pm; \ - #/usr/bin/sed -i 's/\/cluster\/databases\/pdb\/all/\/scicore\/data\/databases\/PDB/g' scripts/HHPaths.pm; \ - #/usr/bin/sed -i 's/\/cluster\/databases\/dssp\/bin\/dsspcmbi//g' scripts/HHPaths.pm; \ /usr/local/bin/cmake . -DCMAKE_INSTALL_PREFIX=/usr/local; \ /usr/bin/make -j ${CPUS_FOR_MAKE} NO_PNG=1; \ /usr/bin/make install INSTALL_DIR=/usr/local; \ @@ -183,13 +188,21 @@ RUN set -eo pipefail; \ /usr/local/sspro/test \ /usr/local/sspro/configure.pl~; \ /usr/bin/chmod -R og+rx /usr/local/sspro/; \ + # apply patch files \ + /usr/bin/patch /usr/local/sspro/script/getACCAligns.pl < /tmp/getACCAligns.pl.patch; \ + /usr/bin/rm -f /tmp/getACCAligns.pl.patch; \ + /usr/bin/patch /usr/local/sspro/script/homology_sa.pl < /tmp/homology_sa.pl.patch; \ + /usr/bin/rm -f /tmp/homology_sa.pl.patch; \ + /usr/bin/patch /usr/local/sspro/script/predict_ss_sa.pl < /tmp/predict_ss_sa.pl.patch; \ + /usr/bin/rm -f /tmp/predict_ss_sa.pl.patch; \ # \ # Cleanup packages only needed to compile/ install things to keep the \ # image size low. \ # \ /usr/bin/yum --assumeyes remove bzip2 gcc gcc-c++ glibc-devel make \ - pcre-devel openssl-devel python3-devel \ - python3-pip zlib-devel zlib-static; \ + patch pcre-devel openssl-devel \ + python3-devel python3-pip zlib-devel \ + zlib-static; \ /usr/bin/yum --assumeyes autoremove; \ /usr/bin/yum --assumeyes clean all; \ /usr/bin/rm -rf /var/cache/yum; \ @@ -344,15 +357,15 @@ RUN groupadd --gid $QMEAN_GROUP_ID $QMEAN_GROUP; \ useradd --no-log-init -u $QMEAN_USER_ID -r -ms /bin/bash -g $QMEAN_GROUP \ $QMEAN_USER; \ /usr/bin/mkdir /qmean; \ + /usr/bin/cp /usr/local/sspro/script/getACCAligns.pl /qmean/; \ + /usr/bin/cp /usr/local/sspro/script/homology_sa.pl /qmean/; \ + /usr/bin/cp /usr/local/sspro/script/predict_ss_sa.pl /qmean/; \ /usr/bin/mkdir /data; \ /usr/bin/chgrp -R $QMEAN_GROUP /data; \ /usr/bin/chown -R $QMEAN_USER /data # Copy script to run QMEANDisCo COPY run_qmean.py /qmean/ -COPY predict_ss_sa.pl /qmean/ -COPY homology_sa.pl /qmean/ -COPY getACCAligns.pl /qmean/ ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 diff --git a/docker/getACCAligns.pl b/docker/getACCAligns.pl deleted file mode 100755 index 4faeb27909ae713bbcf1ab9caf6b6bb1f2af3f1a..0000000000000000000000000000000000000000 --- a/docker/getACCAligns.pl +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env perl -################################################### -# getACCAligns -# - finds alignments to AA sequence and outputs -# predicted SS, using - for non-matches -# -# Useage: getACCAligns.pl $temp_file $dir < seq_file -# -# Author: Michael Sweredoski -# Date: 1/7/04 -# -# Modified by Jianlin cheng: change interface for integration with PSpro -# change name of temporary file, make it robust in multi-process environment. -# -# Modified by Michael Sweredoski: adapted from getSSAligns -################################################### - -$temp_file = shift @ARGV; -$install_dir = shift @ARGV; -my $blast_file = shift @ARGV; #SB: blast output added -$dir = $install_dir . "script"; - -$threshold = .95; -#$threshold = .50; - -#SB: not needed with external blast; $query_file = $temp_file; -#SB: not needed with external blast; $NCBI_PATH = "/scicore/soft/app/NCBI/blast/2.2.16/Linux/bin/";#"${install_dir}blast2.2.8"; -$data_dir = "${install_dir}data/pdb_large"; -#$data_dir = "${install_dir}data/pdb_small"; -#SB: not needed with external blast; $query_header = <>; -#SB: not needed with external blast; $aa = <>; -#SB: not needed with external blast; open(TEMP_QUERY,">$query_file") or die "Couldn't open $temp_file for writing\n"; -#add > to convert to fasta format -#SB: not needed with external blast; print TEMP_QUERY ">$query_header$aa"; -#SB: not needed with external blast; close(TEMP_QUERY); - -#SB: we now get blast output from above `$NCBI_PATH/blastall -i $query_file -d $data_dir/dataset -p blastp -F F -g F -o ${temp_file}foob.txt`; - -# SB: using blast output from outside -#`$dir/blast2index.pl ${temp_file}foob.txt > ${temp_file}fooi.bai`; -`$dir/blast2index.pl $blast_file > ${temp_file}fooi.bai`; - -$aligns_file = "$data_dir/acc_data"; -$index_file = "${temp_file}fooi.bai"; - -sub getACC { - $name = shift @_; - open(ALIGNS,"<$aligns_file") or die "Couldn't open aligns file $aligns_file\n"; - while($line = <ALIGNS>) { - chomp($line); - if($line eq $name) { - $acc = <ALIGNS>; - chomp($acc); - my @temp = split /\s+/,$acc; - $acc = ""; - for(my $i = 0; $i <= $#temp; $i++) { - $acc .= ($temp[$i] > 25?"e":"-"); - } - close(ALIGNS); - return $acc; - } else { - <ALIGNS>; - } - } - close(ALIGNS); - return ""; -} - -open(INDEX,$index_file) or die "Couldn't open index file\n"; - -$line = <INDEX>; -($query_name,$query_len) = split " ",$line; - -@aligns = (); - -while($line = <INDEX>) { - ($aligned,$gaps,$eval,$ident,$posi,$subj,$subj_len,$subj_start,$subj_end,$query_start,$query_end) = split "\t",$line; -# if($subj ne $query_name) { - if($subj_ACC = getACC($subj)) { - $subj_ACC =~ s/\s+//g; - $subj_section = substr($subj_ACC,$subj_start-1,$aligned+$gaps); - - $pred_score = 0.3230 + .0260*log($aligned) + .1078*log($ident); - - if($pred_score > 1) { - $pred_score = 1; - } elsif($pred_score < 0) { - $pred_score = 0; - } - - @subj_CHARS = split "",$subj_section; - - push @aligns, { - p_score => $pred_score, - s_section => [@subj_CHARS], - q_start => $query_start-1, - q_len => $aligned+$gaps - }; - } -# } -} - -@sorted_i = sort { $aligns[$b]{"p_score"} <=> $aligns[$a]{"p_score"} } (0..$#aligns); -@sorted_aligns = (); -for($i = 0; $i <= $#aligns; $i++) { - $sorted_aligns[$sorted_i[$i]] = $aligns[$i]; -} - -for($i = 0; $i < $query_len; $i++) { - $found = 0; - for($j = 0; $j <= $#sorted_aligns; $j++) { - if($sorted_aligns[$j]{"q_start"} <= $i && $sorted_aligns[$j]{"q_len"}+$sorted_aligns[$j]{"q_start"} > $i && $sorted_aligns[$j]{"p_score"} > $threshold) { - if($found == 0) { - print $sorted_aligns[$j]{"s_section"}[$i-$sorted_aligns[$j]{"q_start"}]; - $found = 1; - } - } - } - if($found == 0) { - print "n"; - } -} - -print "\n"; -#`rm -f ${temp_file}foob.txt ${temp_file}fooi.bai $query_file`; -`rm -f ${temp_file}fooi.bai`; diff --git a/docker/getACCAligns.pl.patch b/docker/getACCAligns.pl.patch new file mode 100644 index 0000000000000000000000000000000000000000..808f097bb1e44c736933597fe54f0d6d2c4114c5 --- /dev/null +++ b/docker/getACCAligns.pl.patch @@ -0,0 +1,57 @@ +--- getACCAligns.pl 2021-04-29 10:21:51.000000000 +0200 ++++ getACCAligns.pl 2021-03-31 10:35:36.000000000 +0200 +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl ++#!/usr/bin/env perl + ################################################### + # getACCAligns + # - finds alignments to AA sequence and outputs +@@ -17,25 +17,28 @@ + + $temp_file = shift @ARGV; + $install_dir = shift @ARGV; ++my $blast_file = shift @ARGV; #SB: blast output added + $dir = $install_dir . "script"; + + $threshold = .95; + #$threshold = .50; + +-$query_file = $temp_file; +-$NCBI_PATH = "${install_dir}blast2.2.8"; ++#SB: not needed with external blast; $query_file = $temp_file; ++#SB: not needed with external blast; $NCBI_PATH = "/scicore/soft/app/NCBI/blast/2.2.16/Linux/bin/";#"${install_dir}blast2.2.8"; + $data_dir = "${install_dir}data/pdb_large"; + #$data_dir = "${install_dir}data/pdb_small"; +-$query_header = <>; +-$aa = <>; +-open(TEMP_QUERY,">$query_file") or die "Couldn't open $temp_file for writing\n"; ++#SB: not needed with external blast; $query_header = <>; ++#SB: not needed with external blast; $aa = <>; ++#SB: not needed with external blast; open(TEMP_QUERY,">$query_file") or die "Couldn't open $temp_file for writing\n"; + #add > to convert to fasta format +-print TEMP_QUERY ">$query_header$aa"; +-close(TEMP_QUERY); ++#SB: not needed with external blast; print TEMP_QUERY ">$query_header$aa"; ++#SB: not needed with external blast; close(TEMP_QUERY); + +-`$NCBI_PATH/blastall -i $query_file -d $data_dir/dataset -p blastp -F F -g F -o ${temp_file}foob.txt`; ++#SB: we now get blast output from above `$NCBI_PATH/blastall -i $query_file -d $data_dir/dataset -p blastp -F F -g F -o ${temp_file}foob.txt`; + +-`$dir/blast2index.pl ${temp_file}foob.txt > ${temp_file}fooi.bai`; ++# SB: using blast output from outside ++#`$dir/blast2index.pl ${temp_file}foob.txt > ${temp_file}fooi.bai`; ++`$dir/blast2index.pl $blast_file > ${temp_file}fooi.bai`; + + $aligns_file = "$data_dir/acc_data"; + $index_file = "${temp_file}fooi.bai"; +@@ -119,8 +122,5 @@ + } + + print "\n"; +-`rm -f ${temp_file}foob.txt ${temp_file}fooi.bai $query_file`; +- +- +- +- ++#`rm -f ${temp_file}foob.txt ${temp_file}fooi.bai $query_file`; ++`rm -f ${temp_file}fooi.bai`; diff --git a/docker/homology_sa.pl b/docker/homology_sa.pl deleted file mode 100755 index 0a70fb87434cccc15df874f9ddc04b0e687a34ed..0000000000000000000000000000000000000000 --- a/docker/homology_sa.pl +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env perl -################################################### -# homology_sa.pl -# - makes a ACC prediction for a protein using -# ACCpro and homology -# Useage: homology.pl install_dir seq_file(name, sequence(in compact format)) align_file output_file -# -# Author: Michael Sweredoski -# Date: 1/8/04 -# -# Modified by Jianlin Cheng, 5/1/2004 -# Integration with PSpro, change inteface, add error checking -# Output(chaged): name, seq, predicted_ss -# -# Modified by Mike Sweredoski: adapted from homology_ss -################################################### - -use File::Basename; -use warnings; - -#SB: extended from 4 to 5 to pass blast output in -if (@ARGV != 5) -{ - die "need 5 params: install dir, seq file(name, sequence in compact format), alignment file, output file, blast output\n"; -} - -$install_dir = shift @ARGV; -$seq_file = shift @ARGV; -$align_file = shift @ARGV; -$output_file = shift @ARGV; -my $blast_file = shift @ARGV; - -if (! -d $install_dir) -{ - die "can't find install directory.\n"; -} -if ( substr($install_dir, length($install_dir) - 1, 1) ne "/" ) -{ - $install_dir .= "/"; -} -if (! -f $seq_file) -{ - die "input file doesn't exist.\n"; -} -if (! -f $align_file) -{ - warn "alignment file doesn't exist.\n"; -} -$temp_file = "${seq_file}.homo" . $$; -$dir = "${install_dir}script"; - -open(SEQ_FILE, "$seq_file") || die " can't open seq file.\n"; -$seq_name = <SEQ_FILE>; -$seq_seq = <SEQ_FILE>; -close SEQ_FILE; - -open(OUTPUT,">$output_file"); - -$accpro_pred = `cat $seq_file | $dir/getACCproPred.pl $align_file $temp_file $install_dir`; -$pymod_dir = dirname(__FILE__); -# SB: by passing BLAST output in, we do not need sequence data, anymore -#$acc_homology_pred = `cat $seq_file | $pymod_dir/getACCAligns.pl $temp_file $install_dir $blast_file`; -$acc_homology_pred = `$pymod_dir/getACCAligns.pl $temp_file $install_dir $blast_file`; - -chomp($accpro_pred); -chomp($acc_homology_pred); - -@ACC_pred = split "", $accpro_pred; -@ACC_HOM_pred = split "", $acc_homology_pred; -print OUTPUT $seq_name; -print OUTPUT $seq_seq; -for($i = 0; $i <= $#ACC_pred; $i++) { - if($ACC_HOM_pred[$i] ne "n") { - print OUTPUT $ACC_HOM_pred[$i]; - } else { - print OUTPUT $ACC_pred[$i]; - } -} -print OUTPUT "\n"; -close OUTPUT; diff --git a/docker/homology_sa.pl.patch b/docker/homology_sa.pl.patch new file mode 100644 index 0000000000000000000000000000000000000000..e7a96a613fa703215ed6f1850a765220e514d5d2 --- /dev/null +++ b/docker/homology_sa.pl.patch @@ -0,0 +1,44 @@ +--- homology_sa.pl 2021-04-29 12:50:44.000000000 +0200 ++++ homology_sa.pl 2021-03-31 15:31:42.000000000 +0200 +@@ -1,4 +1,4 @@ +-#!/usr/bin/perl -w ++#!/usr/bin/env perl + ################################################### + # homology_sa.pl + # - makes a ACC prediction for a protein using +@@ -15,16 +15,20 @@ + # Modified by Mike Sweredoski: adapted from homology_ss + ################################################### + +-if (@ARGV != 4) ++use File::Basename; ++use warnings; ++ ++#SB: extended from 4 to 5 to pass blast output in ++if (@ARGV != 5) + { +- die "need 4 params: install dir, seq file(name, sequence in compact format), alignment file, output file\n"; ++ die "need 5 params: install dir, seq file(name, sequence in compact format), alignment file, output file, blast output\n"; + } + + $install_dir = shift @ARGV; + $seq_file = shift @ARGV; + $align_file = shift @ARGV; + $output_file = shift @ARGV; +- ++my $blast_file = shift @ARGV; + + if (! -d $install_dir) + { +@@ -53,7 +57,10 @@ + open(OUTPUT,">$output_file"); + + $accpro_pred = `cat $seq_file | $dir/getACCproPred.pl $align_file $temp_file $install_dir`; +-$acc_homology_pred = `cat $seq_file | $dir/getACCAligns.pl $temp_file $install_dir`; ++$pymod_dir = dirname(__FILE__); ++# SB: by passing BLAST output in, we do not need sequence data, anymore ++#$acc_homology_pred = `cat $seq_file | $pymod_dir/getACCAligns.pl $temp_file $install_dir $blast_file`; ++$acc_homology_pred = `$pymod_dir/getACCAligns.pl $temp_file $install_dir $blast_file`; + + chomp($accpro_pred); + chomp($acc_homology_pred); diff --git a/docker/predict_ss_sa.pl b/docker/predict_ss_sa.pl deleted file mode 100755 index d50d4470c0b4d8ef5bdc6c77a6772ac46063f211..0000000000000000000000000000000000000000 --- a/docker/predict_ss_sa.pl +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env perl - -use File::Basename; -use warnings; - -#predict SS, SA for a single sequence, SS, SA is a combination of neural network and homology - -#June 23, 2004, Jianlin Cheng - -#################################### -### MODIFIED to work along with QMEAN -### MODIFIED to eat our own MSA -#################################### - -#input - # 1: sequence file* - # 2: output file - # 3: msa file* - -# * Important: filenames with a '.' in them make the whole thing fail! - -#Output: pxml, casp, eva format contact map for both 12 and 8 A. -#Notice: the white space or . in seq_file are replaced by _. - -#Oct 28, 2013, SB -# Modified to create query file for get[SS|ACC]Aligns.pl here - -if (@ARGV != 3) -{ - die "Usage: seq_file(fasta) output_file alignment_file\n"; -} - -$seq_file = shift @ARGV; -$output_file = shift @ARGV; -$alignment_file = shift @ARGV; -$output_dir = "/tmp"; - -if (! -f $seq_file) -{ - die "can't find file: $seq_file.\n"; -} - -if (! -d $output_dir) -{ - die "the output directory doesn't exists.\n"; -} - -if ( substr($output_dir, length($output_dir) - 1, 1) ne "/" ) -{ - $output_dir .= "/"; -} - -#extract sequence file name -$slash_pos = rindex($seq_file, "/"); -if ($slash_pos != -1) -{ - $seq_filename = substr($seq_file, $slash_pos + 1, length($seq_file) - $slash_pos - 1); -} -else -{ - $seq_filename = $seq_file; -} -if (length($seq_filename) <= 0) -{ - die "sequence file name shouldn't be less or equal 0.\n"; -} - -#non-char and . is not allowed for ouput file name -$seq_filename =~ s/\s/_/g; -$seq_filename =~ s/\./_/g; - -#output prefix is used as the prefix name of output files -$output_prefix = $output_dir . $$ . $seq_filename; -#set alignment file name -$seq_filename .= "alg"; - -open(SEQ_FILE, "$seq_file") || die "can't open sequence file.\n"; -@content = <SEQ_FILE>; -close(SEQ_FILE); -$name = shift @content; -chomp $name; -$sequence = shift @content; - -#remove unseen dos format (cause the program fail) -$name =~ s/\s//g; -$sequence =~ s/\s//g; - -#check the sequence format -if (substr($name, 0, 1) ne ">") -{ - die "sequence file: $seq_file is not in fasta format.\n"; -} -$name = substr($name, 1, length($name) - 1); -$target_name = $name; -if (length($target_name) == 0) -{ - $target_name = "unknown"; -} -if (length($sequence) < 1) -{ - die "sequence is empty. \n"; -} -$install_dir = "/usr/local/sspro/"; - -# SB: create BLAST alignments here, hand down to follow up scripts -# this should prevent calling BLAST twice -$NCBI_PATH = "/usr/local/bin"; -# SB: create query file -open(TEMP_QUERY,">$output_prefix.tmp.homo.$$") - or die "Couldn't open $output_prefix.tmp.homo.$$ for writing\n"; -#add > to convert to fasta format -print TEMP_QUERY ">$seq_filename\n$sequence\n"; -close(TEMP_QUERY); -# run BLAST -my $blast_file = "${output_prefix}.tmp.homo.$$.foob.txt"; -`$NCBI_PATH/blastall -i $output_prefix.tmp.homo.$$ -d ${install_dir}data/pdb_large/dataset -p blastp -F F -g F -o ${blast_file}`; - -#Predict Solvent Accessibility -#create input file for accpro -open(TMP, ">$output_prefix.tmp") || die "can't create temporary file for accpro.\n"; -#a little bit ugly here: for accpro: alignment_file = align_dir + seq_filename -print TMP "$seq_filename\n"; -print TMP "$sequence\n"; -close(TMP); - -$pymod_dir = dirname(__FILE__); - -print "Predict solvent accessibility..."; -`${pymod_dir}/homology_sa.pl $install_dir $output_prefix.tmp $alignment_file $output_prefix.accpro $blast_file`; -print "done\n"; - -`mv $output_prefix.accpro $output_file`; - -#remove the intermediate files -`rm $output_prefix.tmp`; -`rm $blast_file`; -`rm $output_prefix.tmp.homo.$$`; diff --git a/docker/predict_ss_sa.pl.patch b/docker/predict_ss_sa.pl.patch new file mode 100644 index 0000000000000000000000000000000000000000..15037081c58136e4b8bbbf370a373b24653fadef --- /dev/null +++ b/docker/predict_ss_sa.pl.patch @@ -0,0 +1,207 @@ +--- predict_ss_sa.pl 2021-04-29 12:51:42.000000000 +0200 ++++ predict_ss_sa.pl 2021-03-31 15:29:38.000000000 +0200 +@@ -1,90 +1,39 @@ +-#!/usr/bin/perl -w ++#!/usr/bin/env perl ++ ++use File::Basename; ++use warnings; + + #predict SS, SA for a single sequence, SS, SA is a combination of neural network and homology + + #June 23, 2004, Jianlin Cheng + ++#################################### ++### MODIFIED to work along with QMEAN ++### MODIFIED to eat our own MSA ++#################################### ++ + #input +- # 1: blast_dir +- # 2: big_db +- # 3: nr_db +- # 4: ss_predictor +- # 5: sa_predictor +- # 6: script_dir +- # 7: sequence file(seq_file): one sequence in fasta format, +- # 8: output file ++ # 1: sequence file* ++ # 2: output file ++ # 3: msa file* ++ ++# * Important: filenames with a '.' in them make the whole thing fail! + + #Output: pxml, casp, eva format contact map for both 12 and 8 A. + #Notice: the white space or . in seq_file are replaced by _. + +-if (@ARGV != 8) +-{ +- die "Usage: blast_dir big_db nr_db ss_predictor sa_predictor script_dir seq_file(fasta) output_file\n"; +-} +- +-$blast_dir = shift @ARGV; +-$big_db = shift @ARGV; +-$nr_db = shift @ARGV; +-$ss_predictor = shift @ARGV; +-$sa_predictor = shift @ARGV; +-$script_dir = shift @ARGV; +- +-################verify if all the things are there######################### +-if (! -d $blast_dir) +-{ +- die "can't find blast directory.\n"; +-} +-if ( substr($blast_dir, length($blast_dir) - 1, 1) ne "/" ) +-{ +- $blast_dir .= "/"; +-} +- +-if (! -d $script_dir) +-{ +- die "can't find perl script directory.\n"; +-} +- +-if ( substr($script_dir, length($script_dir) - 1, 1) ne "/" ) +-{ +- $script_dir .= "/"; +-} +- +-if (! -f "${blast_dir}blastpgp") +-{ +- die "can't find blastpgp.\n"; +-} +- +-if (! -f "${big_db}.pal") { +-if (! -f "${big_db}.phr" || ! -f "${big_db}.pin" || ! -f "${big_db}.psq") +-{ +- die "can't find the big coil database.\n"; +-} +-} +- +-if (! -f "${nr_db}.pal") { +-if (! -f "${nr_db}.phr" || ! -f "${nr_db}.pin" || ! -f "${nr_db}.psq" || ! -f "${nr_db}.pnd" +- || ! -f "${nr_db}.pni" || ! -f "${nr_db}.psd" || ! -f "${nr_db}.psi" ) +-{ +- die "can't find the non-redundant database.\n"; +-} +-} +- +-if (! -f $ss_predictor) +-{ +- die "can't find secondary structure predictor.\n"; +-} ++#Oct 28, 2013, SB ++# Modified to create query file for get[SS|ACC]Aligns.pl here + +-if (! -f $sa_predictor) ++if (@ARGV != 3) + { +- die "can't find solvent accessibility predictor.\n"; ++ die "Usage: seq_file(fasta) output_file alignment_file\n"; + } +-#############################End of Verification####################################### +- +- + + $seq_file = shift @ARGV; +-$output_file = shift @ARGV; +-$output_dir = "./"; ++$output_file = shift @ARGV; ++$alignment_file = shift @ARGV; ++$output_dir = "/tmp"; + + if (! -f $seq_file) + { +@@ -121,10 +70,9 @@ + $seq_filename =~ s/\./_/g; + + #output prefix is used as the prefix name of output files +-$output_prefix = $output_dir . $seq_filename; ++$output_prefix = $output_dir . $$ . $seq_filename; + #set alignment file name + $seq_filename .= "alg"; +-$output_prefix_alg = $output_dir . $seq_filename; + + open(SEQ_FILE, "$seq_file") || die "can't open sequence file.\n"; + @content = <SEQ_FILE>; +@@ -150,52 +98,40 @@ + } + if (length($sequence) < 1) + { +- die "seqeunce is empty. \n"; ++ die "sequence is empty. \n"; + } +-$script_dir =~ /(.+)script\/$/; +-$install_dir = $1; ++$install_dir = "/usr/local/sspro/"; + +-#Generate Alignments for sequence +-print "Generate alignment...\n"; +-#the output alignment file name is: $output_prefix +-`${script_dir}generate_flatblast.pl $blast_dir $script_dir $big_db $nr_db $seq_file $output_prefix_alg >/dev/null`; +-print "done\n"; +- +-#Predict Secondary Stx +-print "Predict secondary structure..."; +-#create input file for sspro +-open(TMP, ">$output_prefix.tmp") || die "can't create temporary file for sspro.\n"; +-#a little bit ugly here: for sspro: alignment_file = align_dir + seq_filename +-print TMP "$seq_filename\n"; +-print TMP "$sequence\n"; +-close(TMP); +-`${script_dir}homology_ss.pl $install_dir $output_prefix.tmp $output_prefix_alg $output_prefix.sspro`; +-print "done\n"; ++# SB: create BLAST alignments here, hand down to follow up scripts ++# this should prevent calling BLAST twice ++$NCBI_PATH = "/usr/local/bin"; ++# SB: create query file ++open(TEMP_QUERY,">$output_prefix.tmp.homo.$$") ++ or die "Couldn't open $output_prefix.tmp.homo.$$ for writing\n"; ++#add > to convert to fasta format ++print TEMP_QUERY ">$seq_filename\n$sequence\n"; ++close(TEMP_QUERY); ++# run BLAST ++my $blast_file = "${output_prefix}.tmp.homo.$$.foob.txt"; ++`$NCBI_PATH/blastall -i $output_prefix.tmp.homo.$$ -d ${install_dir}data/pdb_large/dataset -p blastp -F F -g F -o ${blast_file}`; + + #Predict Solvent Accessibility ++#create input file for accpro ++open(TMP, ">$output_prefix.tmp") || die "can't create temporary file for accpro.\n"; ++#a little bit ugly here: for accpro: alignment_file = align_dir + seq_filename ++print TMP "$seq_filename\n"; ++print TMP "$sequence\n"; ++close(TMP); ++ ++$pymod_dir = dirname(__FILE__); ++ + print "Predict solvent accessibility..."; +-`${script_dir}homology_sa.pl $install_dir $output_prefix.tmp $output_prefix_alg $output_prefix.accpro`; ++`${pymod_dir}/homology_sa.pl $install_dir $output_prefix.tmp $alignment_file $output_prefix.accpro $blast_file`; + print "done\n"; + +-open(TMP, ">$output_file") || die "can't create temporary file for conpro.\n"; +-open(SSPRO, "$output_prefix.sspro") || die "can't open the ss result file.\n"; +-open(ACCPRO, "$output_prefix.accpro") || die "can't open the sa result file.\n"; +-@sspro = <SSPRO>; +-@accpro = <ACCPRO>; +-$name = shift @sspro; +-$seq = shift @sspro; +-$sstx = shift @sspro; +-shift @accpro; +-shift @accpro; +-$acc = shift @accpro; +-$acc =~ s/-/b/g; +-print TMP "$seq_filename\n$seq$sstx$acc"; +-close TMP; +- +-#rename the alignment file +-`mv $output_prefix_alg ${output_file}align`; +-#remove pssm file +-#`rm $output_prefix_alg.pssm`; ++`mv $output_prefix.accpro $output_file`; ++ + #remove the intermediate files +-`rm $output_prefix.accpro $output_prefix.sspro`; + `rm $output_prefix.tmp`; ++`rm $blast_file`; ++`rm $output_prefix.tmp.homo.$$`;