diff --git a/modules/bindings/src/tmalign/HwRMSD.cpp b/modules/bindings/src/tmalign/HwRMSD.cpp index d50607f5bd31299248f66711ba31efc1f134c45b..651d82456462f0e1ced7970fd2d27c77d7dd3981 100644 --- a/modules/bindings/src/tmalign/HwRMSD.cpp +++ b/modules/bindings/src/tmalign/HwRMSD.cpp @@ -62,7 +62,16 @@ void print_extra_help() " 2: glocal-both alignment\n" " 3: Smith-Waterman algorithm for local alignment\n" "\n" -" -iter ALignment-superposition iterations. Default is 1\n" +" -iter Alignment-superposition iterations. Default is 10\n" +"\n" +" -seq Type of sequence used to make initial alignment\n" +" 1: amino acid/nucleotide sequence\n" +" 2: secondary structure\n" +" 3: (default) sequence + secondary structure\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" "\n" " -infmt1 Input format for chain1\n" " -infmt2 Input format for chain2\n" @@ -91,7 +100,7 @@ void print_help(bool h_opt=false) "\n" " -i Start with an alignment specified in fasta file 'align.txt'\n" "\n" -" -I Stick to the alignment 'align.txt'\n" +" -I Stick to the alignment specified in 'align.txt'\n" "\n" " -m Output HwRMSD rotation matrix\n" "\n" @@ -135,8 +144,7 @@ int main(int argc, char *argv[]) bool h_opt = false; // print full help message bool m_opt = false; // flag for -m, output rotation matrix - bool i_opt = false; // flag for -i, with user given initial alignment - bool I_opt = false; // flag for -I, stick to user given alignment + int i_opt = 0; // 0 for -i, 3 for -I bool o_opt = false; // flag for -o, output superposed structure bool a_opt = false; // flag for -a, normalized by average length bool u_opt = false; // flag for -u, normalized by user specified length @@ -147,6 +155,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int split_opt =0; // do not split chain int outfmt_opt=0; // set -outfmt to full output + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty @@ -157,7 +166,9 @@ int main(int argc, char *argv[]) vector<string> chain1_list; // only when -dir1 is set vector<string> chain2_list; // only when -dir2 is set int glocal =0; - int iter_opt =1; + int iter_opt =10; + double early_opt =0.01; + int seq_opt =3; for(int i = 1; i < argc; i++) { @@ -187,16 +198,20 @@ int main(int argc, char *argv[]) } else if ( !strcmp(argv[i],"-i") && i < (argc-1) ) { - fname_lign = argv[i + 1]; i_opt = true; i++; + if (i_opt==3) + PrintErrorAndQuit("ERROR! -i and -I cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 1; i++; + } + else if (!strcmp(argv[i], "-I") && i < (argc-1) ) + { + if (i_opt==1) + PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 3; i++; } else if (!strcmp(argv[i], "-m") && i < (argc-1) ) { fname_matrix = argv[i + 1]; m_opt = true; i++; }// get filename for rotation matrix - else if (!strcmp(argv[i], "-I") && i < (argc-1) ) - { - fname_lign = argv[i + 1]; I_opt = true; i++; - } else if ( !strcmp(argv[i],"-infmt1") && i < (argc-1) ) { infmt1_opt=atoi(argv[i + 1]); i++; @@ -253,6 +268,18 @@ int main(int argc, char *argv[]) { iter_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-early") && i < (argc-1) ) + { + early_opt=atof(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-seq") && i < (argc-1) ) + { + seq_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else if (xname.size() == 0) xname=argv[i]; else if (yname.size() == 0) yname=argv[i]; else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); @@ -280,16 +307,14 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); } if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") - PrintErrorAndQuit("ERROR! molecule type must be either RNA or protein."); + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); else if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") atom_opt=" C3'"; - if (i_opt && I_opt) - PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); if (u_opt && Lnorm_ass<=0) PrintErrorAndQuit("Wrong value for option -u! It should be >0"); if (d_opt && d0_scale<=0) @@ -298,7 +323,7 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); if (byresi_opt!=0) { - if (i_opt || I_opt) + if (i_opt) PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); if (byresi_opt<0 || byresi_opt>3) PrintErrorAndQuit("-byresi can only be 0, 1, 2 or 3"); @@ -312,11 +337,13 @@ int main(int argc, char *argv[]) if (split_opt<0 || split_opt>2) PrintErrorAndQuit("-split can only be 0, 1 or 2"); if (iter_opt<=0) PrintErrorAndQuit("-iter must be >0"); + if (seq_opt!=1 && seq_opt!=2 && seq_opt!=3) + PrintErrorAndQuit("-seq must be 1, 2 or 3"); /* read initial alignment file from 'align.txt' */ - if (i_opt || I_opt) read_user_alignment(sequence, fname_lign, I_opt); + if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); - if (byresi_opt) I_opt=true; + if (byresi_opt) i_opt=3; if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); @@ -347,7 +374,7 @@ int main(int argc, char *argv[]) int xlen, ylen; // chain length int xchainnum,ychainnum;// number of chains in a PDB file char *seqx, *seqy; // for the protein sequence - int *secx, *secy; // for the secondary structure + char *secx, *secy; // for the secondary structure double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and // ya[0...ylen-1][0..2], in general, // ya is regarded as native structure @@ -361,7 +388,7 @@ int main(int argc, char *argv[]) /* parse chain 1 */ xname=chain1_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt); + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -379,18 +406,13 @@ int main(int argc, char *argv[]) <<". Chain length 0."<<endl; continue; } - else if (xlen<=5) - { - cerr<<"Sequence is too short <=5!: "<<xname<<endl; - continue; - } NewArray(&xa, xlen, 3); seqx = new char[xlen + 1]; xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, resi_vec1, byresi_opt); - if (iter_opt>=2) // secondary structure assignment + if (seq_opt==2 || (seq_opt==3 && iter_opt>=2)) // SS assignment { - secx = new int[xlen]; + secx = new char[xlen+1]; if (mol_vec1[chain_i]>0) make_sec(seqx, xa, xlen, secx,atom_opt); else make_sec(xa, xlen, secx); @@ -403,7 +425,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -422,18 +445,13 @@ int main(int argc, char *argv[]) <<". Chain length 0."<<endl; continue; } - else if (ylen<=5) - { - cerr<<"Sequence is too short <=5!: "<<yname<<endl; - continue; - } NewArray(&ya, ylen, 3); seqy = new char[ylen + 1]; ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, resi_vec2, byresi_opt); - if (iter_opt>=2) + if (seq_opt==2 || (seq_opt==3 && iter_opt>=2)) // SS assignment { - secy = new int[ylen]; + secy = new char[ylen+1]; if (mol_vec2[chain_j]>0) make_sec(seqy, ya, ylen, secy, atom_opt); else make_sec(ya, ylen, secy); @@ -456,6 +474,7 @@ int main(int argc, char *argv[]) double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore int n_ali=0; int n_ali8=0; + int *invmap = new int[ylen+1]; /* entry function for structure alignment */ HwRMSD_main(xa, ya, seqx, seqy, secx, secy, t0, u0, @@ -463,25 +482,30 @@ int main(int argc, char *argv[]) d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, sequence, - Lnorm_ass, d0_scale, i_opt, I_opt, a_opt, u_opt, d_opt, - mol_vec1[chain_i]+mol_vec2[chain_j], glocal, iter_opt); + Lnorm_ass, d0_scale, i_opt, a_opt, u_opt, d_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + invmap, glocal, iter_opt, seq_opt, early_opt); + + if (outfmt_opt>=2) + get_seqID(invmap, seqx, seqy, ylen, Liden, n_ali8); /* print result */ output_results( - xname.substr(dir1_opt.size()), - yname.substr(dir2_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), chainID_list1[chain_i].c_str(), chainID_list2[chain_j].c_str(), xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, - n_ali8, n_ali, L_ali, TM_ali, rmsd_ali, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, (m_opt?fname_matrix+chainID_list1[chain_i]:"").c_str(), - outfmt_opt, ter_opt, + outfmt_opt, ter_opt, false, split_opt, o_opt, (o_opt?fname_super+chainID_list1[chain_i]:"").c_str(), - false, false, a_opt, u_opt, d_opt); + false, a_opt, u_opt, d_opt, 0, + resi_vec1, resi_vec2); /* Done! Free memory */ seqM.clear(); @@ -489,6 +513,7 @@ int main(int argc, char *argv[]) seqyA.clear(); DeleteArray(&ya, ylen); delete [] seqy; + delete [] invmap; if (iter_opt>=2) delete [] secy; resi_vec2.clear(); } // chain_j diff --git a/modules/bindings/src/tmalign/HwRMSD.h b/modules/bindings/src/tmalign/HwRMSD.h index 312477af1e93cf339f32b76fcc187ff5dc79f4de..8a29399cdfaf9c8b55e3767508f3ff6d78a4cad4 100644 --- a/modules/bindings/src/tmalign/HwRMSD.h +++ b/modules/bindings/src/tmalign/HwRMSD.h @@ -3,9 +3,6 @@ #include "NWalign.h" #include "se.h" -const char* HwRMSD_SSmapProtein=" CHTE"; -const char* HwRMSD_SSmapRNA =" .<> "; - double Kabsch_Superpose(double **r1, double **r2, double **xt, double **xa, double **ya, int xlen, int ylen, int invmap[], int& L_ali, double t[3], double u[3][3], const int mol_type) @@ -44,17 +41,41 @@ double Kabsch_Superpose(double **r1, double **r2, double **xt, return RMSD; } +void parse_alignment_into_invmap(const string seqxA_tmp, + const string seqyA_tmp, const int xlen, const int ylen, int *invmap_tmp) +{ + if (seqxA_tmp.size()==0) return; + int i1=-1; + int i2=-1; + int j = 0; + int L = min(seqxA_tmp.size(), seqyA_tmp.size()); + for (j = 0; j < ylen; j++) invmap_tmp[j] = -1; + for (j = 0; j<L; j++) + { + if (seqxA_tmp[j] != '-') i1++; + if (seqyA_tmp[j] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) j = L; + else if (seqxA_tmp[j] != '-') invmap_tmp[i2] = i1; + } + } + return; +} + +/* outfmt_opt is disabled for alignment consistency */ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, - const int *secx, const int *secy, double t0[3], double u0[3][3], + const char *secx, const char *secy, double t0[3], double u0[3][3], double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, double &d0_0, double &TM_0, double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, string &seqM, string &seqxA, string &seqyA, double &rmsd0, int &L_ali, double &Liden, double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, const int xlen, const int ylen, const vector<string>&sequence, const double Lnorm_ass, - const double d0_scale, const bool i_opt, const bool I_opt, - const int a_opt, const bool u_opt, const bool d_opt, - const int mol_type, const int glocal=0, const int iter_opt=1) + const double d0_scale, const int i_opt, + const int a_opt, const bool u_opt, const bool d_opt, const int mol_type, + int *invmap, const int glocal=0, const int iter_opt=10, + const int seq_opt=3, const double early_opt=0.01) { /***********************/ /* allocate memory */ @@ -66,9 +87,7 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, NewArray(&xt, xlen, 3); NewArray(&r1, minlen, 3); NewArray(&r2, minlen, 3); - int *invmap = new int[ylen+1]; - char *ssx; - char *ssy; + int *invmap_tmp = new int[ylen+1]; int i, j, i1, i2, L; double TM1_tmp,TM2_tmp,TM3_tmp,TM4_tmp,TM5_tmp,TM_ali_tmp; @@ -77,72 +96,80 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, int L_ali_tmp,n_ali_tmp,n_ali8_tmp; double Liden_tmp; double rmsd_ali_tmp; + double max_TM=0; + double cur_TM=0; /* initialize alignment */ TM1=TM2=TM1_tmp=TM2_tmp=L_ali=-1; - if (I_opt || i_opt) + + if (i_opt) { seqxA_tmp=sequence[0]; seqyA_tmp=sequence[1]; } - else - NWalign(seqx, seqy, xlen, ylen, seqxA_tmp, seqyA_tmp, mol_type, glocal); - int total_iter=(I_opt || iter_opt<1)?1:iter_opt; + else if (seq_opt==2) NWalign_main(secx, secy, xlen, ylen, + seqxA_tmp, seqyA_tmp, mol_type, invmap_tmp, 1, glocal); + else NWalign_main(seqx, seqy, xlen, ylen, + seqxA_tmp, seqyA_tmp, mol_type, invmap_tmp, 1, glocal); + int total_iter=(i_opt==3 || iter_opt<1)?1:iter_opt; /*******************************/ /* perform iterative alignment */ /*******************************/ for (int iter=0;iter<total_iter;iter++) { + n_ali_tmp=n_ali8_tmp=0; /* get ss alignment for the second iteration */ - if (iter==1 && !i_opt) - { - ssx=new char[xlen+1]; - ssy=new char[ylen+1]; - for (i=0;i<xlen;i++) - { - if (mol_type>0) ssx[i]=HwRMSD_SSmapRNA[secx[i]]; - else ssx[i]=HwRMSD_SSmapProtein[secx[i]]; - } - for (i=0;i<ylen;i++) - { - if (mol_type>0) ssy[i]=HwRMSD_SSmapRNA[secy[i]]; - else ssy[i]=HwRMSD_SSmapProtein[secy[i]]; - } - ssx[xlen]=0; - ssy[ylen]=0; - NWalign(ssx, ssy, xlen, ylen, seqxA_tmp, seqyA_tmp, - mol_type, glocal); - delete [] ssx; - delete [] ssy; - } + if (iter==1 && !i_opt && seq_opt==3) NWalign_main(secx, secy, xlen, + ylen, seqxA_tmp, seqyA_tmp, mol_type, invmap_tmp, 1, glocal); /* parse initial alignment */ - for (j = 0; j < ylen; j++) invmap[j] = -1; - i1 = -1; - i2 = -1; - L = min(seqxA_tmp.size(), seqyA_tmp.size()); - for (j = 0; j<L; j++) - { - if (seqxA_tmp[j] != '-') i1++; - if (seqyA_tmp[j] != '-') - { - i2++; - if (i2 >= ylen || i1 >= xlen) j = L; - else if (seqxA_tmp[j] != '-') invmap[i2] = i1; - } - } + parse_alignment_into_invmap(seqxA_tmp, seqyA_tmp, xlen, ylen, invmap_tmp); /* superpose */ - Kabsch_Superpose(r1, r2, xt, xa, ya, xlen, ylen, invmap, + Kabsch_Superpose(r1, r2, xt, xa, ya, xlen, ylen, invmap_tmp, L_ali, t, u, mol_type); /* derive new alignment */ - se_main(xt, ya, seqx, seqy, TM1_tmp, TM2_tmp, TM3_tmp, TM4_tmp, TM5_tmp, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + se_main(xt, ya, seqx, seqy, TM1_tmp, TM2_tmp, TM3_tmp, TM4_tmp, + TM5_tmp, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM_tmp, seqxA_tmp, seqyA_tmp, rmsd0_tmp, L_ali_tmp, Liden_tmp, - TM_ali_tmp, rmsd_ali_tmp, n_ali_tmp, n_ali8_tmp, xlen, ylen, sequence, - Lnorm_ass, d0_scale, I_opt, a_opt, u_opt, d_opt, mol_type); + TM_ali_tmp, rmsd_ali_tmp, n_ali_tmp, n_ali8_tmp, xlen, ylen, + sequence, Lnorm_ass, d0_scale, i_opt==3, a_opt, u_opt, d_opt, + mol_type, 1, invmap_tmp); + + if (n_ali8_tmp==0) + { + cerr<<"WARNING! zero aligned residue in iteration "<<iter<<endl; + if (xlen>=ylen) seqxA_tmp=(string)(seqx); + if (xlen<=ylen) seqyA_tmp=(string)(seqy); + if (xlen<ylen) + { + seqxA_tmp.clear(); + for (i1=0;i1<(int)((ylen-xlen)/2);i1++) seqxA_tmp+='-'; + seqxA_tmp+=(string)(seqx); + for (i1=seqxA_tmp.size();i1<ylen;i1++) seqxA_tmp+='-'; + } + if (xlen>ylen) + { + seqyA_tmp.clear(); + for (i1=0;i1<(int)((xlen-ylen)/2);i1++) seqyA_tmp+='-'; + seqyA_tmp+=(string)(seqy); + for (i1=seqyA_tmp.size();i1<xlen;i1++) seqyA_tmp+='-'; + } + + parse_alignment_into_invmap(seqxA_tmp, seqyA_tmp, xlen, ylen, invmap_tmp); + + Kabsch_Superpose(r1, r2, xt, xa, ya, xlen, ylen, invmap_tmp, + L_ali, t, u, mol_type); + + se_main(xt, ya, seqx, seqy, TM1_tmp, TM2_tmp, TM3_tmp, TM4_tmp, + TM5_tmp, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM_tmp, seqxA_tmp, seqyA_tmp, rmsd0_tmp, L_ali_tmp, Liden_tmp, + TM_ali_tmp, rmsd_ali_tmp, n_ali_tmp, n_ali8_tmp, xlen, ylen, + sequence, Lnorm_ass, d0_scale, i_opt==3, a_opt, u_opt, d_opt, + mol_type, 1, invmap_tmp); + } /* accept new alignment */ if (TM1_tmp>TM1 && TM2_tmp>TM2) @@ -167,6 +194,7 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, seqxA =seqxA_tmp; seqM =seqM_tmp; seqyA =seqyA_tmp; + for (j=0; j<ylen; j++) invmap[j]=invmap_tmp[j]; rmsd0 =rmsd0_tmp; Liden =Liden_tmp; @@ -174,7 +202,7 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, n_ali8=n_ali8_tmp; /* user specified initial alignment parameters */ - if ((i_opt || I_opt) && L_ali==-1) + if (i_opt && L_ali==-1) { L_ali=L_ali_tmp; TM_ali=TM_ali_tmp; @@ -184,8 +212,20 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, else { if (iter>=2) break; - seqxA_tmp = seqxA; - seqyA_tmp = seqyA; + seqxA_tmp = seqxA; + seqyA_tmp = seqyA; + for (j=0; j<ylen; j++) invmap_tmp[j]=invmap[j]; + rmsd0_tmp = 0; + Liden_tmp = 0; + n_ali_tmp = 0; + n_ali8_tmp = 0; + } + + if (iter>=2 && early_opt>0) + { + cur_TM=(TM1+TM2)/2; + if (cur_TM-max_TM<early_opt) break; + max_TM=cur_TM; } } @@ -195,7 +235,7 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, seqxA_tmp.clear(); seqM_tmp.clear(); seqyA_tmp.clear(); - delete [] invmap; + delete [] invmap_tmp; DeleteArray(&xt, xlen); DeleteArray(&r1, minlen); DeleteArray(&r2, minlen); diff --git a/modules/bindings/src/tmalign/Kabsch.h b/modules/bindings/src/tmalign/Kabsch.h index a4c5d6f7e549eca28189c8bb534042ef62f80609..a12296d45b4e92cee5e7898e205f9b255b499a0c 100644 --- a/modules/bindings/src/tmalign/Kabsch.h +++ b/modules/bindings/src/tmalign/Kabsch.h @@ -26,7 +26,7 @@ bool Kabsch(double **x, double **y, int n, int mode, double *rms, int a_failed = 0, b_failed = 0; double epsilon = 0.00000001; - //initializtation + //initialization *rms = 0; rms1 = 0; e0 = 0; @@ -99,7 +99,7 @@ bool Kabsch(double **x, double **y, int n, int mode, double *rms, r[j][2] = sz[j] - s1[2] * s2[j] / n; } - //compute determinat of matrix r + //compute determinant of matrix r det = r[0][0] * (r[1][1] * r[2][2] - r[1][2] * r[2][1])\ - r[0][1] * (r[1][0] * r[2][2] - r[1][2] * r[2][0])\ + r[0][2] * (r[1][0] * r[2][1] - r[1][1] * r[2][0]); diff --git a/modules/bindings/src/tmalign/MMalign.cpp b/modules/bindings/src/tmalign/MMalign.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6cc485647ec95d9f6c581148fa9dd059981d6518 --- /dev/null +++ b/modules/bindings/src/tmalign/MMalign.cpp @@ -0,0 +1,680 @@ +/* command line argument parsing and document of MMalign main program */ + +#include "MMalign.h" + +using namespace std; + +void print_version() +{ + cout << +"\n" +" **********************************************************************\n" +" * MM-align (Version 20200519): complex structure alignment *\n" +" * References: S Mukherjee, Y Zhang. Nucl Acids Res 37(11):e83 (2009) *\n" +" * Please email comments and suggestions to yangzhanglab@umich.edu *\n" +" **********************************************************************" + << endl; +} + +void print_extra_help() +{ + cout << +"Additional options:\n" +" -fast Fast but slightly inaccurate alignment\n" +"\n" +" -dir1 Use a list of PDB chains listed by 'chain1_list' under\n" +" 'chain1_folder' as all chains for the first complex.\n" +" Note that the slash is necessary.\n" +" $ MMalign -dir1 chain1_folder/ chain1_list complex2\n" +"\n" +" -dir2 Use a list of PDB chains listed by'chain2_list'\n" +" under 'chain2_folder' as all chains for the second complex.\n" +" $ MMalign complex1 -dir2 chain2_folder/ chain2_list\n" +"\n" +" -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" +" add file name suffix to files listed by chain1_list or chain2_list\n" +"\n" +" -atom 4-character atom name used to represent a residue.\n" +" Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" +" (note the spaces before and after CA).\n" +"\n" +" -mol Types of molecules to align\n""Molecule type: RNA or protein\n" +" auto : (default) align both proteins and nucleic acids\n" +" protein: only align proteins\n" +" RNA : only align nucleic acids (RNA and DNA)\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 2: (default) treat each chain as a seperate chain (-ter should be <=1)\n" +" 1: treat each MODEL as a separate chain (-ter should be 0)\n" +" and joins all chains in a MODEL into a single chain.\n" +"\n" +" -outfmt Output format\n" +" 0: (default) full output\n" +" 1: fasta format compact output\n" +" 2: tabular format very compact output\n" +" -1: full output, but without version or citation information\n" +"\n" +" -TMcut -1: (default) do not consider TMcut\n" +" Values in [0.5,1): Do not proceed with TM-align for this\n" +" structure pair if TM-score is unlikely to reach TMcut.\n" +" TMcut is normalized is set by -a option:\n" +" -2: normalized by longer structure length\n" +" -1: normalized by shorter structure length\n" +" 0: (default, same as F) normalized by second structure\n" +" 1: same as T, normalized by average structure length\n" +"\n" +" -mirror Whether to align the mirror image of input structure\n" +" 0: (default) do not align mirrored structure\n" +" 1: align mirror of chain1 to origin chain2\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" +" -infmt1 Input format for complex1\n" +" -infmt2 Input format for complex2\n" +" -1: (default) automatically detect PDB or PDBx/mmCIF format\n" +" 0: PDB format\n" +" 1: SPICKER format\n" +" 2: xyz format\n" +" 3: PDBx/mmCIF format\n" + <<endl; +} + +void print_help(bool h_opt=false) +{ + print_version(); + cout << +"\n" +"Usage: MMalign complex1.pdb complex2.pdb [Options]\n" +"\n" +"Options:\n" +" -a TM-score normalized by the average length of two structures\n" +" T or F, (default F)\n" +"\n" +" -m Output MM-align rotation matrix\n" +"\n" +" -d TM-score scaled by an assigned d0, e.g. 5 Angstroms\n" +"\n" +" -o Output the superposition of complex1.pdb to MM_sup.pdb\n" +" $ MMalign complex1.pdb complex2.pdb -o MM_sup.pdb\n" +" To view superposed full-atom structures:\n" +" $ pymol MM_sup.pdb complex2.pdb\n" +"\n" +" -full Whether to show full alignment result, including alignment of\n" +" individual chains. T or F, (default F)\n" +"\n" +" -ter Whether to read all MODELs in a multi-model structure file\n" +" 1: (default) only read the first model, recommended for alignment\n" +" of asymetric units.\n" +" 0: read all MODEL, recomended for alignment of biological\n" +" assemblies, i.e., biological units (biounits).\n" +"\n" +" -v Print the version of MM-align\n" +"\n" +" -h Print the full help message\n" +"\n" +" (Options -a, -d, -m, -o won't change the final structure alignment)\n\n" +"Example usages:\n" +" MMalign complex1.pdb complex2.pdb\n" +" MMalign complex1.pdb complex2.pdb -d 5.0\n" +" MMalign complex1.pdb complex2.pdb -a T -o complex1.sup\n" +" MMalign complex1.pdb complex2.pdb -m matrix.txt\n" + <<endl; + + if (h_opt) print_extra_help(); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + + clock_t t1, t2; + t1 = clock(); + + /**********************/ + /* get argument */ + /**********************/ + string xname = ""; + string yname = ""; + string fname_super = ""; // file name for superposed structure + string fname_lign = ""; // file name for user alignment + string fname_matrix= ""; // file name for output matrix + vector<string> sequence; // get value from alignment file + double d0_scale =0; + + bool h_opt = false; // print full help message + bool v_opt = false; // print version + bool m_opt = false; // flag for -m, output rotation matrix + bool o_opt = false; // flag for -o, output superposed structure + int a_opt = 0; // flag for -a, do not normalized by average length + bool d_opt = false; // flag for -d, user specified d0 + + bool full_opt = false;// do not show chain level alignment + double TMcut =-1; + int infmt1_opt=-1; // PDB or PDBx/mmCIF format for chain_1 + int infmt2_opt=-1; // PDB or PDBx/mmCIF format for chain_2 + int ter_opt =1; // ENDMDL or END + int split_opt =2; // split by chain + int outfmt_opt=0; // set -outfmt to full output + bool fast_opt =false; // flags for -fast, fTM-align algorithm + int mirror_opt=0; // do not align mirror + int het_opt =0; // do not read HETATM residues + string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA + string suffix_opt=""; // set -suffix to empty + string dir1_opt =""; // set -dir1 to empty + string dir2_opt =""; // set -dir2 to empty + vector<string> chain1_list; // only when -dir1 is set + vector<string> chain2_list; // only when -dir2 is set + + for(int i = 1; i < argc; i++) + { + if ( !strcmp(argv[i],"-o") && i < (argc-1) ) + { + fname_super = argv[i + 1]; o_opt = true; i++; + } + else if ( !strcmp(argv[i],"-a") && i < (argc-1) ) + { + if (!strcmp(argv[i + 1], "T")) a_opt=true; + else if (!strcmp(argv[i + 1], "F")) a_opt=false; + else + { + a_opt=atoi(argv[i + 1]); + if (a_opt!=-2 && a_opt!=-1 && a_opt!=1) + PrintErrorAndQuit("-a must be -2, -1, 1, T or F"); + } + i++; + } + else if ( !strcmp(argv[i],"-full") && i < (argc-1) ) + { + if (!strcmp(argv[i + 1], "T")) full_opt=true; + else if (!strcmp(argv[i + 1], "F")) full_opt=false; + else PrintErrorAndQuit("-full must be T or F"); + i++; + } + else if ( !strcmp(argv[i],"-d") && i < (argc-1) ) + { + d0_scale = atof(argv[i + 1]); d_opt = true; i++; + } + else if ( !strcmp(argv[i],"-v") ) + { + v_opt = true; + } + else if ( !strcmp(argv[i],"-h") ) + { + h_opt = true; + } + else if (!strcmp(argv[i], "-m") && i < (argc-1) ) + { + fname_matrix = argv[i + 1]; m_opt = true; i++; + }// get filename for rotation matrix + else if (!strcmp(argv[i], "-fast")) + { + fast_opt = true; + } + else if ( !strcmp(argv[i],"-infmt1") && i < (argc-1) ) + { + infmt1_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-infmt2") && i < (argc-1) ) + { + infmt2_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-ter") && i < (argc-1) ) + { + ter_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-split") && i < (argc-1) ) + { + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-atom") && i < (argc-1) ) + { + atom_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-mol") && i < (argc-1) ) + { + mol_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir1") && i < (argc-1) ) + { + dir1_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir2") && i < (argc-1) ) + { + dir2_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-suffix") && i < (argc-1) ) + { + suffix_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-outfmt") && i < (argc-1) ) + { + outfmt_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-TMcut") && i < (argc-1) ) + { + TMcut=atof(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else if (yname.size() == 0) yname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if(yname.size()==0) + { + if (h_opt) print_help(h_opt); + if (v_opt) + { + print_version(); + exit(EXIT_FAILURE); + } + if (xname.size()==0) + PrintErrorAndQuit("Please provide input structures"); + PrintErrorAndQuit("Please provide the second input structure"); + } + + if (suffix_opt.size() && dir1_opt.size()+dir2_opt.size()==0) + PrintErrorAndQuit("-suffix is only valid if -dir1 or -dir2 is set"); + if ((dir1_opt.size() || dir2_opt.size()) && (m_opt || o_opt)) + PrintErrorAndQuit("-m or -o cannot be set with -dir1 or -dir2"); + if (atom_opt.size()!=4) + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); + else if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; + + if (d_opt && d0_scale<=0) + PrintErrorAndQuit("Wrong value for option -d! It should be >0"); + if (outfmt_opt>=2 && (a_opt || d_opt)) + PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -d"); + if (ter_opt!=0 && ter_opt!=1) + PrintErrorAndQuit("-ter should be 1 or 0"); + if (split_opt!=1 && split_opt!=2) + PrintErrorAndQuit("-split should be 1 or 2"); + else if (split_opt==1 && ter_opt!=0) + PrintErrorAndQuit("-split 1 should be used with -ter 0"); + + if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt + PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); + + /* parse file list */ + if (dir1_opt.size()==0) chain1_list.push_back(xname); + else file2chainlist(chain1_list, xname, dir1_opt, suffix_opt); + + if (dir2_opt.size()==0) chain2_list.push_back(yname); + else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + + if (outfmt_opt==2) + cout<<"#PDBchain1\tPDBchain2\tTM1\tTM2\t" + <<"RMSD\tID1\tID2\tIDali\tL1\tL2\tLali"<<endl; + + /* declare previously global variables */ + vector<vector<vector<double> > > xa_vec; // structure of complex1 + vector<vector<vector<double> > > ya_vec; // structure of complex2 + vector<vector<char> >seqx_vec; // sequence of complex1 + vector<vector<char> >seqy_vec; // sequence of complex2 + vector<vector<char> >secx_vec; // secondary structure of complex1 + vector<vector<char> >secy_vec; // secondary structure of complex2 + vector<int> mol_vec1; // molecule type of complex1, RNA if >0 + vector<int> mol_vec2; // molecule type of complex2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + vector<int> xlen_vec; // length of complex1 + vector<int> ylen_vec; // length of complex2 + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int xlen_aa,ylen_aa; // total length of protein + int xlen_na,ylen_na; // total length of RNA/DNA + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + + /* parse complex */ + parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, + xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, + atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 1"); + parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, + ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, + atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + if (ya_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 2"); + int len_aa=getmin(xlen_aa,ylen_aa); + int len_na=getmin(xlen_na,ylen_na); + if (a_opt) + { + len_aa=(xlen_aa+ylen_aa)/2; + len_na=(xlen_na+ylen_na)/2; + } + + /* perform monomer alignment if there is only one chain */ + if (xa_vec.size()==1 && ya_vec.size()==1) + { + xlen = xlen_vec[0]; + ylen = ylen_vec[0]; + seqx = new char[xlen+1]; + seqy = new char[ylen+1]; + secx = new char[xlen+1]; + secy = new char[ylen+1]; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + copy_chain_data(xa_vec[0],seqx_vec[0],secx_vec[0], xlen,xa,seqx,secx); + copy_chain_data(ya_vec[0],seqy_vec[0],secy_vec[0], ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, 0, d0_scale, + 0, a_opt, false, d_opt, fast_opt, + mol_vec1[0]+mol_vec2[0],TMcut); + + /* print result */ + output_results( + xname.substr(dir1_opt.size()), + yname.substr(dir2_opt.size()), + chainID_list1[0], chainID_list2[0], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, + 0, d0_scale, d0a, d0u, (m_opt?fname_matrix:"").c_str(), + outfmt_opt, ter_opt, true, split_opt, o_opt, fname_super, + 0, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete[]seqx; + delete[]seqy; + delete[]secx; + delete[]secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + chain1_list.clear(); + chain2_list.clear(); + sequence.clear(); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + chainID_list1.clear(); // list of chainID1 + chainID_list2.clear(); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + + t2 = clock(); + float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; + printf("Total CPU time is %5.2f seconds\n", diff); + return 0; + } + + /* declare TM-score tables */ + int chain1_num=xa_vec.size(); + int chain2_num=ya_vec.size(); + double **TM1_mat; + double **TM2_mat; + double **TMave_mat; + double **ut_mat; // rotation matrices for all-against-all alignment + int ui,uj,ut_idx; + NewArray(&TM1_mat,chain1_num,chain2_num); + NewArray(&TM2_mat,chain1_num,chain2_num); + NewArray(&TMave_mat,chain1_num,chain2_num); + NewArray(&ut_mat,chain1_num*chain2_num,4*3); + vector<string> tmp_str_vec(chain2_num,""); + vector<vector<string> >seqxA_mat(chain1_num,tmp_str_vec); + vector<vector<string> > seqM_mat(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_mat(chain1_num,tmp_str_vec); + tmp_str_vec.clear(); + + /* get all-against-all alignment */ + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + for (j=0;j<chain2_num;j++) + { + ut_idx=i*chain2_num+j; + for (ui=0;ui<4;ui++) + for (uj=0;uj<3;uj++) ut_mat[ut_idx][ui*3+uj]=0; + ut_mat[ut_idx][0]=1; + ut_mat[ut_idx][4]=1; + ut_mat[ut_idx][8]=1; + + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + int Lnorm_tmp=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_tmp=len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, true, + mol_vec1[i]+mol_vec2[j],TMcut); + + /* store result */ + for (ui=0;ui<3;ui++) + for (uj=0;uj<3;uj++) ut_mat[ut_idx][ui*3+uj]=u0[ui][uj]; + for (uj=0;uj<3;uj++) ut_mat[ut_idx][9+uj]=t0[uj]; + TM1_mat[i][j]=TM2; // normalized by chain1 + TM2_mat[i][j]=TM1; // normalized by chain2 + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + TMave_mat[i][j]=TM4*Lnorm_tmp; + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + + /* calculate initial chain-chain assignment */ + int *assign1_list; // value is index of assigned chain2 + int *assign2_list; // value is index of assigned chain1 + assign1_list=new int[chain1_num]; + assign2_list=new int[chain2_num]; + double total_score=enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + + /* refine alignment for large oligomers */ + int aln_chain_num=0; + for (i=0;i<chain1_num;i++) aln_chain_num+=(assign1_list[i]>=0); + bool is_oligomer=(aln_chain_num>=3); + if (aln_chain_num==2) // dimer alignment + { + int na_chain_num1,na_chain_num2,aa_chain_num1,aa_chain_num2; + count_na_aa_chain_num(na_chain_num1,aa_chain_num1,mol_vec1); + count_na_aa_chain_num(na_chain_num2,aa_chain_num2,mol_vec2); + + /* align protein-RNA hybrid dimer to another hybrid dimer */ + if (na_chain_num1==1 && na_chain_num2==1 && + aa_chain_num1==1 && aa_chain_num2==1) is_oligomer=false; + /* align pure protein dimer or pure RNA dimer */ + else if ((getmin(na_chain_num1,na_chain_num2)==0 && + aa_chain_num1==2 && aa_chain_num2==2) || + (getmin(aa_chain_num1,aa_chain_num2)==0 && + na_chain_num1==2 && na_chain_num2==2)) + { + adjust_dimer_assignment(xa_vec,ya_vec,xlen_vec,ylen_vec,mol_vec1, + mol_vec2,assign1_list,assign2_list,seqxA_mat,seqyA_mat); + is_oligomer=false; // cannot refiner further + } + else is_oligomer=true; /* align oligomers to dimer */ + } + + if (aln_chain_num>=3 || is_oligomer) // oligomer alignment + { + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } + if (len_aa+len_na>1000) fast_opt=true; + + /* perform iterative alignment */ + for (int iter=0;iter<1;iter++) + { + total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TM1_mat, TM2_mat, TMave_mat, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, true); + total_score=enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + } + + /* final alignment */ + if (outfmt_opt==0) print_version(); + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TM1_mat, TM2_mat, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, m_opt, o_opt, outfmt_opt, ter_opt, split_opt, + a_opt, d_opt, fast_opt, full_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up everything */ + delete [] assign1_list; + delete [] assign2_list; + DeleteArray(&TM1_mat, chain1_num); + DeleteArray(&TM2_mat, chain1_num); + DeleteArray(&TMave_mat,chain1_num); + DeleteArray(&ut_mat, chain1_num*chain2_num); + vector<vector<string> >().swap(seqxA_mat); + vector<vector<string> >().swap(seqM_mat); + vector<vector<string> >().swap(seqyA_mat); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + vector<string>().swap(chainID_list1); // list of chainID1 + vector<string>().swap(chainID_list2); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + vector<string>().swap(chain1_list); + vector<string>().swap(chain2_list); + vector<string>().swap(sequence); + + t2 = clock(); + float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; + printf("Total CPU time is %5.2f seconds\n", diff); + return 0; +} diff --git a/modules/bindings/src/tmalign/MMalign.h b/modules/bindings/src/tmalign/MMalign.h new file mode 100644 index 0000000000000000000000000000000000000000..af9920a8cdc8087982310c94dfba08378f46b2d7 --- /dev/null +++ b/modules/bindings/src/tmalign/MMalign.h @@ -0,0 +1,1194 @@ +#include "se.h" + +/* count the number of nucleic acid chains (na_chain_num) and + * protein chains (aa_chain_num) in a complex */ +int count_na_aa_chain_num(int &na_chain_num,int &aa_chain_num, + const vector<int>&mol_vec) +{ + na_chain_num=0; + aa_chain_num=0; + for (size_t i=0;i<mol_vec.size();i++) + { + if (mol_vec[i]>0) na_chain_num++; + else aa_chain_num++; + } + return na_chain_num+aa_chain_num; +} + +/* adjust chain assignment for dimer-dimer alignment + * return true if assignment is adjusted */ +bool adjust_dimer_assignment( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<int>&xlen_vec, const vector<int>&ylen_vec, + const vector<int>&mol_vec1, const vector<int>&mol_vec2, + int *assign1_list, int *assign2_list, + const vector<vector<string> >&seqxA_mat, + const vector<vector<string> >&seqyA_mat) +{ + /* check currently assigned chains */ + int i1,i2,j1,j2; + i1=i2=j1=j2=-1; + int chain1_num=xa_vec.size(); + int i,j; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) + { + if (i1<0) + { + i1=i; + j1=assign1_list[i1]; + } + else + { + i2=i; + j2=assign1_list[i2]; + } + } + } + + /* normalize d0 by L */ + int xlen=xlen_vec[i1]+xlen_vec[i2]; + int ylen=ylen_vec[j1]+ylen_vec[j2]; + int mol_type=mol_vec1[i1]+mol_vec1[i2]+ + mol_vec2[j1]+mol_vec2[j2]; + double D0_MIN, d0, d0_search; + double Lnorm=getmin(xlen,ylen); + parameter_set4final(getmin(xlen,ylen), D0_MIN, Lnorm, d0, + d0_search, mol_type); + + double **xa,**ya, **xt; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + NewArray(&xt, xlen, 3); + + double RMSD = 0; + double dd = 0; + double t[3]; + double u[3][3]; + size_t L_ali=0; // index of residue in aligned region + size_t r=0; // index of residue in full alignment + + /* total score using current assignment */ + L_ali=0; + i=j=-1; + for (r=0;r<seqxA_mat[i1][j1].size();r++) + { + i+=(seqxA_mat[i1][j1][r]!='-'); + j+=(seqyA_mat[i1][j1][r]!='-'); + if (seqxA_mat[i1][j1][r]=='-' || seqyA_mat[i1][j1][r]=='-') continue; + xa[L_ali][0]=xa_vec[i1][i][0]; + xa[L_ali][1]=xa_vec[i1][i][1]; + xa[L_ali][2]=xa_vec[i1][i][2]; + ya[L_ali][0]=ya_vec[j1][j][0]; + ya[L_ali][1]=ya_vec[j1][j][1]; + ya[L_ali][2]=ya_vec[j1][j][2]; + L_ali++; + } + i=j=-1; + for (r=0;r<seqxA_mat[i2][j2].size();r++) + { + i+=(seqxA_mat[i2][j2][r]!='-'); + j+=(seqyA_mat[i2][j2][r]!='-'); + if (seqxA_mat[i2][j2][r]=='-' || seqyA_mat[i2][j2][r]=='-') continue; + xa[L_ali][0]=xa_vec[i2][i][0]; + xa[L_ali][1]=xa_vec[i2][i][1]; + xa[L_ali][2]=xa_vec[i2][i][2]; + ya[L_ali][0]=ya_vec[j2][j][0]; + ya[L_ali][1]=ya_vec[j2][j][1]; + ya[L_ali][2]=ya_vec[j2][j][2]; + L_ali++; + } + + Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); + do_rotation(xa, xt, L_ali, t, u); + + double total_score1=0; + for (r=0;r<L_ali;r++) + { + dd=dist(xt[r],ya[r]); + total_score1+=1/(1+dd/d0*d0); + } + total_score1/=Lnorm; + + /* total score using reversed assignment */ + L_ali=0; + i=j=-1; + for (r=0;r<seqxA_mat[i1][j2].size();r++) + { + i+=(seqxA_mat[i1][j2][r]!='-'); + j+=(seqyA_mat[i1][j2][r]!='-'); + if (seqxA_mat[i1][j2][r]=='-' || seqyA_mat[i1][j2][r]=='-') continue; + xa[L_ali][0]=xa_vec[i1][i][0]; + xa[L_ali][1]=xa_vec[i1][i][1]; + xa[L_ali][2]=xa_vec[i1][i][2]; + ya[L_ali][0]=ya_vec[j2][j][0]; + ya[L_ali][1]=ya_vec[j2][j][1]; + ya[L_ali][2]=ya_vec[j2][j][2]; + L_ali++; + } + i=j=-1; + for (r=0;r<seqxA_mat[i2][j1].size();r++) + { + i+=(seqxA_mat[i2][j1][r]!='-'); + j+=(seqyA_mat[i2][j1][r]!='-'); + if (seqxA_mat[i2][j1][r]=='-' || seqyA_mat[i2][j1][r]=='-') continue; + xa[L_ali][0]=xa_vec[i2][i][0]; + xa[L_ali][1]=xa_vec[i2][i][1]; + xa[L_ali][2]=xa_vec[i2][i][2]; + ya[L_ali][0]=ya_vec[j1][j][0]; + ya[L_ali][1]=ya_vec[j1][j][1]; + ya[L_ali][2]=ya_vec[j1][j][2]; + L_ali++; + } + + Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); + do_rotation(xa, xt, L_ali, t, u); + + double total_score2=0; + for (r=0;r<L_ali;r++) + { + dd=dist(xt[r],ya[r]); + total_score2+=1/(1+dd/d0*d0); + } + total_score2/=Lnorm; + + /* swap chain assignment */ + if (total_score1<total_score2) + { + assign1_list[i1]=j2; + assign1_list[i2]=j1; + assign2_list[j1]=i2; + assign2_list[j2]=i1; + } + + /* clean up */ + DeleteArray(&xa, xlen); + DeleteArray(&ya, ylen); + DeleteArray(&xt, xlen); + return total_score1<total_score2; +} + +/* assign chain-chain correspondence */ +double enhanced_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num) +{ + double total_score=0; + double tmp_score=0; + int i,j; + int maxi=0; + int maxj=0; + + /* initialize parameters */ + for (i=0;i<chain1_num;i++) assign1_list[i]=-1; + for (j=0;j<chain2_num;j++) assign2_list[j]=-1; + + /* greedy assignment: in each iteration, the highest chain pair is + * assigned, until no assignable chain is left */ + while(1) + { + tmp_score=-1; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) continue; + for (j=0;j<chain2_num;j++) + { + if (assign2_list[j]>=0 || TMave_mat[i][j]<=0) continue; + if (TMave_mat[i][j]>tmp_score) + { + maxi=i; + maxj=j; + tmp_score=TMave_mat[i][j]; + } + } + } + if (tmp_score<=0) break; // error: no assignable chain + assign1_list[maxi]=maxj; + assign2_list[maxj]=maxi; + total_score+=tmp_score; + } + if (total_score<=0) return total_score; // error: no assignable chain + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + + /* iterative refinemnt */ + double delta_score; + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; + int old_i=-1; + int old_j=-1; + + for (int iter=0;iter<getmin(chain1_num,chain2_num)*5;iter++) + { + delta_score=-1; + for (i=0;i<chain1_num;i++) + { + old_j=assign1_list[i]; + for (j=0;j<chain2_num;j++) + { + // attempt to swap (i,old_j=assign1_list[i]) with (i,j) + if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; + old_i=assign2_list[j]; + + assign1_tmp[i]=j; + if (old_i>=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + delta_score=TMave_mat[i][j]; + if (old_j>=0) delta_score-=TMave_mat[i][old_j]; + if (old_i>=0) delta_score-=TMave_mat[old_i][j]; + if (old_i>=0 && old_j>=0) delta_score+=TMave_mat[old_i][old_j]; + + if (delta_score>0) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + total_score+=delta_score; + break; + } + else + { + assign1_tmp[i]=assign1_list[i]; + if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + if (delta_score>0) break; + } + if (delta_score<=0) break; // cannot swap any chain pair + } + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + return total_score; +} + +double calculate_centroids(const vector<vector<vector<double> > >&a_vec, + const int chain_num, double ** centroids) +{ + int L=0; + int c,r; // index of chain and residue + for (c=0; c<chain_num; c++) + { + centroids[c][0]=0; + centroids[c][1]=0; + centroids[c][2]=0; + L=a_vec[c].size(); + for (r=0; r<L; r++) + { + centroids[c][0]+=a_vec[c][r][0]; + centroids[c][1]+=a_vec[c][r][1]; + centroids[c][2]+=a_vec[c][r][2]; + } + centroids[c][0]/=L; + centroids[c][1]/=L; + centroids[c][2]/=L; + //cout<<centroids[c][0]<<'\t' + //<<centroids[c][1]<<'\t' + //<<centroids[c][2]<<endl; + } + + vector<double> d0_vec(chain_num,-1); + int c2=0; + double d0MM=0; + for (c=0; c<chain_num; c++) + { + for (c2=0; c2<chain_num; c2++) + { + if (c2==c) continue; + d0MM=sqrt(dist(centroids[c],centroids[c2])); + if (d0_vec[c]<=0) d0_vec[c]=d0MM; + else d0_vec[c]=getmin(d0_vec[c], d0MM); + } + } + d0MM=0; + for (c=0; c<chain_num; c++) d0MM+=d0_vec[c]; + d0MM/=chain_num; + d0_vec.clear(); + //cout<<d0MM<<endl; + return d0MM; +} + +/* calculate MMscore of aligned chains + * MMscore = sum(TMave_mat[i][j]) * sum(1/(1+dij^2/d0MM^2)) + * / (L* getmin(chain1_num,chain2_num)) + * dij is the centroid distance between chain pair i and j + * d0MM is scaling factor. TMave_mat[i][j] is the TM-score between + * chain pair i and j multiple by getmin(Li*Lj) */ +double calMMscore(double **TMave_mat,int *assign1_list, + const int chain1_num, const int chain2_num, double **xcentroids, + double **ycentroids, const double d0MM, double **r1, double **r2, + double **xt, double t[3], double u[3][3], const int L) +{ + int Nali=0; // number of aligned chain + int i,j; + double MMscore=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + + r1[Nali][0]=xcentroids[i][0]; + r1[Nali][1]=xcentroids[i][1]; + r1[Nali][2]=xcentroids[i][2]; + + r2[Nali][0]=ycentroids[j][0]; + r2[Nali][1]=ycentroids[j][1]; + r2[Nali][2]=ycentroids[j][2]; + + Nali++; + MMscore+=TMave_mat[i][j]; + } + MMscore/=L; + + double RMSD = 0; + double TMscore=0; + if (Nali>=3) + { + /* Kabsch superposition */ + Kabsch(r1, r2, Nali, 1, &RMSD, t, u); + do_rotation(r1, xt, Nali, t, u); + + /* calculate pseudo-TMscore */ + double dd=0; + for (i=0;i<Nali;i++) + { + dd=dist(xt[i], r2[i]); + TMscore+=1/(1+dd/(d0MM*d0MM)); + } + } + else if (Nali==2) + { + double dd=dist(r1[0],r2[0]); + TMscore=1/(1+dd/(d0MM*d0MM)); + } + else TMscore=1; // only one aligned chain. + TMscore/=getmin(chain1_num,chain2_num); + MMscore*=TMscore; + return MMscore; +} + +/* check if this is alignment of heterooligomer or homooligomer + * return het_deg, which ranges from 0 to 1. + * The larger the value, the more "hetero"; + * Tthe smaller the value, the more "homo" */ +double check_heterooligomer(double **TMave_mat, const int chain1_num, + const int chain2_num) +{ + double het_deg=0; + double min_TM=-1; + double max_TM=-1; + int i,j; + for (i=0;i<chain1_num;i++) + { + for (j=0;j<chain2_num;j++) + { + if (min_TM<0 || TMave_mat[i][j] <min_TM) min_TM=TMave_mat[i][j]; + if (max_TM<0 || TMave_mat[i][j]>=max_TM) max_TM=TMave_mat[i][j]; + } + } + het_deg=(max_TM-min_TM)/max_TM; + //cout<<"min_TM="<<min_TM<<endl; + //cout<<"max_TM="<<max_TM<<endl; + return het_deg; +} + +/* reassign chain-chain correspondence, specific for homooligomer */ +double homo_refined_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num, + double **xcentroids, double **ycentroids, const double d0MM, + const int L, double **ut_mat) +{ + double MMscore_max=0; + double MMscore=0; + int i,j; + int c1,c2; + int max_i=-1; // the chain pair whose monomer u t yields highest MMscore + int max_j=-1; + + int chain_num=getmin(chain1_num,chain2_num); + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + double **xt; + NewArray(&xt, chain1_num, 3); + double t[3]; + double u[3][3]; + int ui,uj,ut_idx; + double TMscore=0; // pseudo TM-score + double TMsum =0; + double TMnow =0; + double TMmax =0; + double dd=0; + + size_t total_pair=chain1_num*chain2_num; // total pair + double *ut_tmc_mat=new double [total_pair]; // chain level TM-score + vector<pair<double,int> > ut_tm_vec(total_pair,make_pair(0.0,0)); // product of both + + for (c1=0;c1<chain1_num;c1++) + { + for (c2=0;c2<chain2_num;c2++) + { + if (TMave_mat[c1][c2]<=0) continue; + ut_idx=c1*chain2_num+c2; + for (ui=0;ui<3;ui++) + for (uj=0;uj<3;uj++) u[ui][uj]=ut_mat[ut_idx][ui*3+uj]; + for (uj=0;uj<3;uj++) t[uj]=ut_mat[ut_idx][9+uj]; + + do_rotation(xcentroids, xt, chain1_num, t, u); + + for (i=0;i<chain1_num;i++) assign1_tmp[i]=-1; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=-1; + + + for (i=0;i<chain1_num;i++) + { + for (j=0;j<chain2_num;j++) + { + ut_idx=i*chain2_num+j; + ut_tmc_mat[ut_idx]=0; + ut_tm_vec[ut_idx].first=-1; + ut_tm_vec[ut_idx].second=ut_idx; + if (TMave_mat[i][j]<=0) continue; + dd=dist(xt[i],ycentroids[j]); + ut_tmc_mat[ut_idx]=1/(1+dd/(d0MM*d0MM)); + ut_tm_vec[ut_idx].first= + ut_tmc_mat[ut_idx]*TMave_mat[i][j]; + //cout<<"TM["<<ut_idx<<"]="<<ut_tm_vec[ut_idx].first<<endl; + } + } + //cout<<"sorting "<<total_pair<<" chain pairs"<<endl; + + /* initial assignment */ + assign1_tmp[c1]=c2; + assign2_tmp[c2]=c1; + TMsum=TMave_mat[c1][c2]; + TMscore=ut_tmc_mat[c1*chain2_num+c2]; + + /* further assignment */ + sort(ut_tm_vec.begin(), ut_tm_vec.end()); // sort in ascending order + for (ut_idx=total_pair-1;ut_idx>=0;ut_idx--) + { + j=ut_tm_vec[ut_idx].second % chain2_num; + i=int(ut_tm_vec[ut_idx].second / chain2_num); + if (TMave_mat[i][j]<=0) break; + if (assign1_tmp[i]>=0 || assign2_tmp[j]>=0) continue; + assign1_tmp[i]=j; + assign2_tmp[j]=i; + TMsum+=TMave_mat[i][j]; + TMscore+=ut_tmc_mat[i*chain2_num+j]; + //cout<<"ut_idx="<<ut_tm_vec[ut_idx].second + //<<"\ti="<<i<<"\tj="<<j<<"\ttm="<<ut_tm_vec[ut_idx].first<<endl; + } + + /* final MMscore */ + MMscore=(TMsum/L)*(TMscore/chain_num); + if (max_i<0 || max_j<0 || MMscore>MMscore_max) + { + max_i=c1; + max_j=c2; + MMscore_max=MMscore; + for (i=0;i<chain1_num;i++) assign1_list[i]=assign1_tmp[i]; + for (j=0;j<chain2_num;j++) assign2_list[j]=assign2_tmp[j]; + //cout<<"TMsum/L="<<TMsum/L<<endl; + //cout<<"TMscore/chain_num="<<TMscore/chain_num<<endl; + //cout<<"MMscore="<<MMscore<<endl; + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) + //cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) + //cout<<assign2_list[j]<<","; cout<<"}"<<endl; + } + } + } + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + delete[]ut_tmc_mat; + ut_tm_vec.clear(); + DeleteArray(&xt, chain1_num); + return MMscore; +} + +/* reassign chain-chain correspondence, specific for heterooligomer */ +double hetero_refined_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num, + double **xcentroids, double **ycentroids, const double d0MM, const int L) +{ + double MMscore_old=0; + double MMscore=0; + int i,j; + + double **r1; + double **r2; + double **xt; + int chain_num=getmin(chain1_num,chain2_num); + NewArray(&r1, chain_num, 3); + NewArray(&r2, chain_num, 3); + NewArray(&xt, chain_num, 3); + double t[3]; + double u[3][3]; + + /* calculate MMscore */ + MMscore=MMscore_old=calMMscore(TMave_mat, assign1_list, chain1_num, + chain2_num, xcentroids, ycentroids, d0MM, r1, r2, xt, t, u, L); + //cout<<"MMscore="<<MMscore<<endl; + //cout<<"TMave_mat="<<endl; + //for (i=0;i<chain1_num;i++) + //{ + //for (j=0; j<chain2_num; j++) + //{ + //if (j<chain2_num-1) cout<<TMave_mat[i][j]<<'\t'; + //else cout<<TMave_mat[i][j]<<endl; + //} + //} + + /* iteratively refine chain assignment. in each iteration, attempt + * to swap (i,old_j=assign1_list[i]) with (i,j) */ + double delta_score=-1; + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; + int old_i=-1; + int old_j=-1; + + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + + for (int iter=0;iter<chain1_num*chain2_num;iter++) + { + delta_score=-1; + for (i=0;i<chain1_num;i++) + { + old_j=assign1_list[i]; + for (j=0;j<chain2_num;j++) + { + if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; + old_i=assign2_list[j]; + + assign1_tmp[i]=j; + if (old_i>=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + MMscore=calMMscore(TMave_mat, assign1_tmp, chain1_num, + chain2_num, xcentroids, ycentroids, d0MM, + r1, r2, xt, t, u, L); + + //cout<<"(i,j,old_i,old_j,MMscore)=("<<i<<","<<j<<"," + //<<old_i<<","<<old_j<<","<<MMscore<<")"<<endl; + + if (MMscore>MMscore_old) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + delta_score=(MMscore-MMscore_old); + MMscore_old=MMscore; + //cout<<"MMscore="<<MMscore<<endl; + break; + } + else + { + assign1_tmp[i]=assign1_list[i]; + if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + } + //cout<<"iter="<<iter<<endl; + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + if (delta_score<=0) break; // cannot swap any chain pair + } + MMscore=MMscore_old; + //cout<<"MMscore="<<MMscore<<endl; + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + DeleteArray(&r1, chain_num); + DeleteArray(&r2, chain_num); + DeleteArray(&xt, chain_num); + return MMscore; +} + +void copy_chain_data(const vector<vector<double> >&a_vec_i, + const vector<char>&seq_vec_i,const vector<char>&sec_vec_i, + const int len,double **a,char *seq,char *sec) +{ + int r; + for (r=0;r<len;r++) + { + a[r][0]=a_vec_i[r][0]; + a[r][1]=a_vec_i[r][1]; + a[r][2]=a_vec_i[r][2]; + seq[r]=seq_vec_i[r]; + sec[r]=sec_vec_i[r]; + } + seq[len]=0; + sec[len]=0; +} + +void parse_chain_list(const vector<string>&chain_list, + vector<vector<vector<double> > >&a_vec, vector<vector<char> >&seq_vec, + vector<vector<char> >&sec_vec, vector<int>&mol_vec, vector<int>&len_vec, + vector<string>&chainID_list, const int ter_opt, const int split_opt, + const string mol_opt, const int infmt_opt, const string atom_opt, + const int mirror_opt, const int het_opt, int &len_aa, int &len_na, + const int o_opt, vector<string>&resi_vec) +{ + size_t i; + int chain_i,r; + string name; + int chainnum; + double **xa; + int len; + char *seq,*sec; + + vector<vector<string> >PDB_lines; + vector<double> tmp_atom_array(3,0); + vector<vector<double> > tmp_chain_array; + vector<char>tmp_seq_array; + vector<char>tmp_sec_array; + //vector<string> resi_vec; + int read_resi=0; + if (o_opt) read_resi=2; + + for (i=0;i<chain_list.size();i++) + { + name=chain_list[i]; + chainnum=get_PDB_lines(name, PDB_lines, chainID_list, + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + if (!chainnum) + { + cerr<<"Warning! Cannot parse file: "<<name + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<chainnum;chain_i++) + { + len=PDB_lines[chain_i].size(); + if (!len) + { + cerr<<"Warning! Cannot parse file: "<<name + <<". Chain length 0."<<endl; + continue; + } + else if (len<3) + { + cerr<<"Sequence is too short <3!: "<<name<<endl; + continue; + } + NewArray(&xa, len, 3); + seq = new char[len + 1]; + sec = new char[len + 1]; + len = read_PDB(PDB_lines[chain_i], xa, seq, resi_vec, read_resi); + if (mirror_opt) for (r=0;r<len;r++) xa[r][2]=-xa[r][2]; + if (mol_vec[chain_i]>0 || mol_opt=="RNA") + make_sec(seq, xa, len, sec,atom_opt); + else make_sec(xa, len, sec); // secondary structure assignment + + /* store in vector */ + tmp_chain_array.assign(len,tmp_atom_array); + vector<char>tmp_seq_array(len+1,0); + vector<char>tmp_sec_array(len+1,0); + for (r=0;r<len;r++) + { + tmp_chain_array[r][0]=xa[r][0]; + tmp_chain_array[r][1]=xa[r][1]; + tmp_chain_array[r][2]=xa[r][2]; + tmp_seq_array[r]=seq[r]; + tmp_sec_array[r]=sec[r]; + } + a_vec.push_back(tmp_chain_array); + seq_vec.push_back(tmp_seq_array); + sec_vec.push_back(tmp_sec_array); + len_vec.push_back(len); + + /* clean up */ + tmp_chain_array.clear(); + tmp_seq_array.clear(); + tmp_sec_array.clear(); + PDB_lines[chain_i].clear(); + DeleteArray(&xa, len); + delete [] seq; + delete [] sec; + } // chain_i + name.clear(); + PDB_lines.clear(); + mol_vec.clear(); + } // i + tmp_atom_array.clear(); + + if (mol_opt=="RNA") mol_vec.assign(a_vec.size(),1); + else if (mol_opt=="protein") mol_vec.assign(a_vec.size(),-1); + else + { + mol_vec.assign(a_vec.size(),0); + for (i=0;i<a_vec.size();i++) + { + for (r=0;r<len_vec[i];r++) + { + if (seq_vec[i][r]>='a' && seq_vec[i][r]<='z') mol_vec[i]++; + else mol_vec[i]--; + } + } + } + + len_aa=0; + len_na=0; + for (i=0;i<a_vec.size();i++) + { + if (mol_vec[i]>0) len_na+=len_vec[i]; + else len_aa+=len_vec[i]; + } +} + +int copy_chain_pair_data( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int chain1_num, int chain2_num, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence) +{ + int i,j,r; + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + int mol_type=0; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + for (r=0;r<xlen_vec[i];r++) + { + seqx[xlen]=seqx_vec[i][r]; + secx[xlen]=secx_vec[i][r]; + xa[xlen][0]= xa_vec[i][r][0]; + xa[xlen][1]= xa_vec[i][r][1]; + xa[xlen][2]= xa_vec[i][r][2]; + xlen++; + } + sequence[0]+=seqxA_mat[i][j]; + for (r=0;r<ylen_vec[j];r++) + { + seqy[ylen]=seqy_vec[j][r]; + secy[ylen]=secy_vec[j][r]; + ya[ylen][0]= ya_vec[j][r][0]; + ya[ylen][1]= ya_vec[j][r][1]; + ya[ylen][2]= ya_vec[j][r][2]; + ylen++; + } + sequence[1]+=seqyA_mat[i][j]; + mol_type+=mol_vec1[i]+mol_vec2[j]; + } + seqx[xlen]=0; + secx[xlen]=0; + seqy[ylen]=0; + secy[ylen]=0; + return mol_type; +} + +double MMalign_search( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, + double **TM1_mat, double **TM2_mat, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence, + double d0_scale, bool fast_opt) +{ + double total_score=0; + int i,j; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]<0) continue; + xlen+=xlen_vec[i]; + ylen+=ylen_vec[assign1_list[i]]; + } + if (xlen<=3 || ylen<=3) return total_score; + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, false, true, false, fast_opt, mol_type, -1); + + /* clean up */ + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + + /* re-compute chain level alignment */ + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + for (j=0;j<chain2_num;j++) + { + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, true, false, + mol_vec1[i]+mol_vec2[j], 1, invmap); + + /* print result */ + TM1_mat[i][j]=TM2; // normalized by chain1 + TM2_mat[i][j]=TM1; // normalized by chain2 + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + + TMave_mat[i][j]=TM4*Lnorm_ass; + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + return total_score; +} + +void MMalign_final( + const string xname, const string yname, + const vector<string> chainID_list1, const vector<string> chainID_list2, + string fname_super, string fname_lign, string fname_matrix, + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, + double **TM1_mat, double **TM2_mat, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqM_mat, + vector<vector<string> >&seqyA_mat, int *assign1_list, int *assign2_list, + vector<string>&sequence, const double d0_scale, const bool m_opt, + const int o_opt, const int outfmt_opt, const int ter_opt, + const int split_opt, const bool a_opt, const bool d_opt, + const bool fast_opt, const bool full_opt, const int mirror_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) +{ + int i,j; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) xlen+=xlen_vec[i]; + for (j=0;j<chain2_num;j++) ylen+=ylen_vec[j]; + if (xlen<=3 || ylen<=3) return; + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, a_opt, false, d_opt, fast_opt, mol_type, -1); + + /* prepare full complex alignment */ + string chainID1=""; + string chainID2=""; + sequence.clear(); + sequence.push_back(""); // seqxA + sequence.push_back(""); // seqyA + sequence.push_back(""); // seqM + int aln_start=0; + int aln_end=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + chainID1+=chainID_list1[i]; + chainID2+=chainID_list2[j]; + sequence[0]+=seqxA_mat[i][j]+'*'; + sequence[1]+=seqyA_mat[i][j]+'*'; + + aln_end+=seqxA_mat[i][j].size(); + seqM_mat[i][j]=seqM.substr(aln_start,aln_end-aln_start); + sequence[2]+=seqM_mat[i][j]+'*'; + aln_start=aln_end; + } + + /* prepare unaligned region */ + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) continue; + chainID1+=chainID_list1[i]; + chainID2+=':'; + string s(seqx_vec[i].begin(),seqx_vec[i].end()); + sequence[0]+=s.substr(0,xlen_vec[i])+'*'; + sequence[1]+=string(xlen_vec[i],'-')+'*'; + s.clear(); + sequence[2]+=string(xlen_vec[i],' ')+'*'; + } + for (j=0;j<chain2_num;j++) + { + if (assign2_list[j]>=0) continue; + chainID1+=':'; + chainID2+=chainID_list2[j]; + string s(seqy_vec[j].begin(),seqy_vec[j].end()); + sequence[0]+=string(ylen_vec[j],'-')+'*'; + sequence[1]+=s.substr(0,ylen_vec[j])+'*'; + s.clear(); + sequence[2]+=string(ylen_vec[j],' ')+'*'; + } + + /* print alignment */ + output_results(xname, yname, chainID1.c_str(), chainID2.c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + sequence[2].c_str(), sequence[0].c_str(), sequence[1].c_str(), + Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, 0, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, true, + split_opt, o_opt, fname_super, + false, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + sequence[0].clear(); + sequence[1].clear(); + sequence[2].clear(); + + if (!full_opt) return; + + cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<<endl; + + /* re-compute chain level alignment */ + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + xlen=xlen_vec[i]; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + ylen=ylen_vec[j]; + if (ylen<3) + { + TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + seqM=""; + seqxA=""; + seqyA=""; + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 1, a_opt, true, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); + + //TM2=TM4*Lnorm_ass/xlen; + //TM1=TM4*Lnorm_ass/ylen; + //d0A=d0u; + //d0B=d0u; + + /* print result */ + output_results(xname, yname, + chainID_list1[i].c_str(), chainID_list2[j].c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), + seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, 0, + "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); + + /* clean up */ + seqxA.clear(); + seqM.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + sequence.clear(); + return; +} diff --git a/modules/bindings/src/tmalign/NW.h b/modules/bindings/src/tmalign/NW.h index a9dd6a51927cee4af67ce2341f6f4b93dde40359..4c9984853687c47fdf093f246d27ffe1c711416e 100644 --- a/modules/bindings/src/tmalign/NW.h +++ b/modules/bindings/src/tmalign/NW.h @@ -1,10 +1,10 @@ -/* Partial implementation of Needleman-Wunsch (NW) dymanamic programming for +/* Partial implementation of Needleman-Wunsch (NW) dynamic programming for * global alignment. The three NWDP_TM functions below are not complete * implementation of NW algorithm because gap jumping in the standard Gotoh * algorithm is not considered. Since the gap opening and gap extension is * the same, this is not a problem. This code was exploited in TM-align * because it is about 1.5 times faster than a complete NW implementation. - * Nevertheless, if gap openning != gap extension shall be implemented in + * Nevertheless, if gap opening != gap extension shall be implemented in * the future, the Gotoh algorithm must be implemented. In rare scenarios, * it is also possible to have asymmetric alignment (i.e. * TMalign A.pdb B.pdb and TMalign B.pdb A.pdb have different TM_A and TM_B @@ -24,15 +24,15 @@ void NWDP_TM(double **score, bool **path, double **val, //initialization for(i=0; i<=len1; i++) { - //val[i][0]=0; - val[i][0]=i*gap_open; + val[i][0]=0; + //val[i][0]=i*gap_open; path[i][0]=false; //not from diagonal } for(j=0; j<=len2; j++) { - //val[0][j]=0; - val[0][j]=j*gap_open; + val[0][j]=0; + //val[0][j]=j*gap_open; path[0][j]=false; //not from diagonal j2i[j]=-1; //all are not aligned, only use j2i[1:len2] } @@ -179,11 +179,91 @@ void NWDP_TM(bool **path, double **val, double **x, double **y, } } +/* This is the same as the previous NWDP_TM, except for the lack of rotation + * Input: vectors x, y, scale factor d02, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_SE(bool **path, double **val, double **x, double **y, + int len1, int len2, double d02, double gap_open, int j2i[]) +{ + int i, j; + double h, v, d; + + for(i=0; i<=len1; i++) + { + val[i][0]=0; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + val[0][j]=0; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + double dij; + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + dij=dist(&x[i-1][0], &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + /* +ss * Input: secondary structure secx, secy, and gap_open * Output: j2i[1:len2] \in {1:len1} U {-1} * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ -void NWDP_TM(bool **path, double **val, const int *secx, const int *secy, +void NWDP_TM(bool **path, double **val, const char *secx, const char *secy, const int len1, const int len2, const double gap_open, int j2i[]) { @@ -193,15 +273,15 @@ void NWDP_TM(bool **path, double **val, const int *secx, const int *secy, //initialization for(i=0; i<=len1; i++) { - //val[i][0]=0; - val[i][0]=i*gap_open; + val[i][0]=0; + //val[i][0]=i*gap_open; path[i][0]=false; //not from diagonal } for(j=0; j<=len2; j++) { - //val[0][j]=0; - val[0][j]=j*gap_open; + val[0][j]=0; + //val[0][j]=j*gap_open; path[0][j]=false; //not from diagonal j2i[j]=-1; //all are not aligned, only use j2i[1:len2] } diff --git a/modules/bindings/src/tmalign/NWalign.cpp b/modules/bindings/src/tmalign/NWalign.cpp index 269e26315a5b84321488cdf4baaabb620841cf04..6b7b86c2db202c7338b3d7b576d49337e0e4ec4b 100644 --- a/modules/bindings/src/tmalign/NWalign.cpp +++ b/modules/bindings/src/tmalign/NWalign.cpp @@ -47,6 +47,10 @@ void print_extra_help() " one read all sequence; -split >=1 means each sequence is an\n" " individual entry." "\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" " -outfmt Output format\n" " 0: (default) full output\n" " 1: fasta format compact output\n" @@ -103,6 +107,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int split_opt =0; // do not split chain int outfmt_opt=0; // set -outfmt to full output + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty @@ -167,6 +172,10 @@ int main(int argc, char *argv[]) { glocal=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else if (xname.size() == 0) xname=argv[i]; else if (yname.size() == 0) yname=argv[i]; else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); @@ -189,9 +198,9 @@ int main(int argc, char *argv[]) if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") - PrintErrorAndQuit("ERROR! molecule type must be either RNA or protein."); + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); else if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") @@ -224,12 +233,12 @@ int main(int argc, char *argv[]) vector<int> mol_vec2; // molecule type of chain2, RNA if >0 vector<string> chainID_list1; // list of chainID1 vector<string> chainID_list2; // list of chainID2 - int i,j; // file index - int chain_i,chain_j; // chain index - int xlen, ylen; // chain length - int xchainnum,ychainnum;// number of chains in a PDB file - char *seqx, *seqy; // for the protein sequence - int l; // residue index + int i,j; // file index + int chain_i,chain_j; // chain index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + int l; // residue index /* loop over file names */ for (i=0;i<chain1_list.size();i++) @@ -239,7 +248,7 @@ int main(int argc, char *argv[]) if (infmt1_opt>=4) xchainnum=get_FASTA_lines(xname, PDB_lines1, chainID_list1, mol_vec1, ter_opt, split_opt); else xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt); + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -274,8 +283,8 @@ int main(int argc, char *argv[]) ychainnum=get_FASTA_lines(yname, PDB_lines2, chainID_list2, mol_vec2, ter_opt, split_opt); else ychainnum=get_PDB_lines(yname, PDB_lines2, - chainID_list2, mol_vec2, ter_opt, - infmt2_opt, atom_opt, split_opt); + chainID_list2, mol_vec2, ter_opt, infmt2_opt, + atom_opt, split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -305,15 +314,19 @@ int main(int argc, char *argv[]) int L_ali; // Aligned length double Liden=0; string seqM, seqxA, seqyA;// for output alignment + int *invmap = new int[ylen+1]; - int aln_score=NWalign(seqx, seqy, xlen, ylen, seqxA, seqyA, - mol_vec1[chain_i]+mol_vec2[chain_j], glocal); + int aln_score=NWalign_main(seqx, seqy, xlen, ylen, + seqxA, seqyA, mol_vec1[chain_i]+mol_vec2[chain_j], + invmap, (outfmt_opt>=2)?1:0, glocal); - get_seqID(seqxA, seqyA, seqM, Liden, L_ali); + if (outfmt_opt>=2) get_seqID(invmap, seqx, seqy, + ylen, Liden, L_ali); + else get_seqID(seqxA, seqyA, seqM, Liden, L_ali); output_NWalign_results( - xname.substr(dir1_opt.size()), - yname.substr(dir2_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), chainID_list1[chain_i].c_str(), chainID_list2[chain_j].c_str(), xlen, ylen, seqM.c_str(), seqxA.c_str(), @@ -324,6 +337,7 @@ int main(int argc, char *argv[]) seqxA.clear(); seqyA.clear(); delete [] seqy; + delete [] invmap; } // chain_j if (chain2_list.size()>1) { diff --git a/modules/bindings/src/tmalign/NWalign.h b/modules/bindings/src/tmalign/NWalign.h index e0125bb61113925a2eaa92a138d67991b9afb126..2c7e36a1112f8c3a4a7cbb9d813e40e39698ae6e 100644 --- a/modules/bindings/src/tmalign/NWalign.h +++ b/modules/bindings/src/tmalign/NWalign.h @@ -12,8 +12,8 @@ using namespace std; const int gapopen_blosum62=-11; const int gapext_blosum62=-1; -const int gapopen_blastn=-5; -const int gapext_blastn=-2; +const int gapopen_blastn=-15; //-5; +const int gapext_blastn =-4; //-2; /* initialize matrix in gotoh algorithm */ void init_gotoh_mat(int **S, int **JumpH, int **JumpV, int **P, @@ -191,14 +191,18 @@ int calculate_score_gotoh(const int xlen,const int ylen, int **S, /* trace back dynamic programming path to diciper pairwise alignment */ void trace_back_gotoh(const char *seqx, const char *seqy, - int ** JumpH, int ** JumpV, int ** P, - string& seqxA, string& seqyA, const int xlen, const int ylen) + int ** JumpH, int ** JumpV, int ** P, string& seqxA, string& seqyA, + const int xlen, const int ylen, int *invmap, const int invmap_only=1) { - int i=xlen; - int j=ylen; + int i,j; int gaplen,p; - char *buf=new char [MAX(xlen,ylen)+1]; + char *buf=NULL; + + if (invmap_only) for (j = 0; j < ylen; j++) invmap[j] = -1; + if (invmap_only!=1) buf=new char [MAX(xlen,ylen)+1]; + i=xlen; + j=ylen; while(i+j) { gaplen=0; @@ -206,6 +210,7 @@ void trace_back_gotoh(const char *seqx, const char *seqy, { gaplen=JumpH[i][j]; j-=gaplen; + if (invmap_only==1) continue; strncpy(buf,seqy+j,gaplen); buf[gaplen]=0; seqyA=buf+seqyA; @@ -217,6 +222,7 @@ void trace_back_gotoh(const char *seqx, const char *seqy, { gaplen=JumpV[i][j]; i-=gaplen; + if (invmap_only==1) continue; strncpy(buf,seqx+i,gaplen); buf[gaplen]=0; seqxA=buf+seqxA; @@ -246,10 +252,14 @@ void trace_back_gotoh(const char *seqx, const char *seqy, } i--; j--; - seqxA=seqx[i]+seqxA; - seqyA=seqy[j]+seqyA; + if (invmap_only) invmap[j]=i; + if (invmap_only!=1) + { + seqxA=seqx[i]+seqxA; + seqyA=seqy[j]+seqyA; + } } - } + } delete [] buf; } @@ -257,16 +267,20 @@ void trace_back_gotoh(const char *seqx, const char *seqy, /* trace back Smith-Waterman dynamic programming path to diciper * pairwise local alignment */ void trace_back_sw(const char *seqx, const char *seqy, - int **JumpH, int **JumpV, int **P, - string& seqxA, string& seqyA, const int xlen, const int ylen) + int **JumpH, int **JumpV, int **P, string& seqxA, string& seqyA, + const int xlen, const int ylen, int *invmap, const int invmap_only=1) { - int i=xlen; - int j=ylen; + int i; + int j; int gaplen,p; - char *buf=new char [xlen+ylen+1]; + bool found_start_cell=false; // find the first non-zero cell in P + char *buf=NULL; + + if (invmap_only) for (j = 0; j < ylen; j++) invmap[j] = -1; + if (invmap_only!=1) buf=new char [MAX(xlen,ylen)+1]; - // find the first non-zero cell in P - bool found_start_cell=false; + i=xlen; + j=ylen; for (i=xlen;i>=0;i--) { for (j=ylen;j>=0;j--) @@ -281,19 +295,22 @@ void trace_back_sw(const char *seqx, const char *seqy, } /* copy C terminal sequence */ - for (p=0;p<ylen-j;p++) buf[p]='-'; - buf[ylen-j]=0; - seqxA=buf; - strncpy(buf,seqx+i,xlen-i); - buf[xlen-i]=0; - seqxA+=buf; - - strncpy(buf,seqy+j,ylen-j); - buf[ylen-j]=0; - seqyA+=buf; - for (p=0;p<xlen-i;p++) buf[p]='-'; - buf[xlen-i]=0; - seqyA+=buf; + if (invmap_only!=1) + { + for (p=0;p<ylen-j;p++) buf[p]='-'; + buf[ylen-j]=0; + seqxA=buf; + strncpy(buf,seqx+i,xlen-i); + buf[xlen-i]=0; + seqxA+=buf; + + strncpy(buf,seqy+j,ylen-j); + buf[ylen-j]=0; + seqyA+=buf; + for (p=0;p<xlen-i;p++) buf[p]='-'; + buf[xlen-i]=0; + seqyA+=buf; + } if (i<0||j<0) { @@ -309,6 +326,7 @@ void trace_back_sw(const char *seqx, const char *seqy, { gaplen=JumpH[i][j]; j-=gaplen; + if (invmap_only==1) continue; strncpy(buf,seqy+j,gaplen); buf[gaplen]=0; seqyA=buf+seqyA; @@ -320,6 +338,7 @@ void trace_back_sw(const char *seqx, const char *seqy, { gaplen=JumpV[i][j]; i-=gaplen; + if (invmap_only==1) continue; strncpy(buf,seqx+i,gaplen); buf[gaplen]=0; seqxA=buf+seqxA; @@ -331,26 +350,38 @@ void trace_back_sw(const char *seqx, const char *seqy, { i--; j--; - seqxA=seqx[i]+seqxA; - seqyA=seqy[j]+seqyA; + if (invmap_only) invmap[j]=i; + if (invmap_only!=1) + { + seqxA=seqx[i]+seqxA; + seqyA=seqy[j]+seqyA; + } } } /* copy N terminal sequence */ - for (p=0;p<j;p++) buf[p]='-'; - strncpy(buf+j,seqx,i); - buf[i+j]=0; - seqxA=buf+seqxA; - - strncpy(buf,seqy,j); - for (p=j;p<j+i;p++) buf[p]='-'; - buf[i+j]=0; - seqyA=buf+seqyA; + if (invmap_only!=1) + { + for (p=0;p<j;p++) buf[p]='-'; + strncpy(buf+j,seqx,i); + buf[i+j]=0; + seqxA=buf+seqxA; + + strncpy(buf,seqy,j); + for (p=j;p<j+i;p++) buf[p]='-'; + buf[i+j]=0; + seqyA=buf+seqyA; + } delete [] buf; } -/* entry function for NWalign */ -int NWalign(const char *seqx, const char *seqy, const int xlen, const int ylen, - string & seqxA,string & seqyA, const int mol_type, const int glocal=0) +/* entry function for NWalign + * invmap_only - whether to return seqxA and seqyA or to return invmap + * 0: only return seqxA and seqyA + * 1: only return invmap + * 2: return seqxA, seqyA and invmap */ +int NWalign_main(const char *seqx, const char *seqy, const int xlen, + const int ylen, string & seqxA, string & seqyA, const int mol_type, + int *invmap, const int invmap_only=0, const int glocal=0) { int **JumpH; int **JumpV; @@ -369,6 +400,11 @@ int NWalign(const char *seqx, const char *seqy, const int xlen, const int ylen, { gapopen=gapopen_blastn; gapext =gapext_blastn; + if (glocal==3) + { + gapopen=-5; + gapext =-2; + } } for (i=0;i<xlen+1;i++) @@ -383,9 +419,14 @@ int NWalign(const char *seqx, const char *seqy, const int xlen, const int ylen, aln_score=calculate_score_gotoh(xlen, ylen, S, JumpH, JumpV, P, gapopen, gapext, glocal); - if (glocal<3) trace_back_gotoh(seqx,seqy,JumpH,JumpV,P,seqxA,seqyA,xlen,ylen); - else trace_back_sw(seqx,seqy,JumpH,JumpV,P,seqxA,seqyA,xlen,ylen); - + seqxA.clear(); + seqyA.clear(); + + if (glocal<3) trace_back_gotoh(seqx, seqy, JumpH, JumpV, P, + seqxA, seqyA, xlen, ylen, invmap, invmap_only); + else trace_back_sw(seqx, seqy, JumpH, JumpV, P, seqxA, seqyA, + xlen, ylen, invmap, invmap_only); + DeleteArray(&JumpH, xlen+1); DeleteArray(&JumpV, xlen+1); DeleteArray(&P, xlen+1); @@ -393,7 +434,23 @@ int NWalign(const char *seqx, const char *seqy, const int xlen, const int ylen, return aln_score; // aligment score } -double get_seqID(const string& seqxA, const string& seqyA, +void get_seqID(int *invmap, const char *seqx, const char *seqy, + const int ylen, double &Liden,int &L_ali) +{ + Liden=0; + L_ali=0; + int i,j; + for (j=0;j<ylen;j++) + { + i=invmap[j]; + if (i<0) continue; + L_ali+=1; + Liden+=(seqx[i]==seqy[j]); + } + //return L_ali?1.*Liden/L_ali:0; +} + +void get_seqID(const string& seqxA, const string& seqyA, string &seqM,double &Liden,int &L_ali) { Liden=0; @@ -408,10 +465,9 @@ double get_seqID(const string& seqxA, const string& seqyA, else seqM+=' '; L_ali+=(seqxA[i]!='-' && seqyA[i]!='-'); } - return 1.*Liden/L_ali; + //return L_ali?1.*Liden/L_ali:0; } - void output_NWalign_results( const string xname, const string yname, const char *chainID1, const char *chainID2, diff --git a/modules/bindings/src/tmalign/OST_INFO b/modules/bindings/src/tmalign/OST_INFO index f0a86222755c880d5a188a88124bdfdf7d3de488..16ce115699e3266a2a8d6dd7eecfafc01758aa6e 100644 --- a/modules/bindings/src/tmalign/OST_INFO +++ b/modules/bindings/src/tmalign/OST_INFO @@ -1,7 +1,7 @@ -Source code has been cloned February 17 2019 from: +Source code has been cloned August 2 2022 from: https://github.com/kad-ecoli/TMalign last commit: -2ea5b61c6b0c8ded05ff0aea09546d45902b3741 +f0824499d8ab4fa84b2e75d253de80ab2c894c56 diff --git a/modules/bindings/src/tmalign/TMalign.cpp b/modules/bindings/src/tmalign/TMalign.cpp index f62fc45848fe54f518bfc373d7a39738c53371a0..7ea33e1a72155157b385a45a1710352813e52a1e 100644 --- a/modules/bindings/src/tmalign/TMalign.cpp +++ b/modules/bindings/src/tmalign/TMalign.cpp @@ -8,12 +8,12 @@ void print_version() { cout << "\n" -" *********************************************************************\n" -" * TM-align (Version 20190209): protein and RNA structure alignment *\n" -" * References: Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005) *\n" -" * S Gong, C Zhang, Y Zhang. Bioinformatics (2019) *\n" -" * Please email comments and suggestions to yangzhanglab@umich.edu *\n" -" *********************************************************************" +" **********************************************************************\n" +" * TM-align (Version 20210520): protein and RNA structure alignment *\n" +" * References: Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005) *\n" +" * S Gong, C Zhang, Y Zhang. Bioinformatics, bz282 (2019) *\n" +" * Please email comments and suggestions to yangzhanglab@umich.edu *\n" +" **********************************************************************" << endl; } @@ -36,6 +36,9 @@ void print_extra_help() " under 'chain2_folder'\n" " $ TMalign chain1 -dir2 chain2_folder/ chain2_list\n" "\n" +" -pair (Only when -dir1 and -dir2 are set, default is no) whether to\n" +" perform pair alignment rather than all-against-all alignment\n" +"\n" " -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" " add file name suffix to files listed by chain1_list or chain2_list\n" "\n" @@ -82,6 +85,16 @@ void print_extra_help() " 0: (default, same as F) normalized by second structure\n" " 1: same as T, normalized by average structure length\n" "\n" +" -cp ALignment with circular permutation\n" +"\n" +" -mirror Whether to align the mirror image of input structure\n" +" 0: (default) do not align mirrored structure\n" +" 1: align mirror of chain1 to origin chain2\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" " -infmt1 Input format for chain1\n" " -infmt2 Input format for chain2\n" " -1: (default) automatically detect PDB or PDBx/mmCIF format\n" @@ -109,7 +122,7 @@ void print_help(bool h_opt=false) "\n" " -i Start with an alignment specified in fasta file 'align.txt'\n" "\n" -" -I Stick to the alignment 'align.txt'\n" +" -I Stick to the alignment specified in 'align.txt'\n" "\n" " -m Output TM-align rotation matrix\n" "\n" @@ -161,8 +174,7 @@ int main(int argc, char *argv[]) bool h_opt = false; // print full help message bool v_opt = false; // print version bool m_opt = false; // flag for -m, output rotation matrix - bool i_opt = false; // flag for -i, with user given initial alignment - bool I_opt = false; // flag for -I, stick to user given alignment + int i_opt = 0; // 1 for -i, 3 for -I bool o_opt = false; // flag for -o, output superposed structure int a_opt = 0; // flag for -a, do not normalized by average length bool u_opt = false; // flag for -u, normalized by user specified length @@ -175,12 +187,16 @@ int main(int argc, char *argv[]) int split_opt =0; // do not split chain int outfmt_opt=0; // set -outfmt to full output bool fast_opt =false; // flags for -fast, fTM-align algorithm + int cp_opt =0; // do not check circular permutation + int mirror_opt=0; // do not align mirror + int het_opt =0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty string dir_opt =""; // set -dir to empty string dir1_opt =""; // set -dir1 to empty string dir2_opt =""; // set -dir2 to empty + bool pair_opt=false; // pair alignment int byresi_opt=0; // set -byresi to 0 vector<string> chain1_list; // only when -dir1 is set vector<string> chain2_list; // only when -dir2 is set @@ -222,16 +238,20 @@ int main(int argc, char *argv[]) } else if ( !strcmp(argv[i],"-i") && i < (argc-1) ) { - fname_lign = argv[i + 1]; i_opt = true; i++; + if (i_opt==3) + PrintErrorAndQuit("ERROR! -i and -I cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 1; i++; + } + else if (!strcmp(argv[i], "-I") && i < (argc-1) ) + { + if (i_opt==1) + PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 3; i++; } else if (!strcmp(argv[i], "-m") && i < (argc-1) ) { fname_matrix = argv[i + 1]; m_opt = true; i++; }// get filename for rotation matrix - else if (!strcmp(argv[i], "-I") && i < (argc-1) ) - { - fname_lign = argv[i + 1]; I_opt = true; i++; - } else if (!strcmp(argv[i], "-fast")) { fast_opt = true; @@ -272,6 +292,10 @@ int main(int argc, char *argv[]) { dir2_opt=argv[i + 1]; i++; } + else if ( !strcmp(argv[i],"-pair") ) + { + pair_opt=true; + } else if ( !strcmp(argv[i],"-suffix") && i < (argc-1) ) { suffix_opt=argv[i + 1]; i++; @@ -288,6 +312,18 @@ int main(int argc, char *argv[]) { byresi_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-cp") ) + { + cp_opt=1; + } + else if ( !strcmp(argv[i],"-mirror") && i < (argc-1) ) + { + mirror_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else if (xname.size() == 0) xname=argv[i]; else if (yname.size() == 0) yname=argv[i]; else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); @@ -320,16 +356,14 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); } if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") - PrintErrorAndQuit("ERROR! molecule type must be either RNA or protein."); + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); else if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") atom_opt=" C3'"; - if (i_opt && I_opt) - PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); if (u_opt && Lnorm_ass<=0) PrintErrorAndQuit("Wrong value for option -u! It should be >0"); if (d_opt && d0_scale<=0) @@ -338,7 +372,7 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); if (byresi_opt!=0) { - if (i_opt || I_opt) + if (i_opt) PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); if (byresi_opt<0 || byresi_opt>3) PrintErrorAndQuit("-byresi can only be 0, 1, 2 or 3"); @@ -351,11 +385,15 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); if (split_opt<0 || split_opt>2) PrintErrorAndQuit("-split can only be 0, 1 or 2"); + if (cp_opt!=0 && cp_opt!=1) + PrintErrorAndQuit("-cp can only be 0 or 1"); + if (cp_opt && i_opt) + PrintErrorAndQuit("-cp cannot be used with -i or -I"); /* read initial alignment file from 'align.txt' */ - if (i_opt || I_opt) read_user_alignment(sequence, fname_lign, I_opt); + if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); - if (byresi_opt) I_opt=true; + if (byresi_opt) i_opt=3; if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); @@ -383,16 +421,19 @@ int main(int argc, char *argv[]) vector<string> chainID_list2; // list of chainID2 int i,j; // file index int chain_i,chain_j; // chain index + int r; // residue index int xlen, ylen; // chain length int xchainnum,ychainnum;// number of chains in a PDB file char *seqx, *seqy; // for the protein sequence - int *secx, *secy; // for the secondary structure + char *secx, *secy; // for the secondary structure double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and // ya[0...ylen-1][0..2], in general, // ya is regarded as native structure // --> superpose xa onto ya vector<string> resi_vec1; // residue index for chain1 vector<string> resi_vec2; // residue index for chain2 + int read_resi=byresi_opt; // whether to read residue index + if (byresi_opt==0 && o_opt) read_resi=2; /* loop over file names */ for (i=0;i<chain1_list.size();i++) @@ -400,7 +441,7 @@ int main(int argc, char *argv[]) /* parse chain 1 */ xname=chain1_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt); + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -418,27 +459,30 @@ int main(int argc, char *argv[]) <<". Chain length 0."<<endl; continue; } - else if (xlen<=5) + else if (xlen<3) { - cerr<<"Sequence is too short <=5!: "<<xname<<endl; + cerr<<"Sequence is too short <3!: "<<xname<<endl; continue; } NewArray(&xa, xlen, 3); seqx = new char[xlen + 1]; - secx = new int[xlen]; + secx = new char[xlen + 1]; xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, - resi_vec1, byresi_opt); + resi_vec1, read_resi); + if (mirror_opt) for (r=0;r<xlen;r++) xa[r][2]=-xa[r][2]; if (mol_vec1[chain_i]>0) make_sec(seqx,xa, xlen, secx,atom_opt); else make_sec(xa, xlen, secx); // secondary structure assignment for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) { + if (pair_opt && j!=i) continue; /* parse chain 2 */ if (PDB_lines2.size()==0) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -457,16 +501,16 @@ int main(int argc, char *argv[]) <<". Chain length 0."<<endl; continue; } - else if (ylen<=5) + else if (ylen<3) { - cerr<<"Sequence is too short <=5!: "<<yname<<endl; + cerr<<"Sequence is too short <3!: "<<yname<<endl; continue; } NewArray(&ya, ylen, 3); seqy = new char[ylen + 1]; - secy = new int[ylen]; + secy = new char[ylen + 1]; ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, - resi_vec2, byresi_opt); + resi_vec2, read_resi); if (mol_vec2[chain_j]>0) make_sec(seqy, ya, ylen, secy, atom_opt); else make_sec(ya, ylen, secy); @@ -490,33 +534,43 @@ int main(int argc, char *argv[]) int n_ali8=0; /* entry function for structure alignment */ - TMalign_main( + if (cp_opt) CPalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],TMcut); + else TMalign_main( xa, ya, seqx, seqy, secx, secy, t0, u0, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, sequence, Lnorm_ass, d0_scale, - i_opt, I_opt, a_opt, u_opt, d_opt, fast_opt, + i_opt, a_opt, u_opt, d_opt, fast_opt, mol_vec1[chain_i]+mol_vec2[chain_j],TMcut); /* print result */ if (outfmt_opt==0) print_version(); output_results( - xname.substr(dir1_opt.size()), - yname.substr(dir2_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), chainID_list1[chain_i].c_str(), chainID_list2[chain_j].c_str(), xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, - n_ali8, n_ali, L_ali, TM_ali, rmsd_ali, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, (m_opt?fname_matrix+chainID_list1[chain_i]:"").c_str(), - outfmt_opt, ter_opt, + outfmt_opt, ter_opt, 0, split_opt, o_opt, (o_opt?fname_super+chainID_list1[chain_i]:"").c_str(), - i_opt, I_opt, a_opt, u_opt, d_opt); + i_opt, a_opt, u_opt, d_opt,mirror_opt, + resi_vec1, resi_vec2 ); /* Done! Free memory */ seqM.clear(); diff --git a/modules/bindings/src/tmalign/TMalign.h b/modules/bindings/src/tmalign/TMalign.h index 08caeec3a2b97f6cdb49a1a7d22b660b2c60b2eb..9187ad3cbd170541a4c72baf44895c86185183d2 100644 --- a/modules/bindings/src/tmalign/TMalign.h +++ b/modules/bindings/src/tmalign/TMalign.h @@ -36,7 +36,7 @@ int score_fun8( double **xa, double **ya, int n_ali, double d, int i_ali[], } else score_sum += 1/(1+di/d02); } - //there are not enough feasible pairs, reliefe the threshold + //there are not enough feasible pairs, relieve the threshold if(n_cut<3 && n_ali>3) { inc++; @@ -81,7 +81,7 @@ int score_fun8_standard(double **xa, double **ya, int n_ali, double d, score_sum += 1 / (1 + di / d02); } } - //there are not enough feasible pairs, reliefe the threshold + //there are not enough feasible pairs, relieve the threshold if (n_cut<3 && n_ali>3) { inc++; @@ -137,7 +137,7 @@ double TMscore8_search(double **r1, double **r2, double **xtm, double **ytm, //find the maximum score starting from local structures superposition int i_ali[kmax], n_cut; int L_frag; //fragment length - int iL_max; //maximum starting postion for the fragment + int iL_max; //maximum starting position for the fragment for(i_init=0; i_init<n_init; i_init++) { @@ -291,7 +291,7 @@ double TMscore8_search_standard( double **r1, double **r2, //find the maximum score starting from local structures superposition int i_ali[kmax], n_cut; int L_frag; //fragment length - int iL_max; //maximum starting postion for the fragment + int iL_max; //maximum starting position for the fragment for (i_init = 0; i_init<n_init; i_init++) { @@ -635,7 +635,7 @@ double get_initial(double **r1, double **r2, double **xtm, double **ytm, double t[3], double u[3][3]) { int min_len=getmin(xlen, ylen); - if(min_len<=5) PrintErrorAndQuit("Sequence is too short <=5!\n"); + if(min_len<3) PrintErrorAndQuit("Sequence is too short <3!\n"); int min_ali= min_len/2; //minimum size of considered fragment if(min_ali<=5) min_ali=5; @@ -724,17 +724,17 @@ void smooth(int *sec, int len) } -int sec_str(double dis13, double dis14, double dis15, +char sec_str(double dis13, double dis14, double dis15, double dis24, double dis25, double dis35) { - int s=1; + char s='C'; double delta=2.1; if (fabs(dis15-6.37)<delta && fabs(dis14-5.18)<delta && fabs(dis25-5.18)<delta && fabs(dis13-5.45)<delta && fabs(dis24-5.45)<delta && fabs(dis35-5.45)<delta) { - s=2; //helix + s='H'; //helix return s; } @@ -743,24 +743,24 @@ int sec_str(double dis13, double dis14, double dis15, fabs(dis25-10.4)<delta && fabs(dis13-6.1 )<delta && fabs(dis24-6.1 )<delta && fabs(dis35-6.1 )<delta) { - s=4; //strand + s='E'; //strand return s; } - if (dis15 < 8) s=3; //turn + if (dis15 < 8) s='T'; //turn return s; } -/* secondary stucture assignment for protein: +/* secondary structure assignment for protein: * 1->coil, 2->helix, 3->turn, 4->strand */ -void make_sec(double **x, int len, int *sec) +void make_sec(double **x, int len, char *sec) { int j1, j2, j3, j4, j5; double d13, d14, d15, d24, d25, d35; for(int i=0; i<len; i++) { - sec[i]=1; + sec[i]='C'; j1=i-2; j2=i-1; j3=i; @@ -778,6 +778,7 @@ void make_sec(double **x, int len, int *sec) sec[i]=sec_str(d13, d14, d15, d24, d25, d35); } } + sec[len]=0; } /* a c d b: a paired to b, c paired to d */ @@ -809,10 +810,10 @@ void sec_str(int len,char *seq, const vector<vector<bool> >&bp, /* secondary structure assignment for RNA: * 1->unpair, 2->paired with upstream, 3->paired with downstream */ -void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) +void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) { - int ii,jj,j; - unsigned int i; + int ii,jj,i,j; + float lb=12.5; // lower bound for " C3'" float ub=15.0; // upper bound for " C3'" if (atom_opt==" C4'") {lb=14.0;ub=16.0;} @@ -825,9 +826,9 @@ void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) vector<bool> bp_tmp(len,false); vector<vector<bool> > bp(len,bp_tmp); bp_tmp.clear(); - for (i=0;(int) i<len; i++) + for (i=0; i<len; i++) { - sec[i]=1; + sec[i]='.'; for (j=i+1; j<len; j++) { if (((seq[i]=='u'||seq[i]=='t')&&(seq[j]=='a' ))|| @@ -843,7 +844,7 @@ void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) // From 5' to 3': A0 C0 D0 B0: A0 paired to B0, C0 paired to D0 vector<int> A0,B0,C0,D0; - for (i=0;(int) i<len-2; i++) + for (i=0; i<len-2; i++) { for (j=i+3; j<len; j++) { @@ -859,7 +860,7 @@ void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) } //int sign; - for (i=0; i<A0.size();i++) + for (i=0;i<A0.size();i++) { /* sign=0; @@ -889,10 +890,11 @@ void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) for (j=0;;j++) { if(A0[i]+j>C0[i]) break; - sec[A0[i]+j]=2; - sec[D0[i]+j]=3; + sec[A0[i]+j]='<'; + sec[D0[i]+j]='>'; } } + sec[len]=0; /* clean up */ A0.clear(); @@ -909,7 +911,7 @@ void make_sec(char *seq, double **x, int len, int *sec,const string atom_opt) //the jth element in y is aligned to the ith element in x if i>=0 //the jth element in y is aligned to a gap in x if i==-1 void get_initial_ss(bool **path, double **val, - const int *secx, const int *secy, int xlen, int ylen, int *y2x) + const char *secx, const char *secy, int xlen, int ylen, int *y2x) { double gap_open=-1.0; NWDP_TM(path, val, secx, secy, xlen, ylen, gap_open, y2x); @@ -1022,10 +1024,9 @@ bool get_initial5( double **r1, double **r2, double **xtm, double **ytm, return flag; } -void score_matrix_rmsd_sec( double **r1, double **r2, - double **score, const int *secx, const int *secy, - double **x, double **y, int xlen, int ylen, - int *y2x, const double D0_MIN, double d0) +void score_matrix_rmsd_sec( double **r1, double **r2, double **score, + const char *secx, const char *secy, double **x, double **y, + int xlen, int ylen, int *y2x, const double D0_MIN, double d0) { double t[3], u[3][3]; double rmsd, dij; @@ -1076,7 +1077,7 @@ void score_matrix_rmsd_sec( double **r1, double **r2, //the jth element in y is aligned to the ith element in x if i>=0 //the jth element in y is aligned to a gap in x if i==-1 void get_initial_ssplus(double **r1, double **r2, double **score, bool **path, - double **val, const int *secx, const int *secy, double **x, double **y, + double **val, const char *secx, const char *secy, double **x, double **y, int xlen, int ylen, int *y2x0, int *y2x, const double D0_MIN, double d0) { //create score matrix for DP @@ -1457,35 +1458,41 @@ double DP_iter(double **r1, double **r2, double **xtm, double **ytm, } -void output_superpose(const string filename, const char *fname_super, - double t[3], double u[3][3], const int ter_opt=3) +void output_pymol(const string xname, const string yname, + const string fname_super, double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector<string>&resi_vec1, const vector<string>&resi_vec2, + const string chainID1, const string chainID2) { int compress_type=0; // uncompressed file ifstream fin; redi::ipstream fin_gz; // if file is compressed - if (filename.size()>=3 && - filename.substr(filename.size()-3,3)==".gz") + if (xname.size()>=3 && + xname.substr(xname.size()-3,3)==".gz") { - fin_gz.open("zcat "+filename); + fin_gz.open("zcat "+xname); compress_type=1; } - else if (filename.size()>=4 && - filename.substr(filename.size()-4,4)==".bz2") + else if (xname.size()>=4 && + xname.substr(xname.size()-4,4)==".bz2") { - fin_gz.open("bzcat "+filename); + fin_gz.open("bzcat "+xname); compress_type=2; } - else fin.open(filename.c_str()); + else fin.open(xname.c_str()); stringstream buf; + stringstream buf_pymol; string line; double x[3]; // before transform double x1[3]; // after transform /* for PDBx/mmCIF only */ - map<string,unsigned int> _atom_site; - unsigned int atom_site_pos; + map<string,int> _atom_site; + size_t atom_site_pos; vector<string> line_vec; + int infmt=-1; // 0 - PDB, 3 - PDBx/mmCIF while (compress_type?fin_gz.good():fin.good()) { @@ -1494,9 +1501,11 @@ void output_superpose(const string filename, const char *fname_super, if (line.compare(0, 6, "ATOM ")==0 || line.compare(0, 6, "HETATM")==0) // PDB format { + infmt=0; x[0]=atof(line.substr(30,8).c_str()); x[1]=atof(line.substr(38,8).c_str()); x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; transform(t, u, x, x1); buf<<line.substr(0,30)<<setiosflags(ios::fixed) <<setprecision(3) @@ -1505,9 +1514,22 @@ void output_superpose(const string filename, const char *fname_super, } else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF { + infmt=3; buf<<line<<'\n'; - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + if (line.size()) break; + } buf<<line<<'\n'; if (line.compare(0,11,"_atom_site.")) continue; _atom_site.clear(); @@ -1515,8 +1537,20 @@ void output_superpose(const string filename, const char *fname_super, _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; while(1) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + if (line.size()) break; + } if (line.compare(0,11,"_atom_site.")) break; _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; buf<<line<<'\n'; @@ -1542,10 +1576,11 @@ void output_superpose(const string filename, const char *fname_super, x[0]=atof(line_vec[_atom_site["Cartn_x"]].c_str()); x[1]=atof(line_vec[_atom_site["Cartn_y"]].c_str()); x[2]=atof(line_vec[_atom_site["Cartn_z"]].c_str()); + if (mirror_opt) x[2]=-x[2]; transform(t, u, x, x1); for (atom_site_pos=0; atom_site_pos<_atom_site.size(); atom_site_pos++) - { + { if (atom_site_pos==_atom_site["Cartn_x"]) buf<<setiosflags(ios::fixed)<<setprecision(3) <<setw(8)<<x1[0]<<' '; @@ -1574,10 +1609,829 @@ void output_superpose(const string filename, const char *fname_super, if (compress_type) fin_gz.close(); else fin.close(); - ofstream fp(fname_super); + string fname_super_full=fname_super; + if (infmt==0) fname_super_full+=".pdb"; + else if (infmt==3) fname_super_full+=".cif"; + ofstream fp; + fp.open(fname_super_full.c_str()); fp<<buf.str(); fp.close(); buf.str(string()); // clear stream + + string chain1_sele; + string chain2_sele; + int i; + if (!mm_opt) + { + if (split_opt==2 && ter_opt>=1) // align one chain from model 1 + { + chain1_sele=" and c. "+chainID1.substr(1); + chain2_sele=" and c. "+chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i<chainID1.size();i++) if (chainID1[i]==',') break; + chain1_sele=" and c. "+chainID1.substr(i+1); + for (i=1;i<chainID2.size();i++) if (chainID2[i]==',') break; + chain2_sele=" and c. "+chainID2.substr(i+1); + } + } + + /* extract aligned region */ + int i1=-1; + int i2=-1; + string resi1_sele; + string resi2_sele; + string resi1_bond; + string resi2_bond; + string prev_resi1; + string prev_resi2; + string curr_resi1; + string curr_resi2; + if (mm_opt) + { + ; + } + else + { + for (i=0;i<strlen(seqM);i++) + { + i1+=(seqxA[i]!='-' && seqxA[i]!='*'); + i2+=(seqyA[i]!='-'); + if (seqM[i]==' ' || seqxA[i]=='*') continue; + curr_resi1=resi_vec1[i1].substr(0,4); + curr_resi2=resi_vec2[i2].substr(0,4); + if (resi1_sele.size()==0) + resi1_sele = "i. "+curr_resi1; + else + { + resi1_sele+=" or i. "+curr_resi1; + resi1_bond+="bond structure1 and i. "+prev_resi1+ + ", i. "+curr_resi1+"\n"; + } + if (resi2_sele.size()==0) + resi2_sele = "i. "+curr_resi2; + else + { + resi2_sele+=" or i. "+curr_resi2; + resi2_bond+="bond structure2 and i. "+prev_resi2+ + ", i. "+curr_resi2+"\n"; + } + prev_resi1=curr_resi1; + prev_resi2=curr_resi2; + //if (seqM[i]!=':') continue; + } + if (resi1_sele.size()) resi1_sele=" and ( "+resi1_sele+")"; + if (resi2_sele.size()) resi2_sele=" and ( "+resi2_sele+")"; + } + + /* write pymol script */ + vector<string> pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + + for (int p=0;p<pml_list.size();p++) + { + if (mm_opt && p<=1) continue; + buf_pymol + <<"#!/usr/bin/env pymol\n" + <<"cmd.load(\""<<fname_super_full<<"\", \"structure1\")\n" + <<"cmd.load(\""<<yname<<"\", \"structure2\")\n" + <<"hide all\n" + <<"set all_states, "<<((ter_opt==0)?"on":"off")<<'\n'; + if (p==0) // .pml + { + if (chain1_sele.size()) buf_pymol + <<"remove structure1 and not "<<chain1_sele.substr(4)<<"\n"; + if (chain2_sele.size()) buf_pymol + <<"remove structure2 and not "<<chain2_sele.substr(4)<<"\n"; + buf_pymol + <<"remove not n. CA and not n. C3'\n" + <<resi1_bond + <<resi2_bond + <<"show stick, structure1"<<chain1_sele<<resi1_sele<<"\n" + <<"show stick, structure2"<<chain2_sele<<resi2_sele<<"\n"; + } + else if (p==1) // _atm.pml + { + buf_pymol + <<"show cartoon, structure1"<<chain1_sele<<resi1_sele<<"\n" + <<"show cartoon, structure2"<<chain2_sele<<resi2_sele<<"\n"; + } + else if (p==2) // _all.pml + { + buf_pymol + <<"show ribbon, structure1"<<chain1_sele<<"\n" + <<"show ribbon, structure2"<<chain2_sele<<"\n"; + } + else if (p==3) // _all_atm.pml + { + buf_pymol + <<"show cartoon, structure1"<<chain1_sele<<"\n" + <<"show cartoon, structure2"<<chain2_sele<<"\n"; + } + else if (p==4) // _all_atm_lig.pml + { + buf_pymol + <<"show cartoon, structure1\n" + <<"show cartoon, structure2\n" + <<"show stick, not polymer\n" + <<"show sphere, not polymer\n"; + } + buf_pymol + <<"color blue, structure1\n" + <<"color red, structure2\n" + <<"set ribbon_width, 6\n" + <<"set stick_radius, 0.3\n" + <<"set sphere_scale, 0.25\n" + <<"set ray_shadow, 0\n" + <<"bg_color white\n" + <<"set transparency=0.2\n" + <<"zoom polymer and ((structure1"<<chain1_sele + <<") or (structure2"<<chain2_sele<<"))\n" + <<endl; + + fp.open((pml_list[p]+".pml").c_str()); + fp<<buf_pymol.str(); + fp.close(); + buf_pymol.str(string()); + } + + /* clean up */ + pml_list.clear(); + + resi1_sele.clear(); + resi2_sele.clear(); + + resi1_bond.clear(); + resi2_bond.clear(); + + prev_resi1.clear(); + prev_resi2.clear(); + + curr_resi1.clear(); + curr_resi2.clear(); + + chain1_sele.clear(); + chain2_sele.clear(); +} + +void output_rasmol(const string xname, const string yname, + const string fname_super, double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector<string>&resi_vec1, const vector<string>&resi_vec2, + const string chainID1, const string chainID2, + const int xlen, const int ylen, const double d0A, const int n_ali8, + const double rmsd, const double TM1, const double Liden) +{ + stringstream buf; + stringstream buf_all; + stringstream buf_atm; + stringstream buf_all_atm; + stringstream buf_all_atm_lig; + //stringstream buf_pdb; + stringstream buf_tm; + string line; + double x[3]; // before transform + double x1[3]; // after transform + bool after_ter; // true if passed the "TER" line in PDB + string asym_id; // chain ID + + buf_tm<<"REMARK US-align" + <<"\nREMARK Structure 1:"<<setw(11)<<left<<xname+chainID1<<" Size= "<<xlen + <<"\nREMARK Structure 2:"<<setw(11)<<yname+chainID2<<right<<" Size= "<<ylen + <<" (TM-score is normalized by "<<setw(4)<<ylen<<", d0=" + <<setiosflags(ios::fixed)<<setprecision(2)<<setw(6)<<d0A<<")" + <<"\nREMARK Aligned length="<<setw(4)<<n_ali8<<", RMSD=" + <<setw(6)<<setiosflags(ios::fixed)<<setprecision(2)<<rmsd + <<", TM-score="<<setw(7)<<setiosflags(ios::fixed)<<setprecision(5)<<TM1 + <<", ID="<<setw(5)<<setiosflags(ios::fixed)<<setprecision(3) + <<((n_ali8>0)?Liden/n_ali8:0)<<endl; + string rasmol_CA_header="load inline\nselect *A\nwireframe .45\nselect *B\nwireframe .20\nselect all\ncolor white\n"; + string rasmol_cartoon_header="load inline\nselect all\ncartoon\nselect *A\ncolor blue\nselect *B\ncolor red\nselect ligand\nwireframe 0.25\nselect solvent\nspacefill 0.25\nselect all\nexit\n"+buf_tm.str(); + if (!mm_opt) buf<<rasmol_CA_header; + buf_all<<rasmol_CA_header; + if (!mm_opt) buf_atm<<rasmol_cartoon_header; + buf_all_atm<<rasmol_cartoon_header; + buf_all_atm_lig<<rasmol_cartoon_header; + + /* selecting chains for -mol */ + string chain1_sele; + string chain2_sele; + int i; + if (!mm_opt) + { + if (split_opt==2 && ter_opt>=1) // align one chain from model 1 + { + chain1_sele=chainID1.substr(1); + chain2_sele=chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i<chainID1.size();i++) if (chainID1[i]==',') break; + chain1_sele=chainID1.substr(i+1); + for (i=1;i<chainID2.size();i++) if (chainID2[i]==',') break; + chain2_sele=chainID2.substr(i+1); + } + } + + + /* for PDBx/mmCIF only */ + map<string,int> _atom_site; + int atom_site_pos; + vector<string> line_vec; + string atom; // 4-character atom name + string AA; // 3-character residue name + string resi; // 4-character residue sequence number + string inscode; // 1-character insertion code + string model_index; // model index + bool is_mmcif=false; + + /* used for CONECT record of chain1 */ + int ca_idx1=0; // all CA atoms + int lig_idx1=0; // all atoms + vector <int> idx_vec; + + /* used for CONECT record of chain2 */ + int ca_idx2=0; // all CA atoms + int lig_idx2=0; // all atoms + + /* extract aligned region */ + vector<string> resi_aln1; + vector<string> resi_aln2; + int i1=-1; + int i2=-1; + if (!mm_opt) + { + for (i=0;i<strlen(seqM);i++) + { + i1+=(seqxA[i]!='-'); + i2+=(seqyA[i]!='-'); + if (seqM[i]==' ') continue; + resi_aln1.push_back(resi_vec1[i1].substr(0,4)); + resi_aln2.push_back(resi_vec2[i2].substr(0,4)); + if (seqM[i]!=':') continue; + buf <<"select "<<resi_aln1.back()<<":A," + <<resi_aln2.back()<<":B\ncolor red\n"; + buf_all<<"select "<<resi_aln1.back()<<":A," + <<resi_aln2.back()<<":B\ncolor red\n"; + } + buf<<"select all\nexit\n"<<buf_tm.str(); + } + buf_all<<"select all\nexit\n"<<buf_tm.str(); + + ifstream fin; + /* read first file */ + after_ter=false; + asym_id=""; + fin.open(xname.c_str()); + while (fin.good()) + { + getline(fin, line); + if (ter_opt>=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (is_mmcif==false && line.size()>=54 && + (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + transform(t, u, x, x1); + //buf_pdb<<line.substr(0,30)<<setiosflags(ios::fixed) + //<<setprecision(3) + //<<setw(8)<<x1[0] <<setw(8)<<x1[1] <<setw(8)<<x1[2] + //<<line.substr(54)<<'\n'; + + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx1++; + buf_all_atm_lig<<line.substr(0,6)<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + if (chain1_sele.size() && line[21]!=chain1_sele[0]) continue; + if (after_ter || line.compare(0,6,"ATOM ")) continue; + if (ter_opt>=2) + { + if (ca_idx1 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(),resi_aln1.end(), + line.substr(22,4))!=resi_aln1.end()) + { + buf_atm<<"ATOM "<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + } + if (line.substr(12,4)!=" CA " && line.substr(12,4)!=" C3'") continue; + ca_idx1++; + buf_all<<"ATOM "<<setw(5)<<ca_idx1<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1]<<setw(8)<<x1[2]<<'\n'; + if (find(resi_aln1.begin(),resi_aln1.end(), + line.substr(22,4))==resi_aln1.end()) continue; + if (!mm_opt) buf<<"ATOM "<<setw(5)<<ca_idx1<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1]<<setw(8)<<x1[2]<<'\n'; + idx_vec.push_back(ca_idx1); + } + else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF + { + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + } + + if (is_mmcif==false) + { + //buf_pdb.str(string()); + is_mmcif=true; + } + + while(1) + { + line_vec.clear(); + split(line,line_vec); + if (line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") break; + if (_atom_site.count("pdbx_PDB_model_num")) + { + if (model_index.size() && model_index!= + line_vec[_atom_site["pdbx_PDB_model_num"]]) + break; + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + } + + x[0]=atof(line_vec[_atom_site["Cartn_x"]].c_str()); + x[1]=atof(line_vec[_atom_site["Cartn_y"]].c_str()); + x[2]=atof(line_vec[_atom_site["Cartn_z"]].c_str()); + if (mirror_opt) x[2]=-x[2]; + transform(t, u, x, x1); + + if (_atom_site.count("label_alt_id")==0 || + line_vec[_atom_site["label_alt_id"]]=="." || + line_vec[_atom_site["label_alt_id"]]=="A") + { + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) atom=" "; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain1_sele; + else if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain1_sele; + if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + //buf_pdb<<left<<setw(6) + //<<line_vec[_atom_site["group_PDB"]]<<right + //<<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + //<<AA<<" "<<asym_id[asym_id.size()-1] + //<<resi<<inscode<<" " + //<<setiosflags(ios::fixed)<<setprecision(3) + //<<setw(8)<<x1[0] + //<<setw(8)<<x1[1] + //<<setw(8)<<x1[2]<<'\n'; + + if (after_ter==false || + line_vec[_atom_site["group_pdb"]]=="HETATM") + { + lig_idx1++; + buf_all_atm_lig<<left<<setw(6) + <<line_vec[_atom_site["group_PDB"]]<<right + <<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (after_ter==false && + line_vec[_atom_site["group_PDB"]]=="ATOM") + { + buf_all_atm<<"ATOM "<<setw(6) + <<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(), + resi_aln1.end(),resi)!=resi_aln1.end()) + { + buf_atm<<"ATOM "<<setw(6) + <<setw(5)<<lig_idx1%100000<<' ' + <<atom<<' '<<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + } + if (atom==" CA " || atom==" C3'") + { + ca_idx1++; + //mm_opt, split_opt, mirror_opt, chainID1,chainID2); + buf_all<<"ATOM "<<setw(6) + <<setw(5)<<ca_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(), + resi_aln1.end(),resi)!=resi_aln1.end()) + { + buf<<"ATOM "<<setw(6) + <<setw(5)<<ca_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + idx_vec.push_back(ca_idx1); + } + } + } + } + } + + while(1) + { + if (fin.good()) getline(fin, line); + else break; + if (line.size()) break; + } + } + } + else if (line.size() && is_mmcif==false) + { + //buf_pdb<<line<<'\n'; + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=1;i<ca_idx1;i++) buf_all<<"CONECT" + <<setw(5)<<i%100000<<setw(5)<<(i+1)%100000<<'\n'; + if (!mm_opt) for (i=1;i<idx_vec.size();i++) buf<<"CONECT" + <<setw(5)<<idx_vec[i-1]%100000<<setw(5)<<idx_vec[i]%100000<<'\n'; + idx_vec.clear(); + + /* read second file */ + after_ter=false; + asym_id=""; + fin.open(yname.c_str()); + while (fin.good()) + { + getline(fin, line); + if (ter_opt>=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (line.size()>=54 && (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx2++; + buf_all_atm_lig<<line.substr(0,6)<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + if (chain1_sele.size() && line[21]!=chain1_sele[0]) continue; + if (after_ter || line.compare(0,6,"ATOM ")) continue; + if (ter_opt>=2) + { + if (ca_idx2 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + if (!mm_opt && find(resi_aln2.begin(),resi_aln2.end(), + line.substr(22,4))!=resi_aln2.end()) + { + buf_atm<<"ATOM "<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + } + if (line.substr(12,4)!=" CA " && line.substr(12,4)!=" C3'") continue; + ca_idx2++; + buf_all<<"ATOM "<<setw(5)<<ca_idx1+ca_idx2<<' '<<line.substr(12,4) + <<' '<<line.substr(17,3)<<" B"<<line.substr(22,32)<<'\n'; + if (find(resi_aln2.begin(),resi_aln2.end(),line.substr(22,4) + )==resi_aln2.end()) continue; + if (!mm_opt) buf<<"ATOM "<<setw(5)<<ca_idx1+ca_idx2<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" B" + <<line.substr(22,32)<<'\n'; + idx_vec.push_back(ca_idx1+ca_idx2); + } + else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF + { + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+yname); + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+yname); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + } + + while(1) + { + line_vec.clear(); + split(line,line_vec); + if (line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") break; + if (_atom_site.count("pdbx_PDB_model_num")) + { + if (model_index.size() && model_index!= + line_vec[_atom_site["pdbx_PDB_model_num"]]) + break; + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + } + + if (_atom_site.count("label_alt_id")==0 || + line_vec[_atom_site["label_alt_id"]]=="." || + line_vec[_atom_site["label_alt_id"]]=="A") + { + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) atom=" "; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + if (after_ter==false || + line_vec[_atom_site["group_PDB"]]=="HETATM") + { + lig_idx2++; + buf_all_atm_lig<<left<<setw(6) + <<line_vec[_atom_site["group_PDB"]]<<right + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (after_ter==false && + line_vec[_atom_site["group_PDB"]]=="ATOM") + { + buf_all_atm<<"ATOM "<<setw(6) + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (!mm_opt && find(resi_aln2.begin(), + resi_aln2.end(),resi)!=resi_aln2.end()) + { + buf_atm<<"ATOM "<<setw(6) + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + } + if (atom==" CA " || atom==" C3'") + { + ca_idx2++; + buf_all<<"ATOM "<<setw(6) + <<setw(5)<<(ca_idx1+ca_idx2)%100000 + <<' '<<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (!mm_opt && find(resi_aln2.begin(), + resi_aln2.end(),resi)!=resi_aln2.end()) + { + buf<<"ATOM "<<setw(6) + <<setw(5)<<(ca_idx1+ca_idx2)%100000 + <<' '<<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + idx_vec.push_back(ca_idx1+ca_idx2); + } + } + } + } + } + + if (fin.good()) getline(fin, line); + else break; + } + } + else if (line.size()) + { + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=ca_idx1+1;i<ca_idx1+ca_idx2;i++) buf_all<<"CONECT" + <<setw(5)<<i%100000<<setw(5)<<(i+1)%100000<<'\n'; + for (i=1;i<idx_vec.size();i++) buf<<"CONECT" + <<setw(5)<<idx_vec[i-1]%100000<<setw(5)<<idx_vec[i]%100000<<'\n'; + idx_vec.clear(); + + /* write pymol script */ + ofstream fp; + /* + stringstream buf_pymol; + vector<string> pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + for (i=0;i<pml_list.size();i++) + { + buf_pymol<<"#!/usr/bin/env pymol\n" + <<"load "<<pml_list[i]<<"\n" + <<"hide all\n" + <<((i==0 || i==2)?("show stick\n"):("show cartoon\n")) + <<"color blue, chain A\n" + <<"color red, chain B\n" + <<"set ray_shadow, 0\n" + <<"set stick_radius, 0.3\n" + <<"set sphere_scale, 0.25\n" + <<"show stick, not polymer\n" + <<"show sphere, not polymer\n" + <<"bg_color white\n" + <<"set transparency=0.2\n" + <<"zoom polymer\n" + <<endl; + fp.open((pml_list[i]+".pml").c_str()); + fp<<buf_pymol.str(); + fp.close(); + buf_pymol.str(string()); + pml_list[i].clear(); + } + pml_list.clear(); + */ + + /* write rasmol script */ + if (!mm_opt) + { + fp.open((fname_super).c_str()); + fp<<buf.str(); + fp.close(); + } + fp.open((fname_super+"_all").c_str()); + fp<<buf_all.str(); + fp.close(); + if (!mm_opt) + { + fp.open((fname_super+"_atm").c_str()); + fp<<buf_atm.str(); + fp.close(); + } + fp.open((fname_super+"_all_atm").c_str()); + fp<<buf_all_atm.str(); + fp.close(); + fp.open((fname_super+"_all_atm_lig").c_str()); + fp<<buf_all_atm_lig.str(); + fp.close(); + //fp.open((fname_super+".pdb").c_str()); + //fp<<buf_pdb.str(); + //fp.close(); + + /* clear stream */ + buf.str(string()); + buf_all.str(string()); + buf_atm.str(string()); + buf_all_atm.str(string()); + buf_all_atm_lig.str(string()); + //buf_pdb.str(string()); + buf_tm.str(string()); + resi_aln1.clear(); + resi_aln2.clear(); + asym_id.clear(); + line_vec.clear(); + atom.clear(); + AA.clear(); + resi.clear(); + inscode.clear(); + model_index.clear(); } /* extract rotation matrix based on TMscore8 */ @@ -1588,7 +2442,7 @@ void output_rotation_matrix(const char* fname_matrix, fout.open(fname_matrix, ios::out | ios::trunc); if (fout)// succeed { - fout << "------ The rotation matrix to rotate Chain_1 to Chain_2 ------\n"; + fout << "------ The rotation matrix to rotate Structure_1 to Structure_2 ------\n"; char dest[1000]; sprintf(dest, "m %18s %14s %14s %14s\n", "t[m]", "u[m][0]", "u[m][1]", "u[m][2]"); fout << string(dest); @@ -1597,12 +2451,12 @@ void output_rotation_matrix(const char* fname_matrix, sprintf(dest, "%d %18.10f %14.10f %14.10f %14.10f\n", k, t[k], u[k][0], u[k][1], u[k][2]); fout << string(dest); } - fout << "\nCode for rotating Structure A from (x,y,z) to (X,Y,Z):\n" + fout << "\nCode for rotating Structure 1 from (x,y,z) to (X,Y,Z):\n" "for(i=0; i<L; i++)\n" "{\n" - " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i]\n" - " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i]\n" - " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i]\n" + " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i];\n" + " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i];\n" + " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i];\n" "}\n"; fout.close(); } @@ -1611,48 +2465,48 @@ void output_rotation_matrix(const char* fname_matrix, } //output the final results -void output_results( - const string xname, const string yname, - const char *chainID1, const char *chainID2, +void output_results(const string xname, const string yname, + const string chainID1, const string chainID2, const int xlen, const int ylen, double t[3], double u[3][3], const double TM1, const double TM2, const double TM3, const double TM4, const double TM5, - const double rmsd, const double d0_out, - const char *seqM, const char *seqxA, const char *seqyA, const double Liden, - const int n_ali8, const int n_ali, const int L_ali, - const double TM_ali, const double rmsd_ali, const double TM_0, - const double d0_0, const double d0A, const double d0B, - const double Lnorm_ass, const double d0_scale, - const double d0a, const double d0u, const char* fname_matrix, - const int outfmt_opt, const int ter_opt, const char *fname_super, - const bool i_opt, const bool I_opt, const int a_opt, - const bool u_opt, const bool d_opt) + const double rmsd, const double d0_out, const char *seqM, + const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, const double TM_ali, + const double rmsd_ali, const double TM_0, const double d0_0, + const double d0A, const double d0B, const double Lnorm_ass, + const double d0_scale, const double d0a, const double d0u, + const char* fname_matrix, const int outfmt_opt, const int ter_opt, + const int mm_opt, const int split_opt, const int o_opt, + const string fname_super, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const int mirror_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) { if (outfmt_opt<=0) { - printf("\nName of Chain_1: %s%s (to be superimposed onto Chain_2)\n", - xname.c_str(), chainID1); - printf("Name of Chain_2: %s%s\n", yname.c_str(), chainID2); - printf("Length of Chain_1: %d residues\n", xlen); - printf("Length of Chain_2: %d residues\n\n", ylen); + printf("\nName of Structure_1: %s%s (to be superimposed onto Structure_2)\n", + xname.c_str(), chainID1.c_str()); + printf("Name of Structure_2: %s%s\n", yname.c_str(), chainID2.c_str()); + printf("Length of Structure_1: %d residues\n", xlen); + printf("Length of Structure_2: %d residues\n\n", ylen); - if (i_opt || I_opt) + if (i_opt) printf("User-specified initial alignment: TM/Lali/rmsd = %7.5lf, %4d, %6.3lf\n", TM_ali, L_ali, rmsd_ali); - printf("Aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, Liden/(n_ali8+0.00000001)); - printf("TM-score= %6.5f (if normalized by length of Chain_1, i.e., LN=%d, d0=%.2f)\n", TM2, xlen, d0B); - printf("TM-score= %6.5f (if normalized by length of Chain_2, i.e., LN=%d, d0=%.2f)\n", TM1, ylen, d0A); + printf("Aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("TM-score= %6.5f (normalized by length of Structure_1: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("TM-score= %6.5f (normalized by length of Structure_2: L=%d, d0=%.2f)\n", TM1, ylen, d0A); if (a_opt==1) - printf("TM-score= %6.5f (if normalized by average length of two structures, i.e., LN= %.1f, d0= %.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + printf("TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); if (u_opt) - printf("TM-score= %6.5f (if normalized by user-specified LN=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + printf("TM-score= %6.5f (normalized by user-specified L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); if (d_opt) - printf("TM-score= %6.5f (if scaled by user-specified d0= %.2f, and LN= %d)\n", TM5, d0_scale, ylen); + printf("TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); printf("(You should use TM-score normalized by length of the reference structure)\n"); //output alignment - printf("\n(\":\" denotes residue pairs of d < %4.1f Angstrom, ", d0_out); + printf("\n(\":\" denotes residue pairs of d <%4.1f Angstrom, ", d0_out); printf("\".\" denotes other aligned residues)\n"); printf("%s\n", seqxA); printf("%s\n", seqM); @@ -1661,16 +2515,16 @@ void output_results( else if (outfmt_opt==1) { printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", - xname.c_str(), chainID1, xlen, d0B, Liden/xlen, TM2); + xname.c_str(), chainID1.c_str(), xlen, d0B, Liden/xlen, TM2); printf("%s\n", seqxA); printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", - yname.c_str(), chainID2, ylen, d0A, Liden/ylen, TM1); + yname.c_str(), chainID2.c_str(), ylen, d0A, Liden/ylen, TM1); printf("%s\n", seqyA); printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", - n_ali8, rmsd, Liden/(n_ali8+0.00000001)); + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); - if (i_opt || I_opt) + if (i_opt) printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); if(a_opt) @@ -1687,16 +2541,23 @@ void output_results( else if (outfmt_opt==2) { printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", - xname.c_str(), chainID1, yname.c_str(), chainID2, TM2, TM1, rmsd, - Liden/xlen, Liden/ylen, Liden/(n_ali8+0.00000001), + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, xlen, ylen, n_ali8); } cout << endl; - if (strlen(fname_matrix)) - output_rotation_matrix(fname_matrix, t, u); - if (strlen(fname_super)) - output_superpose(xname, fname_super, t, u, ter_opt); + if (strlen(fname_matrix)) output_rotation_matrix(fname_matrix, t, u); + + if (o_opt==1) + output_pymol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2); + else if (o_opt==2) + output_rasmol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); } double standard_TMscore(double **r1, double **r2, double **xtm, double **ytm, @@ -1836,7 +2697,7 @@ void clean_up_after_approx_TM(int *invmap0, int *invmap, * 1 - terminated due to exception * 2-7 - pre-terminated due to low TM-score */ int TMalign_main(double **xa, double **ya, - const char *seqx, const char *seqy, const int *secx, const int *secy, + const char *seqx, const char *seqy, const char *secx, const char *secy, double t0[3], double u0[3][3], double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, double &d0_0, double &TM_0, @@ -1846,8 +2707,7 @@ int TMalign_main(double **xa, double **ya, double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, const int xlen, const int ylen, const vector<string> sequence, const double Lnorm_ass, - const double d0_scale, - const bool i_opt, const bool I_opt, const int a_opt, + const double d0_scale, const int i_opt, const int a_opt, const bool u_opt, const bool d_opt, const bool fast_opt, const int mol_type, const double TMcut=-1) { @@ -1880,7 +2740,7 @@ int TMalign_main(double **xa, double **ya, /***********************/ parameter_set4search(xlen, ylen, D0_MIN, Lnorm, score_d8, d0, d0_search, dcu0); - int simplify_step = 40; //for similified search engine + int simplify_step = 40; //for simplified search engine int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis<score_d8 int i; @@ -1898,7 +2758,7 @@ int TMalign_main(double **xa, double **ya, // Stick to the initial alignment // //************************************************// bool bAlignStick = false; - if (I_opt)// if input has set parameter for "-I" + if (i_opt==3)// if input has set parameter for "-I" { // In the original code, this loop starts from 1, which is // incorrect. Fortran starts from 1 but C++ should starts from 0. @@ -2151,7 +3011,7 @@ int TMalign_main(double **xa, double **ya, //************************************************// // get initial alignment from user's input: // //************************************************// - if (i_opt)// if input has set parameter for "-i" + if (i_opt==1)// if input has set parameter for "-i" { for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" invmap[j] = -1; @@ -2209,7 +3069,7 @@ int TMalign_main(double **xa, double **ya, //*******************************************************************// // The alignment will not be changed any more in the following // //*******************************************************************// - //check if the initial alignment is generated approriately + //check if the initial alignment is generated appropriately bool flag=false; for(i=0; i<ylen; i++) { @@ -2221,8 +3081,9 @@ int TMalign_main(double **xa, double **ya, } if(!flag) { - cout << "There is no alignment between the two proteins!" << endl; - cout << "Program stop with no result!" << endl; + cout << "There is no alignment between the two proteins! " + << "Program stop with no result!" << endl; + TM1=TM2=TM3=TM4=TM5=0; return 1; } @@ -2245,7 +3106,7 @@ int TMalign_main(double **xa, double **ya, // Detailed TMscore search engine --> prepare for final TMscore // //********************************************************************// //run detailed TMscore search engine for the best alignment, and - //extract the best rotation matrix (t, u) for the best alginment + //extract the best rotation matrix (t, u) for the best alignment simplify_step=1; if (fast_opt) simplify_step=40; score_sum_method=8; @@ -2268,7 +3129,7 @@ int TMalign_main(double **xa, double **ya, { n_ali++; d=sqrt(dist(&xt[i][0], &ya[j][0])); - if (d <= score_d8 || (I_opt == true)) + if (d <= score_d8 || (i_opt == 3)) { m1[k]=i; m2[k]=j; @@ -2324,6 +3185,7 @@ int TMalign_main(double **xa, double **ya, TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + double Lnorm_d0; if (a_opt>0) { //normalized by average length of structures A, B @@ -2359,6 +3221,7 @@ int TMalign_main(double **xa, double **ya, d0_out=d0_scale; d0_0=d0_scale; //Lnorm_0=ylen; + Lnorm_d0=Lnorm_0; local_d0_search = d0_search; TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, @@ -2372,7 +3235,8 @@ int TMalign_main(double **xa, double **ya, seqM.assign( ali_len,' '); seqyA.assign(ali_len,'-'); - do_rotation(xa, xt, xlen, t, u); + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); int kk=0, i_old=0, j_old=0; d=0; @@ -2435,3 +3299,163 @@ int TMalign_main(double **xa, double **ya, delete [] m2; return 0; // zero for no exception } + +/* entry function for TM-align with circular permutation + * i_opt, a_opt, u_opt, d_opt, TMcut are not implemented yet */ +int CPalign_main(double **xa, double **ya, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector<string> sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const double TMcut=-1) +{ + char *seqx_cp; // for the protein sequence + char *secx_cp; // for the secondary structure + double **xa_cp; // coordinates + string seqxA_cp,seqyA_cp; // alignment + int i,r; + int cp_point=0; // position of circular permutation + int cp_aln_best=0; // amount of aligned residue in sliding window + int cp_aln_current;// amount of aligned residue in sliding window + + /* duplicate structure */ + NewArray(&xa_cp, xlen*2, 3); + seqx_cp = new char[xlen*2 + 1]; + secx_cp = new char[xlen*2 + 1]; + for (r=0;r<xlen;r++) + { + xa_cp[r+xlen][0]=xa_cp[r][0]=xa[r][0]; + xa_cp[r+xlen][1]=xa_cp[r][1]=xa[r][1]; + xa_cp[r+xlen][2]=xa_cp[r][2]=xa[r][2]; + seqx_cp[r+xlen]=seqx_cp[r]=seqx[r]; + secx_cp[r+xlen]=secx_cp[r]=secx[r]; + } + seqx_cp[2*xlen]=0; + secx_cp[2*xlen]=0; + + /* fTM-align alignment */ + double TM1_cp,TM2_cp; + TMalign_main(xa_cp, ya, seqx_cp, seqy, secx_cp, secy, + t0, u0, TM1_cp, TM2_cp, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA_cp, seqyA_cp, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen*2, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, false, false, true, mol_type, -1); + + /* delete gap in seqxA_cp */ + r=0; + seqxA=seqxA_cp; + seqyA=seqyA_cp; + for (i=0;i<seqxA_cp.size();i++) + { + if (seqxA_cp[i]!='-') + { + seqxA[r]=seqxA_cp[i]; + seqyA[r]=seqyA_cp[i]; + r++; + } + } + seqxA=seqxA.substr(0,r); + seqyA=seqyA.substr(0,r); + + /* count the number of aligned residues in each window + * r - residue index in the original unaligned sequence + * i - position in the alignment */ + for (r=0;r<xlen-1;r++) + { + cp_aln_current=0; + for (i=r;i<r+xlen;i++) cp_aln_current+=(seqyA[i]!='-'); + + if (cp_aln_current>cp_aln_best) + { + cp_aln_best=cp_aln_current; + cp_point=r; + } + } + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + seqxA_cp.clear(); + seqyA_cp.clear(); + rmsd0=Liden=n_ali=n_ali8=0; + + /* fTM-align alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, false, false, true, mol_type, -1); + + /* do not use cricular permutation of number of aligned residues is not + * larger than sequence-order dependent alignment */ + if (n_ali8>cp_aln_best) cp_point=0; + + /* prepare structure for final alignment */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + rmsd0=Liden=n_ali=n_ali8=0; + if (cp_point!=0) + { + for (r=0;r<xlen;r++) + { + xa_cp[r][0]=xa_cp[r+cp_point][0]; + xa_cp[r][1]=xa_cp[r+cp_point][1]; + xa_cp[r][2]=xa_cp[r+cp_point][2]; + seqx_cp[r]=seqx_cp[r+cp_point]; + secx_cp[r]=secx_cp[r+cp_point]; + } + } + seqx_cp[xlen]=0; + secx_cp[xlen]=0; + + /* full TM-align */ + TMalign_main(xa_cp, ya, seqx_cp, seqy, secx_cp, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA_cp, seqyA_cp, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type, TMcut); + + /* correct alignment + * r - residue index in the original unaligned sequence + * i - position in the alignment */ + if (cp_point>0) + { + r=0; + for (i=0;i<seqxA_cp.size();i++) + { + r+=(seqxA_cp[i]!='-'); + if (r>=(xlen-cp_point)) + { + i++; + break; + } + } + seqxA=seqxA_cp.substr(0,i)+'*'+seqxA_cp.substr(i); + seqM =seqM.substr(0,i) +' '+seqM.substr(i); + seqyA=seqyA_cp.substr(0,i)+'-'+seqyA_cp.substr(i); + } + else + { + seqxA=seqxA_cp; + seqyA=seqyA_cp; + } + + /* clean up */ + delete[]seqx_cp; + delete[]secx_cp; + DeleteArray(&xa_cp,xlen*2); + seqxA_cp.clear(); + seqyA_cp.clear(); + return cp_point; +} diff --git a/modules/bindings/src/tmalign/TMscore.cpp b/modules/bindings/src/tmalign/TMscore.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c2ca9958a1ceabb2caf0b914aafa13bb87490635 --- /dev/null +++ b/modules/bindings/src/tmalign/TMscore.cpp @@ -0,0 +1,525 @@ +#include "TMscore.h" + +using namespace std; + +void print_version() +{ + cout << +"\n" +" *************************************************************************\n" +" * TM-SCORE *\n" +" * A scoring function to assess the similarity of protein structures *\n" +" * Based on statistics: *\n" +" * 0.0 < TM-score < 0.17, random structural similarity *\n" +" * 0.5 < TM-score < 1.00, in about the same fold *\n" +" * Reference: Yang Zhang and Jeffrey Skolnick, Proteins 2004 57: 702-710 *\n" +" * For comments, please email to: yangzhanglab@umich.edu *\n" +" *************************************************************************" + << endl; +} + +void print_extra_help() +{ + cout << +"Additional options:\n" +" -a TM-score normalized by the average length of two structures\n" +" T or F, (default F)\n" +"\n" +" -m Output TM-score rotation matrix\n" +"\n" +" -d TM-score scaled by an assigned d0, e.g. 5 Angstroms\n" +"\n" +" -fast Fast but slightly inaccurate alignment\n" +"\n" +" -dir Perform all-against-all alignment among the list of PDB\n" +" chains listed by 'chain_list' under 'chain_folder'. Note\n" +" that the slash is necessary.\n" +" $ TMalign -dir chain_folder/ chain_list\n" +"\n" +" -dir1 Use chain2 to search a list of PDB chains listed by 'chain1_list'\n" +" under 'chain1_folder'. Note that the slash is necessary.\n" +" $ TMalign -dir1 chain1_folder/ chain1_list chain2\n" +"\n" +" -dir2 Use chain1 to search a list of PDB chains listed by 'chain2_list'\n" +" under 'chain2_folder'\n" +" $ TMalign chain1 -dir2 chain2_folder/ chain2_list\n" +"\n" +" -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" +" add file name suffix to files listed by chain1_list or chain2_list\n" +"\n" +" -atom 4-character atom name used to represent a residue.\n" +" Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" +" (note the spaces before and after CA).\n" +"\n" +" -mol Molecule type: RNA or protein\n" +" Default is detect molecule type automatically\n" +"\n" +" -ter Strings to mark the end of a chain\n" +" 3: (default) TER, ENDMDL, END or different chain ID\n" +" 2: ENDMDL, END, or different chain ID\n" +" 1: ENDMDL or END\n" +" 0: (default in the first C++ TMalign) end of file\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 0: (default) treat the whole structure as one single chain\n" +" 1: treat each MODEL as a separate chain (-ter should be 0)\n" +" 2: treat each chain as a seperate chain (-ter should be <=1)\n" +"\n" +" -outfmt Output format\n" +" 0: (default) full output\n" +" 1: fasta format compact output\n" +" 2: tabular format very compact output\n" +" -1: full output, but without version or citation information\n" +"\n" +" -mirror Whether to align the mirror image of input structure\n" +" 0: (default) do not align mirrored structure\n" +" 1: align mirror of chain1 to origin chain2\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" +" -infmt1 Input format for chain1\n" +" -infmt2 Input format for chain2\n" +" -1: (default) automatically detect PDB or PDBx/mmCIF format\n" +" 0: PDB format\n" +" 1: SPICKER format\n" +" 2: xyz format\n" +" 3: PDBx/mmCIF format\n" + <<endl; +} + +void print_help(bool h_opt=false) +{ + //print_version(); + cout << +"\n" +" Brief instruction for running TM-score program:\n" +" (For detail: Zhang & Skolnick, Proteins, 2004 57:702-10)\n" +"\n" +" 1. Run TM-score to compare 'model' and 'native':\n" +" $ TMscore model.pdb native.pdb\n" +"\n" +" 2. Run TM-score to compare two complex structures with multiple chains\n" +" $ TMscore -c model.pdb native.pdb\n" +"\n" +" 2. TM-score normalized with an assigned scale d0 e.g. 5 A:\n" +" $ TMscore model.pdb native.pdb -d 5\n" +"\n" +" 3. TM-score normalized by a specific length, e.g. 120 AA:\n" +" $ TMscore model.pdb native.pdv -l 120\n" +"\n" +" 4. TM-score with superposition output, e.g. 'TM_sup.pdb':\n" +" $ TMscore model.pdb native.pdb -o TM_sup.pdb\n" +" To view superimposed atomic model by PyMOL:\n" +" $ pymol TM_sup.pdb native.pdb\n" + <<endl; + + if (h_opt) print_extra_help(); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + /**********************/ + /* get argument */ + /**********************/ + string xname = ""; + string yname = ""; + string fname_super = ""; // file name for superposed structure + string fname_lign = ""; // file name for user alignment + string fname_matrix= ""; // file name for output matrix + vector<string> sequence; // get value from alignment file + double Lnorm_ass, d0_scale; + + bool h_opt = false; // print full help message + bool v_opt = false; // print version + bool m_opt = false; // flag for -m, output rotation matrix + bool o_opt = false; // flag for -o, output superposed structure + int a_opt = 0; // flag for -a, do not normalized by average length + bool u_opt = false; // flag for -u, normalized by user specified length + bool d_opt = false; // flag for -d, user specified d0 + + double TMcut =-1; + int infmt1_opt=-1; // PDB or PDBx/mmCIF format for chain_1 + int infmt2_opt=-1; // PDB or PDBx/mmCIF format for chain_2 + int ter_opt =3; // TER, END, or different chainID + int split_opt =0; // do not split chain + int outfmt_opt=0; // set -outfmt to full output + bool fast_opt =false; // flags for -fast, fTM-align algorithm + int mirror_opt=0; // do not align mirror + int het_opt=0; // do not read HETATM residues + string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA + string suffix_opt=""; // set -suffix to empty + string dir_opt =""; // set -dir to empty + string dir1_opt =""; // set -dir1 to empty + string dir2_opt =""; // set -dir2 to empty + int byresi_opt=1; // TM-score without -c + vector<string> chain1_list; // only when -dir1 is set + vector<string> chain2_list; // only when -dir2 is set + + for(int i = 1; i < argc; i++) + { + if ( !strcmp(argv[i],"-o") && i < (argc-1) ) + { + fname_super = argv[i + 1]; o_opt = true; i++; + } + else if ( (!strcmp(argv[i],"-u") || !strcmp(argv[i],"-l") || + !strcmp(argv[i],"-L")) && i < (argc-1) ) + { + Lnorm_ass = atof(argv[i + 1]); u_opt = true; i++; + } + else if ( !strcmp(argv[i],"-a") && i < (argc-1) ) + { + if (!strcmp(argv[i + 1], "T")) a_opt=true; + else if (!strcmp(argv[i + 1], "F")) a_opt=false; + else + { + a_opt=atoi(argv[i + 1]); + if (a_opt!=-2 && a_opt!=-1 && a_opt!=1) + PrintErrorAndQuit("-a must be -2, -1, 1, T or F"); + } + i++; + } + else if ( !strcmp(argv[i],"-d") && i < (argc-1) ) + { + d0_scale = atof(argv[i + 1]); d_opt = true; i++; + } + else if ( !strcmp(argv[i],"-v") ) + { + v_opt = true; + } + else if ( !strcmp(argv[i],"-h") ) + { + h_opt = true; + } + else if (!strcmp(argv[i], "-m") && i < (argc-1) ) + { + fname_matrix = argv[i + 1]; m_opt = true; i++; + }// get filename for rotation matrix + else if (!strcmp(argv[i], "-fast")) + { + fast_opt = true; + } + else if ( !strcmp(argv[i],"-infmt1") && i < (argc-1) ) + { + infmt1_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-infmt2") && i < (argc-1) ) + { + infmt2_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-ter") && i < (argc-1) ) + { + ter_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-split") && i < (argc-1) ) + { + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-atom") && i < (argc-1) ) + { + atom_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-mol") && i < (argc-1) ) + { + mol_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir") && i < (argc-1) ) + { + dir_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir1") && i < (argc-1) ) + { + dir1_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir2") && i < (argc-1) ) + { + dir2_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-suffix") && i < (argc-1) ) + { + suffix_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-outfmt") && i < (argc-1) ) + { + outfmt_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-c") ) + { + byresi_opt=2; + } + else if ( !strcmp(argv[i],"-mirror") && i < (argc-1) ) + { + mirror_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else if (yname.size() == 0) yname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if(xname.size()==0 || (yname.size()==0 && dir_opt.size()==0) || + (yname.size() && dir_opt.size())) + { + if (h_opt) print_help(h_opt); + if (v_opt) + { + print_version(); + exit(EXIT_FAILURE); + } + if (xname.size()==0) + PrintErrorAndQuit("Please provide input structures"); + else if (yname.size()==0 && dir_opt.size()==0) + PrintErrorAndQuit("Please provide structure B"); + else if (yname.size() && dir_opt.size()) + PrintErrorAndQuit("Please provide only one file name if -dir is set"); + } + + if (suffix_opt.size() && dir_opt.size()+dir1_opt.size()+dir2_opt.size()==0) + PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); + if ((dir_opt.size() || dir1_opt.size() || dir2_opt.size())) + { + if (m_opt || o_opt) + PrintErrorAndQuit("-m or -o cannot be set with -dir, -dir1 or -dir2"); + else if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) + PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); + } + if (atom_opt.size()!=4) + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); + else if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; + + if (u_opt && Lnorm_ass<=0) + PrintErrorAndQuit("Wrong value for option -u! It should be >0"); + if (d_opt && d0_scale<=0) + PrintErrorAndQuit("Wrong value for option -d! It should be >0"); + if (outfmt_opt>=2 && (a_opt || u_opt || d_opt)) + PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); + if (byresi_opt>=2 && ter_opt>=2) + PrintErrorAndQuit("-byresi >=2 should be used with -ter <=1"); + if (split_opt==1 && ter_opt!=0) + PrintErrorAndQuit("-split 1 should be used with -ter 0"); + else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) + PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); + if (split_opt<0 || split_opt>2) + PrintErrorAndQuit("-split can only be 0, 1 or 2"); + + if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt + PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); + + /* parse file list */ + if (dir1_opt.size()+dir_opt.size()==0) chain1_list.push_back(xname); + else file2chainlist(chain1_list, xname, dir_opt+dir1_opt, suffix_opt); + + if (dir_opt.size()) + for (int i=0;i<chain1_list.size();i++) + chain2_list.push_back(chain1_list[i]); + else if (dir2_opt.size()==0) chain2_list.push_back(yname); + else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + + if (outfmt_opt==2) + cout<<"#PDBchain1\tPDBchain2\tTM1\tTM2\t" + <<"RMSD\tID1\tID2\tIDali\tL1\tL2\tLali"<<endl; + + /* declare previously global variables */ + vector<vector<string> >PDB_lines1; // text of chain1 + vector<vector<string> >PDB_lines2; // text of chain2 + vector<int> mol_vec1; // molecule type of chain1, RNA if >0 + vector<int> mol_vec2; // molecule type of chain2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + + /* loop over file names */ + for (i=0;i<chain1_list.size();i++) + { + /* parse chain 1 */ + xname=chain1_list[i]; + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + if (!xchainnum) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<xchainnum;chain_i++) + { + xlen=PDB_lines1[chain_i].size(); + if (mol_opt=="RNA") mol_vec1[chain_i]=1; + else if (mol_opt=="protein") mol_vec1[chain_i]=-1; + if (!xlen) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain length 0."<<endl; + continue; + } + else if (xlen<3) + { + cerr<<"Sequence is too short <3!: "<<xname<<endl; + continue; + } + NewArray(&xa, xlen, 3); + seqx = new char[xlen + 1]; + xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, + resi_vec1, byresi_opt); + if (mirror_opt) for (r=0;r<xlen;r++) xa[r][2]=-xa[r][2]; + + for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) + { + /* parse chain 2 */ + if (PDB_lines2.size()==0) + { + yname=chain2_list[j]; + ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); + if (!ychainnum) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain number 0."<<endl; + continue; + } + } + for (chain_j=0;chain_j<ychainnum;chain_j++) + { + ylen=PDB_lines2[chain_j].size(); + if (mol_opt=="RNA") mol_vec2[chain_j]=1; + else if (mol_opt=="protein") mol_vec2[chain_j]=-1; + if (!ylen) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain length 0."<<endl; + continue; + } + else if (ylen<3) + { + cerr<<"Sequence is too short <3!: "<<yname<<endl; + continue; + } + NewArray(&ya, ylen, 3); + seqy = new char[ylen + 1]; + ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, + resi_vec2, byresi_opt); + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double rmsd_d0_out=0; + int L_lt_d=0; + double GDT_list[5]={0,0,0,0,0}; // 0.5, 1, 2, 4, 8 + double maxsub=0; + + /* entry function for structure alignment */ + TMscore_main( + xa, ya, seqx, seqy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + a_opt, u_opt, d_opt, fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + GDT_list,maxsub,TMcut); + + /* print result */ + if (outfmt_opt==0) print_version(); + output_TMscore_results( + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), + chainID_list1[chain_i], + chainID_list2[chain_j], + xlen, ylen, t0, u0, TM1, TM2, + TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, + Lnorm_ass, d0_scale, d0a, d0u, + (m_opt?fname_matrix+chainID_list1[chain_i]:"").c_str(), + outfmt_opt, ter_opt, + (o_opt?fname_super+chainID_list1[chain_i]:"").c_str(), + a_opt, u_opt, d_opt, mirror_opt, + L_lt_d, rmsd_d0_out, GDT_list, maxsub, + split_opt, resi_vec1, resi_vec2); + + /* Done! Free memory */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + delete [] seqy; + resi_vec2.clear(); + } // chain_j + if (chain2_list.size()>1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + } // j + PDB_lines1[chain_i].clear(); + DeleteArray(&xa, xlen); + delete [] seqx; + resi_vec1.clear(); + } // chain_i + xname.clear(); + PDB_lines1.clear(); + chainID_list1.clear(); + mol_vec1.clear(); + } // i + if (chain2_list.size()==1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + resi_vec2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + chain1_list.clear(); + chain2_list.clear(); + sequence.clear(); + return 0; +} diff --git a/modules/bindings/src/tmalign/TMscore.h b/modules/bindings/src/tmalign/TMscore.h new file mode 100644 index 0000000000000000000000000000000000000000..445335c79e9f08561d0adef17addfeb2cff79830 --- /dev/null +++ b/modules/bindings/src/tmalign/TMscore.h @@ -0,0 +1,958 @@ +#include "TMalign.h" + +int score_fun8( double **xa, double **ya, int n_ali, double d, int i_ali[], + double *score1, int score_sum_method, const double Lnorm, + const double score_d8, const double d0, + double GDT_list_tmp[5], double &maxsub_tmp) +{ + double score_sum=0, di; + double d_tmp=d*d; + double d02=d0*d0; + double score_d8_cut = score_d8*score_d8; + + int i, n_cut, inc=0; + + while(1) + { + for (i=0;i<5;i++) GDT_list_tmp[i]=0; + maxsub_tmp=0; + + n_cut=0; + score_sum=0; + for(i=0; i<n_ali; i++) + { + di = dist(xa[i], ya[i]); + if(di<d_tmp) + { + i_ali[n_cut]=i; + n_cut++; + } + if(score_sum_method==8) + { + if(di<=score_d8_cut) score_sum += 1/(1+di/d02); + } + else score_sum += 1/(1+di/d02); + + /* for maxsub score */ + //maxsub_tmp+=1/(1+di/12.25); + if (di<64) // 8*8=64 + { + GDT_list_tmp[4]+=1; + if (di<16) // 4*4=16 + { + GDT_list_tmp[3]+=1; + if (di<12.25) // 3.5^2=12.25 + { + maxsub_tmp+=1/(1+di/12.25); + if (di<4) // 2*2=4 + { + GDT_list_tmp[2]+=1; + if (di<1) // 1*1=1 + { + GDT_list_tmp[1]+=1; + if (di<0.25) // 0.5*0.5=0.25 + GDT_list_tmp[0]+=1; + } + } + } + } + } + } + //there are not enough feasible pairs, reliefe the threshold + if(n_cut<3 && n_ali>3) + { + inc++; + double dinc=(d+inc*0.5); + d_tmp = dinc * dinc; + } + else break; + } + + *score1=score_sum/Lnorm; + return n_cut; +} + +int score_fun8_standard(double **xa, double **ya, int n_ali, double d, + int i_ali[], double *score1, int score_sum_method, + double score_d8, double d0, double GDT_list_tmp[5], double &maxsub_tmp) +{ + double score_sum = 0, di; + double d_tmp = d*d; + double d02 = d0*d0; + double score_d8_cut = score_d8*score_d8; + + int i, n_cut, inc = 0; + while (1) + { + for (i=0;i<5;i++) GDT_list_tmp[i]=0; + maxsub_tmp=0; + n_cut = 0; + score_sum = 0; + for (i = 0; i<n_ali; i++) + { + di = dist(xa[i], ya[i]); + if (di<d_tmp) + { + i_ali[n_cut] = i; + n_cut++; + } + if (score_sum_method == 8) + { + if (di <= score_d8_cut) score_sum += 1 / (1 + di / d02); + } + else + { + score_sum += 1 / (1 + di / d02); + } + + /* for maxsub score */ + //maxsub_tmp+=1/(1+di/12.25); + if (di<64) // 8*8=64 + { + GDT_list_tmp[4]+=1; + if (di<16) // 4*4=16 + { + GDT_list_tmp[3]+=1; + if (di<12.25) // 3.5^2=12.25 + { + maxsub_tmp+=1/(1+di/12.25); + if (di<4) // 2*2=4 + { + GDT_list_tmp[2]+=1; + if (di<1) // 1*1=1 + { + GDT_list_tmp[1]+=1; + if (di<0.25) // 0.5*0.5=0.25 + GDT_list_tmp[0]+=1; + } + } + } + } + } + } + //there are not enough feasible pairs, reliefe the threshold + if (n_cut<3 && n_ali>3) + { + inc++; + double dinc = (d + inc*0.5); + d_tmp = dinc * dinc; + } + else break; + } + + *score1 = score_sum / n_ali; + return n_cut; +} + +double TMscore8_search(double **r1, double **r2, double **xtm, double **ytm, + double **xt, int Lali, double t0[3], double u0[3][3], int simplify_step, + int score_sum_method, double *Rcomm, double local_d0_search, double Lnorm, + double score_d8, double d0, double GDT_list[5], double &maxsub) +{ + double GDT_list_tmp[5]={0,0,0,0,0}; + double maxsub_tmp=0; + int i, m; + double score_max, score, rmsd; + const int kmax=Lali; + int k_ali[kmax], ka, k; + double t[3]; + double u[3][3]; + double d; + + + //iterative parameters + int n_it=20; //maximum number of iterations + int n_init_max=6; //maximum number of different fragment length + int L_ini[n_init_max]; //fragment lengths, Lali, Lali/2, Lali/4 ... 4 + int L_ini_min=4; + if(Lali<L_ini_min) L_ini_min=Lali; + + int n_init=0, i_init; + for(i=0; i<n_init_max-1; i++) + { + n_init++; + L_ini[i]=(int) (Lali/pow(2.0, (double) i)); + if(L_ini[i]<=L_ini_min) + { + L_ini[i]=L_ini_min; + break; + } + } + if(i==n_init_max-1) + { + n_init++; + L_ini[i]=L_ini_min; + } + + score_max=-1; + //find the maximum score starting from local structures superposition + int i_ali[kmax], n_cut; + int L_frag; //fragment length + int iL_max; //maximum starting postion for the fragment + + for(i_init=0; i_init<n_init; i_init++) + { + L_frag=L_ini[i_init]; + iL_max=Lali-L_frag; + + i=0; + while(1) + { + //extract the fragment starting from position i + ka=0; + for(k=0; k<L_frag; k++) + { + int kk=k+i; + r1[k][0]=xtm[kk][0]; + r1[k][1]=xtm[kk][1]; + r1[k][2]=xtm[kk][2]; + + r2[k][0]=ytm[kk][0]; + r2[k][1]=ytm[kk][1]; + r2[k][2]=ytm[kk][2]; + + k_ali[ka]=kk; + ka++; + } + + //extract rotation matrix based on the fragment + Kabsch(r1, r2, L_frag, 1, &rmsd, t, u); + if (simplify_step != 1) + *Rcomm = 0; + do_rotation(xtm, xt, Lali, t, u); + + //get subsegment of this fragment + d = local_d0_search - 1; + n_cut=score_fun8(xt, ytm, Lali, d, i_ali, &score, + score_sum_method, Lnorm, score_d8, d0, + GDT_list_tmp, maxsub_tmp); + if(score>score_max) + { + score_max=score; + + //save the rotation matrix + for(k=0; k<3; k++) + { + t0[k]=t[k]; + u0[k][0]=u[k][0]; + u0[k][1]=u[k][1]; + u0[k][2]=u[k][2]; + } + } + if (maxsub_tmp>maxsub) maxsub=maxsub_tmp; + for (k=0;k<5;k++) + if (GDT_list_tmp[k]>GDT_list[k]) + GDT_list[k]=GDT_list_tmp[k]; + + //try to extend the alignment iteratively + d = local_d0_search + 1; + for(int it=0; it<n_it; it++) + { + ka=0; + for(k=0; k<n_cut; k++) + { + m=i_ali[k]; + r1[k][0]=xtm[m][0]; + r1[k][1]=xtm[m][1]; + r1[k][2]=xtm[m][2]; + + r2[k][0]=ytm[m][0]; + r2[k][1]=ytm[m][1]; + r2[k][2]=ytm[m][2]; + + k_ali[ka]=m; + ka++; + } + //extract rotation matrix based on the fragment + Kabsch(r1, r2, n_cut, 1, &rmsd, t, u); + do_rotation(xtm, xt, Lali, t, u); + n_cut=score_fun8(xt, ytm, Lali, d, i_ali, &score, + score_sum_method, Lnorm, score_d8, d0); + if(score>score_max) + { + score_max=score; + + //save the rotation matrix + for(k=0; k<3; k++) + { + t0[k]=t[k]; + u0[k][0]=u[k][0]; + u0[k][1]=u[k][1]; + u0[k][2]=u[k][2]; + } + } + if (maxsub_tmp>maxsub) maxsub=maxsub_tmp; + for (k=0;k<5;k++) + if (GDT_list_tmp[k]>GDT_list[k]) + GDT_list[k]=GDT_list_tmp[k]; + + //check if it converges + if(n_cut==ka) + { + for(k=0; k<n_cut; k++) + { + if(i_ali[k]!=k_ali[k]) break; + } + if(k==n_cut) break; + } + } //for iteration + + if(i<iL_max) + { + i=i+simplify_step; //shift the fragment + if(i>iL_max) i=iL_max; //do this to use the last missed fragment + } + else if(i>=iL_max) break; + }//while(1) + //end of one fragment + }//for(i_init + return score_max; +} + +double TMscore8_search_standard( double **r1, double **r2, + double **xtm, double **ytm, double **xt, int Lali, + double t0[3], double u0[3][3], int simplify_step, int score_sum_method, + double *Rcomm, double local_d0_search, double score_d8, double d0, + double GDT_list[5], double &maxsub) +{ + double GDT_list_tmp[5]={0,0,0,0,0}; + double maxsub_tmp=0; + int i, m; + double score_max, score, rmsd; + const int kmax = Lali; + int k_ali[kmax], ka, k; + double t[3]; + double u[3][3]; + double d; + + //iterative parameters + int n_it = 20; //maximum number of iterations + int n_init_max = 6; //maximum number of different fragment length + int L_ini[n_init_max]; //fragment lengths, Lali, Lali/2, Lali/4 ... 4 + int L_ini_min = 4; + if (Lali<L_ini_min) L_ini_min = Lali; + + int n_init = 0, i_init; + for (i = 0; i<n_init_max - 1; i++) + { + n_init++; + L_ini[i] = (int)(Lali / pow(2.0, (double)i)); + if (L_ini[i] <= L_ini_min) + { + L_ini[i] = L_ini_min; + break; + } + } + if (i == n_init_max - 1) + { + n_init++; + L_ini[i] = L_ini_min; + } + + score_max = -1; + //find the maximum score starting from local structures superposition + int i_ali[kmax], n_cut; + int L_frag; //fragment length + int iL_max; //maximum starting postion for the fragment + + for (i_init = 0; i_init<n_init; i_init++) + { + L_frag = L_ini[i_init]; + iL_max = Lali - L_frag; + + i = 0; + while (1) + { + //extract the fragment starting from position i + ka = 0; + for (k = 0; k<L_frag; k++) + { + int kk = k + i; + r1[k][0] = xtm[kk][0]; + r1[k][1] = xtm[kk][1]; + r1[k][2] = xtm[kk][2]; + + r2[k][0] = ytm[kk][0]; + r2[k][1] = ytm[kk][1]; + r2[k][2] = ytm[kk][2]; + + k_ali[ka] = kk; + ka++; + } + //extract rotation matrix based on the fragment + Kabsch(r1, r2, L_frag, 1, &rmsd, t, u); + if (simplify_step != 1) + *Rcomm = 0; + do_rotation(xtm, xt, Lali, t, u); + + //get subsegment of this fragment + d = local_d0_search - 1; + n_cut = score_fun8_standard(xt, ytm, Lali, d, i_ali, &score, + score_sum_method, score_d8, d0, GDT_list_tmp, maxsub_tmp); + + if (score>score_max) + { + score_max = score; + + //save the rotation matrix + for (k = 0; k<3; k++) + { + t0[k] = t[k]; + u0[k][0] = u[k][0]; + u0[k][1] = u[k][1]; + u0[k][2] = u[k][2]; + } + } + if (maxsub_tmp>maxsub) maxsub=maxsub_tmp; + for (k=0;k<5;k++) + if (GDT_list_tmp[k]>GDT_list[k]) + GDT_list[k]=GDT_list_tmp[k]; + + //try to extend the alignment iteratively + d = local_d0_search + 1; + for (int it = 0; it<n_it; it++) + { + ka = 0; + for (k = 0; k<n_cut; k++) + { + m = i_ali[k]; + r1[k][0] = xtm[m][0]; + r1[k][1] = xtm[m][1]; + r1[k][2] = xtm[m][2]; + + r2[k][0] = ytm[m][0]; + r2[k][1] = ytm[m][1]; + r2[k][2] = ytm[m][2]; + + k_ali[ka] = m; + ka++; + } + //extract rotation matrix based on the fragment + Kabsch(r1, r2, n_cut, 1, &rmsd, t, u); + do_rotation(xtm, xt, Lali, t, u); + n_cut = score_fun8_standard(xt, ytm, Lali, d, i_ali, &score, + score_sum_method, score_d8, d0, GDT_list_tmp, maxsub_tmp); + if (score>score_max) + { + score_max = score; + + //save the rotation matrix + for (k = 0; k<3; k++) + { + t0[k] = t[k]; + u0[k][0] = u[k][0]; + u0[k][1] = u[k][1]; + u0[k][2] = u[k][2]; + } + } + if (maxsub_tmp>maxsub) maxsub=maxsub_tmp; + for (k=0;k<5;k++) + if (GDT_list_tmp[k]>GDT_list[k]) + GDT_list[k]=GDT_list_tmp[k]; + + //check if it converges + if (n_cut == ka) + { + for (k = 0; k<n_cut; k++) + { + if (i_ali[k] != k_ali[k]) break; + } + if (k == n_cut) break; + } + } //for iteration + + if (i<iL_max) + { + i = i + simplify_step; //shift the fragment + if (i>iL_max) i = iL_max; //do this to use the last missed fragment + } + else if (i >= iL_max) break; + }//while(1) + //end of one fragment + }//for(i_init + return score_max; +} + +double detailed_search_standard( double **r1, double **r2, + double **xtm, double **ytm, double **xt, double **x, double **y, + int xlen, int ylen, int invmap0[], double t[3], double u[3][3], + int simplify_step, int score_sum_method, double local_d0_search, + const bool& bNormalize, double Lnorm, double score_d8, double d0, + double GDT_list[5], double &maxsub) +{ + //x is model, y is template, try to superpose onto y + int i, j, k; + double tmscore; + double rmsd; + + k=0; + for(i=0; i<ylen; i++) + { + j=invmap0[i]; + if(j>=0) //aligned + { + xtm[k][0]=x[j][0]; + xtm[k][1]=x[j][1]; + xtm[k][2]=x[j][2]; + + ytm[k][0]=y[i][0]; + ytm[k][1]=y[i][1]; + ytm[k][2]=y[i][2]; + k++; + } + } + + //detailed search 40-->1 + tmscore = TMscore8_search_standard( r1, r2, xtm, ytm, xt, k, t, u, + simplify_step, score_sum_method, &rmsd, local_d0_search, score_d8, d0, + GDT_list, maxsub); + if (bNormalize)// "-i", to use standard_TMscore, then bNormalize=true, else bNormalize=false; + tmscore = tmscore * k / Lnorm; + + return tmscore; +} + +/* Entry function for TM-score. Return TM-score calculation status: + * 0 - full TM-score calculation + * 1 - terminated due to exception + * 2-7 - pre-terminated due to low TM-score */ +int TMscore_main(double **xa, double **ya, + const char *seqx, const char *seqy, double t0[3], double u0[3][3], + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector<string> sequence, const double Lnorm_ass, + const double d0_scale, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, double GDT_list[5], double &maxsub, + const double TMcut=-1) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double t[3], u[3][3]; //Kabsch translation vector and rotation matrix + double **score; // Input score table for dynamic programming + bool **path; // for dynamic programming + double **val; // for dynamic programming + double **xtm, **ytm; // for TMscore search engine + double **xt; //for saving the superposed version of r_1 or xtm + double **r1, **r2; // for Kabsch rotation + + /***********************/ + /* allocate memory */ + /***********************/ + int minlen = min(xlen, ylen); + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + NewArray(&xtm, minlen, 3); + NewArray(&ytm, minlen, 3); + NewArray(&xt, xlen, 3); + NewArray(&r1, minlen, 3); + NewArray(&r2, minlen, 3); + + /***********************/ + /* parameter set */ + /***********************/ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); + int simplify_step = 40; //for similified search engine + int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis<score_d8 + + int i; + int *invmap0 = new int[ylen+1]; + int *invmap = new int[ylen+1]; + double TM, TMmax=-1; + for(i=0; i<ylen; i++) invmap0[i]=-1; + + double ddcc=0.4; + if (Lnorm <= 40) ddcc=0.1; //Lnorm was setted in parameter_set4search + double local_d0_search = d0_search; + + //************************************************// + // Stick to the initial alignment // + //************************************************// + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; + + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') i1++; + if (sequence[1][kk1] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, d0_search, score_d8, + t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, 40, 8, local_d0_search, true, Lnorm, score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + } + + //*******************************************************************// + // The alignment will not be changed any more in the following // + //*******************************************************************// + //check if the initial alignment is generated approriately + bool flag=false; + for(i=0; i<ylen; i++) + { + if(invmap0[i]>=0) + { + flag=true; + break; + } + } + if(!flag) + { + cout << "There is no alignment between the two proteins!" << endl; + cout << "Program stop with no result!" << endl; + return 1; + } + + /* last TM-score pre-termination */ + if (TMcut>0) + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.6*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 7; + } + } + + //********************************************************************// + // Detailed TMscore search engine --> prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alginment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0, + GDT_list, maxsub); + + //select pairs with dis<d8 for final TMscore computation and output alignment + int k=0; + int *m1, *m2; + double d; + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + do_rotation(xa, xt, xlen, t, u); + k=0; + for(int j=0; j<ylen; j++) + { + i=invmap0[j]; + if(i>=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + + //****************************************// + // Final TMscore // + // Please set parameters for output // + //****************************************// + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + + //normalized by length of structure A + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0, + GDT_list, maxsub); + TM_0 = TM1; + + double Lnorm_d0; + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + Lnorm_d0=Lnorm_0; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; //maximum length of alignment + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + int kk=0, i_old=0, j_old=0; + d=0; + for(int k=0; k<n_ali8; k++) + { + for(int i=i_old; i<m1[k]; i++) + { + //align x to gap + seqxA[kk]=seqx[i]; + seqyA[kk]='-'; + seqM[kk]=' '; + kk++; + } + + for(int j=j_old; j<m2[k]; j++) + { + //align y to gap + seqxA[kk]='-'; + seqyA[kk]=seqy[j]; + seqM[kk]=' '; + kk++; + } + + seqxA[kk]=seqx[m1[k]]; + seqyA[kk]=seqy[m2[k]]; + Liden+=(seqxA[kk]==seqyA[kk]); + d=sqrt(dist(&xt[m1[k]][0], &ya[m2[k]][0])); + //if(d<d0_out) seqM[kk]=':'; + //else seqM[kk]='.'; + if(d<5) seqM[kk]=':'; + kk++; + i_old=m1[k]+1; + j_old=m2[k]+1; + } + + //tail + for(int i=i_old; i<xlen; i++) + { + //align x to gap + seqxA[kk]=seqx[i]; + seqyA[kk]='-'; + seqM[kk]=' '; + kk++; + } + for(int j=j_old; j<ylen; j++) + { + //align y to gap + seqxA[kk]='-'; + seqyA[kk]=seqy[j]; + seqM[kk]=' '; + kk++; + } + seqxA=seqxA.substr(0,kk); + seqyA=seqyA.substr(0,kk); + seqM =seqM.substr(0,kk); + + /* free memory */ + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + delete [] m1; + delete [] m2; + return 0; // zero for no exception +} + +void output_TMscore_results( + const string xname, const string yname, + const string chainID1, const string chainID2, + const int xlen, const int ylen, double t[3], double u[3][3], + const double TM1, const double TM2, + const double TM3, const double TM4, const double TM5, + const double rmsd, const double d0_out, + const char *seqM, const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, + const double TM_ali, const double rmsd_ali, const double TM_0, + const double d0_0, const double d0A, const double d0B, + const double Lnorm_ass, const double d0_scale, + const double d0a, const double d0u, const char* fname_matrix, + const int outfmt_opt, const int ter_opt, const char *fname_super, + const int a_opt, const bool u_opt, const bool d_opt, const int mirror_opt, + int L_lt_d, const double rmsd_d0_out, + double GDT_list[5], double maxsub, const int split_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("\nStructure1: %s%s Length=%5d\n", + xname.c_str(), chainID1.c_str(), xlen); + printf("Structure2: %s%s Length=%5d (by which all scores are normalized)\n", + yname.c_str(), chainID2.c_str(), ylen); + + printf("Number of residues in common=%5d\n", n_ali8); + printf("RMSD of the common residues=%9.3f\n\n", rmsd); + printf("TM-score = %6.4f (d0= %.2f)\n", TM1, d0A); + printf("MaxSub-score= %6.4f (d0= 3.50)\n", maxsub/ylen); + + double gdt_ts_score=0; + double gdt_ha_score=0; + int i; + for (i=0;i<4;i++) + { + gdt_ts_score+=GDT_list[i+1]; + gdt_ha_score+=GDT_list[i]; + } + gdt_ts_score/=(4*ylen); + gdt_ha_score/=(4*ylen); + printf("GDT-TS-score= %6.4f %%(d<1)=%6.4f %%(d<2)=%6.4f %%(d<4)=%6.4f %%(d<8)=%6.4f\n", + gdt_ts_score, GDT_list[1]/ylen, GDT_list[2]/ylen, + GDT_list[3]/ylen, GDT_list[4]/ylen); + printf("GDT-HA-score= %6.4f %%(d<0.5)=%6.4f %%(d<1)=%6.4f %%(d<2)=%6.4f %%(d<4)=%6.4f\n", + gdt_ha_score, GDT_list[0]/ylen, GDT_list[1]/ylen, + GDT_list[2]/ylen, GDT_list[3]/ylen); + + if (a_opt==1) + printf("TM-score = %5.4f (if normalized by average length of two structures, i.e., LN= %.1f, d0= %.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("TM-score = %5.4f (if normalized by user-specified LN=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("TM-score = %5.5f (if scaled by user-specified d0= %.2f, and LN= %d)\n", TM5, d0_scale, ylen); + + + printf("\n -------- rotation matrix to rotate Chain-1 to Chain-2 ------\n"); + printf(" i t(i) u(i,1) u(i,2) u(i,3)\n"); + printf(" 1 %17.10f %14.10f %14.10f %14.10f\n",t[0],u[0][0],u[0][1],u[0][2]); + printf(" 2 %17.10f %14.10f %14.10f %14.10f\n",t[1],u[1][0],u[1][1],u[1][2]); + printf(" 3 %17.10f %14.10f %14.10f %14.10f\n",t[2],u[2][0],u[2][1],u[2][2]); + + //output alignment + string seq_scale=seqM; + for (i=0;i<strlen(seqM);i++) + { + L_lt_d+=seqM[i]==':'; + seq_scale[i]=(i+1)%10+'0'; + } + printf("\nSuperposition in the TM-score: Length(d<%3.1f)= %d\n", d0_out, L_lt_d); + //printf("\nSuperposition in the TM-score: Length(d<%3.1f)= %d RMSD=%6.2f\n", d0_out, L_lt_d, rmsd_d0_out); + printf("(\":\" denotes the residue pairs of distance <%4.1f Angstrom)\n", d0_out); + printf("%s\n", seqxA); + printf("%s\n", seqM); + printf("%s\n", seqyA); + printf("%s\n", seq_scale.c_str()); + seq_scale.clear(); + } + else if (outfmt_opt==1) + { + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + xname.c_str(), chainID1.c_str(), xlen, d0B, Liden/xlen, TM2); + printf("%s\n", seqxA); + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + yname.c_str(), chainID2.c_str(), ylen, d0A, Liden/ylen, TM1); + printf("%s\n", seqyA); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by user-specified L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + cout << endl; + + if (strlen(fname_matrix)) + output_rotation_matrix(fname_matrix, t, u); + if (strlen(fname_super)) + output_pymol(xname, yname, fname_super, t, u, ter_opt, + 0, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2); +} diff --git a/modules/bindings/src/tmalign/basic_fun.h b/modules/bindings/src/tmalign/basic_fun.h index 3dadccc30faf0dde5403b9adc123d2cf376bb867..0e8ae307d81a045f12998f90afdd4d0d00c628cb 100644 --- a/modules/bindings/src/tmalign/basic_fun.h +++ b/modules/bindings/src/tmalign/basic_fun.h @@ -7,11 +7,7 @@ #include <math.h> #include <time.h> #include <string.h> -// OST-NOTE: ifdef was added here since malloc.h isn't required for Linux/Mac -// and for some compilers (clang, gcc8) it isn't available -#ifdef _WIN32 -#include <malloc.h> -#endif +//#include <malloc.h> #include <sstream> #include <iostream> @@ -80,35 +76,36 @@ string AAmap(char A) if (A=='W') return "TRP"; if (A=='Y') return "TYR"; if (A=='Z') return "GLX"; - if ('a'<=A && A<='z') return " "+toupper(A); + if ('a'<=A && A<='z') return " "+string(1,char(toupper(A))); return "UNK"; } char AAmap(const string &AA) { - if (AA.compare("ALA")==0) return 'A'; + if (AA.compare("ALA")==0 || AA.compare("DAL")==0) return 'A'; if (AA.compare("ASX")==0) return 'B'; - if (AA.compare("CYS")==0) return 'C'; - if (AA.compare("ASP")==0) return 'D'; - if (AA.compare("GLU")==0) return 'E'; - if (AA.compare("PHE")==0) return 'F'; + if (AA.compare("CYS")==0 || AA.compare("DCY")==0) return 'C'; + if (AA.compare("ASP")==0 || AA.compare("DAS")==0) return 'D'; + if (AA.compare("GLU")==0 || AA.compare("DGL")==0) return 'E'; + if (AA.compare("PHE")==0 || AA.compare("DPN")==0) return 'F'; if (AA.compare("GLY")==0) return 'G'; - if (AA.compare("HIS")==0) return 'H'; - if (AA.compare("ILE")==0) return 'I'; - if (AA.compare("LYS")==0) return 'K'; - if (AA.compare("LEU")==0) return 'L'; - if (AA.compare("MET")==0 || AA.compare("MSE")==0) return 'M'; - if (AA.compare("ASN")==0) return 'N'; + if (AA.compare("HIS")==0 || AA.compare("DHI")==0) return 'H'; + if (AA.compare("ILE")==0 || AA.compare("DIL")==0) return 'I'; + if (AA.compare("LYS")==0 || AA.compare("DLY")==0) return 'K'; + if (AA.compare("LEU")==0 || AA.compare("DLE")==0) return 'L'; + if (AA.compare("MET")==0 || AA.compare("MED")==0 || + AA.compare("MSE")==0) return 'M'; + if (AA.compare("ASN")==0 || AA.compare("DSG")==0) return 'N'; if (AA.compare("PYL")==0) return 'O'; - if (AA.compare("PRO")==0) return 'P'; - if (AA.compare("GLN")==0) return 'Q'; - if (AA.compare("ARG")==0) return 'R'; - if (AA.compare("SER")==0) return 'S'; - if (AA.compare("THR")==0) return 'T'; + if (AA.compare("PRO")==0 || AA.compare("DPR")==0) return 'P'; + if (AA.compare("GLN")==0 || AA.compare("DGN")==0) return 'Q'; + if (AA.compare("ARG")==0 || AA.compare("DAR")==0) return 'R'; + if (AA.compare("SER")==0 || AA.compare("DSN")==0) return 'S'; + if (AA.compare("THR")==0 || AA.compare("DTH")==0) return 'T'; if (AA.compare("SEC")==0) return 'U'; - if (AA.compare("VAL")==0) return 'V'; - if (AA.compare("TRP")==0) return 'W'; - if (AA.compare("TYR")==0) return 'Y'; + if (AA.compare("VAL")==0 || AA.compare("DVA")==0) return 'V'; + if (AA.compare("TRP")==0 || AA.compare("DTR")==0) return 'W'; + if (AA.compare("TYR")==0 || AA.compare("DTY")==0) return 'Y'; if (AA.compare("GLX")==0) return 'Z'; if (AA.compare(0,2," D")==0) return tolower(AA[2]); @@ -124,7 +121,7 @@ void split(const string &line, vector<string> &line_vec, const char delimiter=' ') { bool within_word = false; - for (unsigned int pos=0;pos<line.size();pos++) + for (size_t pos=0;pos<line.size();pos++) { if (line[pos]==delimiter) { @@ -142,8 +139,8 @@ void split(const string &line, vector<string> &line_vec, size_t get_PDB_lines(const string filename, vector<vector<string> >&PDB_lines, vector<string> &chainID_list, - vector<int> &mol_vec, const int ter_opt=3, const int infmt_opt=-1, - const string atom_opt="auto", const int split_opt=0) + vector<int> &mol_vec, const int ter_opt, const int infmt_opt, + const string atom_opt, const int split_opt, const int het_opt) { size_t i=0; // resi i.e. atom index string line; @@ -159,13 +156,13 @@ size_t get_PDB_lines(const string filename, if (filename.size()>=3 && filename.substr(filename.size()-3,3)==".gz") { - fin_gz.open("zcat "+filename); + fin_gz.open("zcat '"+filename+"'"); compress_type=1; } else if (filename.size()>=4 && filename.substr(filename.size()-4,4)==".bz2") { - fin_gz.open("bzcat "+filename); + fin_gz.open("bzcat '"+filename+"'"); compress_type=2; } else fin.open(filename.c_str()); @@ -178,15 +175,18 @@ size_t get_PDB_lines(const string filename, else getline(fin, line); if (infmt_opt==-1 && line.compare(0,5,"loop_")==0) // PDBx/mmCIF return get_PDB_lines(filename,PDB_lines,chainID_list, - mol_vec, ter_opt, 3, atom_opt, split_opt); + mol_vec, ter_opt, 3, atom_opt, split_opt,het_opt); if (i > 0) { if (ter_opt>=1 && line.compare(0,3,"END")==0) break; else if (ter_opt>=3 && line.compare(0,3,"TER")==0) break; } if (split_opt && line.compare(0,3,"END")==0) chainID=0; - if (line.compare(0, 6, "ATOM ")==0 && line.size()>=54 && - (line[16]==' ' || line[16]=='A')) + if (line.size()>=54 && (line[16]==' ' || line[16]=='A') && ( + (line.compare(0, 6, "ATOM ")==0) || + (line.compare(0, 6, "HETATM")==0 && het_opt==1) || + (line.compare(0, 6, "HETATM")==0 && het_opt==2 && + line.compare(17,3, "MSE")==0))) { if (atom_opt=="auto") { @@ -208,12 +208,12 @@ size_t get_PDB_lines(const string filename, if (chainID==' ') { if (ter_opt>=1) i8_stream << ":_"; - else i8_stream<<':'<<model_idx<<":_"; + else i8_stream<<':'<<model_idx<<",_"; } else { if (ter_opt>=1) i8_stream << ':' << chainID; - else i8_stream<<':'<<model_idx<<':'<<chainID; + else i8_stream<<':'<<model_idx<<','<<chainID; } chainID_list.push_back(i8_stream.str()); } @@ -234,12 +234,12 @@ size_t get_PDB_lines(const string filename, if (chainID==' ') { if (ter_opt>=1) i8_stream << ":_"; - else i8_stream<<':'<<model_idx<<":_"; + else i8_stream<<':'<<model_idx<<",_"; } else { if (ter_opt>=1) i8_stream << ':' << chainID; - else i8_stream<<':'<<model_idx<<':'<<chainID; + else i8_stream<<':'<<model_idx<<','<<chainID; } chainID_list.push_back(i8_stream.str()); PDB_lines.push_back(tmp_str_vec); @@ -260,7 +260,7 @@ size_t get_PDB_lines(const string filename, } else if (infmt_opt==1) // SPICKER format { - int L=0; + size_t L=0; float x,y,z; stringstream i8_stream; while (compress_type?fin_gz.good():fin.good()) @@ -276,7 +276,7 @@ size_t get_PDB_lines(const string filename, chainID_list.push_back(i8_stream.str()); PDB_lines.push_back(tmp_str_vec); mol_vec.push_back(0); - for (i=0;(int) i<L;i++) + for (i=0;i<L;i++) { if (compress_type) fin_gz>>x>>y>>z; else fin >>x>>y>>z; @@ -293,7 +293,7 @@ size_t get_PDB_lines(const string filename, } else if (infmt_opt==2) // xyz format { - int L=0; + size_t L=0; stringstream i8_stream; while (compress_type?fin_gz.good():fin.good()) { @@ -308,7 +308,7 @@ size_t get_PDB_lines(const string filename, chainID_list.push_back(':'+line.substr(0,i)); PDB_lines.push_back(tmp_str_vec); mol_vec.push_back(0); - for (i=0;(int) i<L;i++) + for (i=0;i<L;i++) { if (compress_type) getline(fin_gz, line); else getline(fin, line); @@ -343,12 +343,25 @@ size_t get_PDB_lines(const string filename, { if (compress_type) getline(fin_gz, line); else getline(fin, line); + if (line.size()==0) continue; if (loop_) loop_ = line.compare(0,2,"# "); if (!loop_) { if (line.compare(0,5,"loop_")) continue; - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + if (line.size()) break; + } if (line.compare(0,11,"_atom_site.")) continue; loop_=true; @@ -360,6 +373,7 @@ size_t get_PDB_lines(const string filename, { if (compress_type) getline(fin_gz, line); else getline(fin, line); + if (line.size()==0) continue; if (line.compare(0,11,"_atom_site.")) break; _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; } @@ -377,14 +391,19 @@ size_t get_PDB_lines(const string filename, _atom_site.count("Cartn_z")==0) { loop_ = false; - cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_atom_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<<endl; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_comp_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<<endl; continue; } } line_vec.clear(); split(line,line_vec); - if (line_vec[_atom_site["group_PDB"]]!="ATOM") continue; + if ((line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") || + (line_vec[_atom_site["group_PDB"]]=="HETATM" && + (het_opt==0 || + (het_opt==2 && line_vec[_atom_site["label_comp_id"]]!="MSE"))) + ) continue; alt_id="."; if (_atom_site.count("label_alt_id")) // in 39.4 % of entries @@ -435,9 +454,11 @@ size_t get_PDB_lines(const string filename, if (split_opt==1 && ter_opt==0) chainID_list.push_back( ':'+model_index); else if (split_opt==2 && ter_opt==0) - chainID_list.push_back(':'+model_index+':'+asym_id); - else if (split_opt==2 && ter_opt==1) + chainID_list.push_back(':'+model_index+','+asym_id); + else //if (split_opt==2 && ter_opt==1) chainID_list.push_back(':'+asym_id); + //else + //chainID_list.push_back(""); } } @@ -452,9 +473,11 @@ size_t get_PDB_lines(const string filename, if (split_opt==1 && ter_opt==0) chainID_list.push_back( ':'+model_index); else if (split_opt==2 && ter_opt==0) - chainID_list.push_back(':'+model_index+':'+asym_id); - else if (split_opt==2 && ter_opt==1) + chainID_list.push_back(':'+model_index+','+asym_id); + else //if (split_opt==2 && ter_opt==1) chainID_list.push_back(':'+asym_id); + //else + //chainID_list.push_back(""); } } if (prev_asym_id!=asym_id) prev_asym_id=asym_id; @@ -478,9 +501,9 @@ size_t get_PDB_lines(const string filename, i8_stream<<"ATOM " <<setw(5)<<i<<" "<<atom<<" "<<AA<<" "<<asym_id[0] <<setw(5)<<resi.substr(0,5)<<" " - <<setw(8)<<line_vec[_atom_site["Cartn_x"]] - <<setw(8)<<line_vec[_atom_site["Cartn_y"]] - <<setw(8)<<line_vec[_atom_site["Cartn_z"]]; + <<setw(8)<<line_vec[_atom_site["Cartn_x"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_y"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_z"]].substr(0,8); PDB_lines.back().push_back(i8_stream.str()); i8_stream.str(string()); } @@ -510,7 +533,7 @@ size_t get_FASTA_lines(const string filename, { string line; vector<string> tmp_str_vec; - unsigned int l; + size_t l; int compress_type=0; // uncompressed file ifstream fin; @@ -518,13 +541,13 @@ size_t get_FASTA_lines(const string filename, if (filename.size()>=3 && filename.substr(filename.size()-3,3)==".gz") { - fin_gz.open("zcat "+filename); + fin_gz.open("zcat '"+filename+"'"); compress_type=1; } else if (filename.size()>=4 && filename.substr(filename.size()-4,4)==".bz2") { - fin_gz.open("bzcat "+filename); + fin_gz.open("bzcat '"+filename+"'"); compress_type=2; } else fin.open(filename.c_str()); @@ -582,64 +605,115 @@ int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, int i2=0; // positions in resi_vec2 int xlen=resi_vec1.size(); int ylen=resi_vec2.size(); - map<char,int> chainID_map1; - map<char,int> chainID_map2; + map<string,string> chainID_map1; + map<string,string> chainID_map2; if (byresi_opt==3) { - vector<char> chainID_vec; - char chainID; + vector<string> chainID_vec; + string chainID; + stringstream ss; int i; for (i=0;i<xlen;i++) { - chainID=resi_vec1[i][5]; + chainID=resi_vec1[i].substr(5); if (!chainID_vec.size()|| chainID_vec.back()!=chainID) { chainID_vec.push_back(chainID); - chainID_map1[chainID]=chainID_vec.size(); + ss<<chainID_vec.size(); + chainID_map1[chainID]=ss.str(); + ss.str(""); } } chainID_vec.clear(); for (i=0;i<ylen;i++) { - chainID=resi_vec2[i][5]; + chainID=resi_vec2[i].substr(5); if (!chainID_vec.size()|| chainID_vec.back()!=chainID) { chainID_vec.push_back(chainID); - chainID_map2[chainID]=chainID_vec.size(); + ss<<chainID_vec.size(); + chainID_map2[chainID]=ss.str(); + ss.str(""); } } - chainID_vec.clear(); + vector<string>().swap(chainID_vec); } + string chainID1=""; + string chainID2=""; + string chainID1_prev=""; + string chainID2_prev=""; while(i1<xlen && i2<ylen) { - if ((byresi_opt<=2 && resi_vec1[i1]==resi_vec2[i2]) || (byresi_opt==3 - && resi_vec1[i1].substr(0,5)==resi_vec2[i2].substr(0,5) - && chainID_map1[resi_vec1[i1][5]]==chainID_map2[resi_vec2[i2][5]])) + if (byresi_opt==2) + { + chainID1=resi_vec1[i1].substr(5); + chainID2=resi_vec2[i2].substr(5); + } + else if (byresi_opt==3) { - sequence[0]+=seqx[i1++]; - sequence[1]+=seqy[i2++]; + chainID1=chainID_map1[resi_vec1[i1].substr(5)]; + chainID2=chainID_map2[resi_vec2[i2].substr(5)]; } - else if (atoi(resi_vec1[i1].substr(0,4).c_str())<= - atoi(resi_vec2[i2].substr(0,4).c_str())) + + if (chainID1==chainID2) { - sequence[0]+=seqx[i1++]; - sequence[1]+='-'; + if (atoi(resi_vec1[i1].substr(0,4).c_str())< + atoi(resi_vec2[i2].substr(0,4).c_str())) + { + sequence[0]+=seqx[i1++]; + sequence[1]+='-'; + } + else if (atoi(resi_vec1[i1].substr(0,4).c_str())> + atoi(resi_vec2[i2].substr(0,4).c_str())) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + } + chainID1_prev=chainID1; + chainID2_prev=chainID2; } else { - sequence[0]+='-'; - sequence[1]+=seqy[i2++]; + if (chainID1_prev==chainID1 && chainID2_prev!=chainID2) + { + sequence[0]+=seqx[i1++]; + sequence[1]+='-'; + chainID1_prev=chainID1; + } + else if (chainID1_prev!=chainID1 && chainID2_prev==chainID2) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + chainID2_prev=chainID2; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + chainID1_prev=chainID1; + chainID2_prev=chainID2; + } } + } - chainID_map1.clear(); - chainID_map2.clear(); + map<string,string>().swap(chainID_map1); + map<string,string>().swap(chainID_map2); + chainID1.clear(); + chainID2.clear(); + chainID1_prev.clear(); + chainID2_prev.clear(); return sequence[0].size(); } int read_PDB(const vector<string> &PDB_lines, double **a, char *seq, - vector<string> &resi_vec, const int byresi_opt) + vector<string> &resi_vec, const int read_resi) { - unsigned int i; + size_t i; for (i=0;i<PDB_lines.size();i++) { a[i][0] = atof(PDB_lines[i].substr(30, 8).c_str()); @@ -647,9 +721,9 @@ int read_PDB(const vector<string> &PDB_lines, double **a, char *seq, a[i][2] = atof(PDB_lines[i].substr(46, 8).c_str()); seq[i] = AAmap(PDB_lines[i].substr(17, 3)); - if (byresi_opt>=2) resi_vec.push_back(PDB_lines[i].substr(22,5)+ - PDB_lines[i][21]); - if (byresi_opt==1) resi_vec.push_back(PDB_lines[i].substr(22,5)); + if (read_resi>=2) resi_vec.push_back(PDB_lines[i].substr(22,5)+ + PDB_lines[i][21]); + if (read_resi==1) resi_vec.push_back(PDB_lines[i].substr(22,5)); } seq[i]='\0'; return i; @@ -699,7 +773,7 @@ string Trim(const string &inputString) * This function should only be called by main function, as it will * terminate a program if wrong alignment is given */ void read_user_alignment(vector<string>&sequence, const string &fname_lign, - const bool I_opt) + const int i_opt) { if (fname_lign == "") PrintErrorAndQuit("Please provide a file name for option -i!"); @@ -729,10 +803,10 @@ void read_user_alignment(vector<string>&sequence, const string &fname_lign, PrintErrorAndQuit("ERROR: Fasta format is wrong, two proteins should be included."); if (sequence[0].size() != sequence[1].size()) PrintErrorAndQuit("ERROR! FASTA file is wrong. The length in alignment should be equal for the two aligned proteins."); - if (I_opt) + if (i_opt==3) { int aligned_resNum=0; - for (unsigned int i=0;i<sequence[0].size();i++) + for (size_t i=0;i<sequence[0].size();i++) aligned_resNum+=(sequence[0][i]!='-' && sequence[1][i]!='-'); if (aligned_resNum<3) PrintErrorAndQuit("ERROR! Superposition is undefined for <3 aligned residues."); diff --git a/modules/bindings/src/tmalign/param_set.h b/modules/bindings/src/tmalign/param_set.h index 31ac268669606ac121a7082e46d63e16b695585a..9300404a4137f1b24daf2c8a0e4f6ade82ab7f06 100644 --- a/modules/bindings/src/tmalign/param_set.h +++ b/modules/bindings/src/tmalign/param_set.h @@ -10,11 +10,11 @@ void parameter_set4search(const int xlen, const int ylen, double &D0_MIN, double &Lnorm, double &score_d8, double &d0, double &d0_search, double &dcu0) { - //parameter initilization for searching: D0_MIN, Lnorm, d0, d0_search, score_d8 + //parameter initialization for searching: D0_MIN, Lnorm, d0, d0_search, score_d8 D0_MIN=0.5; dcu0=4.25; //update 3.85-->4.25 - Lnorm=getmin(xlen, ylen); //normaliz TMscore by this in searching + Lnorm=getmin(xlen, ylen); //normalize TMscore by this in searching if (Lnorm<=19) //update 15-->19 d0=0.168; //update 0.5-->0.168 else d0=(1.24*pow((Lnorm*1.0-15), 1.0/3)-1.8); @@ -33,7 +33,7 @@ void parameter_set4final_C3prime(const double len, double &D0_MIN, { D0_MIN=0.3; - Lnorm=len; //normaliz TMscore by this in searching + Lnorm=len; //normalize TMscore by this in searching if(Lnorm<=11) d0=0.3; else if(Lnorm>11&&Lnorm<=15) d0=0.4; else if(Lnorm>15&&Lnorm<=19) d0=0.5; @@ -57,7 +57,7 @@ void parameter_set4final(const double len, double &D0_MIN, double &Lnorm, } D0_MIN=0.5; - Lnorm=len; //normaliz TMscore by this in searching + Lnorm=len; //normalize TMscore by this in searching if (Lnorm<=21) d0=0.5; else d0=(1.24*pow((Lnorm*1.0-15), 1.0/3)-1.8); if (d0<D0_MIN) d0=D0_MIN; @@ -70,7 +70,7 @@ void parameter_set4scale(const int len, const double d_s, double &Lnorm, double &d0, double &d0_search) { d0=d_s; - Lnorm=len; //normaliz TMscore by this in searching + Lnorm=len; //normalize TMscore by this in searching d0_search=d0; if (d0_search>8) d0_search=8; if (d0_search<4.5) d0_search=4.5; diff --git a/modules/bindings/src/tmalign/pdb2fasta.cpp b/modules/bindings/src/tmalign/pdb2fasta.cpp index 420514205a22ce86d4cb0b9ddd1dbf4eda72e50d..7c94206ffebee9e6f6847318e001496e53ce64c7 100644 --- a/modules/bindings/src/tmalign/pdb2fasta.cpp +++ b/modules/bindings/src/tmalign/pdb2fasta.cpp @@ -31,6 +31,10 @@ void print_help() " 1: treat each MODEL as a separate chain (-ter should be 0)\n" " 2: treat each chain as a seperate chain (-ter should be <=1)\n" "\n" +" -het Whether to read residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" " -infmt Input format for chain\n" " -1: (default) automatically detect PDB or PDBx/mmCIF format\n" " 0: PDB format\n" @@ -52,6 +56,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int infmt_opt =-1; // PDB or PDBx/mmCIF format int split_opt =0; // do not split chain + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string suffix_opt=""; // set -suffix to empty string dir_opt =""; // set -dir to empty @@ -84,6 +89,10 @@ int main(int argc, char *argv[]) { infmt_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else xname=argv[i]; } @@ -92,7 +101,7 @@ int main(int argc, char *argv[]) if (suffix_opt.size() && dir_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir is set"); if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (split_opt==1 && ter_opt!=0) PrintErrorAndQuit("-split 1 should be used with -ter 0"); else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) @@ -140,7 +149,7 @@ int main(int argc, char *argv[]) { xname=chain_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt); + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname diff --git a/modules/bindings/src/tmalign/pdb2ss.cpp b/modules/bindings/src/tmalign/pdb2ss.cpp index a346cad36fee720d72c7067280d830d59a7e2f06..d0732803d16652eb312ea337812998a761bd092a 100644 --- a/modules/bindings/src/tmalign/pdb2ss.cpp +++ b/modules/bindings/src/tmalign/pdb2ss.cpp @@ -3,8 +3,8 @@ using namespace std; // secondary structure 01234 -const char* SSmapProtein=" CHTE"; -const char* SSmapRNA =" .<>"; +//const char* SSmapProtein=" CHTE"; +//const char* SSmapRNA =" .<>"; void print_help() { @@ -45,6 +45,10 @@ void print_help() " 0: PDB format\n" " 2: xyz format\n" " 3: PDBx/mmCIF format\n" +" -het Whether to read residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" <<endl; exit(EXIT_SUCCESS); } @@ -61,6 +65,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int infmt_opt =-1; // PDB format int split_opt =0; // do not split chain + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty @@ -98,6 +103,10 @@ int main(int argc, char *argv[]) { infmt_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else xname=argv[i]; } @@ -106,9 +115,9 @@ int main(int argc, char *argv[]) if (suffix_opt.size() && dir_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir is set"); if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") - PrintErrorAndQuit("ERROR! molecule type must be either RNA or protein."); + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); else if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") @@ -153,17 +162,16 @@ int main(int argc, char *argv[]) int xlen; // chain length int xchainnum; // number of chains in a PDB file char *seqx; // for the protein sequence - int *secx; // for the secondary structure + char *secx; // for the secondary structure double **xa; // for input vectors xa[0...xlen-1][0..2] and vector<string> resi_vec; // residue index for chain - string sequence; // secondary structure sequence /* loop over file names */ for (i=0;i<chain_list.size();i++) { xname=chain_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt); + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -183,26 +191,15 @@ int main(int argc, char *argv[]) } NewArray(&xa, xlen, 3); seqx = new char[xlen + 1]; - secx = new int[xlen]; + secx = new char[xlen + 1]; xlen = read_PDB(PDB_lines[chain_i], xa, seqx, resi_vec, 0); - if (mol_vec[chain_i]>0) // RNA - { - make_sec(seqx,xa, xlen, secx,atom_opt); - for (l=0;l<PDB_lines[chain_i].size();l++) - sequence+=SSmapRNA[secx[l]]; - } - else //protein - { - make_sec(xa, xlen, secx); - for (l=0;l<PDB_lines[chain_i].size();l++) - sequence+=SSmapProtein[secx[l]]; - } + if (mol_vec[chain_i]>0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // protein cout<<'>'<<xname.substr(dir_opt.size(), xname.size()-dir_opt.size()-suffix_opt.size()) - <<chainID_list[chain_i]<<'\t'<<xlen<<'\n'<<sequence<<endl; + <<chainID_list[chain_i]<<'\t'<<xlen<<'\n'<<secx<<endl; - sequence.clear(); PDB_lines[chain_i].clear(); DeleteArray(&xa, xlen); delete [] seqx; diff --git a/modules/bindings/src/tmalign/pdb2xyz.cpp b/modules/bindings/src/tmalign/pdb2xyz.cpp index 6fd235f6e3941acfe934fa0aa0fe0490ba406d36..d151f1e741b1a419482618d6dc08ac01197be2db 100644 --- a/modules/bindings/src/tmalign/pdb2xyz.cpp +++ b/modules/bindings/src/tmalign/pdb2xyz.cpp @@ -31,6 +31,10 @@ void print_help() " 1: treat each MODEL as a separate chain (-ter should be 0)\n" " 2: treat each chain as a seperate chain (-ter should be <=1)\n" "\n" +" -het Whether to read residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" " -infmt Input format for chain2\n" " -1: (default) automatically detect PDB or PDBx/mmCIF format\n" " 3: PDBx/mmCIF format\n" @@ -50,6 +54,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int infmt_opt =-1; // PDB or PDBx/mmCIF format int split_opt =0; // do not split chain + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string suffix_opt=""; // set -suffix to empty string dir_opt =""; // set -dir to empty @@ -82,6 +87,10 @@ int main(int argc, char *argv[]) { infmt_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else xname=argv[i]; } @@ -90,7 +99,7 @@ int main(int argc, char *argv[]) if (suffix_opt.size() && dir_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir is set"); if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (split_opt==1 && ter_opt!=0) PrintErrorAndQuit("-split 1 should be used with -ter 0"); else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) @@ -135,7 +144,7 @@ int main(int argc, char *argv[]) { xname=chain_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt); + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname diff --git a/modules/bindings/src/tmalign/readme.txt b/modules/bindings/src/tmalign/readme.txt index ea276da42ab2272e0897ddd3dba0f6a7b688c92c..3249215e8e3a65fa56cb3661df7ed3a24638c18b 100644 --- a/modules/bindings/src/tmalign/readme.txt +++ b/modules/bindings/src/tmalign/readme.txt @@ -8,7 +8,7 @@ Please report issues to yangzhanglab@umich.edu References to cite: - S Gong, C Zhang, Y Zhang. Bioinformatics (2019) + S Gong, C Zhang, Y Zhang. Bioinformatics, btz282 (2019) Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005) DISCLAIMER: @@ -50,6 +50,17 @@ (3) automatic detection of molecule type (protein vs RNA). 2019/01/07: C Zhang added support for PDBx/mmCIF format. 2019/02/09: Fixed asymmetric alignment bug. + 2019/03/17: Added the -cp option for circular permutation + 2019/03/27: Added the -mirror option for mirror structure alignment + 2019/04/25: The RNA-align algorithm was published by Bioinformatics + 2019/07/24: Fixed bug in displaying matching residues. + Added GDT and MaxSub to TMscore program. + 2019/08/18: Prevent excessive circular permutation alignment by -cp. + 2020/05/19: Add back rasmol output + 2020/12/12: Fixed bug in double precision coordinate mmcif alignment + 2021/01/07: Fixed bug in TMscore -c + 2021/05/29: Remove unnecessary depedency on malloc.h, which prevent + compilation on Mac OS =============================================================================== ========================= @@ -84,4 +95,4 @@ fortran version, including RNA alignment and batch alignment of multiple structures. A full list of available options can be explored by: ./TMalign -h -02/09/2019 +2021/05/20 diff --git a/modules/bindings/src/tmalign/se.cpp b/modules/bindings/src/tmalign/se.cpp index aa22b07f744578acaac523df25e0a401b8ad2923..c4d7606816f84beceb4b569f0b14d1d082ddc119 100644 --- a/modules/bindings/src/tmalign/se.cpp +++ b/modules/bindings/src/tmalign/se.cpp @@ -1,4 +1,5 @@ #include "se.h" +#include "NWalign.h" using namespace std; @@ -54,6 +55,10 @@ void print_extra_help() " 3: (similar to TMscore -c, should be used with -ter <=1)\n" " align by residue index and order of chain\n" "\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" " -infmt1 Input format for chain1\n" " -infmt2 Input format for chain2\n" " -1: (default) automatically detect PDB or PDBx/mmCIF format\n" @@ -122,6 +127,7 @@ int main(int argc, char *argv[]) int ter_opt =3; // TER, END, or different chainID int split_opt =0; // do not split chain int outfmt_opt=0; // set -outfmt to full output + int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty @@ -206,6 +212,10 @@ int main(int argc, char *argv[]) { byresi_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } else if (xname.size() == 0) xname=argv[i]; else if (yname.size() == 0) yname=argv[i]; else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); @@ -228,9 +238,9 @@ int main(int argc, char *argv[]) if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); if (atom_opt.size()!=4) - PrintErrorAndQuit("ERROR! atom name must have 4 characters, including space."); + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") - PrintErrorAndQuit("ERROR! molecule type must be either RNA or protein."); + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); else if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") @@ -303,7 +313,7 @@ int main(int argc, char *argv[]) /* parse chain 1 */ xname=chain1_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt); + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -333,7 +343,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -373,6 +384,7 @@ int main(int argc, char *argv[]) double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore int n_ali=0; int n_ali8=0; + int *invmap = new int[ylen+1]; /* entry function for structure alignment */ se_main( @@ -382,24 +394,30 @@ int main(int argc, char *argv[]) rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, a_opt, u_opt, d_opt, - mol_vec1[chain_i]+mol_vec2[chain_j]); + mol_vec1[chain_i]+mol_vec2[chain_j], + outfmt_opt, invmap); + + if (outfmt_opt>=2) + get_seqID(invmap, seqx, seqy, ylen, Liden, n_ali8); /* print result */ output_results( - xname.substr(dir1_opt.size()).c_str(), - yname.substr(dir2_opt.size()).c_str(), + xname.substr(dir1_opt.size()+dir_opt.size()).c_str(), + yname.substr(dir2_opt.size()+dir_opt.size()).c_str(), chainID_list1[chain_i].c_str(), chainID_list2[chain_j].c_str(), xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, - n_ali8, n_ali, L_ali, TM_ali, rmsd_ali, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, - "", outfmt_opt, ter_opt, "", - false, false, a_opt, u_opt, d_opt); + "", outfmt_opt, ter_opt, 0, split_opt, + 0, "", false, a_opt, u_opt, d_opt, 0, + resi_vec1, resi_vec2); /* Done! Free memory */ + delete [] invmap; seqM.clear(); seqxA.clear(); seqyA.clear(); diff --git a/modules/bindings/src/tmalign/se.h b/modules/bindings/src/tmalign/se.h index 0021dd6d4b52ccff4f16ffbcb9b9a08ab4726818..6ccc84132d02b9e54179d9b6378937af38ce35a0 100644 --- a/modules/bindings/src/tmalign/se.h +++ b/modules/bindings/src/tmalign/se.h @@ -1,6 +1,7 @@ #include "TMalign.h" -/* entry function for se */ +/* entry function for se + * outfmt_opt>=2 should not parse sequence alignment */ int se_main( double **xa, double **ya, const char *seqx, const char *seqy, double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, @@ -11,31 +12,32 @@ int se_main( double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, const int xlen, const int ylen, const vector<string> &sequence, const double Lnorm_ass, const double d0_scale, const bool i_opt, - const bool a_opt, const bool u_opt, const bool d_opt, const int mol_type) + const bool a_opt, const bool u_opt, const bool d_opt, const int mol_type, + const int outfmt_opt, int *invmap) { double D0_MIN; //for d0 double Lnorm; //normalization length double score_d8,d0,d0_search,dcu0;//for TMscore search - double t[3]={0,0,0}; // dummy translation vection - double u[3][3]={{1,0,0},{0,1,0},{0,0,1}}; // dummy rotation matrix double **score; // Input score table for dynamic programming bool **path; // for dynamic programming double **val; // for dynamic programming - int *m1, *m2; + int *m1=NULL; + int *m2=NULL; double d; - m1=new int[xlen]; //alignd index in x - m2=new int[ylen]; //alignd index in y + if (outfmt_opt<2) + { + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + } /***********************/ /* allocate memory */ /***********************/ - int minlen = min(xlen, ylen); NewArray(&score, xlen+1, ylen+1); NewArray(&path, xlen+1, ylen+1); NewArray(&val, xlen+1, ylen+1); - int *invmap = new int[ylen+1]; - for(int i=0; i<ylen; i++) invmap[i]=-1; + //int *invmap = new int[ylen+1]; /* set d0 */ parameter_set4search(xlen, ylen, D0_MIN, Lnorm, @@ -52,13 +54,10 @@ int se_main( d0u, d0_search, mol_type); // set d0u /* perform alignment */ - if (!i_opt) - NWDP_TM(path, val, xa, ya, xlen, ylen, t, u, d0*d0, 0, invmap); + for(int j=0; j<ylen; j++) invmap[j]=-1; + if (!i_opt) NWDP_SE(path, val, xa, ya, xlen, ylen, d0*d0, 0, invmap); else { - for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" - invmap[j] = -1; - int i1 = -1;// in C version, index starts from zero, not from one int i2 = -1; int L1 = sequence[0].size(); @@ -78,6 +77,8 @@ int se_main( rmsd0=TM1=TM2=TM3=TM4=TM5=0; int k=0; + n_ali=0; + n_ali8=0; for(int i=0,j=0; j<ylen; j++) { i=invmap[j]; @@ -85,10 +86,13 @@ int se_main( { n_ali++; d=sqrt(dist(&xa[i][0], &ya[j][0])); - if (d <= score_d8) + if (d <= score_d8 || i_opt) { - m1[k]=i; - m2[k]=j; + if (outfmt_opt<2) + { + m1[k]=i; + m2[k]=j; + } k++; TM2+=1/(1+(d/d0B)*(d/d0B)); // chain_1 TM1+=1/(1+(d/d0A)*(d/d0A)); // chain_2 @@ -107,6 +111,14 @@ int se_main( TM5/=ylen; if (n_ali8) rmsd0=sqrt(rmsd0/n_ali8); + if (outfmt_opt>=2) + { + DeleteArray(&score, xlen+1); + DeleteArray(&path, xlen+1); + DeleteArray(&val, xlen+1); + return 0; + } + /* extract aligned sequence */ int ali_len=xlen+ylen; //maximum length of alignment seqxA.assign(ali_len,'-'); @@ -115,6 +127,7 @@ int se_main( int kk=0, i_old=0, j_old=0; d=0; + Liden=0; for(int k=0; k<n_ali8; k++) { for(int i=i_old; i<m1[k]; i++) @@ -168,7 +181,7 @@ int se_main( seqM =seqM.substr(0,kk); /* free memory */ - delete [] invmap; + //delete [] invmap; delete [] m1; delete [] m2; DeleteArray(&score, xlen+1); diff --git a/modules/bindings/src/wrap_tmalign.cc b/modules/bindings/src/wrap_tmalign.cc index bbc2eb506a789b4aea3306b90e80b27c061af597..cefbe1a4497372983902f5f5f143a5c21ed1e1eb 100644 --- a/modules/bindings/src/wrap_tmalign.cc +++ b/modules/bindings/src/wrap_tmalign.cc @@ -48,8 +48,8 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, char* seqy = new char[ylen+1]; seqx[xlen] = '\0'; seqy[ylen] = '\0'; - int* secx = new int[xlen]; - int* secy = new int[ylen]; + char* secx = new char[xlen]; + char* secy = new char[ylen]; // use TMalign functionality to generate position arrays double** xa; @@ -83,7 +83,6 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, int a_opt = 0; std::vector<String> sequence; bool i_opt = false; - bool I_opt = false; double TMcut = -1; // following variables are copied from the TMAlign source code @@ -104,7 +103,7 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, TMalign_main(xa, ya, seqx, seqy, secx, secy, t0, u0, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, - sequence, Lnorm_ass, d0_scale, i_opt, I_opt, a_opt, u_opt, d_opt, + sequence, Lnorm_ass, d0_scale, i_opt, a_opt, u_opt, d_opt, fast, 0, TMcut); // cleanup diff --git a/modules/bindings/tests/test_tmtools.py b/modules/bindings/tests/test_tmtools.py index 148e6263c39fc06dc6514b91d2c0c76d78f759b2..cddd9ed7aaeb2cb3176828b95df6e80e38b53355 100644 --- a/modules/bindings/tests/test_tmtools.py +++ b/modules/bindings/tests/test_tmtools.py @@ -4,6 +4,7 @@ from ost import settings from ost import testutils from ost.seq.alg import SequenceIdentity from ost.bindings import tmtools +from ost.bindings import WrappedTMAlign class TestTMBindings(unittest.TestCase): @@ -54,6 +55,24 @@ class TestTMBindings(unittest.TestCase): identity = geom.Mat4(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1) self.assertEqual(tm_result.transform, identity) + def testWrappedTMAlign(self): + + tm_result = WrappedTMAlign(self.protein.CreateFullView().chains[0], + self.protein.CreateFullView().chains[0]) + + # model and reference are the same, we expect pretty good results + self.assertAlmostEqual(tm_result.rmsd, 0.0, places=4) + self.assertAlmostEqual(tm_result.tm_score, 1.0, places=4) + self.assertEqual(tm_result.aligned_length, len(self.protein.chains[0].residues)) + self.assertEqual(SequenceIdentity(tm_result.alignment), 100.0) + + # transformation should be identity matrix (no transformation at all...) + identity = geom.Mat4(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1) + for i in range(4): + for j in range(4): + self.assertAlmostEqual(tm_result.transform[i,j], identity[i,j]) + + if __name__ == "__main__": testutils.RunTests()