diff --git a/modules/bindings/src/USalign/HwRMSD.cpp b/modules/bindings/src/USalign/HwRMSD.cpp index 651d82456462f0e1ced7970fd2d27c77d7dd3981..df7473e7a343f8a4359fbb02b1642333666e2c19 100644 --- a/modules/bindings/src/USalign/HwRMSD.cpp +++ b/modules/bindings/src/USalign/HwRMSD.cpp @@ -306,6 +306,8 @@ int main(int argc, char *argv[]) else if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); } + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (atom_opt.size()!=4) PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") @@ -387,8 +389,8 @@ int main(int argc, char *argv[]) { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -425,8 +427,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, autojustify, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname diff --git a/modules/bindings/src/USalign/LICENSE b/modules/bindings/src/USalign/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..023d1c5b0c1635384026d17b7dc2529a456b8aad --- /dev/null +++ b/modules/bindings/src/USalign/LICENSE @@ -0,0 +1,15 @@ + US-align: universal structure alignment of monomeric and complex proteins + and nucleic acids + + References to cite: + (1) Chengxin Zhang, Morgan Shine, Anna Marie Pyle, Yang Zhang + (2022) Nat Methods. 19(9), 1109-1115. + (2) Chengxin Zhang, Anna Marie Pyle (2022) iScience. 25(10), 105218. + + DISCLAIMER: + Permission to use, copy, modify, and distribute this program for + any purpose, with or without fee, is hereby granted, provided that + the notices on the head, the reference information, and this + copyright notice appear in all copies or substantial portions of + the Software. It is provided "as is" without express or implied + warranty. diff --git a/modules/bindings/src/USalign/MMalign.cpp b/modules/bindings/src/USalign/MMalign.cpp index 816798b248be6b67657bc046865b76e3f79cd814..886cb76000e3d0aa66f41b2ec7b70d0a92647790 100644 --- a/modules/bindings/src/USalign/MMalign.cpp +++ b/modules/bindings/src/USalign/MMalign.cpp @@ -346,11 +346,11 @@ int main(int argc, char *argv[]) /* parse complex */ parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, - atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + atom_opt, false, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 1"); parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, - atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + atom_opt, false, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); if (ya_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 2"); int len_aa=getmin(xlen_aa,ylen_aa); int len_na=getmin(xlen_na,ylen_na); diff --git a/modules/bindings/src/USalign/MMalign.h b/modules/bindings/src/USalign/MMalign.h index 4b480da62dd30b3c9d32a3b602d941166b7368df..d9b1fb19f9ffbaafdf303a5f334a9fc6ec7258a5 100644 --- a/modules/bindings/src/USalign/MMalign.h +++ b/modules/bindings/src/USalign/MMalign.h @@ -1041,8 +1041,8 @@ void parse_chain_list(const vector<string>&chain_list, vector<vector<char> >&sec_vec, vector<int>&mol_vec, vector<int>&len_vec, vector<string>&chainID_list, const int ter_opt, const int split_opt, const string mol_opt, const int infmt_opt, const string atom_opt, - const int mirror_opt, const int het_opt, int &len_aa, int &len_na, - const int o_opt, vector<string>&resi_vec) + const bool autojustify, const int mirror_opt, const int het_opt, + int &len_aa, int &len_na, const int o_opt, vector<string>&resi_vec) { size_t i; int chain_i,r; @@ -1063,8 +1063,8 @@ void parse_chain_list(const vector<string>&chain_list, for (i=0;i<chain_list.size();i++) { name=chain_list[i]; - chainnum=get_PDB_lines(name, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + chainnum=get_PDB_lines(name, PDB_lines, chainID_list, mol_vec, + ter_opt, infmt_opt, atom_opt, autojustify, split_opt, het_opt); if (!chainnum) { cerr<<"Warning! Cannot parse file: "<<name diff --git a/modules/bindings/src/USalign/NWalign.cpp b/modules/bindings/src/USalign/NWalign.cpp index 6b7b86c2db202c7338b3d7b576d49337e0e4ec4b..3e57ce035aa36f99913d9c68da1ba6ea9933bd63 100644 --- a/modules/bindings/src/USalign/NWalign.cpp +++ b/modules/bindings/src/USalign/NWalign.cpp @@ -197,6 +197,8 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (atom_opt.size()!=4) PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") @@ -247,8 +249,8 @@ int main(int argc, char *argv[]) xname=chain1_list[i]; if (infmt1_opt>=4) xchainnum=get_FASTA_lines(xname, PDB_lines1, chainID_list1, mol_vec1, ter_opt, split_opt); - else xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + else xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -284,7 +286,7 @@ int main(int argc, char *argv[]) chainID_list2, mol_vec2, ter_opt, split_opt); else ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, mol_vec2, ter_opt, infmt2_opt, - atom_opt, split_opt, het_opt); + atom_opt, autojustify, split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname diff --git a/modules/bindings/src/USalign/NWalign.h b/modules/bindings/src/USalign/NWalign.h index 7d6856b98db338b5aa24977baea7585df0006f8e..1ff8256acf8399cc4e6b6017208976774e33dd74 100644 --- a/modules/bindings/src/USalign/NWalign.h +++ b/modules/bindings/src/USalign/NWalign.h @@ -530,17 +530,17 @@ int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, int i2=0; // positions in resi_vec2 int xlen=resi_vec1.size(); int ylen=resi_vec2.size(); - if (byresi_opt==4 || byresi_opt==5) // global or glocal sequence alignment + if (byresi_opt==4 || byresi_opt==5 || byresi_opt==7) // global or glocal sequence alignment { int *invmap; int glocal=0; - if (byresi_opt==5) glocal=2; + if (byresi_opt==5 || byresi_opt==7) glocal=2; int mol_type=0; for (i1=0;i1<xlen;i1++) if ('a'<seqx[i1] && seqx[i1]<'z') mol_type++; else mol_type--; for (i2=0;i2<ylen;i2++) - if ('a'<seqx[i2] && seqx[i2]<'z') mol_type++; + if ('a'<seqy[i2] && seqy[i2]<'z') mol_type++; else mol_type--; NWalign_main(seqx, seqy, xlen, ylen, sequence[0],sequence[1], mol_type, invmap, 0, glocal); @@ -657,7 +657,7 @@ int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, const vector<string> resi_vec1, const vector<string> resi_vec2, const vector<int> xlen_vec, const vector<int> ylen_vec, - const int chain_i, const int chain_j) + const int chain_i, const int chain_j, const int byresi_opt) { sequence.clear(); sequence.push_back(""); @@ -671,6 +671,24 @@ int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, for (i=0;i<chain_i;i++) i1+=xlen_vec[i]; for (j=0;j<chain_j;j++) i2+=ylen_vec[j]; + if (byresi_opt==7) + { + int *invmap; + int glocal=2; + int mol_type=0; + + for (i=0;i<xlen;i++) + if ('a'<seqx[i] && seqx[i]<'z') mol_type++; + else mol_type--; + for (i=0;i<ylen;i++) + if ('a'<seqy[i] && seqy[i]<'z') mol_type++; + else mol_type--; + NWalign_main(seqx, seqy, xlen, ylen, sequence[0],sequence[1], + mol_type, invmap, 0, glocal); + delete [] invmap; + return sequence[0].size(); + } + i=j=0; while(i<xlen && j<ylen) { diff --git a/modules/bindings/src/USalign/OST_INFO b/modules/bindings/src/USalign/OST_INFO index 42124da831fd2d3412bc252f2f34eee5ad17027b..7c8299cdd700e97656d25a6aa9083e0945d15e8c 100644 --- a/modules/bindings/src/USalign/OST_INFO +++ b/modules/bindings/src/USalign/OST_INFO @@ -1,6 +1,6 @@ -Source code has been cloned May 4 2023 from: +Source code has been cloned Nov 14 2023 from: https://github.com/pylelab/USalign last commit: -8d968e0111ca275958f209d76b1cd10598864a34 +58b42af9d58436279c21b4f4074db87f072fcc21 diff --git a/modules/bindings/src/USalign/TMalign.cpp b/modules/bindings/src/USalign/TMalign.cpp index c822d4c3094c0e345c321dfce11711724eb06e51..90682ae459456018412bc78d0e2d35e1e88735c9 100644 --- a/modules/bindings/src/USalign/TMalign.cpp +++ b/modules/bindings/src/USalign/TMalign.cpp @@ -450,8 +450,8 @@ int main(int argc, char *argv[]) { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, false, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -491,8 +491,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, false, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname diff --git a/modules/bindings/src/USalign/TMalign.h b/modules/bindings/src/USalign/TMalign.h index 81196a8071fcec442bba46867e246bb9450facf8..3d9097bc9e63a636322f460c630abad122b3c4d7 100644 --- a/modules/bindings/src/USalign/TMalign.h +++ b/modules/bindings/src/USalign/TMalign.h @@ -852,8 +852,8 @@ void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) } } - // From 5' to 3': A0 C0 D0 B0: A0 paired to B0, C0 paired to D0 - vector<int> A0,B0,C0,D0; + // From 5' to 3': A0_var C0_var D0_var B0_var: A0_var paired to B0_var, C0_var paired to D0_var + vector<int> A0_var,B0_var,C0_var,D0_var; for (i=0; i<len-2; i++) { for (j=i+3; j<len; j++) @@ -867,32 +867,32 @@ void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) ii=i; jj=j; } - A0.push_back(i); - B0.push_back(j); - C0.push_back(ii); - D0.push_back(jj); + A0_var.push_back(i); + B0_var.push_back(j); + C0_var.push_back(ii); + D0_var.push_back(jj); } } //int sign; - for (i=0;i<A0.size();i++) + for (i=0;i<A0_var.size();i++) { /* sign=0; - if(C0[i]-A0[i]<=1) + if(C0_var[i]-A0_var[i]<=1) { - for(j=0;j<A0.size();j++) + for(j=0;j<A0_var.size();j++) { if(i==j) continue; - if((A0[j]>=A0[i]&&A0[j]<=C0[i])|| - (C0[j]>=A0[i]&&C0[j]<=C0[i])|| - (D0[j]>=A0[i]&&D0[j]<=C0[i])|| - (B0[j]>=A0[i]&&B0[j]<=C0[i])|| - (A0[j]>=D0[i]&&A0[j]<=B0[i])|| - (C0[j]>=D0[i]&&C0[j]<=B0[i])|| - (D0[j]>=D0[i]&&D0[j]<=B0[i])|| - (B0[j]>=D0[i]&&B0[j]<=B0[i])) + if((A0_var[j]>=A0_var[i]&&A0_var[j]<=C0_var[i])|| + (C0_var[j]>=A0_var[i]&&C0_var[j]<=C0_var[i])|| + (D0_var[j]>=A0_var[i]&&D0_var[j]<=C0_var[i])|| + (B0_var[j]>=A0_var[i]&&B0_var[j]<=C0_var[i])|| + (A0_var[j]>=D0_var[i]&&A0_var[j]<=B0_var[i])|| + (C0_var[j]>=D0_var[i]&&C0_var[j]<=B0_var[i])|| + (D0_var[j]>=D0_var[i]&&D0_var[j]<=B0_var[i])|| + (B0_var[j]>=D0_var[i]&&B0_var[j]<=B0_var[i])) { sign=-1; break; @@ -904,18 +904,18 @@ void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) for (j=0;;j++) { - if(A0[i]+j>C0[i]) break; - sec[A0[i]+j]='<'; - sec[D0[i]+j]='>'; + if(A0_var[i]+j>C0_var[i]) break; + sec[A0_var[i]+j]='<'; + sec[D0_var[i]+j]='>'; } } sec[len]=0; /* clean up */ - A0.clear(); - B0.clear(); - C0.clear(); - D0.clear(); + A0_var.clear(); + B0_var.clear(); + C0_var.clear(); + D0_var.clear(); bp.clear(); } @@ -1687,7 +1687,7 @@ void output_pymol(const string xname, const string yname, { resi1_sele+=" or i. "+curr_resi1; resi1_bond+="bond structure1 and i. "+prev_resi1+ - ", i. "+curr_resi1+"\n"; + ", structure1 and i. "+curr_resi1+"\n"; } if (resi2_sele.size()==0) resi2_sele = "i. "+curr_resi2; @@ -1695,7 +1695,7 @@ void output_pymol(const string xname, const string yname, { resi2_sele+=" or i. "+curr_resi2; resi2_bond+="bond structure2 and i. "+prev_resi2+ - ", i. "+curr_resi2+"\n"; + ", structure2 and i. "+curr_resi2+"\n"; } prev_resi1=curr_resi1; prev_resi2=curr_resi2; diff --git a/modules/bindings/src/USalign/TMscore.cpp b/modules/bindings/src/USalign/TMscore.cpp index c84d742c1f47c199bc179185e346dffcc2e987f5..0946fb2e33f196fea04226e104182f29b0e59381 100644 --- a/modules/bindings/src/USalign/TMscore.cpp +++ b/modules/bindings/src/USalign/TMscore.cpp @@ -304,6 +304,8 @@ int main(int argc, char *argv[]) else if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); } + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (atom_opt.size()!=4) PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") @@ -375,8 +377,8 @@ int main(int argc, char *argv[]) { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -412,8 +414,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, autojustify, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname diff --git a/modules/bindings/src/USalign/USalign.cpp b/modules/bindings/src/USalign/USalign.cpp index fdd1d8b958446fbfcc5eeb331c2caba49b393212..151e3ca48dfabfc95768a058b6791d21e713a7ab 100644 --- a/modules/bindings/src/USalign/USalign.cpp +++ b/modules/bindings/src/USalign/USalign.cpp @@ -11,7 +11,7 @@ void print_version() cout << "\n" " ********************************************************************\n" -" * US-align (Version 20220924) *\n" +" * US-align (Version 20230609) *\n" " * Universal Structure Alignment of Proteins and Nucleic Acids *\n" " * Reference: C Zhang, M Shine, AM Pyle, Y Zhang. (2022) Nat Methods*\n" " * Please email comments and suggestions to zhang@zhanggroup.org *\n" @@ -31,12 +31,16 @@ void print_extra_help() " -fast Fast but slightly inaccurate alignment\n" "\n" " -dir Perform all-against-all alignment among the list of PDB\n" -" chains listed by 'chain_list' under 'chain_folder'. Note\n" -" that the slash is necessary.\n" +" chains listed by 'chain_list' under 'chain_folder'.\n" " $ USalign -dir chain_folder/ chain_list\n" "\n" +//"-dirpair Perform batch alignment for each pair of chains listed by\n" +//" 'chain_pair_list' under 'chain_folder'. Each line consist of\n" +//" two chains, separated by tab or space.\n" +//" $ USalign -dirpair chain_folder/ chain_pair_list\n" +//"\n" " -dir1 Use chain2 to search a list of PDB chains listed by 'chain1_list'\n" -" under 'chain1_folder'. Note that the slash is necessary.\n" +" under 'chain1_folder'.\n" " $ USalign -dir1 chain1_folder/ chain1_list chain2\n" "\n" " -dir2 Use chain1 to search a list of PDB chains listed by 'chain2_list'\n" @@ -92,7 +96,7 @@ void print_extra_help() " -se Do not perform superposition. Useful for extracting alignment from\n" " superposed structure pairs\n" "\n" -" -infmt1 Input format for structure_11\n" +" -infmt1 Input format for structure_1\n" " -infmt2 Input format for structure_2\n" " -1: (default) automatically detect PDB or PDBx/mmCIF format\n" " 0: PDB format\n" @@ -100,6 +104,10 @@ void print_extra_help() //" 2: xyz format\n" " 3: PDBx/mmCIF format\n" "\n" +//"-chainmap (only useful for -mm 1) use the final chain mapping 'chainmap.txt'\n" +//" specified by user. 'chainmap.txt' is a tab-seperated text with two\n" +//" columns, one for each complex\n" +//"\n" "Advanced usage 1 (generate an image for a pair of superposed structures):\n" " USalign 1cpc.pdb 1mba.pdb -o sup\n" " pymol -c -d @sup_all_atm.pml -g sup_all_atm.png\n" @@ -164,6 +172,10 @@ void print_help(bool h_opt=false) " 6: superpose two complex structures by first deriving optimal\n" " chain mapping, followed by TM-score superposition for residues\n" " with the same residue ID\n" +" 7: sequence dependent alignment of two complex structures:\n" +" perform global sequence alignment of each chain pair, derive\n" +" optimal chain mapping, and then superpose two complex\n" +" structures by TM-score\n" "\n" " -I Use the final alignment specified by FASTA file 'align.txt'\n" "\n" @@ -219,10 +231,10 @@ int TMalign(string &xname, string &yname, const string &fname_super, const int infmt1_opt, const int infmt2_opt, const int ter_opt, const int split_opt, const int outfmt_opt, const bool fast_opt, const int cp_opt, const int mirror_opt, const int het_opt, - const string &atom_opt, const string &mol_opt, const string &dir_opt, - const string &dir1_opt, const string &dir2_opt, const int byresi_opt, - const vector<string> &chain1_list, const vector<string> &chain2_list, - const bool se_opt) + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir_opt, const string &dirpair_opt, const string &dir1_opt, + const string &dir2_opt, const int byresi_opt, const vector<string> &chain1_list, + const vector<string> &chain2_list, const bool se_opt) { /* declare previously global variables */ vector<vector<string> >PDB_lines1; // text of chain1 @@ -252,8 +264,8 @@ int TMalign(string &xname, string &yname, const string &fname_super, { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -287,13 +299,14 @@ int TMalign(string &xname, string &yname, const string &fname_super, for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) { + if (dirpair_opt.size() && j!=i) continue; /* parse chain 2 */ if (PDB_lines2.size()==0) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, autojustify, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -409,8 +422,8 @@ int TMalign(string &xname, string &yname, const string &fname_super, seqxA,seqyA,outfmt_opt,left_num,right_num, left_aln_num,right_aln_num); output_results( - xname.substr(dir1_opt.size()+dir_opt.size()), - yname.substr(dir2_opt.size()+dir_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()+dirpair_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()+dirpair_opt.size()), chainID_list1[chain_i], chainID_list2[chain_j], xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), @@ -498,10 +511,10 @@ int MMalign(const string &xname, const string &yname, const double TMcut, const int infmt1_opt, const int infmt2_opt, const int ter_opt, const int split_opt, const int outfmt_opt, bool fast_opt, const int mirror_opt, const int het_opt, - const string &atom_opt, const string &mol_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, const string &dir1_opt, const string &dir2_opt, const vector<string> &chain1_list, const vector<string> &chain2_list, - const int byresi_opt) + const int byresi_opt,const string&chainmapfile) { /* declare previously global variables */ vector<vector<vector<double> > > xa_vec; // structure of complex1 @@ -529,11 +542,12 @@ int MMalign(const string &xname, const string &yname, /* parse complex */ parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, - atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + atom_opt, autojustify, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, + resi_vec1); if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 1"); parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, - atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + atom_opt, autojustify, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); if (ya_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 2"); int len_aa=getmin(xlen_aa,ylen_aa); int len_na=getmin(xlen_na,ylen_na); @@ -545,6 +559,63 @@ int MMalign(const string &xname, const string &yname, int i_opt=0; if (byresi_opt) i_opt=3; + map<int,int> chainmap; + if (chainmapfile.size()) + { + string line; + int chainidx1,chainidx2; + vector<string> line_vec; + ifstream fin; + bool fromStdin=(chainmapfile=="-"); + if (!fromStdin) fin.open(chainmapfile.c_str()); + while (fromStdin?cin.good():fin.good()) + { + if (fromStdin) getline(cin,line); + else getline(fin,line); + if (line.size()==0 || line[0]=='#') continue; + split(line,line_vec,'\t'); + if (line_vec.size()==2) + { + chainidx1=-1; + chainidx2=-1; + + for (i=0;i<chainID_list1.size();i++) + { + if (line_vec[0]==chainID_list1[i] || + ":"+line_vec[0]==chainID_list1[i] || + ":1,"+line_vec[0]==chainID_list1[i]) + { + chainidx1=i; + break; + } + } + for (i=0;i<chainID_list2.size();i++) + { + if (line_vec[1]==chainID_list2[i] || + ":"+line_vec[1]==chainID_list2[i] || + ":1,"+line_vec[1]==chainID_list2[i]) + { + chainidx2=i; + break; + } + } + if (chainidx1>=0 && chainidx2>=0) + { + if (chainmap.count(chainidx1)) + cerr<<"ERROR! "<<line_vec[0]<<" already mapped"<<endl; + chainmap[chainidx1]=chainidx2; + } + else cerr<<"ERROR! Cannot map "<<line<<endl; + } + else cerr<<"ERROR! Cannot map "<<line<<endl; + for (i=0;i<line_vec.size();i++) line_vec[i].clear(); line_vec.clear(); + } + if (!fromStdin) fin.close(); + if (chainmap.size()==0) + cerr<<"ERROR! cannot map any chain pair from "<<chainmapfile<<endl; + } + + /* perform monomer alignment if there is only one chain */ if (xa_vec.size()==1 && ya_vec.size()==1) { @@ -671,6 +742,11 @@ int MMalign(const string &xname, const string &yname, TMave_mat[i][j]=-1; continue; } + if (chainmap.size() && chainmap[i]!=j) + { + TMave_mat[i][j]=-1; + continue; + } ylen=ylen_vec[j]; if (ylen<3) @@ -704,8 +780,8 @@ int MMalign(const string &xname, const string &yname, if (byresi_opt) { - int total_aln=extract_aln_from_resi(sequence, - seqx,seqy,resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j); + int total_aln=extract_aln_from_resi(sequence, seqx,seqy, + resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j, byresi_opt); seqxA_mat[i][j]=sequence[0]; seqyA_mat[i][j]=sequence[1]; if (total_aln>xlen+ylen-3) @@ -776,7 +852,7 @@ int MMalign(const string &xname, const string &yname, /* refine alignment for large oligomers */ int aln_chain_num=count_assign_pair(assign1_list,chain1_num); bool is_oligomer=(aln_chain_num>=3); - if (aln_chain_num==2) // dimer alignment + if (aln_chain_num==2 && chainmap.size()==0) // dimer alignment { int na_chain_num1,na_chain_num2,aa_chain_num1,aa_chain_num2; count_na_aa_chain_num(na_chain_num1,aa_chain_num1,mol_vec1); @@ -798,7 +874,7 @@ int MMalign(const string &xname, const string &yname, else is_oligomer=true; /* align oligomers to dimer */ } - if (aln_chain_num>=3 || is_oligomer) // oligomer alignment + if ((aln_chain_num>=3 || is_oligomer) && chainmap.size()==0) // oligomer alignment { /* extract centroid coordinates */ double **xcentroids; @@ -941,6 +1017,7 @@ int MMalign(const string &xname, const string &yname, ylen_vec.clear(); // length of complex2 vector<string> ().swap(resi_vec1); // residue index for chain1 vector<string> ().swap(resi_vec2); // residue index for chain2 + map<int,int> ().swap(chainmap); return 1; } @@ -953,7 +1030,7 @@ int MMdock(const string &xname, const string &yname, const string &fname_super, const double TMcut, const int infmt1_opt, const int infmt2_opt, const int ter_opt, const int split_opt, const int outfmt_opt, bool fast_opt, const int mirror_opt, const int het_opt, - const string &atom_opt, const string &mol_opt, + const string &atom_opt, const bool autojustify, const string &mol_opt, const string &dir1_opt, const string &dir2_opt, const vector<string> &chain1_list, const vector<string> &chain2_list) { @@ -983,11 +1060,12 @@ int MMdock(const string &xname, const string &yname, const string &fname_super, /* parse complex */ parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, - atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + atom_opt, autojustify, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, + resi_vec1); if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 individual chain"); parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, - atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + atom_opt, autojustify, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); if (xa_vec.size()>ya_vec.size()) PrintErrorAndQuit( "ERROR! more individual chains to align than number of chains in complex template"); int len_aa=getmin(xlen_aa,ylen_aa); @@ -1417,9 +1495,8 @@ int mTMalign(string &xname, string &yname, const string &fname_super, bool u_opt, const bool d_opt, const bool full_opt, const double TMcut, const int infmt_opt, const int ter_opt, const int split_opt, const int outfmt_opt, bool fast_opt, - const int het_opt, - const string &atom_opt, const string &mol_opt, const string &dir_opt, - const int byresi_opt, + const int het_opt, const string &atom_opt, const bool autojustify, + const string &mol_opt, const string &dir_opt, const int byresi_opt, const vector<string> &chain_list) { /* declare previously global variables */ @@ -1441,7 +1518,7 @@ int mTMalign(string &xname, string &yname, const string &fname_super, /* parse chain list */ parse_chain_list(chain_list, a_vec, seq_vec, sec_vec, mol_vec, len_vec, chainID_list, ter_opt, split_opt, mol_opt, infmt_opt, - atom_opt, false, het_opt, len_aa, len_na, o_opt, resi_vec); + atom_opt, autojustify, false, het_opt, len_aa, len_na, o_opt, resi_vec); int chain_num=a_vec.size(); if (chain_num<=1) PrintErrorAndQuit("ERROR! <2 chains for multiple alignment"); if (m_opt||o_opt) for (i=0;i<chain_num;i++) ua_vec.push_back(a_vec[i]); @@ -2128,10 +2205,11 @@ int SOIalign(string &xname, string &yname, const string &fname_super, const int infmt1_opt, const int infmt2_opt, const int ter_opt, const int split_opt, const int outfmt_opt, const bool fast_opt, const int cp_opt, const int mirror_opt, const int het_opt, - const string &atom_opt, const string &mol_opt, const string &dir_opt, - const string &dir1_opt, const string &dir2_opt, - const vector<string> &chain1_list, const vector<string> &chain2_list, - const bool se_opt, const int closeK_opt, const int mm_opt) + const string &atom_opt, const bool autojustify, const string &mol_opt, + const string &dir_opt, const string &dirpair_opt, const string &dir1_opt, + const string &dir2_opt, const vector<string> &chain1_list, + const vector<string> &chain2_list, const bool se_opt, + const int closeK_opt, const int mm_opt) { /* declare previously global variables */ vector<vector<string> >PDB_lines1; // text of chain1 @@ -2164,8 +2242,8 @@ int SOIalign(string &xname, string &yname, const string &fname_super, { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -2206,13 +2284,14 @@ int SOIalign(string &xname, string &yname, const string &fname_super, for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) { + if (dirpair_opt.size() && i!=j) continue; /* parse chain 2 */ if (PDB_lines2.size()==0) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, autojustify, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -2315,8 +2394,8 @@ int SOIalign(string &xname, string &yname, const string &fname_super, /* print result */ if (outfmt_opt==0) print_version(); output_results( - xname.substr(dir1_opt.size()+dir_opt.size()), - yname.substr(dir2_opt.size()+dir_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()+dirpair_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()+dirpair_opt.size()), chainID_list1[chain_i], chainID_list2[chain_j], xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), @@ -2400,11 +2479,11 @@ int flexalign(string &xname, string &yname, const string &fname_super, const bool u_opt, const bool d_opt, const double TMcut, const int infmt1_opt, const int infmt2_opt, const int ter_opt, const int split_opt, const int outfmt_opt, const bool fast_opt, - const int mirror_opt, const int het_opt, - const string &atom_opt, const string &mol_opt, const string &dir_opt, - const string &dir1_opt, const string &dir2_opt, const int byresi_opt, - const vector<string> &chain1_list, const vector<string> &chain2_list, - const int hinge_opt) + const int mirror_opt, const int het_opt, const string &atom_opt, + const bool autojustify, const string &mol_opt, const string &dir_opt, + const string &dirpair_opt, const string &dir1_opt, const string &dir2_opt, + const int byresi_opt, const vector<string> &chain1_list, + const vector<string> &chain2_list, const int hinge_opt) { /* declare previously global variables */ vector<vector<string> >PDB_lines1; // text of chain1 @@ -2435,7 +2514,8 @@ int flexalign(string &xname, string &yname, const string &fname_super, /* parse chain 1 */ xname=chain1_list[i]; xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + mol_vec1, ter_opt, infmt1_opt, atom_opt, autojustify, + split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -2469,13 +2549,14 @@ int flexalign(string &xname, string &yname, const string &fname_super, for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) { + if (dirpair_opt.size() && i!=j) continue; /* parse chain 2 */ if (PDB_lines2.size()==0) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, autojustify, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname @@ -2602,8 +2683,8 @@ int flexalign(string &xname, string &yname, const string &fname_super, /* print result */ if (outfmt_opt==0) print_version(); output_flexalign_results( - xname.substr(dir1_opt.size()+dir_opt.size()), - yname.substr(dir2_opt.size()+dir_opt.size()), + xname.substr(dir1_opt.size()+dir_opt.size()+dirpair_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()+dirpair_opt.size()), chainID_list1[chain_i], chainID_list2[chain_j], xlen, ylen, t0, u0, tu_vec, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, seqM.c_str(), @@ -2708,11 +2789,14 @@ int main(int argc, char *argv[]) string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty string dir_opt =""; // set -dir to empty + string dirpair_opt=""; // set -dirpair to empty string dir1_opt =""; // set -dir1 to empty string dir2_opt =""; // set -dir2 to empty + string chainmapfile=""; // chain mapping between two complexes int byresi_opt=0; // set -byresi to 0 vector<string> chain1_list; // only when -dir1 is set vector<string> chain2_list; // only when -dir2 is set + vector<pair<string,string> > chain_pair_list; // only when -dirpair is set for(int i = 1; i < argc; i++) { @@ -2815,6 +2899,12 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); fname_lign = argv[i + 1]; i_opt = 3; i++; } + else if (!strcmp(argv[i], "-chainmap") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -chainmap"); + chainmapfile = argv[i + 1]; i++; + } else if (!strcmp(argv[i], "-m") ) { if (i>=(argc-1)) @@ -2862,10 +2952,6 @@ int main(int argc, char *argv[]) if (i>=(argc-1)) PrintErrorAndQuit("ERROR! Missing value for -atom"); atom_opt=argv[i + 1]; i++; - if (atom_opt.size()!=4) PrintErrorAndQuit( - "ERROR! Atom name must have 4 characters, including space.\n" - "For example, C alpha, C3' and P atoms should be specified by\n" - "-atom \" CA \", -atom \" P \" and -atom \" C3'\", respectively."); } else if ( !strcmp(argv[i],"-mol") ) { @@ -2885,6 +2971,12 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("ERROR! Missing value for -dir"); dir_opt=argv[i + 1]; i++; } + else if ( !strcmp(argv[i],"-dirpair") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dirpair"); + dirpair_opt=argv[i + 1]; i++; + } else if ( !strcmp(argv[i],"-dir1") ) { if (i>=(argc-1)) @@ -2956,8 +3048,9 @@ int main(int argc, char *argv[]) else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); } - if(xname.size()==0 || (yname.size()==0 && dir_opt.size()==0) || - (yname.size() && dir_opt.size())) + if (xname.size()==0 || (yname.size() && dir_opt.size()) || + (yname.size() && dirpair_opt.size()) || + (yname.size()==0 && dir_opt.size()==0 && dirpair_opt.size()==0)) { if (h_opt) print_help(h_opt); if (v_opt) @@ -2967,15 +3060,15 @@ int main(int argc, char *argv[]) } if (xname.size()==0) PrintErrorAndQuit("Please provide input structures"); - else if (yname.size()==0 && dir_opt.size()==0 && mm_opt!=4) + else if (yname.size()==0 && dir_opt.size()==0 && dirpair_opt.size()==0 && mm_opt!=4) PrintErrorAndQuit("Please provide structure B"); - else if (yname.size() && dir_opt.size()) + else if (yname.size() && dir_opt.size()+dirpair_opt.size()) PrintErrorAndQuit("Please provide only one file name if -dir is set"); } - if (suffix_opt.size() && dir_opt.size()+dir1_opt.size()+dir2_opt.size()==0) + if (suffix_opt.size() && dir_opt.size()+dirpair_opt.size()+dir1_opt.size()+dir2_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); - if ((dir_opt.size() || dir1_opt.size() || dir2_opt.size())) + if ((dir_opt.size() || dirpair_opt.size() || dir1_opt.size() || dir2_opt.size())) { if (mm_opt!=2 && mm_opt!=4) { @@ -2984,16 +3077,30 @@ int main(int argc, char *argv[]) if (m_opt && fname_matrix!="-") PrintErrorAndQuit("-m can only be - or unset when using -dir, -dir1 or -dir2"); } - else if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) + else if ((dir_opt.size() || dirpair_opt.size() )&& (dir1_opt.size() || dir2_opt.size())) PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); + else if (dir_opt.size() && dirpair_opt.size()) + PrintErrorAndQuit("-dir cannot be set with -dirpair"); } if (o_opt && (infmt1_opt!=-1 && infmt1_opt!=0 && infmt1_opt!=3)) PrintErrorAndQuit("-o can only be used with -infmt1 -1, 0 or 3"); + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") atom_opt=" C3'"; + if (atom_opt.size()!=4) + { + cerr<<"ERROR! Atom name must have 4 characters, including space.\n" + "For example, C alpha, C3' and P atoms should be specified by\n" + "-atom \" CA \", -atom \" P \" and -atom \" C3'\", respectively."<<endl; + if (atom_opt.size()>=5 || atom_opt.size()==0) return 1; + else if (atom_opt.size()==1) atom_opt=" "+atom_opt+" "; + else if (atom_opt.size()==2) atom_opt=" "+atom_opt+" "; + else if (atom_opt.size()==3) atom_opt=" "+atom_opt; + cerr<<"Change -atom to \""<<atom_opt<<"\""<<endl; + } if (d_opt && d0_scale<=0) PrintErrorAndQuit("Wrong value for option -d! It should be >0"); @@ -3003,8 +3110,8 @@ int main(int argc, char *argv[]) { if (i_opt) PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); - if (byresi_opt<0 || byresi_opt>6) - PrintErrorAndQuit("-byresi can only be 0 to 6"); + if (byresi_opt<0 || byresi_opt>7) + PrintErrorAndQuit("-byresi can only be 0 to 7"); if ((byresi_opt==2 || byresi_opt==3 || byresi_opt==6) && ter_opt>=2) PrintErrorAndQuit("-byresi 2 and 6 must be used with -ter <=1"); } @@ -3037,6 +3144,8 @@ int main(int argc, char *argv[]) if (ter_opt>=2 && (mm_opt==1 || mm_opt==2)) PrintErrorAndQuit("-mm 1 or 2 must be used with -ter 0 or -ter 1"); if (mm_opt==4 && (yname.size() || dir2_opt.size())) cerr<<"WARNING! structure_2 is ignored for -mm 4"<<endl; + if (dirpair_opt.size() && (mm_opt==2 || mm_opt==4)) + PrintErrorAndQuit("-mm 2 or 4 cannot be used with -dirpair"); } else if (full_opt) PrintErrorAndQuit("-full can only be used with -mm"); @@ -3058,26 +3167,34 @@ int main(int argc, char *argv[]) if (mm_opt==7 && hinge_opt>=10) PrintErrorAndQuit("ERROR! -hinge must be <10"); + if (chainmapfile.size() && mm_opt!=1) + PrintErrorAndQuit("ERROR! -chainmap must be used with -mm 1"); + /* read initial alignment file from 'align.txt' */ if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); - if (byresi_opt==6) mm_opt=1; + if (byresi_opt==6 || byresi_opt==7) mm_opt=1; else if (byresi_opt) i_opt=3; if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); /* parse file list */ - if (dir1_opt.size()+dir_opt.size()==0) chain1_list.push_back(xname); - else file2chainlist(chain1_list, xname, dir_opt+dir1_opt, suffix_opt); - int i; - if (dir_opt.size()) - for (i=0;i<chain1_list.size();i++) - chain2_list.push_back(chain1_list[i]); - else if (dir2_opt.size()==0) chain2_list.push_back(yname); - else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + if (dirpair_opt.size()) + file2chainpairlist(chain1_list,chain2_list, xname, dirpair_opt, suffix_opt); + else + { + if (dir1_opt.size()+dir_opt.size()==0) chain1_list.push_back(xname); + else file2chainlist(chain1_list, xname, dir_opt+dir1_opt, suffix_opt); + + if (dir_opt.size()) + for (i=0;i<chain1_list.size();i++) + chain2_list.push_back(chain1_list[i]); + else if (dir2_opt.size()==0) chain2_list.push_back(yname); + else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + } if (outfmt_opt==2) { @@ -3092,43 +3209,69 @@ int main(int argc, char *argv[]) sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, split_opt, outfmt_opt, fast_opt, cp_opt, mirror_opt, het_opt, - atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, byresi_opt, - chain1_list, chain2_list, se_opt); - else if (mm_opt==1) MMalign(xname, yname, fname_super, fname_lign, - fname_matrix, sequence, d0_scale, m_opt, o_opt, - a_opt, d_opt, full_opt, TMcut, infmt1_opt, infmt2_opt, - ter_opt, split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, - atom_opt, mol_opt, dir1_opt, dir2_opt, chain1_list, chain2_list, - byresi_opt); + atom_opt, autojustify, mol_opt, dir_opt, dirpair_opt, dir1_opt, + dir2_opt, byresi_opt, chain1_list, chain2_list, se_opt); + else if (mm_opt==1) + { + if (dirpair_opt.size()==0) MMalign(xname, yname, fname_super, + fname_lign, fname_matrix, sequence, d0_scale, m_opt, o_opt, + a_opt, d_opt, full_opt, TMcut, infmt1_opt, infmt2_opt, + ter_opt, split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, + atom_opt, autojustify, mol_opt, dir1_opt, dir2_opt, chain1_list, + chain2_list, byresi_opt,chainmapfile); + else + { + vector<string> tmp_vec1; + vector<string> tmp_vec2; + for (i=0;i<chain1_list.size();i++) + { + xname=chain1_list[i]; + yname=chain2_list[i]; + tmp_vec1.push_back(xname); + tmp_vec2.push_back(yname); + MMalign(xname, yname, fname_super, fname_lign, fname_matrix, + sequence, d0_scale, m_opt, o_opt, a_opt, d_opt, full_opt, + TMcut, infmt1_opt, infmt2_opt, ter_opt, split_opt, + outfmt_opt, fast_opt, mirror_opt, het_opt, atom_opt, + autojustify, mol_opt, dirpair_opt, dirpair_opt, tmp_vec1, + tmp_vec2, byresi_opt,chainmapfile); + tmp_vec1[0].clear(); tmp_vec1.clear(); + tmp_vec2[0].clear(); tmp_vec2.clear(); + } + } + chainmapfile.clear(); + } else if (mm_opt==2) MMdock(xname, yname, fname_super, fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, o_opt, a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, - atom_opt, mol_opt, dir1_opt, dir2_opt, chain1_list, chain2_list); + atom_opt, autojustify, mol_opt, dir1_opt, dir2_opt, + chain1_list, chain2_list); else if (mm_opt==3) ; // should be changed to mm_opt=0, cp_opt=true else if (mm_opt==4) mTMalign(xname, yname, fname_super, fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, u_opt, d_opt, full_opt, TMcut, infmt1_opt, ter_opt, split_opt, outfmt_opt, fast_opt, het_opt, - atom_opt, mol_opt, dir_opt, byresi_opt, chain1_list); + atom_opt, autojustify, mol_opt, dir_opt, byresi_opt, chain1_list); else if (mm_opt==5 || mm_opt==6) SOIalign(xname, yname, fname_super, fname_lign, fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, split_opt, outfmt_opt, fast_opt, cp_opt, mirror_opt, het_opt, - atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, - chain1_list, chain2_list, se_opt, closeK_opt, mm_opt); + atom_opt, autojustify, mol_opt, dir_opt, dirpair_opt, dir1_opt, + dir2_opt, chain1_list, chain2_list, se_opt, closeK_opt, mm_opt); else if (mm_opt==7) flexalign(xname, yname, fname_super, fname_lign, fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, - atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, byresi_opt, - chain1_list, chain2_list, hinge_opt); + atom_opt, autojustify, mol_opt, dir_opt, dirpair_opt, dir1_opt, + dir2_opt, byresi_opt, chain1_list, chain2_list, hinge_opt); else cerr<<"WARNING! -mm "<<mm_opt<<" not implemented"<<endl; /* clean up */ vector<string>().swap(chain1_list); vector<string>().swap(chain2_list); vector<string>().swap(sequence); + vector<pair<string,string> >().swap(chain_pair_list); t2 = clock(); float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; diff --git a/modules/bindings/src/USalign/addChainID.cpp b/modules/bindings/src/USalign/addChainID.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7fb7ab4ef2d824682b47d7a11f29971e8e56669a --- /dev/null +++ b/modules/bindings/src/USalign/addChainID.cpp @@ -0,0 +1,129 @@ +#include <fstream> +#include <map> +#include <sstream> +#include <iostream> +#include <string> +#include <vector> +#include <cstdlib> +#include "pstream.h" + +using namespace std; + +void print_help() +{ + cout << +"Add chain ID to PDB format file.\n" +"\n" +"Usage: addChainID input.pdb output.pdb chainID\n" + <<endl; + exit(EXIT_SUCCESS); +} + +void splitlines(const string &line, vector<string> &lines, + const char delimiter='\n') +{ + bool within_word = false; + for (size_t pos=0;pos<line.size();pos++) + { + if (line[pos]==delimiter) + { + within_word = false; + continue; + } + if (!within_word) + { + within_word = true; + lines.push_back(""); + } + lines.back()+=line[pos]; + } +} + +void addChainID(const string &infile,const string &outfile, + const string &chainID) +{ + stringstream buf; + if (infile=="-") buf<<cin.rdbuf(); +#if defined(REDI_PSTREAM_H_SEEN) + else if (infile.size()>3 && infile.substr(infile.size()-3)==".gz") + { + redi::ipstream fp_gz; // if file is compressed + fp_gz.open("gunzip -c "+infile); + buf<<fp_gz.rdbuf(); + fp_gz.close(); + } +#endif + else + { + ifstream fp; + fp.open(infile.c_str(),ios::in); //ifstream fp(filename,ios::in); + buf<<fp.rdbuf(); + fp.close(); + } + vector<string> lines; + splitlines(buf.str(),lines); + buf.str(string()); + size_t l; + + for (l=0;l<lines.size();l++) + { + if (lines[l].substr(0,6)=="ATOM " || + lines[l].substr(0,6)=="HETATM") + { + if (lines[l].size()<22) + { + cerr<<"incomplete:"<<lines[l]<<endl; + continue; + } + buf<<lines[l].substr(0,20)<<chainID<<lines[l].substr(22)<<endl; + } + else if (lines[l].size()) + buf<<lines[l]<<endl; + lines[l].clear(); + } + + if (outfile=="-") + cout<<buf.str(); + else + { + ofstream fout; + fout.open(outfile.c_str(),ios::out); + fout<<buf.str(); + fout.close(); + } + buf.str(string()); + vector<string>().swap(lines); + return; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + string infile =""; + string outfile=""; + string chainID=""; + + for (int i=1; i<argc; i++) + { + if ( infile.size()==0) infile =argv[i]; + else if (outfile.size()==0) outfile=argv[i]; + else if (chainID.size()==0) chainID=argv[i]; + else + { + cerr<<"ERROR! no such option "<<argv[i]<<endl; + exit(1); + } + } + + if (outfile.size()==0) outfile="-"; + if (chainID.size()==0) chainID=" "; + else if (chainID.size()==1) chainID=" "+chainID; + else if (chainID.size()>2) chainID=chainID.substr(chainID.size()-2,2); + + addChainID(infile,outfile,chainID); + + infile.clear(); + outfile.clear(); + return 0; +} diff --git a/modules/bindings/src/USalign/basic_fun.h b/modules/bindings/src/USalign/basic_fun.h index 0fe0701199465f79f0fc340dc7c85a95f90f1205..624d27a35b141ce6293ed188f6b21d07591b0071 100644 --- a/modules/bindings/src/USalign/basic_fun.h +++ b/modules/bindings/src/USalign/basic_fun.h @@ -151,7 +151,8 @@ string Trim(const string &inputString) size_t get_PDB_lines(const string filename, vector<vector<string> >&PDB_lines, vector<string> &chainID_list, vector<int> &mol_vec, const int ter_opt, const int infmt_opt, - const string atom_opt, const int split_opt, const int het_opt) + const string atom_opt, const bool autojustify, const int split_opt, + const int het_opt) { size_t i=0; // resi i.e. atom index string line; @@ -188,14 +189,49 @@ size_t get_PDB_lines(const string filename, if (infmt_opt==0||infmt_opt==-1) // PDB format { + map<string,char> aa3to1; + aa3to1[" A"]=aa3to1[" DA"]='a'; + aa3to1[" C"]=aa3to1[" DC"]='c'; + aa3to1[" G"]=aa3to1[" DG"]='g'; + aa3to1[" U"]=aa3to1["PSU"]='u'; + aa3to1[" I"]=aa3to1[" DI"]='i'; + aa3to1[" T"]='t'; + aa3to1["ALA"]='A'; + aa3to1["CYS"]='C'; + aa3to1["ASP"]='D'; + aa3to1["GLU"]='E'; + aa3to1["PHE"]='F'; + aa3to1["GLY"]='G'; + aa3to1["HIS"]='H'; + aa3to1["ILE"]='I'; + aa3to1["LYS"]='K'; + aa3to1["LEU"]='L'; + aa3to1["MET"]=aa3to1["MSE"]='M'; + aa3to1["ASN"]='N'; + aa3to1["PRO"]='P'; + aa3to1["GLN"]='Q'; + aa3to1["ARG"]='R'; + aa3to1["SER"]='S'; + aa3to1["THR"]='T'; + aa3to1["VAL"]='V'; + aa3to1["TRP"]='W'; + aa3to1["TYR"]='Y'; + aa3to1["ASX"]='B'; + aa3to1["GLX"]='Z'; + aa3to1["SEC"]='U'; + aa3to1["PYL"]='O'; + + + string atom; + string resn; while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { if (compress_type==-1) getline(cin, line); else if (compress_type) getline(fin_gz, line); else getline(fin, line); if (infmt_opt==-1 && line.compare(0,5,"loop_")==0) // PDBx/mmCIF - return get_PDB_lines(filename,PDB_lines,chainID_list, - mol_vec, ter_opt, 3, atom_opt, split_opt,het_opt); + return get_PDB_lines(filename,PDB_lines,chainID_list, mol_vec, + ter_opt, 3, atom_opt, autojustify, split_opt,het_opt); if (i > 0) { if (ter_opt>=1 && line.compare(0,3,"END")==0) break; @@ -208,20 +244,36 @@ size_t get_PDB_lines(const string filename, (line.compare(0, 6, "HETATM")==0 && het_opt==2 && line.compare(17,3, "MSE")==0))) { + atom=line.substr(12,4); + if (autojustify) + { + resn=line.substr(17,3); + if (aa3to1.count(resn)) + { + atom=Trim(atom); + if (atom.size()) + { + if (atom.size()>=2 && atom[atom.size()-1]=='*') + atom=atom.substr(0,atom.size()-1)+"'"; + if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + } + } + } if (atom_opt=="auto") { if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) - select_atom=(line.compare(12,4," C3'")==0); - else select_atom=(line.compare(12,4," CA ")==0); + select_atom=(atom==" C3'"); + else select_atom=(atom==" CA "); } else if (atom_opt=="PC4'") { if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) - select_atom=(line.compare(12,4," P ")==0 - )||(line.compare(12,4," C4'")==0); - else select_atom=(line.compare(12,4," CA ")==0); + select_atom=(atom==" P ")||(atom==" C4'"); + else select_atom=(atom==" CA "); } - else select_atom=(line.compare(12,4,atom_opt)==0); + else select_atom=(atom==atom_opt); if (select_atom) { if (!chainID) @@ -284,6 +336,8 @@ size_t get_PDB_lines(const string filename, } } } + + map<string,char>().swap(aa3to1); } else if (infmt_opt==1) // SPICKER format { @@ -752,6 +806,17 @@ void read_user_alignment(vector<string>&sequence, const string &fname_lign, return; } + +inline bool isfile(const string& filename) +{ + if (FILE *fp = fopen(filename.c_str(), "r")) + { + fclose(fp); + return true; + } + else return false; +} + /* read list of entries from 'name' to 'chain_list'. * dir_opt is the folder name (prefix). * suffix_opt is the file name extension (suffix_opt). @@ -764,14 +829,168 @@ void file2chainlist(vector<string>&chain_list, const string &name, if (! fp.is_open()) PrintErrorAndQuit(("Can not open file: "+name+'\n').c_str()); string line; + string filename; + int a,b; + string sep; while (fp.good()) { getline(fp, line); if (! line.size()) continue; - chain_list.push_back(dir_opt+Trim(line)+suffix_opt); + line=Trim(line); + for (a=0;a<=2;a++) + { + if (a==0) sep=""; + else if (a==1) sep="/"; + else if (a==2) sep="\\"; + + filename=dir_opt+sep+line+suffix_opt; + if (isfile(filename)) break; + if (suffix_opt.size()) + { + filename=dir_opt+sep+line; + if (isfile(filename)) break; + } + else + { + filename=dir_opt+sep+line+".pdb"; + if (isfile(filename)) break; + filename=dir_opt+sep+line+".cif"; + if (isfile(filename)) break; + } + filename.clear(); + } + + if (filename.size()==0) + { + filename=dir_opt+line+suffix_opt; + cerr<<"WARNING! "<<filename<<" does not exist"<<endl; + } + else chain_list.push_back(filename); + line.clear(); + filename.clear(); + } + fp.close(); +} + +void file2chainpairlist(vector<string>&chain1_list, vector<string>&chain2_list, + const string &name, const string &dirpair_opt, const string &suffix_opt) +{ + ifstream fp(name.c_str()); + if (! fp.is_open()) + PrintErrorAndQuit(("Can not open file: "+name+'\n').c_str()); + string line; + string filename; + int a,b; + size_t i; + string sep,filename1,filename2; + vector<string> line_vec; + while (fp.good()) + { + getline(fp, line); + if (! line.size()) continue; + line=Trim(line); + split(line, line_vec, '\t'); + if (line_vec.size()==2) + { + filename1=line_vec[0]; + filename2=line_vec[1]; + for (i=0;i<2;i++) line_vec[i].clear(); line_vec.clear(); + } + else + { + for (i=0;i<line_vec.size();i++) line_vec[i].clear(); line_vec.clear(); + split(line, line_vec, ' '); + if (line_vec.size()==2) + { + filename1=line_vec[0]; + filename2=line_vec[1]; + for (i=0;i<2;i++) line_vec[i].clear(); line_vec.clear(); + } + else + { + cerr<<"WARNING! not a chain pair: "<<line<<endl; + for (i=0;i<line_vec.size();i++) line_vec[i].clear(); line_vec.clear(); + continue; + } + } + + filename.clear(); + for (a=0;a<=2;a++) + { + if (a==0) sep=""; + else if (a==1) sep="/"; + else if (a==2) sep="\\"; + + filename=dirpair_opt+sep+filename1+suffix_opt; + if (isfile(filename)) break; + if (suffix_opt.size()) + { + filename=dirpair_opt+sep+line; + if (isfile(filename)) break; + } + else + { + filename=dirpair_opt+sep+line+".pdb"; + if (isfile(filename)) break; + filename=dirpair_opt+sep+line+".cif"; + if (isfile(filename)) break; + } + filename.clear(); + } + + if (filename.size()==0) + { + filename=dirpair_opt+filename1+suffix_opt; + cerr<<"WARNING! "<<filename<<" does not exist"<<endl; + continue; + } + else + { + filename1=filename; + filename.clear(); + } + + for (a=0;a<=2;a++) + { + if (a==0) sep=""; + else if (a==1) sep="/"; + else if (a==2) sep="\\"; + + filename=dirpair_opt+sep+filename2+suffix_opt; + if (isfile(filename)) break; + if (suffix_opt.size()) + { + filename=dirpair_opt+sep+line; + if (isfile(filename)) break; + } + else + { + filename=dirpair_opt+sep+line+".pdb"; + if (isfile(filename)) break; + filename=dirpair_opt+sep+line+".cif"; + if (isfile(filename)) break; + } + filename.clear(); + } + + if (filename.size()==0) + { + filename=dirpair_opt+filename2+suffix_opt; + cerr<<"WARNING! "<<filename<<" does not exist"<<endl; + continue; + } + else + { + filename2=filename; + filename.clear(); + } + + chain1_list.push_back(filename1); + chain2_list.push_back(filename2); + line.clear(); + filename.clear(); } fp.close(); - line.clear(); } #endif diff --git a/modules/bindings/src/USalign/pdb2fasta.cpp b/modules/bindings/src/USalign/pdb2fasta.cpp index e0fc71206d788719907bb275d776baf2f81e2a83..767b1dfe47dab8281c0746f33862bee39e362009 100644 --- a/modules/bindings/src/USalign/pdb2fasta.cpp +++ b/modules/bindings/src/USalign/pdb2fasta.cpp @@ -126,6 +126,8 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("ERROR! Molecule type must be one of the" "following:\nauto, prot (the same as 'protein'), and " "RNA (the same as 'DNA')."); + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (mol_opt=="protein" && atom_opt=="auto") atom_opt=" CA "; else if (mol_opt=="RNA" && atom_opt=="auto") @@ -170,8 +172,8 @@ int main(int argc, char *argv[]) for (i=0;i<chain_list.size();i++) { xname=chain_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, mol_vec, + ter_opt, infmt_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname diff --git a/modules/bindings/src/USalign/pdb2ss.cpp b/modules/bindings/src/USalign/pdb2ss.cpp index d0732803d16652eb312ea337812998a761bd092a..501023a978f669d1844f33b9173871af1012b1fe 100644 --- a/modules/bindings/src/USalign/pdb2ss.cpp +++ b/modules/bindings/src/USalign/pdb2ss.cpp @@ -114,6 +114,8 @@ int main(int argc, char *argv[]) if (suffix_opt.size() && dir_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir is set"); + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (atom_opt.size()!=4) PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") @@ -170,8 +172,8 @@ int main(int argc, char *argv[]) for (i=0;i<chain_list.size();i++) { xname=chain_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, mol_vec, + ter_opt, infmt_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname diff --git a/modules/bindings/src/USalign/pdb2xyz.cpp b/modules/bindings/src/USalign/pdb2xyz.cpp index d151f1e741b1a419482618d6dc08ac01197be2db..988dc9ae7443aa933d578329f405cf0fd9a37c63 100644 --- a/modules/bindings/src/USalign/pdb2xyz.cpp +++ b/modules/bindings/src/USalign/pdb2xyz.cpp @@ -98,6 +98,8 @@ int main(int argc, char *argv[]) if (suffix_opt.size() && dir_opt.size()==0) PrintErrorAndQuit("-suffix is only valid if -dir is set"); + + bool autojustify=(atom_opt=="auto" || atom_opt=="PC4'"); // auto re-pad atom name if (atom_opt.size()!=4) PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); if (split_opt==1 && ter_opt!=0) @@ -143,8 +145,8 @@ int main(int argc, char *argv[]) for (i=0;i<chain_list.size();i++) { xname=chain_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, mol_vec, + ter_opt, infmt_opt, atom_opt, autojustify, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname diff --git a/modules/bindings/src/USalign/pdbAtomName.cpp b/modules/bindings/src/USalign/pdbAtomName.cpp index d65c576d2f375343b7d1e95f2e75c926182022a8..2d67eb799dbe228a65a8ab878c5929b48948563d 100644 --- a/modules/bindings/src/USalign/pdbAtomName.cpp +++ b/modules/bindings/src/USalign/pdbAtomName.cpp @@ -4,6 +4,7 @@ #include <iostream> #include <string> #include <vector> +#include <cstdlib> #include "pstream.h" using namespace std; diff --git a/modules/bindings/src/USalign/qTMclust.cpp b/modules/bindings/src/USalign/qTMclust.cpp index 08fc64b68d12fc701cf3eff12f3c83885420fb1c..1cb9ba263763e6b4233be3012d4edbd361d095b4 100644 --- a/modules/bindings/src/USalign/qTMclust.cpp +++ b/modules/bindings/src/USalign/qTMclust.cpp @@ -80,6 +80,8 @@ void print_help(bool h_opt=false) " 1: treat each MODEL as a separate chain (-ter should be 0)\n" " 2: treat each chain as a seperate chain (-ter should be <=1)\n" "\n" +" -init tentative clustering\n" +"\n" " -h Print the full help message, including additional options.\n" "\n" <<endl; @@ -110,6 +112,32 @@ void filter_lower_bound(double &lb_HwRMSD, double &lb_TMfast, return; } +void read_init_cluster(const string&filename, + map<string, map<string,bool> > &init_cluster) +{ + ifstream fin; + string line; + vector<string> line_vec; + map<string, bool> tmp_map; + size_t i,j; + fin.open(filename.c_str()); + while (fin.good()) + { + getline(fin,line); + split(line,line_vec,'\t'); + for (i=0;i<line_vec.size();i++) + { + for (j=0;j<line_vec.size();j++) + if (i!=j) tmp_map[line_vec[j]]=1; + init_cluster[line_vec[i]]=tmp_map; + map<string, bool> ().swap(tmp_map); + } + for (i=0;i<line_vec.size();i++) line_vec[i].clear(); line_vec.clear(); + } + fin.close(); + vector<string>().swap(line_vec); +} + int main(int argc, char *argv[]) { if (argc < 2) print_help(); @@ -124,6 +152,7 @@ int main(int argc, char *argv[]) string xname = ""; double TMcut = 0.5; string fname_clust = ""; // file name for output cluster result + string fname_init = ""; string fname_lign = ""; // file name for user alignment vector<string> sequence; // get value from alignment file double Lnorm_ass, d0_scale; @@ -146,6 +175,7 @@ int main(int argc, char *argv[]) string dir_opt =""; // set -dir to empty int byresi_opt=0; // set -byresi to 0 vector<string> chain_list; + map<string, map<string,bool> > init_cluster; for(int i = 1; i < argc; i++) { @@ -228,6 +258,10 @@ int main(int argc, char *argv[]) { het_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-init") && i < (argc-1) ) + { + read_init_cluster(argv[i+1],init_cluster); i++; + } else if (xname.size() == 0) xname=argv[i]; else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); } @@ -317,7 +351,7 @@ int main(int argc, char *argv[]) { xname=chain_list[i]; newchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + mol_vec, ter_opt, infmt_opt, atom_opt, false, split_opt, het_opt); if (!newchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -439,6 +473,7 @@ int main(int argc, char *argv[]) } sizePROT=index_vec.size(); + string key=chainID_list[chain_i]; cout<<'>'<<chainID_list[chain_i]<<'\t'<<xlen<<'\t' <<setiosflags(ios::fixed)<<setprecision(2) <<100.*i/Nstruct<<"%(#"<<i<<")\t" @@ -447,9 +482,14 @@ int main(int argc, char *argv[]) #ifdef TMalign_HwRMSD_h vector<pair<double,size_t> > HwRMSDscore_list; double TM; + size_t init_count=0; for (j=0;j<sizePROT;j++) { chain_j=index_vec[j]; + string value=chainID_list[chain_j]; + if (init_cluster.count(key) && init_count>=2 && + HwRMSDscore_list.size()>=init_cluster[key].size() && !init_cluster[key].count(value)) + continue; ylen=xyz_vec[chain_j].size(); if (mol_vec[chain_i]*mol_vec[chain_j]<0) continue; else if (s_opt==2 && xlen<TMcut*ylen) continue; @@ -461,6 +501,8 @@ int main(int argc, char *argv[]) if (s_opt<=1) filter_lower_bound(lb_HwRMSD, lb_TMfast, TMcut, s_opt, mol_vec[chain_i]+mol_vec[chain_j]); + //cout<<chainID_list[chain_i]<<" => "<<chainID_list[chain_j]<<endl; + NewArray(&ya, ylen, 3); for (r=0;r<ylen;r++) { @@ -509,7 +551,16 @@ int main(int argc, char *argv[]) Lave=sqrt(xlen*ylen); // geometry average because O(L1*L2) if (TM>=lb_HwRMSD || Lave<=fast_lb) - HwRMSDscore_list.push_back(make_pair(TM,index_vec[j])); + { + if (init_cluster.count(key) && init_cluster[key].count(value)) + { + HwRMSDscore_list.push_back(make_pair(TM+1,index_vec[j])); + init_count++; + if (init_count==init_cluster[key].size()) break; + } + else + HwRMSDscore_list.push_back(make_pair(TM,index_vec[j])); + } /* clean up after each HwRMSD */ seqM.clear(); @@ -529,6 +580,7 @@ int main(int argc, char *argv[]) if (xlen<=fast_lb) cur_repr_num_cutoff=max_repr_num; else if (xlen>fast_lb && xlen<fast_ub) cur_repr_num_cutoff+= (fast_ub-xlen)/(fast_ub-fast_lb)*(max_repr_num-min_repr_num); + //if (init_count>=2) cur_repr_num_cutoff=init_count; index_vec.clear(); for (j=0;j<HwRMSDscore_list.size();j++) @@ -715,6 +767,7 @@ int main(int argc, char *argv[]) clust_mem_vec.clear(); chainID_list.clear(); clust_repr_map.clear(); + map<string, map<string,bool> >().swap(init_cluster); t2 = clock(); float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; diff --git a/modules/bindings/src/USalign/readme.txt b/modules/bindings/src/USalign/readme.txt index 2a03302527c578b4e646739b2a2e5331b53091c7..79ae2068bdcf64161398e3bb09e038383895bf92 100644 --- a/modules/bindings/src/USalign/readme.txt +++ b/modules/bindings/src/USalign/readme.txt @@ -4,8 +4,8 @@ References to cite: (1) Chengxin Zhang, Morgan Shine, Anna Marie Pyle, Yang Zhang - (2022) Nat Methods - (2) Chengxin Zhang, Anna Marie Pyle (2022) iScience + (2022) Nat Methods. 19(9), 1109-1115. + (2) Chengxin Zhang, Anna Marie Pyle (2022) iScience. 25(10), 105218. DISCLAIMER: Permission to use, copy, modify, and distribute this program for @@ -72,6 +72,8 @@ 2022/06/23: Fix -m for Windows. Add pymol plugin. 2022/06/26: Add -full option for -mm 2 and 4 2022/09/24: Support -TMscore for complex when the chain order is different + 2023/06/09: Correct atom name justification in PDB file for standard amino + acids and nucleotides =============================================================================== ========================= diff --git a/modules/bindings/src/USalign/se.cpp b/modules/bindings/src/USalign/se.cpp index af24ae78cafc6b5a930522e938cac871409db55d..06492ffa7189b0f7400ad928676398d18c324910 100644 --- a/modules/bindings/src/USalign/se.cpp +++ b/modules/bindings/src/USalign/se.cpp @@ -318,8 +318,8 @@ int main(int argc, char *argv[]) { /* parse chain 1 */ xname=chain1_list[i]; - xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, - mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, mol_vec1, + ter_opt, infmt1_opt, atom_opt, false, split_opt, het_opt); if (!xchainnum) { cerr<<"Warning! Cannot parse file: "<<xname @@ -349,8 +349,8 @@ int main(int argc, char *argv[]) { yname=chain2_list[j]; ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, - mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, - het_opt); + mol_vec2, ter_opt, infmt2_opt, atom_opt, false, + split_opt, het_opt); if (!ychainnum) { cerr<<"Warning! Cannot parse file: "<<yname diff --git a/modules/bindings/src/wrap_tmalign.cc b/modules/bindings/src/wrap_tmalign.cc index 49da68d00061044d2cb4832f09b5907ce1121be4..9be1639a4f601ffddcc4d41d8fa85c9cb9a04ba2 100644 --- a/modules/bindings/src/wrap_tmalign.cc +++ b/modules/bindings/src/wrap_tmalign.cc @@ -430,6 +430,8 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, const std::vector<bool>& rna2, bool fast) { + std::map<int,int> chainmap; + // input checks if(pos_one.empty() || pos_two.empty()) { throw ost::Error("Cannot compute MMAlign on empty chains!"); @@ -722,6 +724,11 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, TMave_mat[i][j]=-1; continue; } + if (chainmap.size() && chainmap[i]!=j) + { + TMave_mat[i][j]=-1; + continue; + } ylen=ylen_vec[j]; if (ylen<3) @@ -755,8 +762,8 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, if (byresi_opt) { - int total_aln=extract_aln_from_resi(sequence, - seqx,seqy,resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j); + int total_aln=extract_aln_from_resi(sequence, seqx,seqy, + resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j, byresi_opt); seqxA_mat[i][j]=sequence[0]; seqyA_mat[i][j]=sequence[1]; if (total_aln>xlen+ylen-3) @@ -827,7 +834,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, /* refine alignment for large oligomers */ int aln_chain_num=count_assign_pair(assign1_list,chain1_num); bool is_oligomer=(aln_chain_num>=3); - if (aln_chain_num==2) // dimer alignment + if (aln_chain_num==2 && chainmap.size()==0) // dimer alignment { int na_chain_num1,na_chain_num2,aa_chain_num1,aa_chain_num2; count_na_aa_chain_num(na_chain_num1,aa_chain_num1,mol_vec1); @@ -849,7 +856,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, else is_oligomer=true; /* align oligomers to dimer */ } - if (aln_chain_num>=3 || is_oligomer) // oligomer alignment + if ((aln_chain_num>=3 || is_oligomer) && chainmap.size()==0) // oligomer alignment { /* extract centroid coordinates */ double **xcentroids; @@ -997,8 +1004,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, ylen_vec.clear(); // length of complex2 vector<string> ().swap(resi_vec1); // residue index for chain1 vector<string> ().swap(resi_vec2); // residue index for chain2 - - + map<int,int> ().swap(chainmap); return res; }