From 728729585f47c8f10909db4ede92a869411e265e Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Tue, 6 Feb 2024 09:35:48 +0100 Subject: [PATCH] update external USalign code --- modules/bindings/src/USalign/MMalign.cpp | 54 +++++++-- modules/bindings/src/USalign/MMalign.h | 142 +++++++++++++---------- modules/bindings/src/USalign/OST_INFO | 4 +- modules/bindings/src/USalign/USalign.cpp | 48 +++++++- modules/bindings/src/USalign/readme.txt | 2 + modules/bindings/src/wrap_tmalign.cc | 46 +++++++- 6 files changed, 214 insertions(+), 82 deletions(-) diff --git a/modules/bindings/src/USalign/MMalign.cpp b/modules/bindings/src/USalign/MMalign.cpp index 886cb7600..20f8308ab 100644 --- a/modules/bindings/src/USalign/MMalign.cpp +++ b/modules/bindings/src/USalign/MMalign.cpp @@ -9,7 +9,7 @@ void print_version() cout << "\n" " **********************************************************************\n" -" * MM-align (Version 20220412): complex structure alignment *\n" +" * MM-align (Version 20231222): complex structure alignment *\n" " * References: S Mukherjee, Y Zhang. Nucl Acids Res 37(11):e83 (2009) *\n" " * Please email comments and suggestions to yangzhanglab@umich.edu *\n" " **********************************************************************" @@ -360,6 +360,8 @@ int main(int argc, char *argv[]) len_na=(xlen_na+ylen_na)/2; } + map<int,int> chainmap; + /* perform monomer alignment if there is only one chain */ if (xa_vec.size()==1 && ya_vec.size()==1) { @@ -638,11 +640,49 @@ int main(int argc, char *argv[]) // score was from monomeric chain superpositions int max_iter=5-(int)((len_aa+len_na)/200); if (max_iter<2) max_iter=2; - MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, - secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, - xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, - TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, fast_opt); + MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, + seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, + ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, + chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, + sequence, d0_scale, fast_opt, chainmap); + + if (aln_chain_num>=4 && is_oligomer && chainmap.size()==0) // oligomer alignment + { + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, 1, 0, 5, ter_opt, split_opt, + 0, 0, true, true, mirror_opt, resi_vec1, resi_vec2); + + + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } /* sometime MMalign_iter is even worse than monomer alignment */ if (max_total_score<maxTMmono) @@ -667,7 +707,7 @@ int main(int argc, char *argv[]) secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, fast_opt); + d0_scale, fast_opt, chainmap); } /* perform cross chain alignment diff --git a/modules/bindings/src/USalign/MMalign.h b/modules/bindings/src/USalign/MMalign.h index d9b1fb19f..71b1f784a 100644 --- a/modules/bindings/src/USalign/MMalign.h +++ b/modules/bindings/src/USalign/MMalign.h @@ -1495,13 +1495,12 @@ void MMalign_final( if (!full_opt) return; + if (outfmt_opt<=2) cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<<endl; /* re-compute chain level alignment */ for (i=0;i<chain1_num;i++) { - j=assign1_list[i]; - if (j<0) continue; xlen=xlen_vec[i]; seqx = new char[xlen+1]; secx = new char[xlen+1]; @@ -1513,67 +1512,71 @@ void MMalign_final( NewArray(&xt, xlen, 3); do_rotation(xa, xt, xlen, t0, u0); - ylen=ylen_vec[j]; - if (ylen<3) + for (j=0;j<chain2_num;j++) { - TMave_mat[i][j]=-1; - continue; + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + seqM=""; + seqxA=""; + seqyA=""; + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 1, a_opt, 2, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); + + //TM2=TM4*Lnorm_ass/xlen; + //TM1=TM4*Lnorm_ass/ylen; + //d0A=d0u; + //d0B=d0u; + TMave_mat[i][j]=TM4*Lnorm_ass; + + /* print result */ + if (j==assign1_list[i]) output_results(xname, yname, + chainID_list1[i].c_str(), chainID_list2[j].c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), + seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, 0, + "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); + + /* clean up */ + seqxA.clear(); + seqM.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; } - seqy = new char[ylen+1]; - secy = new char[ylen+1]; - NewArray(&ya, ylen, 3); - copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], - ylen,ya,seqy,secy); - - /* declare variable specific to this pair of TMalign */ - d0_out=5.0; - rmsd0 = 0.0; - Liden=0; - int *invmap = new int[ylen+1]; - seqM=""; - seqxA=""; - seqyA=""; - double Lnorm_ass=len_aa; - if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; - sequence[0]=seqxA_mat[i][j]; - sequence[1]=seqyA_mat[i][j]; - - /* entry function for structure alignment */ - se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, - rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 1, a_opt, 2, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); - - //TM2=TM4*Lnorm_ass/xlen; - //TM1=TM4*Lnorm_ass/ylen; - //d0A=d0u; - //d0B=d0u; - - /* print result */ - output_results(xname, yname, - chainID_list1[i].c_str(), chainID_list2[j].c_str(), - xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, - seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), - seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, - TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, - "", outfmt_opt, ter_opt, false, split_opt, 0, - "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); - - /* clean up */ - seqxA.clear(); - seqM.clear(); - seqyA.clear(); - sequence[0].clear(); - sequence[1].clear(); - delete[]seqy; - delete[]secy; - DeleteArray(&ya,ylen); delete[]seqx; delete[]secx; DeleteArray(&xa,xlen); DeleteArray(&xt,xlen); - delete[]invmap; } sequence.clear(); return; @@ -1621,7 +1624,7 @@ void MMalign_iter(double & max_total_score, const int max_iter, int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, int *assign1_list, int *assign2_list, vector<string>&sequence, - double d0_scale, bool fast_opt) + double d0_scale, bool fast_opt, map<int,int> &chainmap) { /* tmp assignment */ double total_score; @@ -1637,7 +1640,7 @@ void MMalign_iter(double & max_total_score, const int max_iter, copy_chain_assign_data(chain1_num, chain2_num, sequence_tmp, seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp); - + for (int iter=0;iter<max_iter;iter++) { total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, @@ -1646,14 +1649,25 @@ void MMalign_iter(double & max_total_score, const int max_iter, chain1_num, chain2_num, TMave_tmp, seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, sequence, d0_scale, fast_opt); + if (chainmap.size()) + { + int i,j; + for (i=0;i<chain1_num;i++) for (j=0;j<chain2_num;j++) + if (!chainmap.count(i) || chainmap[i]!=j) TMave_tmp[i][j]=-1; + } total_score=enhanced_greedy_search(TMave_tmp, assign1_tmp, assign2_tmp, chain1_num, chain2_num); //if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); if (total_score<=max_total_score) break; max_total_score=total_score; - copy_chain_assign_data(chain1_num, chain2_num, sequence, - seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp, - seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + if (chainmap.size()) + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_tmp, seqyA_tmp, assign1_list, assign2_list, TMave_tmp, + seqxA_mat, seqyA_mat, assign1_tmp, assign2_tmp, TMave_mat); + else + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); } /* clean up everything */ @@ -2831,7 +2845,7 @@ void MMalign_cross(double & max_total_score, const int max_iter, int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, int *assign1_list, int *assign2_list, vector<string>&sequence, - double d0_scale, bool fast_opt) + double d0_scale, bool fast_opt, map<int,int> &chainmap) { /* tmp assignment */ int *assign1_tmp, *assign2_tmp; @@ -2865,7 +2879,7 @@ void MMalign_cross(double & max_total_score, const int max_iter, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, fast_opt); + d0_scale, fast_opt, chainmap); /* clean up everything */ delete [] assign1_tmp; diff --git a/modules/bindings/src/USalign/OST_INFO b/modules/bindings/src/USalign/OST_INFO index 7c8299cdd..888d01eb0 100644 --- a/modules/bindings/src/USalign/OST_INFO +++ b/modules/bindings/src/USalign/OST_INFO @@ -1,6 +1,6 @@ -Source code has been cloned Nov 14 2023 from: +Source code has been cloned Feb 6 2024 from: https://github.com/pylelab/USalign last commit: -58b42af9d58436279c21b4f4074db87f072fcc21 +444144c4edf0b43b9f573a1a4b8a0692c4c24ab3 diff --git a/modules/bindings/src/USalign/USalign.cpp b/modules/bindings/src/USalign/USalign.cpp index 151e3ca48..5f712642d 100644 --- a/modules/bindings/src/USalign/USalign.cpp +++ b/modules/bindings/src/USalign/USalign.cpp @@ -11,7 +11,7 @@ void print_version() cout << "\n" " ********************************************************************\n" -" * US-align (Version 20230609) *\n" +" * US-align (Version 20231222) *\n" " * Universal Structure Alignment of Proteins and Nucleic Acids *\n" " * Reference: C Zhang, M Shine, AM Pyle, Y Zhang. (2022) Nat Methods*\n" " * Please email comments and suggestions to zhang@zhanggroup.org *\n" @@ -742,7 +742,7 @@ int MMalign(const string &xname, const string &yname, TMave_mat[i][j]=-1; continue; } - if (chainmap.size() && chainmap[i]!=j) + if (chainmap.size() && (!chainmap.count(i) || chainmap[i]!=j)) { TMave_mat[i][j]=-1; continue; @@ -893,7 +893,7 @@ int MMalign(const string &xname, const string &yname, hetero_refined_greedy_search(TMave_mat, assign1_list, assign2_list, chain1_num, chain2_num, xcentroids, ycentroids, d0MM, len_aa+len_na); - + /* clean up */ DeleteArray(&xcentroids, chain1_num); DeleteArray(&ycentroids, chain2_num); @@ -922,7 +922,45 @@ int MMalign(const string &xname, const string &yname, seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, - sequence, d0_scale, fast_opt); + sequence, d0_scale, fast_opt, chainmap); + + if (byresi_opt && aln_chain_num>=4 && is_oligomer && chainmap.size()==0) // oligomer alignment + { + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, 1, 0, 5, ter_opt, split_opt, + 0, 0, true, true, mirror_opt, resi_vec1, resi_vec2); + + + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } /* sometime MMalign_iter is even worse than monomer alignment */ if (byresi_opt==0 && max_total_score<maxTMmono) @@ -947,7 +985,7 @@ int MMalign(const string &xname, const string &yname, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, fast_opt); + d0_scale, fast_opt, chainmap); } /* perform cross chain alignment diff --git a/modules/bindings/src/USalign/readme.txt b/modules/bindings/src/USalign/readme.txt index 79ae2068b..fb5281061 100644 --- a/modules/bindings/src/USalign/readme.txt +++ b/modules/bindings/src/USalign/readme.txt @@ -74,6 +74,8 @@ 2022/09/24: Support -TMscore for complex when the chain order is different 2023/06/09: Correct atom name justification in PDB file for standard amino acids and nucleotides + 2023/12/13: Refine chain assignment for -TMscore >=6 for large complex + 2023/12/22: Forbid chain assignment refinment for -chainmap =============================================================================== ========================= diff --git a/modules/bindings/src/wrap_tmalign.cc b/modules/bindings/src/wrap_tmalign.cc index 91544c06c..bc45f3cc0 100644 --- a/modules/bindings/src/wrap_tmalign.cc +++ b/modules/bindings/src/wrap_tmalign.cc @@ -725,7 +725,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, TMave_mat[i][j]=-1; continue; } - if (chainmap.size() && chainmap[i]!=j) + if (chainmap.size() && (!chainmap.count(i) || chainmap[i]!=j)) { TMave_mat[i][j]=-1; continue; @@ -876,7 +876,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, hetero_refined_greedy_search(TMave_mat, assign1_list, assign2_list, chain1_num, chain2_num, xcentroids, ycentroids, d0MM, len_aa+len_na); - + /* clean up */ DeleteArray(&xcentroids, chain1_num); DeleteArray(&ycentroids, chain2_num); @@ -905,7 +905,45 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, - sequence, d0_scale, fast_opt); + sequence, d0_scale, fast_opt, chainmap); + + if (byresi_opt && aln_chain_num>=4 && is_oligomer && chainmap.size()==0) // oligomer alignment + { + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, 1, 0, 5, ter_opt, split_opt, + 0, 0, true, true, mirror_opt, resi_vec1, resi_vec2); + + + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } /* sometime MMalign_iter is even worse than monomer alignment */ if (byresi_opt==0 && max_total_score<maxTMmono) @@ -930,7 +968,7 @@ MMAlignResult WrappedMMAlign(const std::vector<geom::Vec3List>& pos_one, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, fast_opt); + d0_scale, fast_opt, chainmap); } /* perform cross chain alignment -- GitLab