From b4d4648a455ad35cd5707f8e63ed516e2fb3644d Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Thu, 4 May 2023 17:31:55 +0200 Subject: [PATCH] Replace older TMalign source code with USalign source code Enables RNA comparison for simple chain by chain comparison. Possible to extend to multichain functionality as implemented in USalign. --- modules/bindings/doc/tmtools.rst | 24 +- modules/bindings/pymod/export_tmalign.cc | 6 +- .../src/{tmalign => USalign}/BLOSUM.h | 0 modules/bindings/src/USalign/Dockerfile | 25 + .../src/{tmalign => USalign}/HwRMSD.cpp | 0 .../src/{tmalign => USalign}/HwRMSD.h | 2 +- .../src/{tmalign => USalign}/Kabsch.h | 0 .../src/{tmalign => USalign}/MMalign.cpp | 138 +- modules/bindings/src/USalign/MMalign.h | 3040 ++++++++++++++++ .../bindings/src/{tmalign => USalign}/NW.h | 94 + .../src/{tmalign => USalign}/NWalign.cpp | 0 .../src/{tmalign => USalign}/NWalign.h | 198 +- modules/bindings/src/USalign/OST_INFO | 6 + .../src/{tmalign => USalign}/PDB1.pdb | 0 .../src/{tmalign => USalign}/PDB2.pdb | 0 modules/bindings/src/USalign/SOIalign.h | 959 +++++ .../src/{tmalign => USalign}/TMalign.cpp | 30 +- .../src/{tmalign => USalign}/TMalign.h | 356 +- .../src/{tmalign => USalign}/TMscore.cpp | 36 +- .../src/{tmalign => USalign}/TMscore.h | 17 +- modules/bindings/src/USalign/USalign.cpp | 3137 +++++++++++++++++ .../src/{tmalign => USalign}/align.txt | 0 .../src/{tmalign => USalign}/basic_fun.h | 287 +- modules/bindings/src/USalign/cif2pdb.cpp | 533 +++ modules/bindings/src/USalign/flexalign.h | 1826 ++++++++++ .../src/{tmalign => USalign}/param_set.h | 0 .../src/{tmalign => USalign}/pdb2fasta.cpp | 34 +- .../src/{tmalign => USalign}/pdb2ss.cpp | 0 .../src/{tmalign => USalign}/pdb2xyz.cpp | 0 modules/bindings/src/USalign/pdbAtomName.cpp | 232 ++ .../src/{tmalign => USalign}/pstream.h | 7 +- modules/bindings/src/USalign/qTMclust.cpp | 723 ++++ .../src/{tmalign => USalign}/readme.txt | 61 +- .../bindings/src/{tmalign => USalign}/se.cpp | 18 +- .../bindings/src/{tmalign => USalign}/se.h | 66 +- modules/bindings/src/USalign/usalign.py | 132 + .../src/{tmalign => USalign}/xyz_sfetch.cpp | 28 +- modules/bindings/src/tmalign/.gitignore | 17 - modules/bindings/src/tmalign/MMalign.h | 1194 ------- modules/bindings/src/tmalign/OST_INFO | 7 - modules/bindings/src/wrap_tmalign.cc | 106 +- modules/bindings/src/wrap_tmalign.hh | 3 +- 42 files changed, 11687 insertions(+), 1655 deletions(-) rename modules/bindings/src/{tmalign => USalign}/BLOSUM.h (100%) create mode 100644 modules/bindings/src/USalign/Dockerfile rename modules/bindings/src/{tmalign => USalign}/HwRMSD.cpp (100%) rename modules/bindings/src/{tmalign => USalign}/HwRMSD.h (96%) rename modules/bindings/src/{tmalign => USalign}/Kabsch.h (100%) rename modules/bindings/src/{tmalign => USalign}/MMalign.cpp (81%) create mode 100644 modules/bindings/src/USalign/MMalign.h rename modules/bindings/src/{tmalign => USalign}/NW.h (80%) rename modules/bindings/src/{tmalign => USalign}/NWalign.cpp (100%) rename modules/bindings/src/{tmalign => USalign}/NWalign.h (72%) create mode 100644 modules/bindings/src/USalign/OST_INFO rename modules/bindings/src/{tmalign => USalign}/PDB1.pdb (100%) rename modules/bindings/src/{tmalign => USalign}/PDB2.pdb (100%) create mode 100644 modules/bindings/src/USalign/SOIalign.h rename modules/bindings/src/{tmalign => USalign}/TMalign.cpp (93%) rename modules/bindings/src/{tmalign => USalign}/TMalign.h (88%) rename modules/bindings/src/{tmalign => USalign}/TMscore.cpp (91%) rename modules/bindings/src/{tmalign => USalign}/TMscore.h (95%) create mode 100644 modules/bindings/src/USalign/USalign.cpp rename modules/bindings/src/{tmalign => USalign}/align.txt (100%) rename modules/bindings/src/{tmalign => USalign}/basic_fun.h (78%) create mode 100644 modules/bindings/src/USalign/cif2pdb.cpp create mode 100644 modules/bindings/src/USalign/flexalign.h rename modules/bindings/src/{tmalign => USalign}/param_set.h (100%) rename modules/bindings/src/{tmalign => USalign}/pdb2fasta.cpp (79%) rename modules/bindings/src/{tmalign => USalign}/pdb2ss.cpp (100%) rename modules/bindings/src/{tmalign => USalign}/pdb2xyz.cpp (100%) create mode 100644 modules/bindings/src/USalign/pdbAtomName.cpp rename modules/bindings/src/{tmalign => USalign}/pstream.h (99%) create mode 100644 modules/bindings/src/USalign/qTMclust.cpp rename modules/bindings/src/{tmalign => USalign}/readme.txt (66%) rename modules/bindings/src/{tmalign => USalign}/se.cpp (94%) rename modules/bindings/src/{tmalign => USalign}/se.h (73%) create mode 100644 modules/bindings/src/USalign/usalign.py rename modules/bindings/src/{tmalign => USalign}/xyz_sfetch.cpp (83%) delete mode 100644 modules/bindings/src/tmalign/.gitignore delete mode 100644 modules/bindings/src/tmalign/MMalign.h delete mode 100644 modules/bindings/src/tmalign/OST_INFO diff --git a/modules/bindings/doc/tmtools.rst b/modules/bindings/doc/tmtools.rst index 823a35eca..76e3f5d48 100644 --- a/modules/bindings/doc/tmtools.rst +++ b/modules/bindings/doc/tmtools.rst @@ -18,13 +18,14 @@ Citation: Y. Zhang and J. Skolnick, Nucl. Acids Res. 2005 33, 2302-9 Besides using the standalone TM-align program, ost also provides a wrapper -around TM-align as published in: +around USalign as published in: - Sha Gong, Chengxin Zhang, Yang Zhang, Bioinformatics 2019 + Chengxin Zhang, Morgan Shine, Anna Marie Pyle, Yang Zhang + (2022) Nat Methods The advantage is that no intermediate files must be generated, a wrapper on the -c++ layer is used instead. However, only the basic TM-align superposition -functionality is available. +c++ layer is used instead. However, only the basic TM-align superposition between +single chains is available. @@ -122,9 +123,12 @@ generated in order to call the executable. The positions and sequences are directly extracted from the chain residues for every residue that fulfills: - * peptide linking + * peptide linking and valid CA atom OR nucleotide linking and valid C3' + atom * valid one letter code(no '?') - * valid CA atom + + The function automatically identifies whether the chains consist of peptide + or RNA residues. An error is raised if the two types are mixed. :param chain1: Chain from which position and sequence are extracted to run TMalign. @@ -137,20 +141,22 @@ generated in order to call the executable. :rtype: :class:`ost.bindings.TMAlignResult` -.. method:: WrappedTMAlign(pos1, pos2, seq1, seq2 [fast=False]) +.. method:: WrappedTMAlign(pos1, pos2, seq1, seq2 [fast=False, rna=False]) Similar as described above, but directly feeding in raw data. - :param pos1: CA positions of the first chain - :param pos2: CA positions of the second chain, this is the reference. + :param pos1: CA/C3' positions of the first chain + :param pos2: CA/C3' positions of the second chain, this is the reference. :param seq1: Sequence of first chain :param seq2: Sequence of second chain :param fast: Whether to apply the *fast* flag to TMAlign + :param rna: Whether to treat as RNA :type pos1: :class:`ost.geom.Vec3List` :type pos2: :class:`ost.geom.Vec3List` :type seq1: :class:`ost.seq.SequenceHandle` :type seq2: :class:`ost.seq.SequenceHandle` :type fast: :class:`bool` + :type rna: :class:`bool` :rtype: :class:`ost.bindings.TMAlignResult` :raises: :class:`ost.Error` if pos1 and seq1, pos2 and seq2 respectively are not consistent in size. diff --git a/modules/bindings/pymod/export_tmalign.cc b/modules/bindings/pymod/export_tmalign.cc index aefe33ec7..f6d94a2d7 100644 --- a/modules/bindings/pymod/export_tmalign.cc +++ b/modules/bindings/pymod/export_tmalign.cc @@ -26,9 +26,9 @@ ost::bindings::TMAlignResult WrapTMAlignPos(const geom::Vec3List& pos_one, const geom::Vec3List& pos_two, const ost::seq::SequenceHandle& seq1, const ost::seq::SequenceHandle& seq2, - bool fast) { + bool fast, bool rna) { - return ost::bindings::WrappedTMAlign(pos_one, pos_two, seq1, seq2, fast); + return ost::bindings::WrappedTMAlign(pos_one, pos_two, seq1, seq2, fast, rna); } ost::bindings::TMAlignResult WrapTMAlignView(const ost::mol::ChainView& chain1, @@ -51,7 +51,7 @@ void export_TMAlign() { ; def("WrappedTMAlign", &WrapTMAlignPos, (arg("pos1"), arg("pos2"), arg("seq1"), arg("seq2"), - arg("fast")=false)); + arg("fast")=false, arg("rna")=false)); def("WrappedTMAlign", &WrapTMAlignView, (arg("chain1"), arg("chain2"), arg("fast")=false)); diff --git a/modules/bindings/src/tmalign/BLOSUM.h b/modules/bindings/src/USalign/BLOSUM.h similarity index 100% rename from modules/bindings/src/tmalign/BLOSUM.h rename to modules/bindings/src/USalign/BLOSUM.h diff --git a/modules/bindings/src/USalign/Dockerfile b/modules/bindings/src/USalign/Dockerfile new file mode 100644 index 000000000..26c4bf271 --- /dev/null +++ b/modules/bindings/src/USalign/Dockerfile @@ -0,0 +1,25 @@ +FROM gcc:12.2 as build +COPY . /usr/src/usalign +WORKDIR /usr/src/usalign +RUN make -j +RUN strip qTMclust USalign TMalign TMscore MMalign se pdb2xyz xyz_sfetch pdb2fasta pdb2ss NWalign HwRMSD cif2pdb + +# Don't use alpine since we need ubuntu's support +FROM ubuntu:latest +RUN mkdir /usr/bin/usalign +WORKDIR /usr/bin/usalign +COPY --from=build /usr/src/usalign/qTMclust /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/USalign /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/TMalign /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/TMscore /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/MMalign /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/se /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/pdb2xyz /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/xyz_sfetch /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/pdb2fasta /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/pdb2ss /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/NWalign /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/HwRMSD /usr/bin/usalign/ +COPY --from=build /usr/src/usalign/cif2pdb /usr/bin/usalign/ + +CMD "/bin/bash" diff --git a/modules/bindings/src/tmalign/HwRMSD.cpp b/modules/bindings/src/USalign/HwRMSD.cpp similarity index 100% rename from modules/bindings/src/tmalign/HwRMSD.cpp rename to modules/bindings/src/USalign/HwRMSD.cpp diff --git a/modules/bindings/src/tmalign/HwRMSD.h b/modules/bindings/src/USalign/HwRMSD.h similarity index 96% rename from modules/bindings/src/tmalign/HwRMSD.h rename to modules/bindings/src/USalign/HwRMSD.h index 8a29399cd..8e0d0b2e4 100644 --- a/modules/bindings/src/tmalign/HwRMSD.h +++ b/modules/bindings/src/USalign/HwRMSD.h @@ -140,7 +140,7 @@ int HwRMSD_main(double **xa, double **ya, const char *seqx, const char *seqy, if (n_ali8_tmp==0) { - cerr<<"WARNING! zero aligned residue in iteration "<<iter<<endl; + //cerr<<"WARNING! zero aligned residue in iteration "<<iter<<endl; if (xlen>=ylen) seqxA_tmp=(string)(seqx); if (xlen<=ylen) seqyA_tmp=(string)(seqy); if (xlen<ylen) diff --git a/modules/bindings/src/tmalign/Kabsch.h b/modules/bindings/src/USalign/Kabsch.h similarity index 100% rename from modules/bindings/src/tmalign/Kabsch.h rename to modules/bindings/src/USalign/Kabsch.h diff --git a/modules/bindings/src/tmalign/MMalign.cpp b/modules/bindings/src/USalign/MMalign.cpp similarity index 81% rename from modules/bindings/src/tmalign/MMalign.cpp rename to modules/bindings/src/USalign/MMalign.cpp index 6cc485647..816798b24 100644 --- a/modules/bindings/src/tmalign/MMalign.cpp +++ b/modules/bindings/src/USalign/MMalign.cpp @@ -9,7 +9,7 @@ void print_version() cout << "\n" " **********************************************************************\n" -" * MM-align (Version 20200519): complex structure alignment *\n" +" * MM-align (Version 20220412): complex structure alignment *\n" " * References: S Mukherjee, Y Zhang. Nucl Acids Res 37(11):e83 (2009) *\n" " * Please email comments and suggestions to yangzhanglab@umich.edu *\n" " **********************************************************************" @@ -440,36 +440,34 @@ int main(int argc, char *argv[]) t2 = clock(); float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; - printf("Total CPU time is %5.2f seconds\n", diff); + printf("#Total CPU time is %5.2f seconds\n", diff); return 0; } /* declare TM-score tables */ int chain1_num=xa_vec.size(); int chain2_num=ya_vec.size(); - double **TM1_mat; - double **TM2_mat; + vector<string> tmp_str_vec(chain2_num,""); double **TMave_mat; double **ut_mat; // rotation matrices for all-against-all alignment int ui,uj,ut_idx; - NewArray(&TM1_mat,chain1_num,chain2_num); - NewArray(&TM2_mat,chain1_num,chain2_num); NewArray(&TMave_mat,chain1_num,chain2_num); NewArray(&ut_mat,chain1_num*chain2_num,4*3); - vector<string> tmp_str_vec(chain2_num,""); vector<vector<string> >seqxA_mat(chain1_num,tmp_str_vec); vector<vector<string> > seqM_mat(chain1_num,tmp_str_vec); vector<vector<string> >seqyA_mat(chain1_num,tmp_str_vec); - tmp_str_vec.clear(); + + double maxTMmono=-1; + int maxTMmono_i,maxTMmono_j; /* get all-against-all alignment */ + if (len_aa+len_na>500) fast_opt=true; for (i=0;i<chain1_num;i++) { xlen=xlen_vec[i]; if (xlen<3) { - for (j=0;j<chain2_num;j++) - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + for (j=0;j<chain2_num;j++) TMave_mat[i][j]=-1; continue; } seqx = new char[xlen+1]; @@ -489,14 +487,14 @@ int main(int argc, char *argv[]) if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment { - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + TMave_mat[i][j]=-1; continue; } ylen=ylen_vec[j]; if (ylen<3) { - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; + TMave_mat[i][j]=-1; continue; } seqy = new char[ylen+1]; @@ -530,18 +528,22 @@ int main(int argc, char *argv[]) seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, sequence, Lnorm_tmp, d0_scale, - 0, false, true, false, true, + 0, false, true, false, fast_opt, mol_vec1[i]+mol_vec2[j],TMcut); /* store result */ for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) ut_mat[ut_idx][ui*3+uj]=u0[ui][uj]; for (uj=0;uj<3;uj++) ut_mat[ut_idx][9+uj]=t0[uj]; - TM1_mat[i][j]=TM2; // normalized by chain1 - TM2_mat[i][j]=TM1; // normalized by chain2 seqxA_mat[i][j]=seqxA; seqyA_mat[i][j]=seqyA; TMave_mat[i][j]=TM4*Lnorm_tmp; + if (TMave_mat[i][j]>maxTMmono) + { + maxTMmono=TMave_mat[i][j]; + maxTMmono_i=i; + maxTMmono_j=j; + } /* clean up */ seqM.clear(); @@ -568,8 +570,7 @@ int main(int argc, char *argv[]) if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); /* refine alignment for large oligomers */ - int aln_chain_num=0; - for (i=0;i<chain1_num;i++) aln_chain_num+=(assign1_list[i]>=0); + int aln_chain_num=count_assign_pair(assign1_list,chain1_num); bool is_oligomer=(aln_chain_num>=3); if (aln_chain_num==2) // dimer alignment { @@ -617,22 +618,90 @@ int main(int argc, char *argv[]) DeleteArray(&xcentroids, chain1_num); DeleteArray(&ycentroids, chain2_num); } - if (len_aa+len_na>1000) fast_opt=true; + + /* store initial assignment */ + int init_pair_num=count_assign_pair(assign1_list,chain1_num); + int *assign1_init, *assign2_init; + assign1_init=new int[chain1_num]; + assign2_init=new int[chain2_num]; + double **TMave_init; + NewArray(&TMave_init,chain1_num,chain2_num); + vector<vector<string> >seqxA_init(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_init(chain1_num,tmp_str_vec); + vector<string> sequence_init; + copy_chain_assign_data(chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); /* perform iterative alignment */ - for (int iter=0;iter<1;iter++) + double max_total_score=0; // ignore old total_score because previous + // score was from monomeric chain superpositions + int max_iter=5-(int)((len_aa+len_na)/200); + if (max_iter<2) max_iter=2; + MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, fast_opt); + + /* sometime MMalign_iter is even worse than monomer alignment */ + if (max_total_score<maxTMmono) { - total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + for (i=0;i<chain1_num;i++) + { + if (i!=maxTMmono_i) assign1_list[i]=-1; + else assign1_list[i]=maxTMmono_j; + } + for (j=0;j<chain2_num;j++) + { + if (j!=maxTMmono_j) assign2_list[j]=-1; + else assign2_list[j]=maxTMmono_i; + } + sequence[0]=seqxA_mat[maxTMmono_i][maxTMmono_j]; + sequence[1]=seqyA_mat[maxTMmono_i][maxTMmono_j]; + max_total_score=maxTMmono; + MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, - xa, ya, seqx, seqy, secx, secy, len_aa, len_na, - chain1_num, chain2_num, TM1_mat, TM2_mat, TMave_mat, - seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, - d0_scale, true); - total_score=enhanced_greedy_search(TMave_mat, assign1_list, - assign2_list, chain1_num, chain2_num); - if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, fast_opt); } + /* perform cross chain alignment + * in some cases, this leads to dramatic improvement, esp for homodimer */ + int iter_pair_num=count_assign_pair(assign1_list,chain1_num); + if (iter_pair_num>=init_pair_num) copy_chain_assign_data( + chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); + double max_total_score_cross=max_total_score; + + //if (init_pair_num!=2 && is_oligomer==false) MMalign_cross( + //max_total_score_cross, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, + //secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + //xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + //TMave_init, seqxA_init, seqyA_init, assign1_init, assign2_init, sequence_init, + //d0_scale, true); + //else + if (len_aa+len_na<10000) + { + MMalign_dimer(max_total_score_cross, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_init, seqxA_init, seqyA_init, assign1_init, assign2_init, + sequence_init, d0_scale, fast_opt); + if (max_total_score_cross>max_total_score) + { + max_total_score=max_total_score_cross; + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + } + } + /* final alignment */ if (outfmt_opt==0) print_version(); MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), @@ -641,7 +710,7 @@ int main(int argc, char *argv[]) xa_vec, ya_vec, seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, - chain1_num, chain2_num, TM1_mat, TM2_mat, TMave_mat, + chain1_num, chain2_num, TMave_mat, seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, d0_scale, m_opt, o_opt, outfmt_opt, ter_opt, split_opt, a_opt, d_opt, fast_opt, full_opt, mirror_opt, resi_vec1, resi_vec2); @@ -649,13 +718,18 @@ int main(int argc, char *argv[]) /* clean up everything */ delete [] assign1_list; delete [] assign2_list; - DeleteArray(&TM1_mat, chain1_num); - DeleteArray(&TM2_mat, chain1_num); DeleteArray(&TMave_mat,chain1_num); DeleteArray(&ut_mat, chain1_num*chain2_num); vector<vector<string> >().swap(seqxA_mat); vector<vector<string> >().swap(seqM_mat); vector<vector<string> >().swap(seqyA_mat); + vector<string>().swap(tmp_str_vec); + + delete [] assign1_init; + delete [] assign2_init; + DeleteArray(&TMave_init,chain1_num); + vector<vector<string> >().swap(seqxA_init); + vector<vector<string> >().swap(seqyA_init); vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 @@ -672,9 +746,11 @@ int main(int argc, char *argv[]) vector<string>().swap(chain1_list); vector<string>().swap(chain2_list); vector<string>().swap(sequence); + vector<string>().swap(resi_vec1); // residue index for chain1 + vector<string>().swap(resi_vec2); // residue index for chain2 t2 = clock(); float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; - printf("Total CPU time is %5.2f seconds\n", diff); + printf("#Total CPU time is %5.2f seconds\n", diff); return 0; } diff --git a/modules/bindings/src/USalign/MMalign.h b/modules/bindings/src/USalign/MMalign.h new file mode 100644 index 000000000..4b480da62 --- /dev/null +++ b/modules/bindings/src/USalign/MMalign.h @@ -0,0 +1,3040 @@ +#include <cfloat> +#include "se.h" + +/* count the number of nucleic acid chains (na_chain_num) and + * protein chains (aa_chain_num) in a complex */ +int count_na_aa_chain_num(int &na_chain_num,int &aa_chain_num, + const vector<int>&mol_vec) +{ + na_chain_num=0; + aa_chain_num=0; + for (size_t i=0;i<mol_vec.size();i++) + { + if (mol_vec[i]>0) na_chain_num++; + else aa_chain_num++; + } + return na_chain_num+aa_chain_num; +} + +/* adjust chain assignment for dimer-dimer alignment + * return true if assignment is adjusted */ +bool adjust_dimer_assignment( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<int>&xlen_vec, const vector<int>&ylen_vec, + const vector<int>&mol_vec1, const vector<int>&mol_vec2, + int *assign1_list, int *assign2_list, + const vector<vector<string> >&seqxA_mat, + const vector<vector<string> >&seqyA_mat) +{ + /* check currently assigned chains */ + int i1,i2,j1,j2; + i1=i2=j1=j2=-1; + int chain1_num=xa_vec.size(); + int i,j; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) + { + if (i1<0) + { + i1=i; + j1=assign1_list[i1]; + } + else + { + i2=i; + j2=assign1_list[i2]; + } + } + } + + /* normalize d0 by L */ + int xlen=xlen_vec[i1]+xlen_vec[i2]; + int ylen=ylen_vec[j1]+ylen_vec[j2]; + int mol_type=mol_vec1[i1]+mol_vec1[i2]+ + mol_vec2[j1]+mol_vec2[j2]; + double D0_MIN, d0, d0_search; + double Lnorm=getmin(xlen,ylen); + parameter_set4final(getmin(xlen,ylen), D0_MIN, Lnorm, d0, + d0_search, mol_type); + + double **xa,**ya, **xt; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + NewArray(&xt, xlen, 3); + + double RMSD = 0; + double dd = 0; + double t[3]; + double u[3][3]; + size_t L_ali=0; // index of residue in aligned region + size_t r=0; // index of residue in full alignment + + /* total score using current assignment */ + L_ali=0; + i=j=-1; + for (r=0;r<seqxA_mat[i1][j1].size();r++) + { + i+=(seqxA_mat[i1][j1][r]!='-'); + j+=(seqyA_mat[i1][j1][r]!='-'); + if (seqxA_mat[i1][j1][r]=='-' || seqyA_mat[i1][j1][r]=='-') continue; + xa[L_ali][0]=xa_vec[i1][i][0]; + xa[L_ali][1]=xa_vec[i1][i][1]; + xa[L_ali][2]=xa_vec[i1][i][2]; + ya[L_ali][0]=ya_vec[j1][j][0]; + ya[L_ali][1]=ya_vec[j1][j][1]; + ya[L_ali][2]=ya_vec[j1][j][2]; + L_ali++; + } + i=j=-1; + for (r=0;r<seqxA_mat[i2][j2].size();r++) + { + i+=(seqxA_mat[i2][j2][r]!='-'); + j+=(seqyA_mat[i2][j2][r]!='-'); + if (seqxA_mat[i2][j2][r]=='-' || seqyA_mat[i2][j2][r]=='-') continue; + xa[L_ali][0]=xa_vec[i2][i][0]; + xa[L_ali][1]=xa_vec[i2][i][1]; + xa[L_ali][2]=xa_vec[i2][i][2]; + ya[L_ali][0]=ya_vec[j2][j][0]; + ya[L_ali][1]=ya_vec[j2][j][1]; + ya[L_ali][2]=ya_vec[j2][j][2]; + L_ali++; + } + + Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); + do_rotation(xa, xt, L_ali, t, u); + + double total_score1=0; + for (r=0;r<L_ali;r++) + { + dd=dist(xt[r],ya[r]); + total_score1+=1/(1+dd/d0*d0); + } + total_score1/=Lnorm; + + /* total score using reversed assignment */ + L_ali=0; + i=j=-1; + for (r=0;r<seqxA_mat[i1][j2].size();r++) + { + i+=(seqxA_mat[i1][j2][r]!='-'); + j+=(seqyA_mat[i1][j2][r]!='-'); + if (seqxA_mat[i1][j2][r]=='-' || seqyA_mat[i1][j2][r]=='-') continue; + xa[L_ali][0]=xa_vec[i1][i][0]; + xa[L_ali][1]=xa_vec[i1][i][1]; + xa[L_ali][2]=xa_vec[i1][i][2]; + ya[L_ali][0]=ya_vec[j2][j][0]; + ya[L_ali][1]=ya_vec[j2][j][1]; + ya[L_ali][2]=ya_vec[j2][j][2]; + L_ali++; + } + i=j=-1; + for (r=0;r<seqxA_mat[i2][j1].size();r++) + { + i+=(seqxA_mat[i2][j1][r]!='-'); + j+=(seqyA_mat[i2][j1][r]!='-'); + if (seqxA_mat[i2][j1][r]=='-' || seqyA_mat[i2][j1][r]=='-') continue; + xa[L_ali][0]=xa_vec[i2][i][0]; + xa[L_ali][1]=xa_vec[i2][i][1]; + xa[L_ali][2]=xa_vec[i2][i][2]; + ya[L_ali][0]=ya_vec[j1][j][0]; + ya[L_ali][1]=ya_vec[j1][j][1]; + ya[L_ali][2]=ya_vec[j1][j][2]; + L_ali++; + } + + Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); + do_rotation(xa, xt, L_ali, t, u); + + double total_score2=0; + for (r=0;r<L_ali;r++) + { + dd=dist(xt[r],ya[r]); + total_score2+=1/(1+dd/d0*d0); + } + total_score2/=Lnorm; + + /* swap chain assignment */ + if (total_score1<total_score2) + { + assign1_list[i1]=j2; + assign1_list[i2]=j1; + assign2_list[j1]=i2; + assign2_list[j2]=i1; + } + + /* clean up */ + DeleteArray(&xa, xlen); + DeleteArray(&ya, ylen); + DeleteArray(&xt, xlen); + return total_score1<total_score2; +} + +/* count how many chains are paired */ +int count_assign_pair(int *assign1_list,const int chain1_num) +{ + int pair_num=0; + int i; + for (i=0;i<chain1_num;i++) pair_num+=(assign1_list[i]>=0); + return pair_num; +} + + +/* assign chain-chain correspondence */ +double enhanced_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num) +{ + double total_score=0; + double tmp_score=0; + int i,j; + int maxi=0; + int maxj=0; + + /* initialize parameters */ + for (i=0;i<chain1_num;i++) assign1_list[i]=-1; + for (j=0;j<chain2_num;j++) assign2_list[j]=-1; + + /* greedy assignment: in each iteration, the highest chain pair is + * assigned, until no assignable chain is left */ + while(1) + { + tmp_score=-1; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) continue; + for (j=0;j<chain2_num;j++) + { + if (assign2_list[j]>=0 || TMave_mat[i][j]<=0) continue; + if (TMave_mat[i][j]>tmp_score) + { + maxi=i; + maxj=j; + tmp_score=TMave_mat[i][j]; + } + } + } + if (tmp_score<=0) break; // error: no assignable chain + assign1_list[maxi]=maxj; + assign2_list[maxj]=maxi; + total_score+=tmp_score; + } + if (total_score<=0) return total_score; // error: no assignable chain + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + + /* iterative refinemnt */ + double delta_score; + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; + int old_i=-1; + int old_j=-1; + + for (int iter=0;iter<getmin(chain1_num,chain2_num)*5;iter++) + { + delta_score=-1; + for (i=0;i<chain1_num;i++) + { + old_j=assign1_list[i]; + for (j=0;j<chain2_num;j++) + { + // attempt to swap (i,old_j=assign1_list[i]) with (i,j) + if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; + old_i=assign2_list[j]; + + assign1_tmp[i]=j; + if (old_i>=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + delta_score=TMave_mat[i][j]; + if (old_j>=0) delta_score-=TMave_mat[i][old_j]; + if (old_i>=0) delta_score-=TMave_mat[old_i][j]; + if (old_i>=0 && old_j>=0) delta_score+=TMave_mat[old_i][old_j]; + + if (delta_score>0) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + total_score+=delta_score; + break; + } + else + { + assign1_tmp[i]=assign1_list[i]; + if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + if (delta_score>0) break; + } + if (delta_score<=0) break; // cannot swap any chain pair + } + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + return total_score; +} + +double calculate_centroids(const vector<vector<vector<double> > >&a_vec, + const int chain_num, double ** centroids) +{ + int L=0; + int c,r; // index of chain and residue + for (c=0; c<chain_num; c++) + { + centroids[c][0]=0; + centroids[c][1]=0; + centroids[c][2]=0; + L=a_vec[c].size(); + for (r=0; r<L; r++) + { + centroids[c][0]+=a_vec[c][r][0]; + centroids[c][1]+=a_vec[c][r][1]; + centroids[c][2]+=a_vec[c][r][2]; + } + centroids[c][0]/=L; + centroids[c][1]/=L; + centroids[c][2]/=L; + //cout<<centroids[c][0]<<'\t' + //<<centroids[c][1]<<'\t' + //<<centroids[c][2]<<endl; + } + + vector<double> d0_vec(chain_num,-1); + int c2=0; + double d0MM=0; + for (c=0; c<chain_num; c++) + { + for (c2=0; c2<chain_num; c2++) + { + if (c2==c) continue; + d0MM=sqrt(dist(centroids[c],centroids[c2])); + if (d0_vec[c]<=0) d0_vec[c]=d0MM; + else d0_vec[c]=getmin(d0_vec[c], d0MM); + } + } + d0MM=0; + for (c=0; c<chain_num; c++) d0MM+=d0_vec[c]; + d0MM/=chain_num; + d0_vec.clear(); + //cout<<d0MM<<endl; + return d0MM; +} + +/* calculate MMscore of aligned chains + * MMscore = sum(TMave_mat[i][j]) * sum(1/(1+dij^2/d0MM^2)) + * / (L* getmin(chain1_num,chain2_num)) + * dij is the centroid distance between chain pair i and j + * d0MM is scaling factor. TMave_mat[i][j] is the TM-score between + * chain pair i and j multiple by getmin(Li*Lj) */ +double calMMscore(double **TMave_mat,int *assign1_list, + const int chain1_num, const int chain2_num, double **xcentroids, + double **ycentroids, const double d0MM, double **r1, double **r2, + double **xt, double t[3], double u[3][3], const int L) +{ + int Nali=0; // number of aligned chain + int i,j; + double MMscore=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + + r1[Nali][0]=xcentroids[i][0]; + r1[Nali][1]=xcentroids[i][1]; + r1[Nali][2]=xcentroids[i][2]; + + r2[Nali][0]=ycentroids[j][0]; + r2[Nali][1]=ycentroids[j][1]; + r2[Nali][2]=ycentroids[j][2]; + + Nali++; + MMscore+=TMave_mat[i][j]; + } + MMscore/=L; + + double RMSD = 0; + double TMscore=0; + if (Nali>=3) + { + /* Kabsch superposition */ + Kabsch(r1, r2, Nali, 1, &RMSD, t, u); + do_rotation(r1, xt, Nali, t, u); + + /* calculate pseudo-TMscore */ + double dd=0; + for (i=0;i<Nali;i++) + { + dd=dist(xt[i], r2[i]); + TMscore+=1/(1+dd/(d0MM*d0MM)); + } + } + else if (Nali==2) + { + double dd=dist(r1[0],r2[0]); + TMscore=1/(1+dd/(d0MM*d0MM)); + } + else TMscore=1; // only one aligned chain. + TMscore/=getmin(chain1_num,chain2_num); + MMscore*=TMscore; + return MMscore; +} + +/* check if this is alignment of heterooligomer or homooligomer + * return het_deg, which ranges from 0 to 1. + * The larger the value, the more "hetero"; + * Tthe smaller the value, the more "homo" */ +double check_heterooligomer(double **TMave_mat, const int chain1_num, + const int chain2_num) +{ + double het_deg=0; + double min_TM=-1; + double max_TM=-1; + int i,j; + for (i=0;i<chain1_num;i++) + { + for (j=0;j<chain2_num;j++) + { + if (min_TM<0 || TMave_mat[i][j] <min_TM) min_TM=TMave_mat[i][j]; + if (max_TM<0 || TMave_mat[i][j]>=max_TM) max_TM=TMave_mat[i][j]; + } + } + het_deg=(max_TM-min_TM)/max_TM; + //cout<<"min_TM="<<min_TM<<endl; + //cout<<"max_TM="<<max_TM<<endl; + return het_deg; +} + +/* reassign chain-chain correspondence, specific for homooligomer */ +double homo_refined_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num, + double **xcentroids, double **ycentroids, const double d0MM, + const int L, double **ut_mat) +{ + double MMscore_max=0; + double MMscore=0; + int i,j; + int c1,c2; + int max_i=-1; // the chain pair whose monomer u t yields highest MMscore + int max_j=-1; + + int chain_num=getmin(chain1_num,chain2_num); + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + double **xt; + NewArray(&xt, chain1_num, 3); + double t[3]; + double u[3][3]; + int ui,uj,ut_idx; + double TMscore=0; // pseudo TM-score + double TMsum =0; + double TMnow =0; + double TMmax =0; + double dd=0; + + size_t total_pair=chain1_num*chain2_num; // total pair + double *ut_tmc_mat=new double [total_pair]; // chain level TM-score + vector<pair<double,int> > ut_tm_vec(total_pair,make_pair(0.0,0)); // product of both + + for (c1=0;c1<chain1_num;c1++) + { + for (c2=0;c2<chain2_num;c2++) + { + if (TMave_mat[c1][c2]<=0) continue; + ut_idx=c1*chain2_num+c2; + for (ui=0;ui<3;ui++) + for (uj=0;uj<3;uj++) u[ui][uj]=ut_mat[ut_idx][ui*3+uj]; + for (uj=0;uj<3;uj++) t[uj]=ut_mat[ut_idx][9+uj]; + + do_rotation(xcentroids, xt, chain1_num, t, u); + + for (i=0;i<chain1_num;i++) assign1_tmp[i]=-1; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=-1; + + + for (i=0;i<chain1_num;i++) + { + for (j=0;j<chain2_num;j++) + { + ut_idx=i*chain2_num+j; + ut_tmc_mat[ut_idx]=0; + ut_tm_vec[ut_idx].first=-1; + ut_tm_vec[ut_idx].second=ut_idx; + if (TMave_mat[i][j]<=0) continue; + dd=dist(xt[i],ycentroids[j]); + ut_tmc_mat[ut_idx]=1/(1+dd/(d0MM*d0MM)); + ut_tm_vec[ut_idx].first= + ut_tmc_mat[ut_idx]*TMave_mat[i][j]; + //cout<<"TM["<<ut_idx<<"]="<<ut_tm_vec[ut_idx].first<<endl; + } + } + //cout<<"sorting "<<total_pair<<" chain pairs"<<endl; + + /* initial assignment */ + assign1_tmp[c1]=c2; + assign2_tmp[c2]=c1; + TMsum=TMave_mat[c1][c2]; + TMscore=ut_tmc_mat[c1*chain2_num+c2]; + + /* further assignment */ + sort(ut_tm_vec.begin(), ut_tm_vec.end()); // sort in ascending order + for (ut_idx=total_pair-1;ut_idx>=0;ut_idx--) + { + j=ut_tm_vec[ut_idx].second % chain2_num; + i=int(ut_tm_vec[ut_idx].second / chain2_num); + if (TMave_mat[i][j]<=0) break; + if (assign1_tmp[i]>=0 || assign2_tmp[j]>=0) continue; + assign1_tmp[i]=j; + assign2_tmp[j]=i; + TMsum+=TMave_mat[i][j]; + TMscore+=ut_tmc_mat[i*chain2_num+j]; + //cout<<"ut_idx="<<ut_tm_vec[ut_idx].second + //<<"\ti="<<i<<"\tj="<<j<<"\ttm="<<ut_tm_vec[ut_idx].first<<endl; + } + + /* final MMscore */ + MMscore=(TMsum/L)*(TMscore/chain_num); + if (max_i<0 || max_j<0 || MMscore>MMscore_max) + { + max_i=c1; + max_j=c2; + MMscore_max=MMscore; + for (i=0;i<chain1_num;i++) assign1_list[i]=assign1_tmp[i]; + for (j=0;j<chain2_num;j++) assign2_list[j]=assign2_tmp[j]; + //cout<<"TMsum/L="<<TMsum/L<<endl; + //cout<<"TMscore/chain_num="<<TMscore/chain_num<<endl; + //cout<<"MMscore="<<MMscore<<endl; + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) + //cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) + //cout<<assign2_list[j]<<","; cout<<"}"<<endl; + } + } + } + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + delete[]ut_tmc_mat; + ut_tm_vec.clear(); + DeleteArray(&xt, chain1_num); + return MMscore; +} + +/* reassign chain-chain correspondence, specific for heterooligomer */ +double hetero_refined_greedy_search(double **TMave_mat,int *assign1_list, + int *assign2_list, const int chain1_num, const int chain2_num, + double **xcentroids, double **ycentroids, const double d0MM, const int L) +{ + double MMscore_old=0; + double MMscore=0; + int i,j; + + double **r1; + double **r2; + double **xt; + int chain_num=getmin(chain1_num,chain2_num); + NewArray(&r1, chain_num, 3); + NewArray(&r2, chain_num, 3); + NewArray(&xt, chain_num, 3); + double t[3]; + double u[3][3]; + + /* calculate MMscore */ + MMscore=MMscore_old=calMMscore(TMave_mat, assign1_list, chain1_num, + chain2_num, xcentroids, ycentroids, d0MM, r1, r2, xt, t, u, L); + //cout<<"MMscore="<<MMscore<<endl; + //cout<<"TMave_mat="<<endl; + //for (i=0;i<chain1_num;i++) + //{ + //for (j=0; j<chain2_num; j++) + //{ + //if (j<chain2_num-1) cout<<TMave_mat[i][j]<<'\t'; + //else cout<<TMave_mat[i][j]<<endl; + //} + //} + + /* iteratively refine chain assignment. in each iteration, attempt + * to swap (i,old_j=assign1_list[i]) with (i,j) */ + double delta_score=-1; + int *assign1_tmp=new int [chain1_num]; + int *assign2_tmp=new int [chain2_num]; + for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; + for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; + int old_i=-1; + int old_j=-1; + + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + + for (int iter=0;iter<chain1_num*chain2_num;iter++) + { + delta_score=-1; + for (i=0;i<chain1_num;i++) + { + old_j=assign1_list[i]; + for (j=0;j<chain2_num;j++) + { + if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; + old_i=assign2_list[j]; + + assign1_tmp[i]=j; + if (old_i>=0) assign1_tmp[old_i]=old_j; + assign2_tmp[j]=i; + if (old_j>=0) assign2_tmp[old_j]=old_i; + + MMscore=calMMscore(TMave_mat, assign1_tmp, chain1_num, + chain2_num, xcentroids, ycentroids, d0MM, + r1, r2, xt, t, u, L); + + //cout<<"(i,j,old_i,old_j,MMscore)=("<<i<<","<<j<<"," + //<<old_i<<","<<old_j<<","<<MMscore<<")"<<endl; + + if (MMscore>MMscore_old) // successful swap + { + assign1_list[i]=j; + if (old_i>=0) assign1_list[old_i]=old_j; + assign2_list[j]=i; + if (old_j>=0) assign2_list[old_j]=old_i; + delta_score=(MMscore-MMscore_old); + MMscore_old=MMscore; + //cout<<"MMscore="<<MMscore<<endl; + break; + } + else + { + assign1_tmp[i]=assign1_list[i]; + if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; + assign2_tmp[j]=assign2_list[j]; + if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; + } + } + } + //cout<<"iter="<<iter<<endl; + //cout<<"assign1_list={"; + //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; + //cout<<"assign2_list={"; + //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; + if (delta_score<=0) break; // cannot swap any chain pair + } + MMscore=MMscore_old; + //cout<<"MMscore="<<MMscore<<endl; + + /* clean up */ + delete[]assign1_tmp; + delete[]assign2_tmp; + DeleteArray(&r1, chain_num); + DeleteArray(&r2, chain_num); + DeleteArray(&xt, chain_num); + return MMscore; +} + +void copy_chain_data(const vector<vector<double> >&a_vec_i, + const vector<char>&seq_vec_i,const vector<char>&sec_vec_i, + const int len,double **a,char *seq,char *sec) +{ + int r; + for (r=0;r<len;r++) + { + a[r][0]=a_vec_i[r][0]; + a[r][1]=a_vec_i[r][1]; + a[r][2]=a_vec_i[r][2]; + seq[r]=seq_vec_i[r]; + sec[r]=sec_vec_i[r]; + } + seq[len]=0; + sec[len]=0; +} + +/* clear chains with L<3 */ +void clear_full_PDB_lines(vector<vector<string> > PDB_lines,const string atom_opt) +{ + int chain_i; + int Lch; + int a; + bool select_atom; + string line; + for (chain_i=0;chain_i<PDB_lines.size();chain_i++) + { + Lch=0; + for (a=0;a<PDB_lines[chain_i].size();a++) + { + line=PDB_lines[chain_i][a]; + if (atom_opt=="auto") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(line.compare(12,4," C3'")==0); + else select_atom=(line.compare(12,4," CA ")==0); + } + else select_atom=(line.compare(12,4,atom_opt)==0); + Lch+=select_atom; + } + if (Lch<3) + { + for (a=0;a<PDB_lines[chain_i].size();a++) + PDB_lines[chain_i][a].clear(); + PDB_lines[chain_i].clear(); + } + } + line.clear(); +} + +size_t get_full_PDB_lines(const string filename, + vector<vector<string> >&PDB_lines, const int ter_opt, + const int infmt_opt, const int split_opt, const int het_opt) +{ + size_t i=0; // resi i.e. atom index + string line; + char chainID=0; + vector<string> tmp_str_vec; + + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (filename.size()>=3 && + filename.substr(filename.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c '"+filename+"'"); + compress_type=1; + } + else if (filename.size()>=4 && + filename.substr(filename.size()-4,4)==".bz2") + { + fin_gz.open("bzcat '"+filename+"'"); + compress_type=2; + } + else +#endif + fin.open(filename.c_str()); + + if (infmt_opt==0||infmt_opt==-1) // PDB format + { + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (infmt_opt==-1 && line.compare(0,5,"loop_")==0) // PDBx/mmCIF + return get_full_PDB_lines(filename,PDB_lines, + ter_opt, 3, split_opt,het_opt); + if (i > 0) + { + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + else if (ter_opt>=3 && line.compare(0,3,"TER")==0) break; + } + if (split_opt && line.compare(0,3,"END")==0) chainID=0; + if (line.size()>=54 && (line[16]==' ' || line[16]=='A') && ( + (line.compare(0, 6, "ATOM ")==0) || + (line.compare(0, 6, "HETATM")==0 && het_opt==1) || + (line.compare(0, 6, "HETATM")==0 && het_opt==2 && + line.compare(17,3, "MSE")==0))) + { + if (!chainID) + { + chainID=line[21]; + PDB_lines.push_back(tmp_str_vec); + } + else if (ter_opt>=2 && chainID!=line[21]) break; + if (split_opt==2 && chainID!=line[21]) + { + chainID=line[21]; + PDB_lines.push_back(tmp_str_vec); + } + + PDB_lines.back().push_back(line); + i++; + } + } + } + else if (infmt_opt==1) // SPICKER format + { + size_t L=0; + float x,y,z; + stringstream i8_stream; + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) fin_gz>>L>>x>>y>>z; + else fin >>L>>x>>y>>z; + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (!(compress_type?fin_gz.good():fin.good())) break; + for (i=0;i<L;i++) + { + if (compress_type) fin_gz>>x>>y>>z; + else fin >>x>>y>>z; + i8_stream<<"ATOM "<<setw(4)<<i+1<<" CA UNK "<<setw(4) + <<i+1<<" "<<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x<<setw(8)<<y<<setw(8)<<z; + line=i8_stream.str(); + i8_stream.str(string()); + PDB_lines.back().push_back(line); + } + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + } + } + else if (infmt_opt==2) // xyz format + { + size_t L=0; + stringstream i8_stream; + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + L=atoi(line.c_str()); + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + for (i=0;i<line.size();i++) + if (line[i]==' '||line[i]=='\t') break; + if (!(compress_type?fin_gz.good():fin.good())) break; + PDB_lines.push_back(tmp_str_vec); + for (i=0;i<L;i++) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + i8_stream<<"ATOM "<<setw(4)<<i+1<<" CA " + <<AAmap(line[0])<<" "<<setw(4)<<i+1<<" " + <<line.substr(2,8)<<line.substr(11,8)<<line.substr(20,8); + line=i8_stream.str(); + i8_stream.str(string()); + PDB_lines.back().push_back(line); + } + } + } + else if (infmt_opt==3) // PDBx/mmCIF format + { + bool loop_ = false; // not reading following content + map<string,int> _atom_site; + int atom_site_pos; + vector<string> line_vec; + string alt_id="."; // alternative location indicator + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + string prev_asym_id=""; + string AA=""; // residue name + string atom=""; + string resi=""; + string model_index=""; // the same as model_idx but type is string + stringstream i8_stream; + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (loop_) loop_ = (line.size()>=2)?(line.compare(0,2,"# ")):(line.compare(0,1,"#")); + if (!loop_) + { + if (line.compare(0,5,"loop_")) continue; + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + + loop_=true; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[Trim(line.substr(11))]=atom_site_pos; + + while(1) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; + } + + + if (_atom_site.count("group_PDB")* + _atom_site.count("label_atom_id")* + _atom_site.count("label_comp_id")* + (_atom_site.count("auth_asym_id")+ + _atom_site.count("label_asym_id"))* + (_atom_site.count("auth_seq_id")+ + _atom_site.count("label_seq_id"))* + _atom_site.count("Cartn_x")* + _atom_site.count("Cartn_y")* + _atom_site.count("Cartn_z")==0) + { + loop_ = false; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_comp_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<<endl; + continue; + } + } + + line_vec.clear(); + split(line,line_vec); + if ((line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") || + (line_vec[_atom_site["group_PDB"]]=="HETATM" && + (het_opt==0 || + (het_opt==2 && line_vec[_atom_site["label_comp_id"]]!="MSE"))) + ) continue; + + alt_id="."; + if (_atom_site.count("label_alt_id")) // in 39.4 % of entries + alt_id=line_vec[_atom_site["label_alt_id"]]; + if (alt_id!="." && alt_id!="A") continue; + + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) continue; + if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; // wrong for sidechain H + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) continue; + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) continue; + + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + + if (_atom_site.count("pdbx_PDB_model_num") && + model_index!=line_vec[_atom_site["pdbx_PDB_model_num"]]) + { + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + if (PDB_lines.size() && ter_opt>=1) break; + if (PDB_lines.size()==0 || split_opt>=1) + { + PDB_lines.push_back(tmp_str_vec); + prev_asym_id=asym_id; + } + } + + if (prev_asym_id!=asym_id) + { + if (prev_asym_id!="" && ter_opt>=2) break; + if (split_opt>=2) PDB_lines.push_back(tmp_str_vec); + } + if (prev_asym_id!=asym_id) prev_asym_id=asym_id; + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + + i++; + i8_stream<<"ATOM " + <<setw(5)<<i<<" "<<atom<<" "<<AA<<setw(2)<<asym_id.substr(0,2) + <<setw(5)<<resi.substr(0,5)<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_y"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_z"]].substr(0,8); + PDB_lines.back().push_back(i8_stream.str()); + i8_stream.str(string()); + } + _atom_site.clear(); + line_vec.clear(); + alt_id.clear(); + asym_id.clear(); + AA.clear(); + } + + if (compress_type) fin_gz.close(); + else fin.close(); + line.clear(); + return PDB_lines.size(); +} + +void output_dock(const vector<string>&chain_list, const int ter_opt, + const int split_opt, const int infmt_opt, const string atom_opt, + const int mirror_opt, double **ut_mat, const string&fname_super) +{ + size_t i; + int chain_i,a; + string name; + int chainnum; + double x[3]; // before transform + double x1[3]; // after transform + string line; + vector<vector<string> >PDB_lines; + int m=0; + double t[3]; + double u[3][3]; + int ui,uj; + stringstream buf; + string filename; + int het_opt=1; + for (i=0;i<chain_list.size();i++) + { + name=chain_list[i]; + chainnum=get_full_PDB_lines(name, PDB_lines, + ter_opt, infmt_opt, split_opt, het_opt); + if (!chainnum) continue; + clear_full_PDB_lines(PDB_lines, atom_opt); // clear chains with <3 residue + for (chain_i=0;chain_i<chainnum;chain_i++) + { + if (PDB_lines[chain_i].size()<3) continue; + buf<<fname_super<<'.'<<m<<".pdb"; + filename=buf.str(); + buf.str(string()); + for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) u[ui][uj]=ut_mat[m][ui*3+uj]; + for (uj=0;uj<3;uj++) t[uj]=ut_mat[m][9+uj]; + for (a=0;a<PDB_lines[chain_i].size();a++) + { + line=PDB_lines[chain_i][a]; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + transform(t, u, x, x1); + buf<<line.substr(0,30)<<setiosflags(ios::fixed) + <<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1]<<setw(8)<<x1[2] + <<line.substr(54)<<'\n'; + } + buf<<"TER"<<endl; + ofstream fp; + fp.open(filename.c_str()); + fp<<buf.str(); + fp.close(); + buf.str(string()); + PDB_lines[chain_i].clear(); + m++; + } // chain_i + name.clear(); + PDB_lines.clear(); + } // i + vector<vector<string> >().swap(PDB_lines); + line.clear(); +} + +void parse_chain_list(const vector<string>&chain_list, + vector<vector<vector<double> > >&a_vec, vector<vector<char> >&seq_vec, + vector<vector<char> >&sec_vec, vector<int>&mol_vec, vector<int>&len_vec, + vector<string>&chainID_list, const int ter_opt, const int split_opt, + const string mol_opt, const int infmt_opt, const string atom_opt, + const int mirror_opt, const int het_opt, int &len_aa, int &len_na, + const int o_opt, vector<string>&resi_vec) +{ + size_t i; + int chain_i,r; + string name; + int chainnum; + double **xa; + int len; + char *seq,*sec; + + vector<vector<string> >PDB_lines; + vector<double> tmp_atom_array(3,0); + vector<vector<double> > tmp_chain_array; + vector<char>tmp_seq_array; + vector<char>tmp_sec_array; + //vector<string> resi_vec; + int read_resi=2; + + for (i=0;i<chain_list.size();i++) + { + name=chain_list[i]; + chainnum=get_PDB_lines(name, PDB_lines, chainID_list, + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + if (!chainnum) + { + cerr<<"Warning! Cannot parse file: "<<name + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<chainnum;chain_i++) + { + len=PDB_lines[chain_i].size(); + if (!len) + { + cerr<<"Warning! Cannot parse file: "<<name + <<". Chain length 0."<<endl; + continue; + } + else if (len<3) + { + cerr<<"Sequence is too short <3!: "<<name<<endl; + continue; + } + NewArray(&xa, len, 3); + seq = new char[len + 1]; + sec = new char[len + 1]; + len = read_PDB(PDB_lines[chain_i], xa, seq, resi_vec, read_resi); + if (mirror_opt) for (r=0;r<len;r++) xa[r][2]=-xa[r][2]; + if (mol_vec[chain_i]>0 || mol_opt=="RNA") + make_sec(seq, xa, len, sec,atom_opt); + else make_sec(xa, len, sec); // secondary structure assignment + + /* store in vector */ + tmp_chain_array.assign(len,tmp_atom_array); + vector<char>tmp_seq_array(len+1,0); + vector<char>tmp_sec_array(len+1,0); + for (r=0;r<len;r++) + { + tmp_chain_array[r][0]=xa[r][0]; + tmp_chain_array[r][1]=xa[r][1]; + tmp_chain_array[r][2]=xa[r][2]; + tmp_seq_array[r]=seq[r]; + tmp_sec_array[r]=sec[r]; + } + a_vec.push_back(tmp_chain_array); + seq_vec.push_back(tmp_seq_array); + sec_vec.push_back(tmp_sec_array); + len_vec.push_back(len); + + /* clean up */ + tmp_chain_array.clear(); + tmp_seq_array.clear(); + tmp_sec_array.clear(); + PDB_lines[chain_i].clear(); + DeleteArray(&xa, len); + delete [] seq; + delete [] sec; + } // chain_i + name.clear(); + PDB_lines.clear(); + mol_vec.clear(); + } // i + tmp_atom_array.clear(); + + if (mol_opt=="RNA") mol_vec.assign(a_vec.size(),1); + else if (mol_opt=="protein") mol_vec.assign(a_vec.size(),-1); + else + { + mol_vec.assign(a_vec.size(),0); + for (i=0;i<a_vec.size();i++) + { + for (r=0;r<len_vec[i];r++) + { + if (seq_vec[i][r]>='a' && seq_vec[i][r]<='z') mol_vec[i]++; + else mol_vec[i]--; + } + } + } + + len_aa=0; + len_na=0; + for (i=0;i<a_vec.size();i++) + { + if (mol_vec[i]>0) len_na+=len_vec[i]; + else len_aa+=len_vec[i]; + } +} + +int copy_chain_pair_data( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int chain1_num, int chain2_num, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence) +{ + int i,j,r; + for (i=0;i<sequence.size();i++) sequence[i].clear(); + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + int mol_type=0; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + for (r=0;r<xlen_vec[i];r++) + { + seqx[xlen]=seqx_vec[i][r]; + secx[xlen]=secx_vec[i][r]; + xa[xlen][0]= xa_vec[i][r][0]; + xa[xlen][1]= xa_vec[i][r][1]; + xa[xlen][2]= xa_vec[i][r][2]; + xlen++; + } + sequence[0]+=seqxA_mat[i][j]; + for (r=0;r<ylen_vec[j];r++) + { + seqy[ylen]=seqy_vec[j][r]; + secy[ylen]=secy_vec[j][r]; + ya[ylen][0]= ya_vec[j][r][0]; + ya[ylen][1]= ya_vec[j][r][1]; + ya[ylen][2]= ya_vec[j][r][2]; + ylen++; + } + sequence[1]+=seqyA_mat[i][j]; + mol_type+=mol_vec1[i]+mol_vec2[j]; + } + seqx[xlen]=0; + secx[xlen]=0; + seqy[ylen]=0; + secy[ylen]=0; + return mol_type; +} + +double MMalign_search( + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence, + double d0_scale, bool fast_opt, const int i_opt=3) +{ + double total_score=0; + int i,j; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]<0) continue; + xlen+=xlen_vec[i]; + ylen+=ylen_vec[assign1_list[i]]; + } + if (xlen<=3 || ylen<=3) return total_score; + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, false, true, false, fast_opt, mol_type, -1); + + /* clean up */ + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + + /* re-compute chain level alignment */ + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + for (j=0;j<chain2_num;j++) + { + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, 2, false, mol_vec1[i]+mol_vec2[j], 1, invmap); + + /* print result */ + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + + TMave_mat[i][j]=TM4*Lnorm_ass; + if (assign1_list[i]==j) total_score+=TMave_mat[i][j]; + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + return total_score; +} + +void MMalign_final( + const string xname, const string yname, + const vector<string> chainID_list1, const vector<string> chainID_list2, + string fname_super, string fname_lign, string fname_matrix, + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, + double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqM_mat, + vector<vector<string> >&seqyA_mat, int *assign1_list, int *assign2_list, + vector<string>&sequence, const double d0_scale, const bool m_opt, + const int o_opt, const int outfmt_opt, const int ter_opt, + const int split_opt, const bool a_opt, const bool d_opt, + const bool fast_opt, const bool full_opt, const int mirror_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) +{ + int i,j; + int xlen=0; + int ylen=0; + for (i=0;i<chain1_num;i++) xlen+=xlen_vec[i]; + for (j=0;j<chain2_num;j++) ylen+=ylen_vec[j]; + if (xlen<=3 || ylen<=3) return; + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, a_opt, false, d_opt, fast_opt, mol_type, -1); + + /* prepare full complex alignment */ + string chainID1=""; + string chainID2=""; + sequence.clear(); + sequence.push_back(""); // seqxA + sequence.push_back(""); // seqyA + sequence.push_back(""); // seqM + int aln_start=0; + int aln_end=0; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + chainID1+=chainID_list1[i]; + chainID2+=chainID_list2[j]; + sequence[0]+=seqxA_mat[i][j]+'*'; + sequence[1]+=seqyA_mat[i][j]+'*'; + + aln_end+=seqxA_mat[i][j].size(); + seqM_mat[i][j]=seqM.substr(aln_start,aln_end-aln_start); + sequence[2]+=seqM_mat[i][j]+'*'; + aln_start=aln_end; + } + + /* prepare unaligned region */ + for (i=0;i<chain1_num;i++) + { + if (assign1_list[i]>=0) continue; + chainID1+=chainID_list1[i]; + chainID2+=':'; + string s(seqx_vec[i].begin(),seqx_vec[i].end()); + sequence[0]+=s.substr(0,xlen_vec[i])+'*'; + sequence[1]+=string(xlen_vec[i],'-')+'*'; + s.clear(); + sequence[2]+=string(xlen_vec[i],' ')+'*'; + } + for (j=0;j<chain2_num;j++) + { + if (assign2_list[j]>=0) continue; + chainID1+=':'; + chainID2+=chainID_list2[j]; + string s(seqy_vec[j].begin(),seqy_vec[j].end()); + sequence[0]+=string(ylen_vec[j],'-')+'*'; + sequence[1]+=s.substr(0,ylen_vec[j])+'*'; + s.clear(); + sequence[2]+=string(ylen_vec[j],' ')+'*'; + } + + /* print alignment */ + output_results(xname, yname, chainID1.c_str(), chainID2.c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + sequence[2].c_str(), sequence[0].c_str(), sequence[1].c_str(), + Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, 0, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, true, + split_opt, o_opt, fname_super, + false, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + sequence[0].clear(); + sequence[1].clear(); + sequence[2].clear(); + + if (!full_opt) return; + + cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<<endl; + + /* re-compute chain level alignment */ + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + xlen=xlen_vec[i]; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + seqM=""; + seqxA=""; + seqyA=""; + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 1, a_opt, 2, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); + + //TM2=TM4*Lnorm_ass/xlen; + //TM1=TM4*Lnorm_ass/ylen; + //d0A=d0u; + //d0B=d0u; + + /* print result */ + output_results(xname, yname, + chainID_list1[i].c_str(), chainID_list2[j].c_str(), + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), + seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, + TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, 0, + "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); + + /* clean up */ + seqxA.clear(); + seqM.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + delete[]invmap; + } + sequence.clear(); + return; +} + +void copy_chain_assign_data(int chain1_num, int chain2_num, + vector<string> &sequence, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, double **TMave_mat, + vector<vector<string> >&seqxA_tmp, vector<vector<string> >&seqyA_tmp, + int *assign1_tmp, int *assign2_tmp, double **TMave_tmp) +{ + int i,j; + for (i=0;i<sequence.size();i++) sequence[i].clear(); + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; + for (i=0;i<chain2_num;i++) assign2_tmp[i]=assign2_list[i]; + for (i=0;i<chain1_num;i++) + { + for (j=0;j<chain2_num;j++) + { + seqxA_tmp[i][j]=seqxA_mat[i][j]; + seqyA_tmp[i][j]=seqyA_mat[i][j]; + TMave_tmp[i][j]=TMave_mat[i][j]; + if (assign1_list[i]==j) + { + sequence[0]+=seqxA_mat[i][j]; + sequence[1]+=seqyA_mat[i][j]; + } + } + } + return; +} + +void MMalign_iter(double & max_total_score, const int max_iter, + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence, + double d0_scale, bool fast_opt) +{ + /* tmp assignment */ + double total_score; + int *assign1_tmp, *assign2_tmp; + assign1_tmp=new int[chain1_num]; + assign2_tmp=new int[chain2_num]; + double **TMave_tmp; + NewArray(&TMave_tmp,chain1_num,chain2_num); + vector<string> tmp_str_vec(chain2_num,""); + vector<vector<string> >seqxA_tmp(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_tmp(chain1_num,tmp_str_vec); + vector<string> sequence_tmp; + copy_chain_assign_data(chain1_num, chain2_num, sequence_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp); + + for (int iter=0;iter<max_iter;iter++) + { + total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, + TMave_tmp, seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, + sequence, d0_scale, fast_opt); + total_score=enhanced_greedy_search(TMave_tmp, assign1_tmp, + assign2_tmp, chain1_num, chain2_num); + //if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + if (total_score<=max_total_score) break; + max_total_score=total_score; + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + } + + /* clean up everything */ + delete [] assign1_tmp; + delete [] assign2_tmp; + DeleteArray(&TMave_tmp,chain1_num); + vector<string>().swap(tmp_str_vec); + vector<vector<string> >().swap(seqxA_tmp); + vector<vector<string> >().swap(seqyA_tmp); +} + + +/* Input: vectors x, y, rotation matrix t, u, scale factor d02, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM_dimer(bool **path, double **val, double **x, double **y, + int len1, int len2, bool **mask, + double t[3], double u[3][3], double d02, double gap_open, int j2i[]) +{ + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + //val[i][0]=0; + val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + //val[0][j]=0; + val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + double xx[3], dij; + + + //decide matrix and path + for(i=1; i<=len1; i++) + { + transform(t, u, &x[i-1][0], xx); + for(j=1; j<=len2; j++) + { + d=FLT_MIN; + if (mask[i][j]) + { + dij=dist(xx, &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + } + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +/* +ss + * Input: secondary structure secx, secy, and gap_open + * Output: j2i[1:len2] \in {1:len1} U {-1} + * path[0:len1, 0:len2]=1,2,3, from diagonal, horizontal, vertical */ +void NWDP_TM_dimer(bool **path, double **val, const char *secx, const char *secy, + const int len1, const int len2, bool **mask, const double gap_open, int j2i[]) +{ + + int i, j; + double h, v, d; + + //initialization + for(i=0; i<=len1; i++) + { + //val[i][0]=0; + val[i][0]=i*gap_open; + path[i][0]=false; //not from diagonal + } + + for(j=0; j<=len2; j++) + { + //val[0][j]=0; + val[0][j]=j*gap_open; + path[0][j]=false; //not from diagonal + j2i[j]=-1; //all are not aligned, only use j2i[1:len2] + } + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + d=FLT_MIN; + if (mask[i][j]) + d=val[i-1][j-1] + 1.0*(secx[i-1]==secy[j-1]); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + if(d>=h && d>=v) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + +//heuristic run of dynamic programing iteratively to find the best alignment +//input: initial rotation matrix t, u +// vectors x and y, d0 +//output: best alignment that maximizes the TMscore, will be stored in invmap +double DP_iter_dimer(double **r1, double **r2, double **xtm, double **ytm, + double **xt, bool **path, double **val, double **x, double **y, + int xlen, int ylen, bool **mask, double t[3], double u[3][3], int invmap0[], + int g1, int g2, int iteration_max, double local_d0_search, + double D0_MIN, double Lnorm, double d0, double score_d8) +{ + double gap_open[2]={-0.6, 0}; + double rmsd; + int *invmap=new int[ylen+1]; + + int iteration, i, j, k; + double tmscore, tmscore_max, tmscore_old=0; + int score_sum_method=8, simplify_step=40; + tmscore_max=-1; + + //double d01=d0+1.5; + double d02=d0*d0; + for(int g=g1; g<g2; g++) + { + for(iteration=0; iteration<iteration_max; iteration++) + { + NWDP_TM_dimer(path, val, x, y, xlen, ylen, mask, + t, u, d02, gap_open[g], invmap); + + k=0; + for(j=0; j<ylen; j++) + { + i=invmap[j]; + + if(i>=0) //aligned + { + xtm[k][0]=x[i][0]; + xtm[k][1]=x[i][1]; + xtm[k][2]=x[i][2]; + + ytm[k][0]=y[j][0]; + ytm[k][1]=y[j][1]; + ytm[k][2]=y[j][2]; + k++; + } + } + + tmscore = TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, + simplify_step, score_sum_method, &rmsd, local_d0_search, + Lnorm, score_d8, d0); + + + if(tmscore>tmscore_max) + { + tmscore_max=tmscore; + for(i=0; i<ylen; i++) invmap0[i]=invmap[i]; + } + + if(iteration>0) + { + if(fabs(tmscore_old-tmscore)<0.000001) break; + } + tmscore_old=tmscore; + }// for iteration + + }//for gapopen + + + delete []invmap; + return tmscore_max; +} + +void get_initial_ss_dimer(bool **path, double **val, const char *secx, + const char *secy, int xlen, int ylen, bool **mask, int *y2x) +{ + double gap_open=-1.0; + NWDP_TM_dimer(path, val, secx, secy, xlen, ylen, mask, gap_open, y2x); +} + +bool get_initial5_dimer( double **r1, double **r2, double **xtm, double **ytm, + bool **path, double **val, double **x, double **y, int xlen, int ylen, + bool **mask, int *y2x, + double d0, double d0_search, const bool fast_opt, const double D0_MIN) +{ + double GL, rmsd; + double t[3]; + double u[3][3]; + + double d01 = d0 + 1.5; + if (d01 < D0_MIN) d01 = D0_MIN; + double d02 = d01*d01; + + double GLmax = 0; + int aL = getmin(xlen, ylen); + int *invmap = new int[ylen + 1]; + + // jump on sequence1--------------> + int n_jump1 = 0; + if (xlen > 250) + n_jump1 = 45; + else if (xlen > 200) + n_jump1 = 35; + else if (xlen > 150) + n_jump1 = 25; + else + n_jump1 = 15; + if (n_jump1 > (xlen / 3)) + n_jump1 = xlen / 3; + + // jump on sequence2--------------> + int n_jump2 = 0; + if (ylen > 250) + n_jump2 = 45; + else if (ylen > 200) + n_jump2 = 35; + else if (ylen > 150) + n_jump2 = 25; + else + n_jump2 = 15; + if (n_jump2 > (ylen / 3)) + n_jump2 = ylen / 3; + + // fragment to superimpose--------------> + int n_frag[2] = { 20, 100 }; + if (n_frag[0] > (aL / 3)) + n_frag[0] = aL / 3; + if (n_frag[1] > (aL / 2)) + n_frag[1] = aL / 2; + + // start superimpose search--------------> + if (fast_opt) + { + n_jump1*=5; + n_jump2*=5; + } + bool flag = false; + for (int i_frag = 0; i_frag < 2; i_frag++) + { + int m1 = xlen - n_frag[i_frag] + 1; + int m2 = ylen - n_frag[i_frag] + 1; + + for (int i = 0; i<m1; i = i + n_jump1) //index starts from 0, different from FORTRAN + { + for (int j = 0; j<m2; j = j + n_jump2) + { + for (int k = 0; k<n_frag[i_frag]; k++) //fragment in y + { + r1[k][0] = x[k + i][0]; + r1[k][1] = x[k + i][1]; + r1[k][2] = x[k + i][2]; + + r2[k][0] = y[k + j][0]; + r2[k][1] = y[k + j][1]; + r2[k][2] = y[k + j][2]; + } + + // superpose the two structures and rotate it + Kabsch(r1, r2, n_frag[i_frag], 1, &rmsd, t, u); + + double gap_open = 0.0; + NWDP_TM_dimer(path, val, x, y, xlen, ylen, mask, + t, u, d02, gap_open, invmap); + GL = get_score_fast(r1, r2, xtm, ytm, x, y, xlen, ylen, + invmap, d0, d0_search, t, u); + if (GL>GLmax) + { + GLmax = GL; + for (int ii = 0; ii<ylen; ii++) y2x[ii] = invmap[ii]; + flag = true; + } + } + } + } + + delete[] invmap; + return flag; +} + +void get_initial_ssplus_dimer(double **r1, double **r2, double **score, + bool **path, double **val, const char *secx, const char *secy, + double **x, double **y, int xlen, int ylen, bool **mask, + int *y2x0, int *y2x, const double D0_MIN, double d0) +{ + //create score matrix for DP + score_matrix_rmsd_sec(r1, r2, score, secx, secy, x, y, xlen, ylen, + y2x0, D0_MIN,d0); + + int i,j; + for (i=0;i<xlen+1;i++) for (j=0;j<ylen+1;j++) score[i][j]=FLT_MIN; + + double gap_open=-1.0; + NWDP_TM(score, path, val, xlen, ylen, gap_open, y2x); +} + +/* Entry function for TM-align. Return TM-score calculation status: + * 0 - full TM-score calculation + * 1 - terminated due to exception + * 2-7 - pre-terminated due to low TM-score */ +int TMalign_dimer_main(double **xa, double **ya, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + bool **mask, + const vector<string> sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const double TMcut=-1) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double t[3], u[3][3]; //Kabsch translation vector and rotation matrix + double **score; // Input score table for dynamic programming + bool **path; // for dynamic programming + double **val; // for dynamic programming + double **xtm, **ytm; // for TMscore search engine + double **xt; //for saving the superposed version of r_1 or xtm + double **r1, **r2; // for Kabsch rotation + + /***********************/ + /* allocate memory */ + /***********************/ + int minlen = min(xlen, ylen); + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + NewArray(&xtm, minlen, 3); + NewArray(&ytm, minlen, 3); + NewArray(&xt, xlen, 3); + NewArray(&r1, minlen, 3); + NewArray(&r2, minlen, 3); + + /***********************/ + /* parameter set */ + /***********************/ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); + int simplify_step = 40; //for simplified search engine + int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis<score_d8 + + int i; + int *invmap0 = new int[ylen+1]; + int *invmap = new int[ylen+1]; + double TM, TMmax=-1; + for(i=0; i<ylen; i++) invmap0[i]=-1; + + double ddcc=0.4; + if (Lnorm <= 40) ddcc=0.1; //Lnorm was setted in parameter_set4search + double local_d0_search = d0_search; + + //************************************************// + // get initial alignment from user's input: // + // Stick to the initial alignment // + //************************************************// + bool bAlignStick = false; + if (i_opt==3)// if input has set parameter for "-I" + { + // In the original code, this loop starts from 1, which is + // incorrect. Fortran starts from 1 but C++ should starts from 0. + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; + + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') i1++; + if (sequence[1][kk1] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, d0_search, score_d8, + t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, 40, 8, local_d0_search, true, Lnorm, score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + } + bAlignStick = true; + } + + /******************************************************/ + /* get initial alignment with gapless threading */ + /******************************************************/ + if (!bAlignStick) + { + get_initial(r1, r2, xtm, ytm, xa, ya, xlen, ylen, invmap0, d0, + d0_search, fast_opt, t, u); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap0, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) TMmax = TM; + if (TMcut>0) copy_t_u(t, u, t0, u0); + //run dynamic programing iteratively to find the best alignment + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, xlen, ylen, + mask, t, u, invmap, 0, 2, (fast_opt)?2:30, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.5*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 2; + } + } + + /************************************************************/ + /* get initial alignment based on secondary structure */ + /************************************************************/ + get_initial_ss_dimer(path, val, secx, secy, xlen, ylen, mask, invmap); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*0.2) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, + (fast_opt)?2:30, local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.52*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 3; + } + } + + /************************************************************/ + /* get initial alignment based on local superposition */ + /************************************************************/ + //=initial5 in original TM-align + if (get_initial5_dimer( r1, r2, xtm, ytm, path, val, xa, ya, + xlen, ylen, mask, invmap, d0, d0_search, fast_opt, D0_MIN)) + { + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap, t, u, simplify_step, score_sum_method, + local_d0_search, Lnorm, score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, 2, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (int i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + } + } + else + cerr << "\n\nWarning: initial alignment from local superposition fail!\n\n" << endl; + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.54*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 4; + } + } + + /********************************************************************/ + /* get initial alignment by local superposition+secondary structure */ + /********************************************************************/ + //=initial3 in original TM-align + get_initial_ssplus_dimer(r1, r2, score, path, val, secx, secy, xa, ya, + xlen, ylen, mask, invmap0, invmap, D0_MIN, d0); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, + (fast_opt)?2:30, local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.56*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 5; + } + } + + /*******************************************************************/ + /* get initial alignment based on fragment gapless threading */ + /*******************************************************************/ + //=initial4 in original TM-align + get_initial_fgt(r1, r2, xtm, ytm, xa, ya, xlen, ylen, + invmap, d0, d0_search, dcu0, fast_opt, t, u); + TM = detailed_search(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, invmap, + t, u, simplify_step, score_sum_method, local_d0_search, Lnorm, + score_d8, d0); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + if (TM > TMmax*ddcc) + { + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 1, 2, 2, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + if (TMcut>0) copy_t_u(t, u, t0, u0); + } + } + + if (TMcut>0) // pre-terminate if TM-score is too low + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.58*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 6; + } + } + + //************************************************// + // get initial alignment from user's input: // + //************************************************// + if (i_opt==1)// if input has set parameter for "-i" + { + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; + + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') + i1++; + if (sequence[1][kk1] != '-') + { + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; + } + } + + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, + d0_search, score_d8, t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; + + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, t, u, 40, 8, local_d0_search, true, Lnorm, + score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + } + // Different from get_initial, get_initial_ss and get_initial_ssplus + TM = DP_iter_dimer(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, mask, t, u, invmap, 0, 2, + (fast_opt)?2:30, local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + } + } + } + + + + //*******************************************************************// + // The alignment will not be changed any more in the following // + //*******************************************************************// + //check if the initial alignment is generated appropriately + bool flag=false; + for(i=0; i<ylen; i++) + { + if(invmap0[i]>=0) + { + flag=true; + break; + } + } + if(!flag) + { + cout << "There is no alignment between the two structures! " + << "Program stop with no result!" << endl; + TM1=TM2=TM3=TM4=TM5=0; + return 1; + } + + /* last TM-score pre-termination */ + if (TMcut>0) + { + double TMtmp=approx_TM(xlen, ylen, a_opt, + xa, ya, t0, u0, invmap0, mol_type); + + if (TMtmp<0.6*TMcut) + { + TM1=TM2=TM3=TM4=TM5=TMtmp; + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + return 7; + } + } + + //********************************************************************// + // Detailed TMscore search engine --> prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alignment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0); + + //select pairs with dis<d8 for final TMscore computation and output alignment + int k=0; + int *m1, *m2; + double d; + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + do_rotation(xa, xt, xlen, t, u); + k=0; + for(int j=0; j<ylen; j++) + { + i=invmap0[j]; + if(i>=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8 || (i_opt == 3)) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + + //****************************************// + // Final TMscore // + // Please set parameters for output // + //****************************************// + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + + //normalized by length of structure A + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + TM_0 = TM1; + + //normalized by length of structure B + parameter_set4final(xlen+0.0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0B=d0; + local_d0_search = d0_search; + TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + double Lnorm_d0; + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + Lnorm_d0=Lnorm_0; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; //maximum length of alignment + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + int kk=0, i_old=0, j_old=0; + d=0; + for(int k=0; k<n_ali8; k++) + { + for(int i=i_old; i<m1[k]; i++) + { + //align x to gap + seqxA[kk]=seqx[i]; + seqyA[kk]='-'; + seqM[kk]=' '; + kk++; + } + + for(int j=j_old; j<m2[k]; j++) + { + //align y to gap + seqxA[kk]='-'; + seqyA[kk]=seqy[j]; + seqM[kk]=' '; + kk++; + } + + seqxA[kk]=seqx[m1[k]]; + seqyA[kk]=seqy[m2[k]]; + Liden+=(seqxA[kk]==seqyA[kk]); + d=sqrt(dist(&xt[m1[k]][0], &ya[m2[k]][0])); + if(d<d0_out) seqM[kk]=':'; + else seqM[kk]='.'; + kk++; + i_old=m1[k]+1; + j_old=m2[k]+1; + } + + //tail + for(int i=i_old; i<xlen; i++) + { + //align x to gap + seqxA[kk]=seqx[i]; + seqyA[kk]='-'; + seqM[kk]=' '; + kk++; + } + for(int j=j_old; j<ylen; j++) + { + //align y to gap + seqxA[kk]='-'; + seqyA[kk]=seqy[j]; + seqM[kk]=' '; + kk++; + } + seqxA=seqxA.substr(0,kk); + seqyA=seqyA.substr(0,kk); + seqM =seqM.substr(0,kk); + + /* free memory */ + clean_up_after_approx_TM(invmap0, invmap, score, path, val, + xtm, ytm, xt, r1, r2, xlen, minlen); + delete [] m1; + delete [] m2; + return 0; // zero for no exception +} + +void MMalign_dimer(double & total_score, + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence, + double d0_scale, bool fast_opt) +{ + int i,j; + int xlen=0; + int ylen=0; + vector<int> xlen_dimer; + vector<int> ylen_dimer; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + xlen+=xlen_vec[i]; + ylen+=ylen_vec[j]; + xlen_dimer.push_back(xlen_vec[i]); + ylen_dimer.push_back(ylen_vec[j]); + } + if (xlen<=3 || ylen<=3) return; + + bool **mask; // mask out inter-chain region + NewArray(&mask, xlen+1, ylen+1); + for (i=0;i<xlen+1;i++) for (j=0;j<ylen+1;j++) mask[i][j]=false; + for (i=0;i<xlen_dimer[0]+1;i++) mask[i][0]=true; + for (j=0;j<ylen_dimer[0]+1;j++) mask[0][j]=true; + int c,prev_xlen,prev_ylen; + prev_xlen=1; + prev_ylen=1; + for (c=0;c<xlen_dimer.size();c++) + { + for (i=prev_xlen;i<prev_xlen+xlen_dimer[c];i++) + for (j=prev_ylen;j<prev_ylen+ylen_dimer[c];j++) mask[i][j]=true; + prev_xlen+=xlen_dimer[c]; + prev_ylen+=ylen_dimer[c]; + } + vector<int>().swap(xlen_dimer); + vector<int>().swap(ylen_dimer); + + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + + int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + double Lnorm_ass=len_aa+len_na; + + TMalign_dimer_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, mask, sequence, Lnorm_ass, d0_scale, + 1, false, true, false, fast_opt, mol_type, -1); + + /* clean up TM-align */ + delete [] seqx; + delete [] seqy; + delete [] secx; + delete [] secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + DeleteArray(&mask,xlen+1); + + /* re-compute chain level alignment */ + total_score=0; + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + for (j=0;j<chain2_num;j++) + { + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + d0_out=5.0; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + rmsd0 = 0.0; + Liden=0; + int *invmap = new int[ylen+1]; + + double Lnorm_ass=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; + + /* entry function for structure alignment */ + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, 2, false, mol_vec1[i]+mol_vec2[j], 1, invmap); + + /* print result */ + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + + TMave_mat[i][j]=TM4*Lnorm_ass; + if (assign1_list[i]==j) + { + if (TM4<=0) assign1_list[i]=assign2_list[j]=-1; + else total_score+=TMave_mat[i][j]; + } + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + delete[]invmap; + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + DeleteArray(&xt,xlen); + } + return; +} + +void MMalign_cross(double & max_total_score, const int max_iter, + const vector<vector<vector<double> > >&xa_vec, + const vector<vector<vector<double> > >&ya_vec, + const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, + const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, + const vector<int> &mol_vec1, const vector<int> &mol_vec2, + const vector<int> &xlen_vec, const vector<int> &ylen_vec, + double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, + int len_aa, int len_na, int chain1_num, int chain2_num, double **TMave_mat, + vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, + int *assign1_list, int *assign2_list, vector<string>&sequence, + double d0_scale, bool fast_opt) +{ + /* tmp assignment */ + int *assign1_tmp, *assign2_tmp; + assign1_tmp=new int[chain1_num]; + assign2_tmp=new int[chain2_num]; + double **TMave_tmp; + NewArray(&TMave_tmp,chain1_num,chain2_num); + vector<string> tmp_str_vec(chain2_num,""); + vector<vector<string> >seqxA_tmp(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_tmp(chain1_num,tmp_str_vec); + vector<string> sequence_tmp; + copy_chain_assign_data(chain1_num, chain2_num, sequence_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp); + + double total_score=MMalign_search(xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_tmp, seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, sequence_tmp, + d0_scale, fast_opt, 1); + if (total_score>max_total_score) + { + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_tmp, seqyA_tmp, assign1_tmp, assign2_tmp, TMave_tmp, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + max_total_score=total_score; + } + + if (max_iter) MMalign_iter( + max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, fast_opt); + + /* clean up everything */ + delete [] assign1_tmp; + delete [] assign2_tmp; + DeleteArray(&TMave_tmp,chain1_num); + vector<string>().swap(tmp_str_vec); + vector<vector<string> >().swap(seqxA_tmp); + vector<vector<string> >().swap(seqyA_tmp); + vector<string>().swap(sequence_tmp); + return; +} + +/* return the number of chains that are trimmed */ +int trimComplex(vector<vector<vector<double> > >&a_trim_vec, + vector<vector<char> >&seq_trim_vec, vector<vector<char> >&sec_trim_vec, + vector<int>&len_trim_vec, + const vector<vector<vector<double> > >&a_vec, + const vector<vector<char> >&seq_vec, const vector<vector<char> >&sec_vec, + const vector<int> &len_vec, const vector<int> &mol_vec, + const int Lchain_aa_max, const int Lchain_na_max) +{ + int trim_chain_count=0; + int chain_num=a_vec.size(); + int i,j; + int r1,r2; + double dinter; + double dinter_min; + vector<pair<double,int> >dinter_vec; + vector<bool> include_vec; + vector<char> seq_empty; + vector<vector<double> > a_empty; + vector<double> xcoor(3,0); + vector<double> ycoor(3,0); + int xlen,ylen; + int Lchain_max; + double expand=2; + for (i=0;i<chain_num;i++) + { + xlen=len_vec[i]; + if (mol_vec[i]>0) Lchain_max=Lchain_na_max*expand; + else Lchain_max=Lchain_aa_max*expand; + if (Lchain_max<3) Lchain_max=3; + if (xlen<=Lchain_max || xlen<=3) + { + a_trim_vec.push_back(a_vec[i]); + seq_trim_vec.push_back(seq_vec[i]); + sec_trim_vec.push_back(sec_vec[i]); + len_trim_vec.push_back(xlen); + continue; + } + trim_chain_count++; + for (r1=0;r1<xlen;r1++) + { + xcoor[0]=a_vec[i][r1][0]; + xcoor[1]=a_vec[i][r1][1]; + xcoor[2]=a_vec[i][r1][2]; + dinter_min=FLT_MAX; + for (j=0;j<chain_num;j++) + { + if (i==j) continue; + ylen=len_vec[j]; + for (r2=0;r2<ylen;r2++) + { + ycoor[0]=a_vec[j][r2][0]; + ycoor[1]=a_vec[j][r2][1]; + ycoor[2]=a_vec[j][r2][2]; + dinter=(xcoor[0]-ycoor[0])*(xcoor[0]-ycoor[0])+ + (xcoor[1]-ycoor[1])*(xcoor[1]-ycoor[1])+ + (xcoor[2]-ycoor[2])*(xcoor[2]-ycoor[2]); + if (dinter<dinter_min) dinter_min=dinter; + } + } + dinter_vec.push_back(make_pair(dinter,r1)); + } + sort(dinter_vec.begin(),dinter_vec.end()); + include_vec.assign(xlen,false); + for (r1=0;r1<Lchain_max;r1++) + include_vec[dinter_vec[r1].second]=true; + dinter_vec.clear(); + + a_trim_vec.push_back(a_empty); + seq_trim_vec.push_back(seq_empty); + sec_trim_vec.push_back(seq_empty); + len_trim_vec.push_back(Lchain_max); + for (r1=0;r1<xlen;r1++) + { + if (include_vec[r1]==false) continue; + a_trim_vec[i].push_back(a_vec[i][r1]); + seq_trim_vec[i].push_back(seq_vec[i][r1]); + sec_trim_vec[i].push_back(sec_vec[i][r1]); + } + include_vec.clear(); + } + vector<pair<double,int> >().swap(dinter_vec); + vector<bool>().swap(include_vec); + vector<double> ().swap(xcoor); + vector<double> ().swap(ycoor); + return trim_chain_count; +} + +void writeTrimComplex(vector<vector<vector<double> > >&a_trim_vec, + vector<vector<char> >&seq_trim_vec, vector<int>&len_trim_vec, + vector<string>&chainID_list, vector<int>&mol_vec, + const string &atom_opt, string filename) +{ + int c,r; + int a=0; + string chainID; + string atom; + ofstream fp(filename.c_str()); + for (c=0;c<chainID_list.size();c++) + { + chainID=chainID_list[c]; + if (chainID.size()==1) chainID=" "+chainID; + else if (chainID.size()>2) chainID=chainID.substr(chainID.size()-2,2); + if (chainID[0]==':') chainID=" "+chainID.substr(1); + atom=atom_opt; + if (atom_opt=="auto") + { + if (mol_vec[c]>0) atom=" C3'"; + else atom=" CA "; + } + + for (r=0;r<len_trim_vec[c];r++) + fp<<"ATOM "<<resetiosflags(ios::right)<<setw(5)<<++a<<' ' + <<atom<<' '<<AAmap(seq_trim_vec[c][r])<<chainID + <<setw(4)<<r+1<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<a_trim_vec[c][r][0] + <<setw(8)<<a_trim_vec[c][r][1] + <<setw(8)<<a_trim_vec[c][r][2]<<endl; + } + fp.close(); + atom.clear(); + chainID.clear(); + return; +} + +void output_dock_rotation_matrix(const char* fname_matrix, + const vector<string>&xname_vec, const vector<string>&yname_vec, + double ** ut_mat, int *assign1_list) +{ + fstream fout; + fout.open(fname_matrix, ios::out | ios::trunc); + if (fout)// succeed + { + int i,k; + for (i=0;i<xname_vec.size();i++) + { + if (assign1_list[i]<0) continue; + fout << "------ The rotation matrix to rotate " + <<xname_vec[i]<<" to "<<yname_vec[i]<<" ------\n" + << "m t[m] u[m][0] u[m][1] u[m][2]\n"; + for (k = 0; k < 3; k++) + fout<<k<<setiosflags(ios::fixed)<<setprecision(10) + <<' '<<setw(18)<<ut_mat[i][9+k] + <<' '<<setw(14)<<ut_mat[i][3*k+0] + <<' '<<setw(14)<<ut_mat[i][3*k+1] + <<' '<<setw(14)<<ut_mat[i][3*k+2]<<'\n'; + } + fout << "\nCode for rotating Structure 1 from (x,y,z) to (X,Y,Z):\n" + "for(i=0; i<L; i++)\n" + "{\n" + " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i];\n" + " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i];\n" + " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i];\n" + "}"<<endl; + fout.close(); + } + else + cout << "Open file to output rotation matrix fail.\n"; +} diff --git a/modules/bindings/src/tmalign/NW.h b/modules/bindings/src/USalign/NW.h similarity index 80% rename from modules/bindings/src/tmalign/NW.h rename to modules/bindings/src/USalign/NW.h index 4c9984853..66e7e94f7 100644 --- a/modules/bindings/src/tmalign/NW.h +++ b/modules/bindings/src/USalign/NW.h @@ -259,6 +259,100 @@ void NWDP_SE(bool **path, double **val, double **x, double **y, } } +void NWDP_SE(bool **path, double **val, double **x, double **y, + int len1, int len2, double d02, double gap_open, int j2i[], + const int hinge) +{ + if (hinge==0) + { + NWDP_SE(path, val, x, y, len1, len2, d02, gap_open, j2i); + return; + } + int i, j; + double h, v, d; + + int L=(len2>len1)?len2:len1; + int int_min=L*(gap_open-1); + + for (i=0; i<=len1; i++) + { + for (j=0; j<=len2; j++) + { + val[i][j]=0; + path[i][j]=false; + } + } + + /* fill in old j2i */ + int k=0; + for (j=0; j<len2; j++) + { + i=j2i[j]; + if (i<0) continue; + path[i+1][j+1]=true; + val[i+1][j+1]=0; + } + + double dij; + + //decide matrix and path + for(i=1; i<=len1; i++) + { + for(j=1; j<=len2; j++) + { + dij=0; + if (path[i][j]==false) dij=dist(&x[i-1][0], &y[j-1][0]); + d=val[i-1][j-1] + 1.0/(1+dij/d02); + + //symbol insertion in horizontal (= a gap in vertical) + h=val[i-1][j]; + if(path[i-1][j]) h += gap_open; //aligned in last position + + //symbol insertion in vertical + v=val[i][j-1]; + if(path[i][j-1]) v += gap_open; //aligned in last position + + + if(d>=h && d>=v && val[i][j]==0) + { + path[i][j]=true; //from diagonal + val[i][j]=d; + } + else + { + path[i][j]=false; //from horizontal + if(v>=h) val[i][j]=v; + else val[i][j]=h; + } + } //for i + } //for j + + //trace back to extract the alignment + for (j=0;j<=len2;j++) j2i[j]=-1; + i=len1; + j=len2; + while(i>0 && j>0) + { + if(path[i][j]) //from diagonal + { + j2i[j-1]=i-1; + i--; + j--; + } + else + { + h=val[i-1][j]; + if(path[i-1][j]) h +=gap_open; + + v=val[i][j-1]; + if(path[i][j-1]) v +=gap_open; + + if(v>=h) j--; + else i--; + } + } +} + /* +ss * Input: secondary structure secx, secy, and gap_open * Output: j2i[1:len2] \in {1:len1} U {-1} diff --git a/modules/bindings/src/tmalign/NWalign.cpp b/modules/bindings/src/USalign/NWalign.cpp similarity index 100% rename from modules/bindings/src/tmalign/NWalign.cpp rename to modules/bindings/src/USalign/NWalign.cpp diff --git a/modules/bindings/src/tmalign/NWalign.h b/modules/bindings/src/USalign/NWalign.h similarity index 72% rename from modules/bindings/src/tmalign/NWalign.h rename to modules/bindings/src/USalign/NWalign.h index 2c7e36a11..7d6856b98 100644 --- a/modules/bindings/src/tmalign/NWalign.h +++ b/modules/bindings/src/USalign/NWalign.h @@ -502,7 +502,7 @@ void output_NWalign_results( printf(">%s%s\tL=%d\tseqID=%.3f\n", yname.c_str(), chainID2, ylen, Liden/ylen); printf("%s\n", seqyA); - printf("# Lali=%d\tseqID_ali=%.3f\n", L_ali, Liden/L_ali); + printf("#score=%d\tLali=%d\tseqID_ali=%.3f\n", aln_score, L_ali, Liden/L_ali); printf("$$$$\n"); } else if (outfmt_opt==2) @@ -515,4 +515,200 @@ void output_NWalign_results( cout << endl; } +/* extract pairwise sequence alignment from residue index vectors, + * assuming that "sequence" contains two empty strings. + * return length of alignment, including gap. */ +int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, + const vector<string> resi_vec1, const vector<string> resi_vec2, + const int byresi_opt) +{ + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + + int i1=0; // positions in resi_vec1 + int i2=0; // positions in resi_vec2 + int xlen=resi_vec1.size(); + int ylen=resi_vec2.size(); + if (byresi_opt==4 || byresi_opt==5) // global or glocal sequence alignment + { + int *invmap; + int glocal=0; + if (byresi_opt==5) glocal=2; + int mol_type=0; + for (i1=0;i1<xlen;i1++) + if ('a'<seqx[i1] && seqx[i1]<'z') mol_type++; + else mol_type--; + for (i2=0;i2<ylen;i2++) + if ('a'<seqx[i2] && seqx[i2]<'z') mol_type++; + else mol_type--; + NWalign_main(seqx, seqy, xlen, ylen, sequence[0],sequence[1], + mol_type, invmap, 0, glocal); + } + + + map<string,string> chainID_map1; + map<string,string> chainID_map2; + if (byresi_opt==3) + { + vector<string> chainID_vec; + string chainID; + stringstream ss; + int i; + for (i=0;i<xlen;i++) + { + chainID=resi_vec1[i].substr(5); + if (!chainID_vec.size()|| chainID_vec.back()!=chainID) + { + chainID_vec.push_back(chainID); + ss<<chainID_vec.size(); + chainID_map1[chainID]=ss.str(); + ss.str(""); + } + } + chainID_vec.clear(); + for (i=0;i<ylen;i++) + { + chainID=resi_vec2[i].substr(5); + if (!chainID_vec.size()|| chainID_vec.back()!=chainID) + { + chainID_vec.push_back(chainID); + ss<<chainID_vec.size(); + chainID_map2[chainID]=ss.str(); + ss.str(""); + } + } + vector<string>().swap(chainID_vec); + } + string chainID1=""; + string chainID2=""; + string chainID1_prev=""; + string chainID2_prev=""; + while(i1<xlen && i2<ylen) + { + if (byresi_opt==2) + { + chainID1=resi_vec1[i1].substr(5); + chainID2=resi_vec2[i2].substr(5); + } + else if (byresi_opt==3) + { + chainID1=chainID_map1[resi_vec1[i1].substr(5)]; + chainID2=chainID_map2[resi_vec2[i2].substr(5)]; + } + + if (chainID1==chainID2) + { + if (atoi(resi_vec1[i1].substr(0,4).c_str())< + atoi(resi_vec2[i2].substr(0,4).c_str())) + { + sequence[0]+=seqx[i1++]; + sequence[1]+='-'; + } + else if (atoi(resi_vec1[i1].substr(0,4).c_str())> + atoi(resi_vec2[i2].substr(0,4).c_str())) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + } + chainID1_prev=chainID1; + chainID2_prev=chainID2; + } + else + { + if (chainID1_prev==chainID1 && chainID2_prev!=chainID2) + { + sequence[0]+=seqx[i1++]; + sequence[1]+='-'; + chainID1_prev=chainID1; + } + else if (chainID1_prev!=chainID1 && chainID2_prev==chainID2) + { + sequence[0]+='-'; + sequence[1]+=seqy[i2++]; + chainID2_prev=chainID2; + } + else + { + sequence[0]+=seqx[i1++]; + sequence[1]+=seqy[i2++]; + chainID1_prev=chainID1; + chainID2_prev=chainID2; + } + } + + } + map<string,string>().swap(chainID_map1); + map<string,string>().swap(chainID_map2); + chainID1.clear(); + chainID2.clear(); + chainID1_prev.clear(); + chainID2_prev.clear(); + return sequence[0].size(); +} + +/* extract pairwise sequence alignment from residue index vectors, + * return length of alignment, including gap. */ +int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, + const vector<string> resi_vec1, const vector<string> resi_vec2, + const vector<int> xlen_vec, const vector<int> ylen_vec, + const int chain_i, const int chain_j) +{ + sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + + int i1=0; // positions in resi_vec1 + int i2=0; // positions in resi_vec2 + int xlen=xlen_vec[chain_i]; + int ylen=ylen_vec[chain_j]; + int i,j; + for (i=0;i<chain_i;i++) i1+=xlen_vec[i]; + for (j=0;j<chain_j;j++) i2+=ylen_vec[j]; + + i=j=0; + while(i<xlen && j<ylen) + { + if (atoi(resi_vec1[i+i1].substr(0,4).c_str())< + atoi(resi_vec2[j+i2].substr(0,4).c_str())) + { + sequence[0]+=seqx[i++]; + sequence[1]+='-'; + } + else if (atoi(resi_vec1[i+i1].substr(0,4).c_str())> + atoi(resi_vec2[j+i2].substr(0,4).c_str())) + { + sequence[0]+='-'; + sequence[1]+=seqy[j++]; + } + else + { + sequence[0]+=seqx[i++]; + sequence[1]+=seqy[j++]; + } + } + if (i<xlen && j==ylen) + { + for (i;i<xlen;i++) + { + sequence[0]+=seqx[i]; + sequence[1]+='-'; + } + } + else if (i==xlen && j<ylen) + { + for (j;j<ylen;j++) + { + sequence[0]+='-'; + sequence[1]+=seqy[j]; + } + } + return sequence[0].size(); +} + #endif diff --git a/modules/bindings/src/USalign/OST_INFO b/modules/bindings/src/USalign/OST_INFO new file mode 100644 index 000000000..42124da83 --- /dev/null +++ b/modules/bindings/src/USalign/OST_INFO @@ -0,0 +1,6 @@ +Source code has been cloned May 4 2023 from: + +https://github.com/pylelab/USalign + +last commit: +8d968e0111ca275958f209d76b1cd10598864a34 diff --git a/modules/bindings/src/tmalign/PDB1.pdb b/modules/bindings/src/USalign/PDB1.pdb similarity index 100% rename from modules/bindings/src/tmalign/PDB1.pdb rename to modules/bindings/src/USalign/PDB1.pdb diff --git a/modules/bindings/src/tmalign/PDB2.pdb b/modules/bindings/src/USalign/PDB2.pdb similarity index 100% rename from modules/bindings/src/tmalign/PDB2.pdb rename to modules/bindings/src/USalign/PDB2.pdb diff --git a/modules/bindings/src/USalign/SOIalign.h b/modules/bindings/src/USalign/SOIalign.h new file mode 100644 index 000000000..716afbaf8 --- /dev/null +++ b/modules/bindings/src/USalign/SOIalign.h @@ -0,0 +1,959 @@ +#ifndef SOIalign_h +#define SOIalign_h 1 + +#include "TMalign.h" + +void print_invmap(int *invmap, const int ylen) +{ + int i,j; + for (j=0;j<ylen;j++) + { + i=invmap[j]; + if (i>=0) cout<<" ("<<i<<","<<j<<")"; + } + cout<<endl; +} + +void assign_sec_bond(int **secx_bond, const char *secx, const int xlen) +{ + int i,j; + int starti=-1; + int endi=-1; + char ss; + char prev_ss=0; + for (i=0; i<xlen; i++) + { + ss=secx[i]; + secx_bond[i][0]=secx_bond[i][1]=-1; + if (ss!=prev_ss && !(ss=='C' && prev_ss=='T') + && !(ss=='T' && prev_ss=='C')) + { + if (starti>=0) // previous SSE end + { + endi=i; + for (j=starti;j<endi;j++) + { + secx_bond[j][0]=starti; + secx_bond[j][1]=endi; + } + } + if (ss=='H' || ss=='E' || ss=='<' || ss=='>') starti=i; + else starti=-1; + } + prev_ss=secx[i]; + } + if (starti>=0) // previous SSE end + { + endi=i; + for (j=starti;j<endi;j++) + { + secx_bond[j][0]=starti; + secx_bond[j][1]=endi; + } + } + for (i=0;i<xlen;i++) if (secx_bond[i][1]-secx_bond[i][0]==1) + secx_bond[i][0]=secx_bond[i][1]=-1; +} + +void getCloseK(double **xa, const int xlen, const int closeK_opt, double **xk) +{ + double **score; + NewArray(&score, xlen+1, xlen+1); + vector<pair<double,int> > close_idx_vec(xlen, make_pair(0,0)); + int i,j,k; + for (i=0;i<xlen;i++) + { + score[i+1][i+1]=0; + for (j=i+1;j<xlen;j++) score[j+1][i+1]=score[i+1][j+1]=dist(xa[i], xa[j]); + } + for (i=0;i<xlen;i++) + { + for (j=0;j<xlen;j++) + { + close_idx_vec[j].first=score[i+1][j+1]; + close_idx_vec[j].second=j; + } + sort(close_idx_vec.begin(), close_idx_vec.end()); + for (k=0;k<closeK_opt;k++) + { + j=close_idx_vec[k % xlen].second; + xk[i*closeK_opt+k][0]=xa[j][0]; + xk[i*closeK_opt+k][1]=xa[j][1]; + xk[i*closeK_opt+k][2]=xa[j][2]; + } + } + + /* clean up */ + vector<pair<double,int> >().swap(close_idx_vec); + DeleteArray(&score, xlen+1); +} + +/* check if pairing i to j conform to sequantiality within the SSE */ +inline bool sec2sq(const int i, const int j, + int **secx_bond, int **secy_bond, int *fwdmap, int *invmap) +{ + if (i<0 || j<0) return true; + int ii,jj; + if (secx_bond[i][0]>=0) + { + for (ii=secx_bond[i][0];ii<secx_bond[i][1];ii++) + { + jj=fwdmap[ii]; + if (jj>=0 && (i-ii)*(j-jj)<=0) return false; + } + } + if (secy_bond[j][0]>=0) + { + for (jj=secy_bond[j][0];jj<secy_bond[j][1];jj++) + { + ii=invmap[jj]; + if (ii>=0 && (i-ii)*(j-jj)<=0) return false; + } + } + return true; +} + +void soi_egs(double **score, const int xlen, const int ylen, int *invmap, + int **secx_bond, int **secy_bond, const int mm_opt) +{ + int i,j; + int *fwdmap=new int[xlen]; // j=fwdmap[i]; + for (i=0; i<xlen; i++) fwdmap[i]=-1; + for (j=0; j<ylen; j++) + { + i=invmap[j]; + if (i>=0) fwdmap[i]=j; + } + + /* stage 1 - make initial assignment, starting from the highest score pair */ + double max_score; + int maxi,maxj; + while(1) + { + max_score=0; + maxi=maxj=-1; + for (i=0;i<xlen;i++) + { + if (fwdmap[i]>=0) continue; + for (j=0;j<ylen;j++) + { + if (invmap[j]>=0 || score[i+1][j+1]<=max_score) continue; + if (mm_opt==6 && !sec2sq(i,j,secx_bond,secy_bond, + fwdmap,invmap)) continue; + maxi=i; + maxj=j; + max_score=score[i+1][j+1]; + } + } + if (maxi<0) break; // no assignment; + invmap[maxj]=maxi; + fwdmap[maxi]=maxj; + } + + double total_score=0; + for (j=0;j<ylen;j++) + { + i=invmap[j]; + if (i>=0) total_score+=score[i+1][j+1]; + } + + /* stage 2 - swap assignment until total score cannot be improved */ + int iter; + int oldi,oldj; + double delta_score; + for (iter=0; iter<getmin(xlen,ylen)*5; iter++) + { + //cout<<"total_score="<<total_score<<".iter="<<iter<<endl; + //print_invmap(invmap,ylen); + delta_score=-1; + for (i=0;i<xlen;i++) + { + oldj=fwdmap[i]; + for (j=0;j<ylen;j++) + { + oldi=invmap[j]; + if (score[i+1][j+1]<=0 || oldi==i) continue; + if (mm_opt==6 && (!sec2sq(i,j,secx_bond,secy_bond,fwdmap,invmap) || + !sec2sq(oldi,oldj,secx_bond,secy_bond,fwdmap,invmap))) + continue; + delta_score=score[i+1][j+1]; + if (oldi>=0 && oldj>=0) delta_score+=score[oldi+1][oldj+1]; + if (oldi>=0) delta_score-=score[oldi+1][j+1]; + if (oldj>=0) delta_score-=score[i+1][oldj+1]; + + if (delta_score>0) // successful swap + { + fwdmap[i]=j; + if (oldi>=0) fwdmap[oldi]=oldj; + invmap[j]=i; + if (oldj>=0) invmap[oldj]=oldi; + total_score+=delta_score; + break; + } + } + } + if (delta_score<=0) break; // cannot make further swap + } + + /* clean up */ + delete[]fwdmap; +} + +/* entry function for se + * u_opt corresponds to option -L + * if u_opt==2, use d0 from Lnorm_ass for alignment + * */ +int soi_se_main( + double **xa, double **ya, const char *seqx, const char *seqy, + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const double Lnorm_ass, const double d0_scale, const bool i_opt, + const bool a_opt, const int u_opt, const bool d_opt, const int mol_type, + const int outfmt_opt, int *invmap, double *dist_list, + int **secx_bond, int **secy_bond, const int mm_opt) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double **score; // score for aligning a residue pair + bool **path; // for dynamic programming + double **val; // for dynamic programming + + int *m1=NULL; + int *m2=NULL; + int i,j; + double d; + if (outfmt_opt<2) + { + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + } + + /***********************/ + /* allocate memory */ + /***********************/ + NewArray(&score, xlen+1, ylen+1); + NewArray(&path, xlen+1, ylen+1); + NewArray(&val, xlen+1, ylen+1); + //int *invmap = new int[ylen+1]; + + /* set d0 */ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); // set score_d8 + parameter_set4final(xlen, D0_MIN, Lnorm, + d0B, d0_search, mol_type); // set d0B + parameter_set4final(ylen, D0_MIN, Lnorm, + d0A, d0_search, mol_type); // set d0A + if (a_opt) + parameter_set4final((xlen+ylen)*0.5, D0_MIN, Lnorm, + d0a, d0_search, mol_type); // set d0a + if (u_opt) + { + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0u, d0_search, mol_type); // set d0u + if (u_opt==2) + { + parameter_set4search(Lnorm_ass, Lnorm_ass, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); // set score_d8 + } + } + + /* perform alignment */ + for(j=0; j<ylen; j++) invmap[j]=-1; + double d02=d0*d0; + double score_d82=score_d8*score_d8; + double d2; + for(i=0; i<xlen; i++) + { + for(j=0; j<ylen; j++) + { + d2=dist(xa[i], ya[j]); + if (d2>score_d82) score[i+1][j+1]=0; + else score[i+1][j+1]=1./(1+ d2/d02); + } + } + if (mm_opt==6) NWDP_TM(score, path, val, xlen, ylen, -0.6, invmap); + soi_egs(score, xlen, ylen, invmap, secx_bond, secy_bond, mm_opt); + + rmsd0=TM1=TM2=TM3=TM4=TM5=0; + int k=0; + n_ali=0; + n_ali8=0; + for(j=0; j<ylen; j++) + { + i=invmap[j]; + dist_list[j]=-1; + if(i>=0)//aligned + { + n_ali++; + d=sqrt(dist(&xa[i][0], &ya[j][0])); + dist_list[j]=d; + if (score[i+1][j+1]>0) + { + if (outfmt_opt<2) + { + m1[k]=i; + m2[k]=j; + } + k++; + TM2+=1/(1+(d/d0B)*(d/d0B)); // chain_1 + TM1+=1/(1+(d/d0A)*(d/d0A)); // chain_2 + if (a_opt) TM3+=1/(1+(d/d0a)*(d/d0a)); // -a + if (u_opt) TM4+=1/(1+(d/d0u)*(d/d0u)); // -u + if (d_opt) TM5+=1/(1+(d/d0_scale)*(d/d0_scale)); // -d + rmsd0+=d*d; + } + } + } + n_ali8=k; + TM2/=xlen; + TM1/=ylen; + TM3/=(xlen+ylen)*0.5; + TM4/=Lnorm_ass; + TM5/=ylen; + if (n_ali8) rmsd0=sqrt(rmsd0/n_ali8); + + if (outfmt_opt>=2) + { + DeleteArray(&score, xlen+1); + return 0; + } + + /* extract aligned sequence */ + int ali_len=xlen+ylen; + for (j=0;j<ylen;j++) ali_len-=(invmap[j]>=0); + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + int *fwdmap = new int [xlen+1]; + for (i=0;i<xlen;i++) fwdmap[i]=-1; + + for (j=0;j<ylen;j++) + { + seqyA[j]=seqy[j]; + i=invmap[j]; + if (i<0) continue; + if (sqrt(dist(xa[i], ya[j]))<d0_out) seqM[j]=':'; + else seqM[j]='.'; + fwdmap[i]=j; + seqxA[j]=seqx[i]; + Liden+=(seqxA[k]==seqyA[k]); + } + k=0; + for (i=0;i<xlen;i++) + { + j=fwdmap[i]; + if (j>=0) continue; + seqxA[ylen+k]=seqx[i]; + k++; + } + + /* free memory */ + delete [] fwdmap; + delete [] m1; + delete [] m2; + DeleteArray(&score, xlen+1); + DeleteArray(&path, xlen+1); + DeleteArray(&val, xlen+1); + return 0; // zero for no exception +} + +inline void SOI_super2score(double **xt, double **ya, const int xlen, + const int ylen, double **score, double d0, double score_d8) +{ + int i,j; + double d02=d0*d0; + double score_d82=score_d8*score_d8; + double d2; + for (i=0; i<xlen; i++) + { + for(j=0; j<ylen; j++) + { + d2=dist(xt[i], ya[j]); + if (d2>score_d82) score[i+1][j+1]=0; + else score[i+1][j+1]=1./(1+ d2/d02); + } + } +} + +//heuristic run of dynamic programing iteratively to find the best alignment +//input: initial rotation matrix t, u +// vectors x and y, d0 +//output: best alignment that maximizes the TMscore, will be stored in invmap +double SOI_iter(double **r1, double **r2, double **xtm, double **ytm, + double **xt, double **score, bool **path, double **val, double **xa, double **ya, + int xlen, int ylen, double t[3], double u[3][3], int *invmap0, + int iteration_max, double local_d0_search, + double Lnorm, double d0, double score_d8, + int **secx_bond, int **secy_bond, const int mm_opt, const bool init_invmap=false) +{ + double rmsd; + int *invmap=new int[ylen+1]; + + int iteration, i, j, k; + double tmscore, tmscore_max, tmscore_old=0; + tmscore_max=-1; + + //double d01=d0+1.5; + double d02=d0*d0; + double score_d82=score_d8*score_d8; + double d2; + for (iteration=0; iteration<iteration_max; iteration++) + { + if (iteration==0 && init_invmap) + for (j=0;j<ylen;j++) invmap[j]=invmap0[j]; + else + { + for (j=0; j<ylen; j++) invmap[j]=-1; + if (mm_opt==6) NWDP_TM(score, path, val, xlen, ylen, -0.6, invmap); + } + soi_egs(score, xlen, ylen, invmap, secx_bond, secy_bond, mm_opt); + + k=0; + for (j=0; j<ylen; j++) + { + i=invmap[j]; + if (i<0) continue; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + k++; + } + + tmscore = TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, + 40, 8, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + if (tmscore>tmscore_max) + { + tmscore_max=tmscore; + for (j=0; j<ylen; j++) invmap0[j]=invmap[j]; + } + + if (iteration>0 && fabs(tmscore_old-tmscore)<0.000001) break; + tmscore_old=tmscore; + do_rotation(xa, xt, xlen, t, u); + SOI_super2score(xt, ya, xlen, ylen, score, d0, score_d8); + }// for iteration + + delete []invmap; + return tmscore_max; +} + +void get_SOI_initial_assign(double **xk, double **yk, const int closeK_opt, + double **score, bool **path, double **val, const int xlen, const int ylen, + double t[3], double u[3][3], int invmap[], + double local_d0_search, double d0, double score_d8, + int **secx_bond, int **secy_bond, const int mm_opt) +{ + int i,j,k; + double **xfrag; + double **xtran; + double **yfrag; + NewArray(&xfrag, closeK_opt, 3); + NewArray(&xtran, closeK_opt, 3); + NewArray(&yfrag, closeK_opt, 3); + double rmsd; + double d02=d0*d0; + double score_d82=score_d8*score_d8; + double d2; + + /* fill in score */ + for (i=0;i<xlen;i++) + { + for (k=0;k<closeK_opt;k++) + { + xfrag[k][0]=xk[i*closeK_opt+k][0]; + xfrag[k][1]=xk[i*closeK_opt+k][1]; + xfrag[k][2]=xk[i*closeK_opt+k][2]; + } + + for (j=0;j<ylen;j++) + { + for (k=0;k<closeK_opt;k++) + { + yfrag[k][0]=yk[j*closeK_opt+k][0]; + yfrag[k][1]=yk[j*closeK_opt+k][1]; + yfrag[k][2]=yk[j*closeK_opt+k][2]; + } + Kabsch(xfrag, yfrag, closeK_opt, 1, &rmsd, t, u); + do_rotation(xfrag, xtran, closeK_opt, t, u); + + //for (k=0; k<closeK_opt; k++) + //{ + //d2=dist(xtran[k], yfrag[k]); + //if (d2>score_d82) score[i+1][j+1]=0; + //else score[i+1][j+1]=1./(1+d2/d02); + //} + k=closeK_opt-1; + d2=dist(xtran[k], yfrag[k]); + if (d2>score_d82) score[i+1][j+1]=0; + else score[i+1][j+1]=1./(1+d2/d02); + } + } + + /* initial assignment */ + for (j=0;j<ylen;j++) invmap[j]=-1; + if (mm_opt==6) NWDP_TM(score, path, val, xlen, ylen, -0.6, invmap); + for (j=0; j<ylen;j++) i=invmap[j]; + soi_egs(score, xlen, ylen, invmap, secx_bond, secy_bond, mm_opt); + + /* clean up */ + DeleteArray(&xfrag, closeK_opt); + DeleteArray(&xtran, closeK_opt); + DeleteArray(&yfrag, closeK_opt); +} + +void SOI_assign2super(double **r1, double **r2, double **xtm, double **ytm, + double **xt, double **xa, double **ya, + const int xlen, const int ylen, double t[3], double u[3][3], int invmap[], + double local_d0_search, double Lnorm, double d0, double score_d8) +{ + int i,j,k; + double rmsd; + double d02=d0*d0; + double score_d82=score_d8*score_d8; + double d2; + + k=0; + for (j=0; j<ylen; j++) + { + i=invmap[j]; + if (i<0) continue; + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + k++; + } + TMscore8_search(r1, r2, xtm, ytm, xt, k, t, u, + 40, 8, &rmsd, local_d0_search, Lnorm, score_d8, d0); + do_rotation(xa, xt, xlen, t, u); +} + +/* entry function for TM-align with circular permutation + * i_opt, a_opt, u_opt, d_opt, TMcut are not implemented yet */ +int SOIalign_main(double **xa, double **ya, + double **xk, double **yk, const int closeK_opt, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, int *invmap, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector<string> sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, double *dist_list, + int **secx_bond, int **secy_bond, const int mm_opt) +{ + double D0_MIN; //for d0 + double Lnorm; //normalization length + double score_d8,d0,d0_search,dcu0;//for TMscore search + double t[3], u[3][3]; //Kabsch translation vector and rotation matrix + double **score; // Input score table for enhanced greedy search + double **scoret; // Transposed score table for enhanced greedy search + bool **path; // for dynamic programming + double **val; // for dynamic programming + double **xtm, **ytm; // for TMscore search engine + double **xt; //for saving the superposed version of r_1 or xtm + double **yt; //for saving the superposed version of r_2 or ytm + double **r1, **r2; // for Kabsch rotation + + /***********************/ + /* allocate memory */ + /***********************/ + int minlen = min(xlen, ylen); + int maxlen = (xlen>ylen)?xlen:ylen; + NewArray(&score, xlen+1, ylen+1); + NewArray(&scoret, ylen+1, xlen+1); + NewArray(&path, maxlen+1, maxlen+1); + NewArray(&val, maxlen+1, maxlen+1); + NewArray(&xtm, minlen, 3); + NewArray(&ytm, minlen, 3); + NewArray(&xt, xlen, 3); + NewArray(&yt, ylen, 3); + NewArray(&r1, minlen, 3); + NewArray(&r2, minlen, 3); + + /***********************/ + /* parameter set */ + /***********************/ + parameter_set4search(xlen, ylen, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); + int simplify_step = 40; //for simplified search engine + int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis<score_d8 + + int i,j; + int *fwdmap0 = new int[xlen+1]; + int *invmap0 = new int[ylen+1]; + + double TMmax=-1, TM=-1; + for(i=0; i<xlen; i++) fwdmap0[i]=-1; + for(j=0; j<ylen; j++) invmap0[j]=-1; + double local_d0_search = d0_search; + int iteration_max=(fast_opt)?2:30; + //if (mm_opt==6) iteration_max=1; + + /*************************************************************/ + /* initial alignment with sequence order dependent alignment */ + /*************************************************************/ + CPalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, fast_opt, + mol_type,-1); + if (mm_opt==6) + { + i=0; + j=0; + for (int r=0;r<seqxA.size();r++) + { + if (seqxA[r]=='*') // circular permutation point + { + for (int jj=0;jj<j;jj++) if (invmap0[jj]>=0) + invmap0[jj]+=xlen - i; + i=0; + continue; + } + if (seqyA[r]!='-') + { + if (seqxA[r]!='-') invmap0[j]=i; + j++; + } + if (seqxA[r]!='-') i++; + } + for (j=0;j<ylen;j++) + { + i=invmap0[j]; + if (i>=0) fwdmap0[i]=j; + } + } + do_rotation(xa, xt, xlen, t0, u0); + SOI_super2score(xt, ya, xlen, ylen, score, d0, score_d8); + for (i=0;i<xlen;i++) for (j=0;j<ylen;j++) scoret[j+1][i+1]=score[i+1][j+1]; + TMmax=SOI_iter(r1, r2, xtm, ytm, xt, score, path, val, xa, ya, + xlen, ylen, t0, u0, invmap0, iteration_max, + local_d0_search, Lnorm, d0, score_d8, secx_bond, secy_bond, mm_opt, true); + TM =SOI_iter(r2, r1, ytm, xtm, yt,scoret, path, val, ya, xa, + ylen, xlen, t0, u0, fwdmap0, iteration_max, + local_d0_search, Lnorm, d0, score_d8, secy_bond, secx_bond, mm_opt, true); + //cout<<"TM2="<<TM2<<"\tTM1="<<TM1<<"\tTMmax="<<TMmax<<"\tTM="<<TM<<endl; + if (TM>TMmax) + { + TMmax = TM; + for (j=0; j<ylen; j++) invmap0[j]=-1; + for (i=0; i<xlen; i++) + { + j=fwdmap0[i]; + if (j>=0) invmap0[j]=i; + } + } + + /***************************************************************/ + /* initial alignment with sequence order independent alignment */ + /***************************************************************/ + if (closeK_opt>=3) + { + get_SOI_initial_assign(xk, yk, closeK_opt, score, path, val, + xlen, ylen, t, u, invmap, local_d0_search, d0, score_d8, + secx_bond, secy_bond, mm_opt); + for (i=0;i<xlen;i++) for (j=0;j<ylen;j++) scoret[j+1][i+1]=score[i+1][j+1]; + + SOI_assign2super(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, t, u, invmap, local_d0_search, Lnorm, d0, score_d8); + TM=SOI_iter(r1, r2, xtm, ytm, xt, score, path, val, xa, ya, + xlen, ylen, t, u, invmap, iteration_max, + local_d0_search, Lnorm, d0, score_d8, secx_bond, secy_bond, mm_opt); + if (TM>TMmax) + { + TMmax = TM; + for (j = 0; j<ylen; j++) invmap0[j] = invmap[j]; + } + + for (i=0;i<xlen;i++) fwdmap0[i]=-1; + if (mm_opt==6) NWDP_TM(scoret, path, val, ylen, xlen, -0.6, fwdmap0); + soi_egs(scoret, ylen, xlen, fwdmap0, secy_bond, secx_bond, mm_opt); + SOI_assign2super(r2, r1, ytm, xtm, yt, ya, xa, + ylen, xlen, t, u, fwdmap0, local_d0_search, Lnorm, d0, score_d8); + TM=SOI_iter(r2, r1, ytm, xtm, yt, scoret, path, val, ya, xa, ylen, xlen, t, u, + fwdmap0, iteration_max, local_d0_search, Lnorm, d0, score_d8,secy_bond, secx_bond, mm_opt); + if (TM>TMmax) + { + TMmax = TM; + for (j=0; j<ylen; j++) invmap0[j]=-1; + for (i=0; i<xlen; i++) + { + j=fwdmap0[i]; + if (j>=0) invmap0[j]=i; + } + } + } + + //*******************************************************************// + // The alignment will not be changed any more in the following // + //*******************************************************************// + //check if the initial alignment is generated appropriately + bool flag=false; + for (i=0; i<xlen; i++) fwdmap0[i]=-1; + for (j=0; j<ylen; j++) + { + i=invmap0[j]; + invmap[j]=i; + if (i>=0) + { + fwdmap0[i]=j; + flag=true; + } + } + if(!flag) + { + cout << "There is no alignment between the two structures! " + << "Program stop with no result!" << endl; + TM1=TM2=TM3=TM4=TM5=0; + return 1; + } + + + //********************************************************************// + // Detailed TMscore search engine --> prepare for final TMscore // + //********************************************************************// + //run detailed TMscore search engine for the best alignment, and + //extract the best rotation matrix (t, u) for the best alignment + simplify_step=1; + if (fast_opt) simplify_step=40; + score_sum_method=8; + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, xlen, ylen, + invmap0, t, u, simplify_step, score_sum_method, local_d0_search, + false, Lnorm, score_d8, d0); + + double rmsd; + simplify_step=1; + score_sum_method=0; + double Lnorm_0=ylen; + + //select pairs with dis<d8 for final TMscore computation and output alignment + int k=0; + int *m1, *m2; + double d; + m1=new int[xlen]; //alignd index in x + m2=new int[ylen]; //alignd index in y + copy_t_u(t, u, t0, u0); + + //****************************************// + // Final TMscore 1 // + //****************************************// + + do_rotation(xa, xt, xlen, t, u); + k=0; + n_ali=0; + for (i=0; i<xlen; i++) + { + j=fwdmap0[i]; + if(j>=0)//aligned + { + n_ali++; + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + else fwdmap0[i]=-1; + } + } + n_ali8=k; + + Kabsch(r1, r2, n_ali8, 0, &rmsd0, t, u);// rmsd0 is used for final output, only recalculate rmsd0, not t & u + rmsd0 = sqrt(rmsd0 / n_ali8); + + //normalized by length of structure A + parameter_set4final(xlen+0.0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0B=d0; + local_d0_search = d0_search; + TM2 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t, u, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + + //****************************************// + // Final TMscore 2 // + //****************************************// + + do_rotation(xa, xt, xlen, t0, u0); + k=0; + for (j=0; j<ylen; j++) + { + i=invmap0[j]; + if(i>=0)//aligned + { + d=sqrt(dist(&xt[i][0], &ya[j][0])); + if (d <= score_d8) + { + m1[k]=i; + m2[k]=j; + + xtm[k][0]=xa[i][0]; + xtm[k][1]=xa[i][1]; + xtm[k][2]=xa[i][2]; + + ytm[k][0]=ya[j][0]; + ytm[k][1]=ya[j][1]; + ytm[k][2]=ya[j][2]; + + r1[k][0] = xt[i][0]; + r1[k][1] = xt[i][1]; + r1[k][2] = xt[i][2]; + r2[k][0] = ya[j][0]; + r2[k][1] = ya[j][1]; + r2[k][2] = ya[j][2]; + + k++; + } + else invmap[j]=invmap0[j]=-1; + } + } + + //normalized by length of structure B + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0A=d0; + d0_0=d0A; + local_d0_search = d0_search; + TM1 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, simplify_step, + score_sum_method, &rmsd, local_d0_search, Lnorm, score_d8, d0); + TM_0 = TM1; + + if (a_opt>0) + { + //normalized by average length of structures A, B + Lnorm_0=(xlen+ylen)*0.5; + parameter_set4final(Lnorm_0, D0_MIN, Lnorm, d0, d0_search, mol_type); + d0a=d0; + d0_0=d0a; + local_d0_search = d0_search; + + TM3 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM3; + } + if (u_opt) + { + //normalized by user assigned length + parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, + d0, d0_search, mol_type); + d0u=d0; + d0_0=d0u; + Lnorm_0=Lnorm_ass; + local_d0_search = d0_search; + TM4 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM4; + } + if (d_opt) + { + //scaled by user assigned d0 + parameter_set4scale(ylen, d0_scale, Lnorm, d0, d0_search); + d0_out=d0_scale; + d0_0=d0_scale; + //Lnorm_0=ylen; + local_d0_search = d0_search; + TM5 = TMscore8_search(r1, r2, xtm, ytm, xt, n_ali8, t0, u0, + simplify_step, score_sum_method, &rmsd, local_d0_search, Lnorm, + score_d8, d0); + TM_0=TM5; + } + + /* derive alignment from superposition */ + int ali_len=xlen+ylen; + for (j=0;j<ylen;j++) ali_len-=(invmap0[j]>=0); + seqxA.assign(ali_len,'-'); + seqM.assign( ali_len,' '); + seqyA.assign(ali_len,'-'); + + //do_rotation(xa, xt, xlen, t, u); + do_rotation(xa, xt, xlen, t0, u0); + + Liden=0; + //double SO=0; + for (j=0;j<ylen;j++) + { + seqyA[j]=seqy[j]; + i=invmap0[j]; + dist_list[j]=-1; + if (i<0) continue; + d=sqrt(dist(xt[i], ya[j])); + if (d<d0_out) seqM[j]=':'; + else seqM[j]='.'; + dist_list[j]=d; + //SO+=(d<3.5); + seqxA[j]=seqx[i]; + Liden+=(seqx[i]==seqy[j]); + } + //SO/=getmin(xlen,ylen); + k=0; + for (i=0;i<xlen;i++) + { + j=fwdmap0[i]; + if (j>=0) continue; + seqxA[ylen+k]=seqx[i]; + k++; + } + //cout<<n_ali8<<'\t' + //<<rmsd0<<'\t' + //<<100.*SO<<endl; + + + /* clean up */ + DeleteArray(&score, xlen+1); + DeleteArray(&scoret,ylen+1); + DeleteArray(&path,maxlen+1); + DeleteArray(&val, maxlen+1); + DeleteArray(&xtm, minlen); + DeleteArray(&ytm, minlen); + DeleteArray(&xt,xlen); + DeleteArray(&yt,ylen); + DeleteArray(&r1, minlen); + DeleteArray(&r2, minlen); + delete[]invmap0; + delete[]fwdmap0; + delete[]m1; + delete[]m2; + return 0; +} +#endif diff --git a/modules/bindings/src/tmalign/TMalign.cpp b/modules/bindings/src/USalign/TMalign.cpp similarity index 93% rename from modules/bindings/src/tmalign/TMalign.cpp rename to modules/bindings/src/USalign/TMalign.cpp index 7ea33e1a7..c822d4c30 100644 --- a/modules/bindings/src/tmalign/TMalign.cpp +++ b/modules/bindings/src/USalign/TMalign.cpp @@ -9,7 +9,7 @@ void print_version() cout << "\n" " **********************************************************************\n" -" * TM-align (Version 20210520): protein and RNA structure alignment *\n" +" * TM-align (Version 20220623): protein and RNA structure alignment *\n" " * References: Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005) *\n" " * S Gong, C Zhang, Y Zhang. Bioinformatics, bz282 (2019) *\n" " * Please email comments and suggestions to yangzhanglab@umich.edu *\n" @@ -67,7 +67,7 @@ void print_extra_help() " -1: full output, but without version or citation information\n" "\n" " -byresi Whether to assume residue index correspondence between the\n" -" two structures.\n" +" two structures. The same as -TMscore.\n" " 0: (default) sequence independent alignment\n" " 1: (same as TMscore program) sequence-dependent superposition,\n" " i.e. align by residue index\n" @@ -75,6 +75,11 @@ void print_extra_help() " align by residue index and chain ID\n" " 3: (similar to TMscore -c, should be used with -ter <=1)\n" " align by residue index and order of chain\n" +//" 4: sequence dependent alignment: perform Needleman-Wunsch\n" +//" global sequence alignment, followed by TM-score superposition\n" +" 5: sequence dependent alignment: perform glocal sequence\n" +" alignment followed by TM-score superposition.\n" +" -byresi 5 is thee same as -seq\n" "\n" " -TMcut -1: (default) do not consider TMcut\n" " Values in [0.5,1): Do not proceed with TM-align for this\n" @@ -308,10 +313,15 @@ int main(int argc, char *argv[]) { TMcut=atof(argv[i + 1]); i++; } - else if ( !strcmp(argv[i],"-byresi") && i < (argc-1) ) + else if ((!strcmp(argv[i],"-byresi") || !strcmp(argv[i],"-tmscore") || + !strcmp(argv[i],"-TMscore")) && i < (argc-1) ) { byresi_opt=atoi(argv[i + 1]); i++; } + else if ( !strcmp(argv[i],"-seq") ) + { + byresi_opt=5; + } else if ( !strcmp(argv[i],"-cp") ) { cp_opt=1; @@ -374,10 +384,10 @@ int main(int argc, char *argv[]) { if (i_opt) PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); - if (byresi_opt<0 || byresi_opt>3) - PrintErrorAndQuit("-byresi can only be 0, 1, 2 or 3"); - if (byresi_opt>=2 && ter_opt>=2) - PrintErrorAndQuit("-byresi >=2 should be used with -ter <=1"); + if (byresi_opt<0 || byresi_opt>5) + PrintErrorAndQuit("-byresi can only be 0, 1, 2, 3, 4, or 5"); + if (byresi_opt>=2 && byresi_opt<=3 && ter_opt>=2) + PrintErrorAndQuit("-byresi 2 and -byresi 3 should be used with -ter <=1"); } if (split_opt==1 && ter_opt!=0) PrintErrorAndQuit("-split 1 should be used with -ter 0"); @@ -566,9 +576,9 @@ int main(int argc, char *argv[]) n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, - (m_opt?fname_matrix+chainID_list1[chain_i]:"").c_str(), + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, 0, split_opt, o_opt, - (o_opt?fname_super+chainID_list1[chain_i]:"").c_str(), + (o_opt?fname_super:"").c_str(), i_opt, a_opt, u_opt, d_opt,mirror_opt, resi_vec1, resi_vec2 ); @@ -618,6 +628,6 @@ int main(int argc, char *argv[]) t2 = clock(); float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; - printf("Total CPU time is %5.2f seconds\n", diff); + printf("#Total CPU time is %5.2f seconds\n", diff); return 0; } diff --git a/modules/bindings/src/tmalign/TMalign.h b/modules/bindings/src/USalign/TMalign.h similarity index 88% rename from modules/bindings/src/tmalign/TMalign.h rename to modules/bindings/src/USalign/TMalign.h index 9187ad3cb..81196a807 100644 --- a/modules/bindings/src/tmalign/TMalign.h +++ b/modules/bindings/src/USalign/TMalign.h @@ -1,9 +1,12 @@ /* Functions for the core TMalign algorithm, including the entry function * TMalign_main */ +#ifndef TMalign_h +#define TMalign_h 1 #include "param_set.h" #include "NW.h" #include "Kabsch.h" +#include "NWalign.h" // 1, collect those residues with dis<d; // 2, calculate TMscore @@ -540,6 +543,10 @@ double get_score_fast( double **r1, double **r2, double **xtm, double **ytm, //second iteration double d002t=d002; + vector<double> dis_vec(dis, dis+n_ali); + sort(dis_vec.begin(), dis_vec.end()); + if (d002t<dis_vec[2]) d002t=dis_vec[2]; + dis_vec.clear(); while(1) { j=0; @@ -577,7 +584,10 @@ double get_score_fast( double **r1, double **r2, double **xtm, double **ytm, //third iteration d002t=d002+1; - + vector<double> dis_vec(dis, dis+n_ali); + sort(dis_vec.begin(), dis_vec.end()); + if (d002t<dis_vec[2]) d002t=dis_vec[2]; + dis_vec.clear(); while(1) { j=0; @@ -852,6 +862,11 @@ void make_sec(char *seq, double **x, int len, char *sec,const string atom_opt) if (i>0 && j+1<len && bp[i-1][j+1]) continue; if (!bp[i+1][j-1]) continue; sec_str(len,seq, bp, i,j,ii,jj); + if (jj<i || j<ii) + { + ii=i; + jj=j; + } A0.push_back(i); B0.push_back(j); C0.push_back(ii); @@ -1467,11 +1482,14 @@ void output_pymol(const string xname, const string yname, { int compress_type=0; // uncompressed file ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else redi::ipstream fin_gz; // if file is compressed if (xname.size()>=3 && xname.substr(xname.size()-3,3)==".gz") { - fin_gz.open("zcat "+xname); + fin_gz.open("gunzip -c "+xname); compress_type=1; } else if (xname.size()>=4 && @@ -1480,7 +1498,9 @@ void output_pymol(const string xname, const string yname, fin_gz.open("bzcat "+xname); compress_type=2; } - else fin.open(xname.c_str()); + else +#endif + fin.open(xname.c_str()); stringstream buf; stringstream buf_pymol; @@ -1534,7 +1554,7 @@ void output_pymol(const string xname, const string yname, if (line.compare(0,11,"_atom_site.")) continue; _atom_site.clear(); atom_site_pos=0; - _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + _atom_site[Trim(line.substr(11))]=atom_site_pos; while(1) { while(1) @@ -1552,7 +1572,7 @@ void output_pymol(const string xname, const string yname, if (line.size()) break; } if (line.compare(0,11,"_atom_site.")) break; - _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; buf<<line<<'\n'; } @@ -2438,30 +2458,37 @@ void output_rasmol(const string xname, const string yname, void output_rotation_matrix(const char* fname_matrix, const double t[3], const double u[3][3]) { - fstream fout; - fout.open(fname_matrix, ios::out | ios::trunc); - if (fout)// succeed - { - fout << "------ The rotation matrix to rotate Structure_1 to Structure_2 ------\n"; - char dest[1000]; - sprintf(dest, "m %18s %14s %14s %14s\n", "t[m]", "u[m][0]", "u[m][1]", "u[m][2]"); - fout << string(dest); - for (int k = 0; k < 3; k++) - { - sprintf(dest, "%d %18.10f %14.10f %14.10f %14.10f\n", k, t[k], u[k][0], u[k][1], u[k][2]); - fout << string(dest); - } - fout << "\nCode for rotating Structure 1 from (x,y,z) to (X,Y,Z):\n" - "for(i=0; i<L; i++)\n" - "{\n" - " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i];\n" - " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i];\n" - " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i];\n" - "}\n"; - fout.close(); - } + stringstream ss; + ss << "------ The rotation matrix to rotate Structure_1 to Structure_2 ------\n"; + char dest[1000]; + sprintf(dest, "m %18s %14s %14s %14s\n", "t[m]", "u[m][0]", "u[m][1]", "u[m][2]"); + ss << string(dest); + for (int k = 0; k < 3; k++) + { + sprintf(dest, "%d %18.10f %14.10f %14.10f %14.10f\n", k, t[k], u[k][0], u[k][1], u[k][2]); + ss << string(dest); + } + ss << "\nCode for rotating Structure 1 from (x,y,z) to (X,Y,Z):\n" + "for(i=0; i<L; i++)\n" + "{\n" + " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i];\n" + " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i];\n" + " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i];\n" + "}\n"; + if (strcmp(fname_matrix,(char *)("-"))==0) + cout<<ss.str(); else - cout << "Open file to output rotation matrix fail.\n"; + { + fstream fout; + fout.open(fname_matrix, ios::out | ios::trunc); + if (fout) + { + fout<<ss.str(); + fout.close(); + } + else cout << "Open file to output rotation matrix fail.\n"; + } + ss.str(string()); } //output the final results @@ -2560,6 +2587,82 @@ void output_results(const string xname, const string yname, xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); } +void output_mTMalign_results(const string xname, const string yname, + const string chainID1, const string chainID2, + const int xlen, const int ylen, double t[3], double u[3][3], + const double TM1, const double TM2, + const double TM3, const double TM4, const double TM5, + const double rmsd, const double d0_out, const char *seqM, + const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, const double TM_ali, + const double rmsd_ali, const double TM_0, const double d0_0, + const double d0A, const double d0B, const double Lnorm_ass, + const double d0_scale, const double d0a, const double d0u, + const char* fname_matrix, const int outfmt_opt, const int ter_opt, + const int mm_opt, const int split_opt, const int o_opt, + const string fname_super, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const int mirror_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("Average aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("Average TM-score= %6.5f (normalized by length of shorter structure: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("Average TM-score= %6.5f (normalized by length of longer structure: L=%d, d0=%.2f)\n", TM1, ylen, d0A); + + if (a_opt==1) + printf("Average TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("Average TM-score= %6.5f (normalized by average L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("Average TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); + + //output alignment + printf("In the following, seqID=n_identical/L.\n\n%s\n", seqM); + } + else if (outfmt_opt==1) + { + printf("%s\n", seqM); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if (i_opt) + printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by average L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + cout << endl; + + if (strlen(fname_matrix)) output_rotation_matrix(fname_matrix, t, u); + + if (o_opt==1) + output_pymol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2); + else if (o_opt==2) + output_rasmol(xname, yname, fname_super, t, u, ter_opt, + mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); +} + double standard_TMscore(double **r1, double **r2, double **xtm, double **ytm, double **xt, double **x, double **y, int xlen, int ylen, int invmap[], int& L_ali, double& RMSD, double D0_MIN, double Lnorm, double d0, @@ -2757,7 +2860,6 @@ int TMalign_main(double **xa, double **ya, // get initial alignment from user's input: // // Stick to the initial alignment // //************************************************// - bool bAlignStick = false; if (i_opt==3)// if input has set parameter for "-I" { // In the original code, this loop starts from 1, which is @@ -2798,13 +2900,12 @@ int TMalign_main(double **xa, double **ya, TMmax = TM; for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; } - bAlignStick = true; } /******************************************************/ /* get initial alignment with gapless threading */ /******************************************************/ - if (!bAlignStick) + if (i_opt<=1) { get_initial(r1, r2, xtm, ytm, xa, ya, xlen, ylen, invmap0, d0, d0_search, fast_opt, t, u); @@ -3007,60 +3108,60 @@ int TMalign_main(double **xa, double **ya, return 6; } } + } - //************************************************// - // get initial alignment from user's input: // - //************************************************// - if (i_opt==1)// if input has set parameter for "-i" - { - for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" - invmap[j] = -1; + //************************************************// + // get initial alignment from user's input: // + //************************************************// + if (i_opt>=1 && i_opt<=2)// if input has set parameter for "-i" + { + for (int j = 0; j < ylen; j++)// Set aligned position to be "-1" + invmap[j] = -1; - int i1 = -1;// in C version, index starts from zero, not from one - int i2 = -1; - int L1 = sequence[0].size(); - int L2 = sequence[1].size(); - int L = min(L1, L2);// Get positions for aligned residues - for (int kk1 = 0; kk1 < L; kk1++) + int i1 = -1;// in C version, index starts from zero, not from one + int i2 = -1; + int L1 = sequence[0].size(); + int L2 = sequence[1].size(); + int L = min(L1, L2);// Get positions for aligned residues + for (int kk1 = 0; kk1 < L; kk1++) + { + if (sequence[0][kk1] != '-') + i1++; + if (sequence[1][kk1] != '-') { - if (sequence[0][kk1] != '-') - i1++; - if (sequence[1][kk1] != '-') - { - i2++; - if (i2 >= ylen || i1 >= xlen) kk1 = L; - else if (sequence[0][kk1] != '-') invmap[i2] = i1; - } + i2++; + if (i2 >= ylen || i1 >= xlen) kk1 = L; + else if (sequence[0][kk1] != '-') invmap[i2] = i1; } + } - //--------------- 2. Align proteins from original alignment - double prevD0_MIN = D0_MIN;// stored for later use - int prevLnorm = Lnorm; - double prevd0 = d0; - TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, - xlen, ylen, invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, - d0_search, score_d8, t, u, mol_type); - D0_MIN = prevD0_MIN; - Lnorm = prevLnorm; - d0 = prevd0; + //--------------- 2. Align proteins from original alignment + double prevD0_MIN = D0_MIN;// stored for later use + int prevLnorm = Lnorm; + double prevd0 = d0; + TM_ali = standard_TMscore(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, L_ali, rmsd_ali, D0_MIN, Lnorm, d0, + d0_search, score_d8, t, u, mol_type); + D0_MIN = prevD0_MIN; + Lnorm = prevLnorm; + d0 = prevd0; - TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, - xlen, ylen, invmap, t, u, 40, 8, local_d0_search, true, Lnorm, - score_d8, d0); - if (TM > TMmax) - { - TMmax = TM; - for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; - } - // Different from get_initial, get_initial_ss and get_initial_ssplus - TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, - xlen, ylen, t, u, invmap, 0, 2, (fast_opt)?2:30, - local_d0_search, D0_MIN, Lnorm, d0, score_d8); - if (TM>TMmax) - { - TMmax = TM; - for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; - } + TM = detailed_search_standard(r1, r2, xtm, ytm, xt, xa, ya, + xlen, ylen, invmap, t, u, 40, 8, local_d0_search, true, Lnorm, + score_d8, d0); + if (TM > TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; + } + // Different from get_initial, get_initial_ss and get_initial_ssplus + TM = DP_iter(r1, r2, xtm, ytm, xt, path, val, xa, ya, + xlen, ylen, t, u, invmap, 0, 2, (fast_opt)?2:30, + local_d0_search, D0_MIN, Lnorm, d0, score_d8); + if (TM>TMmax) + { + TMmax = TM; + for (i = 0; i<ylen; i++) invmap0[i] = invmap[i]; } } @@ -3081,7 +3182,7 @@ int TMalign_main(double **xa, double **ya, } if(!flag) { - cout << "There is no alignment between the two proteins! " + cout << "There is no alignment between the two structures! " << "Program stop with no result!" << endl; TM1=TM2=TM3=TM4=TM5=0; return 1; @@ -3240,6 +3341,8 @@ int TMalign_main(double **xa, double **ya, int kk=0, i_old=0, j_old=0; d=0; + Liden=0; + //double SO=0; for(int k=0; k<n_ali8; k++) { for(int i=i_old; i<m1[k]; i++) @@ -3266,10 +3369,16 @@ int TMalign_main(double **xa, double **ya, d=sqrt(dist(&xt[m1[k]][0], &ya[m2[k]][0])); if(d<d0_out) seqM[kk]=':'; else seqM[kk]='.'; + //SO+=(d<3.5); kk++; i_old=m1[k]+1; j_old=m2[k]+1; } + //SO/=getmin(xlen,ylen); + //cout<<n_ali8<<'\t' + //<<rmsd0<<'\t' + //<<100.*SO<<endl; + //tail for(int i=i_old; i<xlen; i++) @@ -3342,13 +3451,14 @@ int CPalign_main(double **xa, double **ya, secx_cp[2*xlen]=0; /* fTM-align alignment */ - double TM1_cp,TM2_cp; + double TM1_cp,TM2_cp,TM4_cp; + const double Lnorm_tmp=getmin(xlen,ylen); TMalign_main(xa_cp, ya, seqx_cp, seqy, secx_cp, secy, - t0, u0, TM1_cp, TM2_cp, TM3, TM4, TM5, + t0, u0, TM1_cp, TM2_cp, TM3, TM4_cp, TM5, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA_cp, seqyA_cp, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen*2, ylen, sequence, Lnorm_ass, d0_scale, - 0, false, false, false, true, mol_type, -1); + xlen*2, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, true, mol_type, -1); /* delete gap in seqxA_cp */ r=0; @@ -3392,12 +3502,14 @@ int CPalign_main(double **xa, double **ya, t0, u0, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 0, false, false, false, true, mol_type, -1); + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, true, mol_type, -1); - /* do not use cricular permutation of number of aligned residues is not + /* do not use circular permutation of number of aligned residues is not * larger than sequence-order dependent alignment */ - if (n_ali8>cp_aln_best) cp_point=0; + //cout<<"cp: aln="<<cp_aln_best<<"\tTM="<<TM4_cp<<endl; + //cout<<"TM: aln="<<n_ali8<<"\tTM="<<TM4<<endl; + if (n_ali8>=cp_aln_best || TM4>=TM4_cp) cp_point=0; /* prepare structure for final alignment */ seqM.clear(); @@ -3418,6 +3530,31 @@ int CPalign_main(double **xa, double **ya, seqx_cp[xlen]=0; secx_cp[xlen]=0; + /* test another round of alignment as concatenated alignment can + * inflate the number of aligned residues and TM-score. e.g. 1yadA 2duaA */ + if (cp_point!=0) + { + TMalign_main(xa_cp, ya, seqx_cp, seqy, secx_cp, secy, + t0, u0, TM1_cp, TM2_cp, TM3, TM4_cp, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA_cp, seqyA_cp, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, cp_aln_best, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, true, mol_type, -1); + //cout<<"cp: aln="<<cp_aln_best<<"\tTM="<<TM4_cp<<endl; + if (n_ali8>=cp_aln_best || TM4>=TM4_cp) + { + cp_point=0; + for (r=0;r<xlen;r++) + { + xa_cp[r][0]=xa[r][0]; + xa_cp[r][1]=xa[r][1]; + xa_cp[r][2]=xa[r][2]; + seqx_cp[r]=seqx[r]; + secx_cp[r]=secx[r]; + } + } + } + /* full TM-align */ TMalign_main(xa_cp, ya, seqx_cp, seqy, secx_cp, secy, t0, u0, TM1, TM2, TM3, TM4, TM5, @@ -3459,3 +3596,46 @@ int CPalign_main(double **xa, double **ya, seqyA_cp.clear(); return cp_point; } + +bool output_cp(const string&xname, const string&yname, + const string &seqxA, const string &seqyA, const int outfmt_opt, + int &left_num, int &right_num, int &left_aln_num, int &right_aln_num) +{ + int r; + bool after_cp=false; + for (r=0;r<seqxA.size();r++) + { + if (seqxA[r]=='*') after_cp=true; + else + { + if (after_cp) + { + right_aln_num++; + right_num+=(seqxA[r]!='-'); + } + else + { + left_aln_num++; + left_num+=(seqxA[r]!='-'); + } + } + } + if (after_cp==false) + { + if (outfmt_opt<=0) cout<<"No CP"<<endl; + else if (outfmt_opt==1) cout<<"#No CP"<<endl; + else if (outfmt_opt==2) cout<<"@"<<xname<<'\t'<<yname<<'\t'<<"No CP"<<endl; + } + else + { + if (outfmt_opt<=0) cout<<"CP point in structure_1 alignment: "<<left_aln_num<<'/'<<right_aln_num<<'\n' + <<"CP point in structure_1: "<<left_num<<'/'<<right_num<<endl; + else if (outfmt_opt==1) + cout<<"#CP_in_aln="<<left_aln_num<<'/'<<right_aln_num + <<"\tCP_in_seq="<<left_num<<'/'<<right_num<<endl; + else if (outfmt_opt==2) cout<<"@"<<xname<<'\t'<<yname<<'\t'<<left_aln_num + <<'/'<<right_aln_num<<'\t'<<left_num<<'/'<<right_num<<endl; + } + return after_cp; +} +#endif diff --git a/modules/bindings/src/tmalign/TMscore.cpp b/modules/bindings/src/USalign/TMscore.cpp similarity index 91% rename from modules/bindings/src/tmalign/TMscore.cpp rename to modules/bindings/src/USalign/TMscore.cpp index c2ca9958a..c84d742c1 100644 --- a/modules/bindings/src/tmalign/TMscore.cpp +++ b/modules/bindings/src/USalign/TMscore.cpp @@ -34,15 +34,15 @@ void print_extra_help() " -dir Perform all-against-all alignment among the list of PDB\n" " chains listed by 'chain_list' under 'chain_folder'. Note\n" " that the slash is necessary.\n" -" $ TMalign -dir chain_folder/ chain_list\n" +" $ TMscore -dir chain_folder/ chain_list\n" "\n" " -dir1 Use chain2 to search a list of PDB chains listed by 'chain1_list'\n" " under 'chain1_folder'. Note that the slash is necessary.\n" -" $ TMalign -dir1 chain1_folder/ chain1_list chain2\n" +" $ TMscore -dir1 chain1_folder/ chain1_list chain2\n" "\n" " -dir2 Use chain1 to search a list of PDB chains listed by 'chain2_list'\n" " under 'chain2_folder'\n" -" $ TMalign chain1 -dir2 chain2_folder/ chain2_list\n" +" $ TMscore chain1 -dir2 chain2_folder/ chain2_list\n" "\n" " -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" " add file name suffix to files listed by chain1_list or chain2_list\n" @@ -106,13 +106,21 @@ void print_help(bool h_opt=false) " 2. TM-score normalized with an assigned scale d0 e.g. 5 A:\n" " $ TMscore model.pdb native.pdb -d 5\n" "\n" -" 3. TM-score normalized by a specific length, e.g. 120 AA:\n" -" $ TMscore model.pdb native.pdv -l 120\n" +" 3. TM-score normalized by a specific length, e.g. 120 residues:\n" +" $ TMscore model.pdb native.pdb -l 120\n" "\n" " 4. TM-score with superposition output, e.g. 'TM_sup.pdb':\n" " $ TMscore model.pdb native.pdb -o TM_sup.pdb\n" " To view superimposed atomic model by PyMOL:\n" " $ pymol TM_sup.pdb native.pdb\n" +"\n" +" 5. By default, this program assumes that residue pair with the same\n" +" residue index accross the two structure files are equivalent. This\n" +" often requires that the residue index in the input structures are\n" +" renumbered beforehand. Alternatively, residue equivalence can be\n" +" established by sequence alignment:\n" +" $ TMscore model.pdb native.pdb -seq\n" +"\n" <<endl; if (h_opt) print_extra_help(); @@ -253,6 +261,10 @@ int main(int argc, char *argv[]) { byresi_opt=2; } + else if ( !strcmp(argv[i],"-seq") ) + { + byresi_opt=5; + } else if ( !strcmp(argv[i],"-mirror") && i < (argc-1) ) { mirror_opt=atoi(argv[i + 1]); i++; @@ -307,8 +319,8 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("Wrong value for option -d! It should be >0"); if (outfmt_opt>=2 && (a_opt || u_opt || d_opt)) PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); - if (byresi_opt>=2 && ter_opt>=2) - PrintErrorAndQuit("-byresi >=2 should be used with -ter <=1"); + if (byresi_opt>=2 && byresi_opt<=3 && ter_opt>=2) + PrintErrorAndQuit("-c should be used with -ter <=1"); if (split_opt==1 && ter_opt!=0) PrintErrorAndQuit("-split 1 should be used with -ter 0"); else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) @@ -329,6 +341,11 @@ int main(int argc, char *argv[]) else if (dir2_opt.size()==0) chain2_list.push_back(yname); else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + if (byresi_opt>=4) + cerr<<"WARNING! The residue correspondence between the two structures" + <<" are automatically established by sequence alignment. Results" + <<" may be unreliable."<<endl; + if (outfmt_opt==2) cout<<"#PDBchain1\tPDBchain2\tTM1\tTM2\t" <<"RMSD\tID1\tID2\tIDali\tL1\tL2\tLali"<<endl; @@ -447,6 +464,7 @@ int main(int argc, char *argv[]) int L_lt_d=0; double GDT_list[5]={0,0,0,0,0}; // 0.5, 1, 2, 4, 8 double maxsub=0; + TM1=TM2=TM3=TM4=TM5=0; /* entry function for structure alignment */ TMscore_main( @@ -473,9 +491,9 @@ int main(int argc, char *argv[]) n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, - (m_opt?fname_matrix+chainID_list1[chain_i]:"").c_str(), + (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, - (o_opt?fname_super+chainID_list1[chain_i]:"").c_str(), + (o_opt?fname_super:"").c_str(), a_opt, u_opt, d_opt, mirror_opt, L_lt_d, rmsd_d0_out, GDT_list, maxsub, split_opt, resi_vec1, resi_vec2); diff --git a/modules/bindings/src/tmalign/TMscore.h b/modules/bindings/src/USalign/TMscore.h similarity index 95% rename from modules/bindings/src/tmalign/TMscore.h rename to modules/bindings/src/USalign/TMscore.h index 445335c79..90ded3c01 100644 --- a/modules/bindings/src/tmalign/TMscore.h +++ b/modules/bindings/src/USalign/TMscore.h @@ -58,7 +58,7 @@ int score_fun8( double **xa, double **ya, int n_ali, double d, int i_ali[], } } } - //there are not enough feasible pairs, reliefe the threshold + //there are not enough feasible pairs, relieve the threshold if(n_cut<3 && n_ali>3) { inc++; @@ -130,7 +130,7 @@ int score_fun8_standard(double **xa, double **ya, int n_ali, double d, } } } - //there are not enough feasible pairs, reliefe the threshold + //there are not enough feasible pairs, relieve the threshold if (n_cut<3 && n_ali>3) { inc++; @@ -309,6 +309,7 @@ double TMscore8_search(double **r1, double **r2, double **xtm, double **ytm, return score_max; } + double TMscore8_search_standard( double **r1, double **r2, double **xtm, double **ytm, double **xt, int Lali, double t0[3], double u0[3][3], int simplify_step, int score_sum_method, @@ -353,7 +354,7 @@ double TMscore8_search_standard( double **r1, double **r2, //find the maximum score starting from local structures superposition int i_ali[kmax], n_cut; int L_frag; //fragment length - int iL_max; //maximum starting postion for the fragment + int iL_max; //maximum starting position for the fragment for (i_init = 0; i_init<n_init; i_init++) { @@ -560,7 +561,7 @@ int TMscore_main(double **xa, double **ya, /***********************/ parameter_set4search(xlen, ylen, D0_MIN, Lnorm, score_d8, d0, d0_search, dcu0); - int simplify_step = 40; //for similified search engine + int simplify_step = 40; //for simplified search engine int score_sum_method = 8; //for scoring method, whether only sum over pairs with dis<score_d8 int i; @@ -616,7 +617,7 @@ int TMscore_main(double **xa, double **ya, //*******************************************************************// // The alignment will not be changed any more in the following // //*******************************************************************// - //check if the initial alignment is generated approriately + //check if the initial alignment is generated appropriately bool flag=false; for(i=0; i<ylen; i++) { @@ -628,8 +629,8 @@ int TMscore_main(double **xa, double **ya, } if(!flag) { - cout << "There is no alignment between the two proteins!" << endl; - cout << "Program stop with no result!" << endl; + cout << "There is no alignment between the two structures! " + << "Program stop with no result!" << endl; return 1; } @@ -652,7 +653,7 @@ int TMscore_main(double **xa, double **ya, // Detailed TMscore search engine --> prepare for final TMscore // //********************************************************************// //run detailed TMscore search engine for the best alignment, and - //extract the best rotation matrix (t, u) for the best alginment + //extract the best rotation matrix (t, u) for the best alignment simplify_step=1; if (fast_opt) simplify_step=40; score_sum_method=8; diff --git a/modules/bindings/src/USalign/USalign.cpp b/modules/bindings/src/USalign/USalign.cpp new file mode 100644 index 000000000..fdd1d8b95 --- /dev/null +++ b/modules/bindings/src/USalign/USalign.cpp @@ -0,0 +1,3137 @@ +/* command line argument parsing and document of US-align main program */ + +#include "MMalign.h" +#include "SOIalign.h" +#include "flexalign.h" + +using namespace std; + +void print_version() +{ + cout << +"\n" +" ********************************************************************\n" +" * US-align (Version 20220924) *\n" +" * Universal Structure Alignment of Proteins and Nucleic Acids *\n" +" * Reference: C Zhang, M Shine, AM Pyle, Y Zhang. (2022) Nat Methods*\n" +" * Please email comments and suggestions to zhang@zhanggroup.org *\n" +" ********************************************************************" + << endl; +} + +void print_extra_help() +{ + cout << +"Additional options:\n" +" -v Print the version of US-align\n" +"\n" +" -a TM-score normalized by the average length of two structures\n" +" T or F, (default F). -a does not change the final alignment.\n" +"\n" +" -fast Fast but slightly inaccurate alignment\n" +"\n" +" -dir Perform all-against-all alignment among the list of PDB\n" +" chains listed by 'chain_list' under 'chain_folder'. Note\n" +" that the slash is necessary.\n" +" $ USalign -dir chain_folder/ chain_list\n" +"\n" +" -dir1 Use chain2 to search a list of PDB chains listed by 'chain1_list'\n" +" under 'chain1_folder'. Note that the slash is necessary.\n" +" $ USalign -dir1 chain1_folder/ chain1_list chain2\n" +"\n" +" -dir2 Use chain1 to search a list of PDB chains listed by 'chain2_list'\n" +" under 'chain2_folder'\n" +" $ USalign chain1 -dir2 chain2_folder/ chain2_list\n" +"\n" +" -suffix (Only when -dir1 and/or -dir2 are set, default is empty)\n" +" add file name suffix to files listed by chain1_list or chain2_list\n" +"\n" +" -atom 4-character atom name used to represent a residue.\n" +" Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" +" (note the spaces before and after CA).\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 0: treat the whole structure as one single chain\n" +" 1: treat each MODEL as a separate chain\n" +" 2: (default) treat each chain as a separate chain\n" +"\n" +" -outfmt Output format\n" +" 0: (default) full output\n" +" 1: fasta format compact output\n" +" 2: tabular format very compact output\n" +" -1: full output, but without version or citation information\n" +"\n" +" -TMcut -1: (default) do not consider TMcut\n" +" Values in [0.5,1): Do not proceed with TM-align for this\n" +" structure pair if TM-score is unlikely to reach TMcut.\n" +" TMcut is normalized as set by -a option:\n" +" -2: normalized by longer structure length\n" +" -1: normalized by shorter structure length\n" +" 0: (default, same as F) normalized by second structure\n" +" 1: same as T, normalized by average structure length\n" +"\n" +" -mirror Whether to align the mirror image of input structure\n" +" 0: (default) do not align mirrored structure\n" +" 1: align mirror of Structure_1 to origin Structure_2,\n" +" which usually requires the '-het 1' option:\n" +" $ USalign 4glu.pdb 3p9w.pdb -mirror 1 -het 1\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +" 2: align both 'ATOM ' and MSE residues\n" +"\n" +" -full Whether to show full pairwise alignment of individual chains for\n" +" -mm 2 or 4. T or F, (default F)\n" +//"\n" +//" -closeK Number of closest atoms used for sequence order independent\n" +//" initial alignment. default: 5\n" +//"\n" +//" -hinge Maximum number of hinge allowed in flexible alignment. default: 9\n" +"\n" +" -se Do not perform superposition. Useful for extracting alignment from\n" +" superposed structure pairs\n" +"\n" +" -infmt1 Input format for structure_11\n" +" -infmt2 Input format for structure_2\n" +" -1: (default) automatically detect PDB or PDBx/mmCIF format\n" +" 0: PDB format\n" +" 1: SPICKER format\n" +//" 2: xyz format\n" +" 3: PDBx/mmCIF format\n" +"\n" +"Advanced usage 1 (generate an image for a pair of superposed structures):\n" +" USalign 1cpc.pdb 1mba.pdb -o sup\n" +" pymol -c -d @sup_all_atm.pml -g sup_all_atm.png\n" +"\n" +"Advanced usage 2 (a quick search of query.pdb against I-TASSER PDB library):\n" +" wget https://zhanggroup.org/library/PDB.tar.bz2\n" +" tar -xjvf PDB.tar.bz2\n" +" USalign query.pdb -dir2 PDB/ PDB/list -suffix .pdb -outfmt 2 -fast\n" + <<endl; +} + +void print_help(bool h_opt=false) +{ + print_version(); + cout << +"\n" +"Usage: USalign PDB1.pdb PDB2.pdb [Options]\n" +"\n" +"Options:\n" +" -mol Type of molecule(s) to align.\n" +" auto: (default) align both protein and nucleic acids.\n" +" prot: only align proteins in a structure.\n" +" RNA : only align RNA and DNA in a structure.\n" +"\n" +" -mm Multimeric alignment option:\n" +" 0: (default) alignment of two monomeric structures\n" +" 1: alignment of two multi-chain oligomeric structures\n" +" 2: alignment of individual chains to an oligomeric structure\n" +" $ USalign -dir1 monomers/ list oligomer.pdb -ter 0 -mm 2\n" +" 3: alignment of circularly permuted structure\n" +" 4: alignment of multiple monomeric chains into a consensus alignment\n" +" $ USalign -dir chains/ list -suffix .pdb -mm 4\n" +" 5: fully non-sequential (fNS) alignment\n" +" 6: semi-non-sequential (sNS) alignment\n" +" To use -mm 1 or -mm 2, '-ter' option must be 0 or 1.\n" +"\n" +" -ter Number of chains to align.\n" +" 3: only align the first chain, or the first segment of the\n" +" first chain as marked by the 'TER' string in PDB file\n" +" 2: (default) only align the first chain\n" +" 1: align all chains of the first model (recommended for aligning\n" +" asymmetric units)\n" +" 0: align all chains from all models (recommended for aligning\n" +" biological assemblies, i.e. biounits)\n" +"\n" +" -TMscore Whether to perform TM-score superposition without structure-based\n" +" alignment. The same as -byresi.\n" +" 0: (default) sequence independent structure alignment\n" +" 1: superpose two structures by assuming that a pair of residues\n" +" with the same residue index are equivalent between the two\n" +" structures\n" +" 2: superpose two complex structures, assuming that a pair of\n" +" residues with the same residue index and the same chain ID\n" +" are equivalent between the two structures\n" +//" 3: (similar to TMscore '-c' option; used with -ter 0 or 1)\n" +//" align by residue index and order of chain\n" +//" 4: sequence dependent alignment: perform Needleman-Wunsch\n" +//" global sequence alignment, followed by TM-score superposition\n" +" 5: sequence dependent alignment: perform glocal sequence\n" +" alignment followed by TM-score superposition.\n" +" -byresi 5 is the same as -seq\n" +" 6: superpose two complex structures by first deriving optimal\n" +" chain mapping, followed by TM-score superposition for residues\n" +" with the same residue ID\n" +"\n" +" -I Use the final alignment specified by FASTA file 'align.txt'\n" +"\n" +" -i Use alignment specified by 'align.txt' as an initial alignment\n" +"\n" +" -m Output rotation matrix for superposition\n" +"\n" +" -d TM-score scaled by an assigned d0, e.g., '-d 3.5' reports MaxSub\n" +" score, where d0 is 3.5 Angstrom. -d does not change final alignment.\n" +"\n" +" -u TM-score normalized by an assigned length. It should be >= length\n" +" of protein to avoid TM-score >1. -u does not change final alignment.\n" +"\n" +" -o Output superposed structure1 to sup.* for PyMOL viewing.\n" +" $ USalign structure1.pdb structure2.pdb -o sup\n" +" $ pymol -d @sup.pml # C-alpha trace aligned region\n" +" $ pymol -d @sup_all.pml # C-alpha trace whole chain\n" +" $ pymol -d @sup_atm.pml # full-atom aligned region\n" +" $ pymol -d @sup_all_atm.pml # full-atom whole chain\n" +" $ pymol -d @sup_all_atm_lig.pml # full-atom with all molecules\n" +"\n" +" -rasmol Output superposed structure1 to sup.* for RasMol viewing.\n" +" $ USalign structure1.pdb structure2.pdb -rasmol sup\n" +" $ rasmol -script sup # C-alpha trace aligned region\n" +" $ rasmol -script sup_all # C-alpha trace whole chain\n" +" $ rasmol -script sup_atm # full-atom aligned region\n" +" $ rasmol -script sup_all_atm # full-atom whole chain\n" +" $ rasmol -script sup_all_atm_lig # full-atom with all molecules\n" +"\n" +//" -h Print the full help message, including additional options\n" +//"\n" +"Example usages ('gunzip' program is needed to read .gz compressed files):\n" +" USalign 101m.cif.gz 1mba.pdb # pairwise monomeric protein alignment\n" +" USalign 1qf6.cif 5yyn.pdb.gz -mol RNA # pairwise monomeric RNA alignment\n" +" USalign model.pdb native.pdb -TMscore 1 # calculate TM-score between two conformations of a monomer\n" +" USalign 4v4a.cif 4v49.cif -mm 1 -ter 1 # oligomeric alignment for asymmetic units\n" +" USalign 3ksc.pdb1 4lej.pdb1 -mm 1 -ter 0 # oligomeric alignment for biological units\n" +" USalign 1ajk.pdb.gz 2ayh.pdb.gz -mm 3 # circular permutation alignment\n" + <<endl; + + //if (h_opt) + print_extra_help(); + + exit(EXIT_SUCCESS); +} + +/* TMalign, RNAalign, CPalign, TMscore */ +int TMalign(string &xname, string &yname, const string &fname_super, + const string &fname_lign, const string &fname_matrix, + vector<string> &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int cp_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const string &mol_opt, const string &dir_opt, + const string &dir1_opt, const string &dir2_opt, const int byresi_opt, + const vector<string> &chain1_list, const vector<string> &chain2_list, + const bool se_opt) +{ + /* declare previously global variables */ + vector<vector<string> >PDB_lines1; // text of chain1 + vector<vector<string> >PDB_lines2; // text of chain2 + vector<int> mol_vec1; // molecule type of chain1, RNA if >0 + vector<int> mol_vec2; // molecule type of chain2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + int read_resi=byresi_opt; // whether to read residue index + if (byresi_opt==0 && o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i<chain1_list.size();i++) + { + /* parse chain 1 */ + xname=chain1_list[i]; + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + if (!xchainnum) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<xchainnum;chain_i++) + { + xlen=PDB_lines1[chain_i].size(); + if (mol_opt=="RNA") mol_vec1[chain_i]=1; + else if (mol_opt=="protein") mol_vec1[chain_i]=-1; + if (!xlen) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain length 0."<<endl; + continue; + } + else if (xlen<3) + { + cerr<<"Sequence is too short <3!: "<<xname<<endl; + continue; + } + NewArray(&xa, xlen, 3); + seqx = new char[xlen + 1]; + secx = new char[xlen + 1]; + xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, + resi_vec1, read_resi); + if (mirror_opt) for (r=0;r<xlen;r++) xa[r][2]=-xa[r][2]; + if (mol_vec1[chain_i]>0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + + for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) + { + /* parse chain 2 */ + if (PDB_lines2.size()==0) + { + yname=chain2_list[j]; + ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); + if (!ychainnum) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain number 0."<<endl; + continue; + } + } + for (chain_j=0;chain_j<ychainnum;chain_j++) + { + ylen=PDB_lines2[chain_j].size(); + if (mol_opt=="RNA") mol_vec2[chain_j]=1; + else if (mol_opt=="protein") mol_vec2[chain_j]=-1; + if (!ylen) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain length 0."<<endl; + continue; + } + else if (ylen<3) + { + cerr<<"Sequence is too short <3!: "<<yname<<endl; + continue; + } + NewArray(&ya, ylen, 3); + seqy = new char[ylen + 1]; + secy = new char[ylen + 1]; + ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, + resi_vec2, read_resi); + if (mol_vec2[chain_j]>0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + + /* entry function for structure alignment */ + if (cp_opt) CPalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],TMcut); + else if (se_opt) + { + int *invmap = new int[ylen+1]; + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + se_main( + xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + outfmt_opt, invmap); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2<ylen;r2++) + { + r1=invmap[r2]; + if (r1<0) continue; + L_ali+=1; + Liden+=(seqx[r1]==seqy[r2]); + } + } + delete [] invmap; + } + else TMalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],TMcut); + + /* print result */ + if (outfmt_opt==0) print_version(); + int left_num=0; + int right_num=0; + int left_aln_num=0; + int right_aln_num=0; + bool after_cp=false; + if (cp_opt) after_cp=output_cp( + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), + seqxA,seqyA,outfmt_opt,left_num,right_num, + left_aln_num,right_aln_num); + output_results( + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), + chainID_list1[chain_i], chainID_list2[chain_j], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, + rmsd0, d0_out, seqM.c_str(), + seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, + d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), + outfmt_opt, ter_opt, false, split_opt, o_opt, + fname_super, i_opt, a_opt, u_opt, d_opt, mirror_opt, + resi_vec1, resi_vec2); + if (cp_opt && outfmt_opt<=0) + { + cout<<"###############\t###############\n" + <<"#Aligned atom 1\tAligned atom 2#\n"; + size_t r1=right_num; + size_t r2=0; + size_t r; + for (r=0;r<seqxA.size();r++) + { + r1+=seqxA[r]!='-'; + r2+=seqyA[r]!='-'; + if (seqxA[r]=='*') + { + cout<<"###### Circular\tPermutation ###\n"; + r1=0; + } + else if (seqxA[r]!='-' && seqyA[r]!='-') + { + cout<<PDB_lines1[chain_i][r1-1].substr(12,15)<<'\t' + <<PDB_lines2[chain_j][r2-1].substr(12,15)<<'\n'; + } + } + cout<<"###############\t###############"<<endl; + } + + /* Done! Free memory */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + delete [] seqy; + delete [] secy; + resi_vec2.clear(); + } // chain_j + if (chain2_list.size()>1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + } // j + PDB_lines1[chain_i].clear(); + DeleteArray(&xa, xlen); + delete [] seqx; + delete [] secx; + resi_vec1.clear(); + } // chain_i + xname.clear(); + PDB_lines1.clear(); + chainID_list1.clear(); + mol_vec1.clear(); + } // i + if (chain2_list.size()==1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + resi_vec2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + return 0; +} + +/* MMalign if more than two chains. TMalign if only one chain */ +int MMalign(const string &xname, const string &yname, + const string &fname_super, const string &fname_lign, + const string &fname_matrix, vector<string> &sequence, + const double d0_scale, const bool m_opt, const int o_opt, + const int a_opt, const bool d_opt, const bool full_opt, + const double TMcut, const int infmt1_opt, const int infmt2_opt, + const int ter_opt, const int split_opt, const int outfmt_opt, + bool fast_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const string &mol_opt, + const string &dir1_opt, const string &dir2_opt, + const vector<string> &chain1_list, const vector<string> &chain2_list, + const int byresi_opt) +{ + /* declare previously global variables */ + vector<vector<vector<double> > > xa_vec; // structure of complex1 + vector<vector<vector<double> > > ya_vec; // structure of complex2 + vector<vector<char> >seqx_vec; // sequence of complex1 + vector<vector<char> >seqy_vec; // sequence of complex2 + vector<vector<char> >secx_vec; // secondary structure of complex1 + vector<vector<char> >secy_vec; // secondary structure of complex2 + vector<int> mol_vec1; // molecule type of complex1, RNA if >0 + vector<int> mol_vec2; // molecule type of complex2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + vector<int> xlen_vec; // length of complex1 + vector<int> ylen_vec; // length of complex2 + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int xlen_aa,ylen_aa; // total length of protein + int xlen_na,ylen_na; // total length of RNA/DNA + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + + /* parse complex */ + parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, + xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, + atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 1"); + parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, + ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, + atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + if (ya_vec.size()==0) PrintErrorAndQuit("ERROR! 0 chain in complex 2"); + int len_aa=getmin(xlen_aa,ylen_aa); + int len_na=getmin(xlen_na,ylen_na); + if (a_opt) + { + len_aa=(xlen_aa+ylen_aa)/2; + len_na=(xlen_na+ylen_na)/2; + } + int i_opt=0; + if (byresi_opt) i_opt=3; + + /* perform monomer alignment if there is only one chain */ + if (xa_vec.size()==1 && ya_vec.size()==1) + { + xlen = xlen_vec[0]; + ylen = ylen_vec[0]; + seqx = new char[xlen+1]; + seqy = new char[ylen+1]; + secx = new char[xlen+1]; + secy = new char[ylen+1]; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + copy_chain_data(xa_vec[0],seqx_vec[0],secx_vec[0], xlen,xa,seqx,secx); + copy_chain_data(ya_vec[0],seqy_vec[0],secy_vec[0], ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, 0, d0_scale, + i_opt, a_opt, false, d_opt, fast_opt, + mol_vec1[0]+mol_vec2[0],TMcut); + + /* print result */ + output_results( + xname.substr(dir1_opt.size()), + yname.substr(dir2_opt.size()), + chainID_list1[0], chainID_list2[0], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, + 0, d0_scale, d0a, d0u, (m_opt?fname_matrix:"").c_str(), + outfmt_opt, ter_opt, true, split_opt, o_opt, fname_super, + 0, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete[]seqx; + delete[]seqy; + delete[]secx; + delete[]secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + chainID_list1.clear(); // list of chainID1 + chainID_list2.clear(); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 0; + } + + /* declare TM-score tables */ + int chain1_num=xa_vec.size(); + int chain2_num=ya_vec.size(); + vector<string> tmp_str_vec(chain2_num,""); + double **TMave_mat; + double **ut_mat; // rotation matrices for all-against-all alignment + int ui,uj,ut_idx; + NewArray(&TMave_mat,chain1_num,chain2_num); + NewArray(&ut_mat,chain1_num*chain2_num,4*3); + vector<vector<string> >seqxA_mat(chain1_num,tmp_str_vec); + vector<vector<string> > seqM_mat(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_mat(chain1_num,tmp_str_vec); + + double maxTMmono=-1; + int maxTMmono_i,maxTMmono_j; + + /* get all-against-all alignment */ + if (len_aa+len_na>500) fast_opt=true; + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + for (j=0;j<chain2_num;j++) + { + ut_idx=i*chain2_num+j; + for (ui=0;ui<4;ui++) + for (uj=0;uj<3;uj++) ut_mat[ut_idx][ui*3+uj]=0; + ut_mat[ut_idx][0]=1; + ut_mat[ut_idx][4]=1; + ut_mat[ut_idx][8]=1; + + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + int Lnorm_tmp=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_tmp=len_na; + + if (byresi_opt) + { + int total_aln=extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,xlen_vec,ylen_vec, i, j); + seqxA_mat[i][j]=sequence[0]; + seqyA_mat[i][j]=sequence[1]; + if (total_aln>xlen+ylen-3) + { + for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) + ut_mat[ut_idx][ui*3+uj]=(ui==uj)?1:0; + for (uj=0;uj<3;uj++) ut_mat[ut_idx][9+uj]=0; + TMave_mat[i][j]=0; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + continue; + } + } + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + i_opt, false, true, false, fast_opt, + mol_vec1[i]+mol_vec2[j],TMcut); + + /* store result */ + for (ui=0;ui<3;ui++) + for (uj=0;uj<3;uj++) ut_mat[ut_idx][ui*3+uj]=u0[ui][uj]; + for (uj=0;uj<3;uj++) ut_mat[ut_idx][9+uj]=t0[uj]; + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + TMave_mat[i][j]=TM4*Lnorm_tmp; + if (TMave_mat[i][j]>maxTMmono) + { + maxTMmono=TMave_mat[i][j]; + maxTMmono_i=i; + maxTMmono_j=j; + } + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + + /* calculate initial chain-chain assignment */ + int *assign1_list; // value is index of assigned chain2 + int *assign2_list; // value is index of assigned chain1 + assign1_list=new int[chain1_num]; + assign2_list=new int[chain2_num]; + double total_score=enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + if (total_score<=0) PrintErrorAndQuit("ERROR! No assignable chain"); + + /* refine alignment for large oligomers */ + int aln_chain_num=count_assign_pair(assign1_list,chain1_num); + bool is_oligomer=(aln_chain_num>=3); + if (aln_chain_num==2) // dimer alignment + { + int na_chain_num1,na_chain_num2,aa_chain_num1,aa_chain_num2; + count_na_aa_chain_num(na_chain_num1,aa_chain_num1,mol_vec1); + count_na_aa_chain_num(na_chain_num2,aa_chain_num2,mol_vec2); + + /* align protein-RNA hybrid dimer to another hybrid dimer */ + if (na_chain_num1==1 && na_chain_num2==1 && + aa_chain_num1==1 && aa_chain_num2==1) is_oligomer=false; + /* align pure protein dimer or pure RNA dimer */ + else if ((getmin(na_chain_num1,na_chain_num2)==0 && + aa_chain_num1==2 && aa_chain_num2==2) || + (getmin(aa_chain_num1,aa_chain_num2)==0 && + na_chain_num1==2 && na_chain_num2==2)) + { + adjust_dimer_assignment(xa_vec,ya_vec,xlen_vec,ylen_vec,mol_vec1, + mol_vec2,assign1_list,assign2_list,seqxA_mat,seqyA_mat); + is_oligomer=false; // cannot refiner further + } + else is_oligomer=true; /* align oligomers to dimer */ + } + + if (aln_chain_num>=3 || is_oligomer) // oligomer alignment + { + /* extract centroid coordinates */ + double **xcentroids; + double **ycentroids; + NewArray(&xcentroids, chain1_num, 3); + NewArray(&ycentroids, chain2_num, 3); + double d0MM=getmin( + calculate_centroids(xa_vec, chain1_num, xcentroids), + calculate_centroids(ya_vec, chain2_num, ycentroids)); + + /* refine enhanced greedy search with centroid superposition */ + //double het_deg=check_heterooligomer(TMave_mat, chain1_num, chain2_num); + homo_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na, ut_mat); + hetero_refined_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num, xcentroids, + ycentroids, d0MM, len_aa+len_na); + + /* clean up */ + DeleteArray(&xcentroids, chain1_num); + DeleteArray(&ycentroids, chain2_num); + } + + /* store initial assignment */ + int init_pair_num=count_assign_pair(assign1_list,chain1_num); + int *assign1_init, *assign2_init; + assign1_init=new int[chain1_num]; + assign2_init=new int[chain2_num]; + double **TMave_init; + NewArray(&TMave_init,chain1_num,chain2_num); + vector<vector<string> >seqxA_init(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_init(chain1_num,tmp_str_vec); + vector<string> sequence_init; + copy_chain_assign_data(chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); + + /* perform iterative alignment */ + double max_total_score=0; // ignore old total_score because previous + // score was from monomeric chain superpositions + int max_iter=5-(int)((len_aa+len_na)/200); + if (max_iter<2) max_iter=2; + if (byresi_opt==0) MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, + seqx_vec, seqy_vec, secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, + ylen_vec, xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, + chain2_num, TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, + sequence, d0_scale, fast_opt); + + /* sometime MMalign_iter is even worse than monomer alignment */ + if (byresi_opt==0 && max_total_score<maxTMmono) + { + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + for (i=0;i<chain1_num;i++) + { + if (i!=maxTMmono_i) assign1_list[i]=-1; + else assign1_list[i]=maxTMmono_j; + } + for (j=0;j<chain2_num;j++) + { + if (j!=maxTMmono_j) assign2_list[j]=-1; + else assign2_list[j]=maxTMmono_i; + } + sequence[0]=seqxA_mat[maxTMmono_i][maxTMmono_j]; + sequence[1]=seqyA_mat[maxTMmono_i][maxTMmono_j]; + max_total_score=maxTMmono; + MMalign_iter(max_total_score, max_iter, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_mat, seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, fast_opt); + } + + /* perform cross chain alignment + * in some cases, this leads to dramatic improvement, esp for homodimer */ + int iter_pair_num=count_assign_pair(assign1_list,chain1_num); + if (iter_pair_num>=init_pair_num) copy_chain_assign_data( + chain1_num, chain2_num, sequence_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init); + double max_total_score_cross=max_total_score; + if (byresi_opt==0 && len_aa+len_na<10000) + { + MMalign_dimer(max_total_score_cross, xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, chain1_num, chain2_num, + TMave_init, seqxA_init, seqyA_init, assign1_init, assign2_init, + sequence_init, d0_scale, fast_opt); + if (max_total_score_cross>max_total_score) + { + max_total_score=max_total_score_cross; + copy_chain_assign_data(chain1_num, chain2_num, sequence, + seqxA_init, seqyA_init, assign1_init, assign2_init, TMave_init, + seqxA_mat, seqyA_mat, assign1_list, assign2_list, TMave_mat); + } + } + + /* final alignment */ + if (outfmt_opt==0) print_version(); + MMalign_final(xname.substr(dir1_opt.size()), yname.substr(dir2_opt.size()), + chainID_list1, chainID_list2, + fname_super, fname_lign, fname_matrix, + xa_vec, ya_vec, seqx_vec, seqy_vec, + secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, + xa, ya, seqx, seqy, secx, secy, len_aa, len_na, + chain1_num, chain2_num, TMave_mat, + seqxA_mat, seqM_mat, seqyA_mat, assign1_list, assign2_list, sequence, + d0_scale, m_opt, o_opt, outfmt_opt, ter_opt, split_opt, + a_opt, d_opt, fast_opt, full_opt, mirror_opt, resi_vec1, resi_vec2); + + /* clean up everything */ + delete [] assign1_list; + delete [] assign2_list; + DeleteArray(&TMave_mat,chain1_num); + DeleteArray(&ut_mat, chain1_num*chain2_num); + vector<vector<string> >().swap(seqxA_mat); + vector<vector<string> >().swap(seqM_mat); + vector<vector<string> >().swap(seqyA_mat); + vector<string>().swap(tmp_str_vec); + + delete [] assign1_init; + delete [] assign2_init; + DeleteArray(&TMave_init,chain1_num); + vector<vector<string> >().swap(seqxA_init); + vector<vector<string> >().swap(seqyA_init); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + vector<string>().swap(chainID_list1); // list of chainID1 + vector<string>().swap(chainID_list2); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + vector<string> ().swap(resi_vec1); // residue index for chain1 + vector<string> ().swap(resi_vec2); // residue index for chain2 + return 1; +} + + +/* alignment individual chains to a complex. */ +int MMdock(const string &xname, const string &yname, const string &fname_super, + const string &fname_matrix, vector<string> &sequence, const double Lnorm_ass, + const double d0_scale, const bool m_opt, const int o_opt, + const int a_opt, const bool u_opt, const bool d_opt, + const double TMcut, const int infmt1_opt, const int infmt2_opt, + const int ter_opt, const int split_opt, const int outfmt_opt, + bool fast_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const string &mol_opt, + const string &dir1_opt, const string &dir2_opt, + const vector<string> &chain1_list, const vector<string> &chain2_list) +{ + /* declare previously global variables */ + vector<vector<vector<double> > > xa_vec; // structure of complex1 + vector<vector<vector<double> > > ya_vec; // structure of complex2 + vector<vector<char> >seqx_vec; // sequence of complex1 + vector<vector<char> >seqy_vec; // sequence of complex2 + vector<vector<char> >secx_vec; // secondary structure of complex1 + vector<vector<char> >secy_vec; // secondary structure of complex2 + vector<int> mol_vec1; // molecule type of complex1, RNA if >0 + vector<int> mol_vec2; // molecule type of complex2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + vector<int> xlen_vec; // length of complex1 + vector<int> ylen_vec; // length of complex2 + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int xlen_aa,ylen_aa; // total length of protein + int xlen_na,ylen_na; // total length of RNA/DNA + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + + /* parse complex */ + parse_chain_list(chain1_list, xa_vec, seqx_vec, secx_vec, mol_vec1, + xlen_vec, chainID_list1, ter_opt, split_opt, mol_opt, infmt1_opt, + atom_opt, mirror_opt, het_opt, xlen_aa, xlen_na, o_opt, resi_vec1); + if (xa_vec.size()==0) PrintErrorAndQuit("ERROR! 0 individual chain"); + parse_chain_list(chain2_list, ya_vec, seqy_vec, secy_vec, mol_vec2, + ylen_vec, chainID_list2, ter_opt, split_opt, mol_opt, infmt2_opt, + atom_opt, 0, het_opt, ylen_aa, ylen_na, o_opt, resi_vec2); + if (xa_vec.size()>ya_vec.size()) PrintErrorAndQuit( + "ERROR! more individual chains to align than number of chains in complex template"); + int len_aa=getmin(xlen_aa,ylen_aa); + int len_na=getmin(xlen_na,ylen_na); + if (a_opt) + { + len_aa=(xlen_aa+ylen_aa)/2; + len_na=(xlen_na+ylen_na)/2; + } + + /* perform monomer alignment if there is only one chain */ + if (xa_vec.size()==1 && ya_vec.size()==1) + { + xlen = xlen_vec[0]; + ylen = ylen_vec[0]; + seqx = new char[xlen+1]; + seqy = new char[ylen+1]; + secx = new char[xlen+1]; + secy = new char[ylen+1]; + NewArray(&xa, xlen, 3); + NewArray(&ya, ylen, 3); + copy_chain_data(xa_vec[0],seqx_vec[0],secx_vec[0], xlen,xa,seqx,secx); + copy_chain_data(ya_vec[0],seqy_vec[0],secy_vec[0], ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, a_opt, u_opt, d_opt, fast_opt, + mol_vec1[0]+mol_vec2[0],TMcut); + + /* print result */ + output_results( + xname.substr(dir1_opt.size()), + yname.substr(dir2_opt.size()), + chainID_list1[0], chainID_list2[0], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, + Lnorm_ass, d0_scale, d0a, d0u, (m_opt?fname_matrix:"").c_str(), + (outfmt_opt==2?outfmt_opt:3), ter_opt, true, split_opt, o_opt, fname_super, + 0, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); + if (outfmt_opt==2) printf("%s%s\t%s%s\t%.4f\n", + xname.substr(dir1_opt.size()).c_str(), chainID_list1[0].c_str(), + yname.substr(dir2_opt.size()).c_str(), chainID_list2[0].c_str(), + sqrt((TM1*TM1+TM2*TM2)/2)); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + delete[]seqx; + delete[]seqy; + delete[]secx; + delete[]secy; + DeleteArray(&xa,xlen); + DeleteArray(&ya,ylen); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + chainID_list1.clear(); // list of chainID1 + chainID_list2.clear(); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 0; + } + + /* declare TM-score tables */ + int chain1_num=xa_vec.size(); + int chain2_num=ya_vec.size(); + vector<string> tmp_str_vec(chain2_num,""); + double **TMave_mat; + NewArray(&TMave_mat,chain1_num,chain2_num); + vector<vector<string> >seqxA_mat(chain1_num,tmp_str_vec); + vector<vector<string> > seqM_mat(chain1_num,tmp_str_vec); + vector<vector<string> >seqyA_mat(chain1_num,tmp_str_vec); + + /* trimComplex */ + vector<vector<vector<double> > > ya_trim_vec; // structure of complex2 + vector<vector<char> >seqy_trim_vec; // sequence of complex2 + vector<vector<char> >secy_trim_vec; // secondary structure of complex2 + vector<int> ylen_trim_vec; // length of complex2 + int Lchain_aa_max1=0; + int Lchain_na_max1=0; + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (mol_vec1[i]>0 && xlen>Lchain_na_max1) Lchain_na_max1=xlen; + else if (mol_vec1[i]<=0 && xlen>Lchain_aa_max1) Lchain_aa_max1=xlen; + } + int trim_chain_count=trimComplex(ya_trim_vec,seqy_trim_vec, + secy_trim_vec,ylen_trim_vec,ya_vec,seqy_vec,secy_vec,ylen_vec, + mol_vec2,Lchain_aa_max1,Lchain_na_max1); + int ylen_trim; // chain length + double **ya_trim; // structure of single chain + char *seqy_trim; // for the protein sequence + char *secy_trim; // for the secondary structure + double **xt; + + /* get all-against-all alignment */ + if (len_aa+len_na>500) fast_opt=true; + for (i=0;i<chain1_num;i++) + { + xlen=xlen_vec[i]; + if (xlen<3) + { + for (j=0;j<chain2_num;j++) TMave_mat[i][j]=-1; + continue; + } + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], + xlen,xa,seqx,secx); + + for (j=0;j<chain2_num;j++) + { + if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment + { + TMave_mat[i][j]=-1; + continue; + } + + ylen=ylen_vec[j]; + if (ylen<3) + { + TMave_mat[i][j]=-1; + continue; + } + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], + ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + int Lnorm_tmp=len_aa; + if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_tmp=len_na; + + /* entry function for structure alignment */ + if (trim_chain_count && ylen_trim_vec[j]<ylen) + { + ylen_trim = ylen_trim_vec[j]; + seqy_trim = new char[ylen_trim+1]; + secy_trim = new char[ylen_trim+1]; + NewArray(&ya_trim, ylen_trim, 3); + copy_chain_data(ya_trim_vec[j],seqy_trim_vec[j],secy_trim_vec[j], + ylen_trim,ya_trim,seqy_trim,secy_trim); + TMalign_main(xa, ya_trim, seqx, seqy_trim, secx, secy_trim, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen_trim, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, fast_opt, + mol_vec1[i]+mol_vec2[j],TMcut); + seqxA.clear(); + seqyA.clear(); + delete[]seqy_trim; + delete[]secy_trim; + DeleteArray(&ya_trim,ylen_trim); + + NewArray(&xt,xlen,3); + do_rotation(xa, xt, xlen, t0, u0); + int *invmap = new int[ylen+1]; + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, 2, false, mol_vec1[i]+mol_vec2[j], 1, invmap); + delete[]invmap; + + if (sequence.size()<2) sequence.push_back(""); + if (sequence.size()<2) sequence.push_back(""); + sequence[0]=seqxA; + sequence[1]=seqyA; + TMalign_main(xt, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 2, false, true, false, fast_opt, + mol_vec1[i]+mol_vec2[j],TMcut); + DeleteArray(&xt, xlen); + } + else + { + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_tmp, d0_scale, + 0, false, true, false, fast_opt, + mol_vec1[i]+mol_vec2[j],TMcut); + } + + /* store result */ + seqxA_mat[i][j]=seqxA; + seqyA_mat[i][j]=seqyA; + TMave_mat[i][j]=TM4*Lnorm_tmp; + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + vector<vector<vector<double> > >().swap(ya_trim_vec); + vector<vector<char> >().swap(seqy_trim_vec); + vector<vector<char> >().swap(secy_trim_vec); + vector<int> ().swap(ylen_trim_vec); + + /* calculate initial chain-chain assignment */ + int *assign1_list; // value is index of assigned chain2 + int *assign2_list; // value is index of assigned chain1 + assign1_list=new int[chain1_num]; + assign2_list=new int[chain2_num]; + enhanced_greedy_search(TMave_mat, assign1_list, + assign2_list, chain1_num, chain2_num); + + /* final alignment */ + if (outfmt_opt==0) print_version(); + double **ut_mat; // rotation matrices for all-against-all alignment + NewArray(&ut_mat,chain1_num,4*3); + int ui,uj; + vector<string>xname_vec; + vector<string>yname_vec; + vector<double>TM_vec; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + xname_vec.push_back(xname+chainID_list1[i]); + if (j<0) + { + cerr<<"Warning! "<<chainID_list1[i]<<" cannot be alighed"<<endl; + for (ui=0;ui<3;ui++) + { + for (uj=0;uj<4;uj++) ut_mat[i][ui*3+uj]=0; + ut_mat[i][ui*3+ui]=1; + } + yname_vec.push_back(yname); + continue; + } + yname_vec.push_back(yname+chainID_list2[j]); + + xlen =xlen_vec[i]; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], xlen,xa,seqx,secx); + + ylen =ylen_vec[j]; + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + int c; + for (c=0; c<sequence.size(); c++) sequence[c].clear(); + sequence.clear(); + sequence.push_back(seqxA_mat[i][j]); + sequence.push_back(seqyA_mat[i][j]); + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 3, a_opt, u_opt, d_opt, fast_opt, + mol_vec1[i]+mol_vec2[j]); + + for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) ut_mat[i][ui*3+uj]=u0[ui][uj]; + for (uj=0;uj<3;uj++) ut_mat[i][9+uj]=t0[uj]; + + TM_vec.push_back(TM1); + TM_vec.push_back(TM2); + + if (outfmt_opt<2) output_results( + xname.c_str(), yname.c_str(), + chainID_list1[i], chainID_list2[j], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, + rmsd0, d0_out, seqM.c_str(), + seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, + d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", outfmt_opt, ter_opt, false, split_opt, + false, "",//o_opt, fname_super+chainID_list1[i], + false, a_opt, u_opt, d_opt, mirror_opt, + resi_vec1, resi_vec2); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + if (outfmt_opt==2) + { + double TM=0; + for (i=0;i<TM_vec.size();i++) TM+=TM_vec[i]*TM_vec[i]; + TM=sqrt(TM/TM_vec.size()); + string query_name=xname; + string template_name=yname; + for (i=0;i<chain1_num;i++) + { + j=assign1_list[i]; + if (j<0) continue; + query_name +=chainID_list1[i]; + template_name+=chainID_list2[j]; + } + printf("%s\t%s\t%.4f\n",query_name.c_str(),template_name.c_str(),TM); + query_name.clear(); + template_name.clear(); + } + + if (m_opt) output_dock_rotation_matrix(fname_matrix.c_str(), + xname_vec,yname_vec, ut_mat, assign1_list); + + if (o_opt) output_dock(chain1_list, ter_opt, split_opt, infmt1_opt, + atom_opt, mirror_opt, ut_mat, fname_super); + + + /* clean up everything */ + vector<double>().swap(TM_vec); + vector<string>().swap(xname_vec); + vector<string>().swap(yname_vec); + delete [] assign1_list; + delete [] assign2_list; + DeleteArray(&TMave_mat,chain1_num); + DeleteArray(&ut_mat, chain1_num); + vector<vector<string> >().swap(seqxA_mat); + vector<vector<string> >().swap(seqM_mat); + vector<vector<string> >().swap(seqyA_mat); + vector<string>().swap(tmp_str_vec); + + vector<vector<vector<double> > >().swap(xa_vec); // structure of complex1 + vector<vector<vector<double> > >().swap(ya_vec); // structure of complex2 + vector<vector<char> >().swap(seqx_vec); // sequence of complex1 + vector<vector<char> >().swap(seqy_vec); // sequence of complex2 + vector<vector<char> >().swap(secx_vec); // secondary structure of complex1 + vector<vector<char> >().swap(secy_vec); // secondary structure of complex2 + mol_vec1.clear(); // molecule type of complex1, RNA if >0 + mol_vec2.clear(); // molecule type of complex2, RNA if >0 + vector<string>().swap(chainID_list1); // list of chainID1 + vector<string>().swap(chainID_list2); // list of chainID2 + xlen_vec.clear(); // length of complex1 + ylen_vec.clear(); // length of complex2 + return 1; +} + +int mTMalign(string &xname, string &yname, const string &fname_super, + const string &fname_matrix, + vector<string> &sequence, double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + bool u_opt, const bool d_opt, const bool full_opt, const double TMcut, + const int infmt_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, bool fast_opt, + const int het_opt, + const string &atom_opt, const string &mol_opt, const string &dir_opt, + const int byresi_opt, + const vector<string> &chain_list) +{ + /* declare previously global variables */ + vector<vector<vector<double> > >a_vec; // atomic structure + vector<vector<vector<double> > >ua_vec; // unchanged atomic structure + vector<vector<char> >seq_vec; // sequence of complex + vector<vector<char> >sec_vec; // secondary structure of complex + vector<int> mol_vec; // molecule type of complex1, RNA if >0 + vector<string> chainID_list; // list of chainID + vector<int> len_vec; // length of complex + int i,j; // chain index + int xlen, ylen; // chain length + double **xa, **ya; // structure of single chain + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int len_aa,len_na; // total length of protein and RNA/DNA + vector<string> resi_vec; // residue index for chain + + /* parse chain list */ + parse_chain_list(chain_list, a_vec, seq_vec, sec_vec, mol_vec, + len_vec, chainID_list, ter_opt, split_opt, mol_opt, infmt_opt, + atom_opt, false, het_opt, len_aa, len_na, o_opt, resi_vec); + int chain_num=a_vec.size(); + if (chain_num<=1) PrintErrorAndQuit("ERROR! <2 chains for multiple alignment"); + if (m_opt||o_opt) for (i=0;i<chain_num;i++) ua_vec.push_back(a_vec[i]); + int mol_type=0; + int total_len=0; + xlen=0; + for (i=0; i<chain_num; i++) + { + if (len_vec[i]>xlen) xlen=len_vec[i]; + total_len+=len_vec[i]; + mol_type+=mol_vec[i]; + } + if (!u_opt) Lnorm_ass=total_len/chain_num; + u_opt=true; + total_len-=xlen; + if (total_len>750) fast_opt=true; + + /* get all-against-all alignment */ + double **TMave_mat; + NewArray(&TMave_mat,chain_num,chain_num); + vector<string> tmp_str_vec(chain_num,""); + vector<vector<string> >seqxA_mat(chain_num,tmp_str_vec); + vector<vector<string> >seqyA_mat(chain_num,tmp_str_vec); + for (i=0;i<chain_num;i++) for (j=0;j<chain_num;j++) TMave_mat[i][j]=0; + for (i=0;i<chain_num;i++) + { + xlen=len_vec[i]; + if (xlen<3) continue; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(a_vec[i],seq_vec[i],sec_vec[i],xlen,xa,seqx,secx); + seqxA_mat[i][i]=seqyA_mat[i][i]=(string)(seqx); + for (j=i+1;j<chain_num;j++) + { + ylen=len_vec[j]; + if (ylen<3) continue; + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(a_vec[j],seq_vec[j],sec_vec[j],ylen,ya,seqy,secy); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, false, u_opt, false, fast_opt, + mol_type,TMcut); + + /* store result */ + TMave_mat[i][j]=TMave_mat[j][i]=TM4; + seqxA_mat[i][j]=seqyA_mat[j][i]=seqxA; + seqyA_mat[i][j]=seqxA_mat[j][i]=seqyA; + //cout<<chain_list[i]<<':'<<chainID_list[i] + //<<chain_list[j]<<':'<<chainID_list[j]<<"\tTM4="<<TM4<<endl; + if (full_opt) output_results( + chain_list[i],chain_list[j], chainID_list[i], chainID_list[j], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, d0A, d0B, + Lnorm_ass, d0_scale, d0a, d0u, "", + outfmt_opt, ter_opt, true, split_opt, o_opt, "", + 0, a_opt, false, d_opt, false, resi_vec, resi_vec); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + + /* representative related variables */ + int r; + int repr_idx=0; + vector<string>xname_vec; + for (i=0;i<chain_num;i++) xname_vec.push_back( + chain_list[i].substr(dir_opt.size())+chainID_list[i]); + vector<string>yname_vec; + double *TMave_list; + TMave_list = new double[chain_num]; + int *assign_list; + assign_list=new int[chain_num]; + vector<string> msa(ylen,""); // row is position along msa; column is sequence + + int compare_num; + double TM1_total, TM2_total; + double TM3_total, TM4_total, TM5_total; // for a_opt, u_opt, d_opt + double d0_0_total, TM_0_total; + double d0A_total, d0B_total, d0u_total, d0a_total; + double d0_out_total; + double rmsd0_total; + int L_ali_total; // Aligned length in standard_TMscore + double Liden_total; + double TM_ali_total, rmsd_ali_total; // TMscore and rmsd in standard_TMscore + int n_ali_total; + int n_ali8_total; + int xlen_total, ylen_total; + double TM4_total_max=0; + + int max_iter=5-(int)(total_len/200); + if (max_iter<2) max_iter=2; + int iter=0; + vector<double> TM_vec(chain_num,0); + vector<double> d0_vec(chain_num,0); + vector<double> seqID_vec(chain_num,0); + vector<vector<double> > TM_mat(chain_num,TM_vec); + vector<vector<double> > d0_mat(chain_num,d0_vec); + vector<vector<double> > seqID_mat(chain_num,seqID_vec); + for (iter=0; iter<max_iter; iter++) + { + /* select representative */ + for (j=0; j<chain_num; j++) TMave_list[j]=0; + for (i=0; i<chain_num; i++ ) + { + for (j=0; j<chain_num; j++) + { + //cout<<'\t'<<setprecision(4)<<TMave_mat[i][j]; + TMave_list[j]+=TMave_mat[i][j]; + } + //cout<<'\t'<<chain_list[i]<<':'<<chainID_list[i]<<endl; + } + repr_idx=0; + double repr_TM=0; + for (j=0; j<chain_num; j++) + { + //cout<<chain_list[j]<<'\t'<<len_vec[j]<<'\t'<<TMave_list[j]<<endl; + if (TMave_list[j]<repr_TM) continue; + repr_TM=TMave_list[j]; + repr_idx=j; + } + //cout<<"repr="<<repr_idx<<"; "<<chain_list[repr_idx]<<"; TM="<<repr_TM<<endl; + + /* superpose superpose */ + yname=chain_list[repr_idx].substr(dir_opt.size())+chainID_list[repr_idx]; + double **xt; + vector<pair<double,int> >TM_pair_vec; // TM vs chain + + for (i=0; i<chain_num; i++) assign_list[i]=-1; + assign_list[repr_idx]=repr_idx; + //ylen = len_vec[repr_idx]; + //seqy = new char[ylen+1]; + //secy = new char[ylen+1]; + //NewArray(&ya, ylen, 3); + //copy_chain_data(a_vec[repr_idx],seq_vec[repr_idx],sec_vec[repr_idx], ylen,ya,seqy,secy); + for (r=0;r<sequence.size();r++) sequence[r].clear(); sequence.clear(); + sequence.push_back(""); + sequence.push_back(""); + for (i=0;i<chain_num;i++) + { + yname_vec.push_back(yname); + xlen = len_vec[i]; + if (i==repr_idx || xlen<3) continue; + TM_pair_vec.push_back(make_pair(-TMave_mat[i][repr_idx],i)); + } + sort(TM_pair_vec.begin(),TM_pair_vec.end()); + + int tm_idx; + if (outfmt_opt<0) cout<<"#PDBchain1\tPDBchain2\tTM1\tTM2\t" + <<"RMSD\tID1\tID2\tIDali\tL1\tL2\tLali"<<endl; + for (tm_idx=0; tm_idx<TM_pair_vec.size(); tm_idx++) + { + i=TM_pair_vec[tm_idx].second; + xlen = len_vec[i]; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(a_vec[i],seq_vec[i],sec_vec[i], xlen,xa,seqx,secx); + + double maxTM=TMave_mat[i][repr_idx]; + int maxj=repr_idx; + for (j=0;j<chain_num;j++) + { + if (i==j || assign_list[j]<0 || TMave_mat[i][j]<=maxTM) continue; + maxj=j; + maxTM=TMave_mat[i][j]; + } + j=maxj; + assign_list[i]=j; + ylen = len_vec[j]; + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(a_vec[j],seq_vec[j],sec_vec[j], ylen,ya,seqy,secy); + + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + //cout<<"tm_idx="<<tm_idx<<"\ti="<<i<<"\tj="<<j<<endl; + //cout<<"superpose "<<xname_vec[i]<<" to "<<xname_vec[j]<<endl; + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + /* entry function for structure alignment */ + TMalign_main(xa, ya, seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 2, a_opt, u_opt, d_opt, fast_opt, mol_type); + + if (outfmt_opt<0) output_results( + xname_vec[i].c_str(), xname_vec[j].c_str(), "", "", + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, + rmsd0, d0_out, seqM.c_str(), + seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, + d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + "", 2,//outfmt_opt, + ter_opt, false, split_opt, + false, "",//o_opt, fname_super+chainID_list1[i], + false, a_opt, u_opt, d_opt, false, + resi_vec, resi_vec); + + NewArray(&xt,xlen,3); + do_rotation(xa, xt, xlen, t0, u0); + for (r=0;r<xlen;r++) + { + a_vec[i][r][0]=xt[r][0]; + a_vec[i][r][1]=xt[r][1]; + a_vec[i][r][2]=xt[r][2]; + } + DeleteArray(&xt, xlen); + + /* clean up */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + sequence[0].clear(); + sequence[1].clear(); + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + ylen = len_vec[repr_idx]; + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(a_vec[repr_idx],seq_vec[repr_idx],sec_vec[repr_idx], ylen,ya,seqy,secy); + + /* recover alignment */ + int ylen_ext=ylen; // chain length + double **ya_ext; // structure of single chain + char *seqy_ext; // for the protein sequence + char *secy_ext; // for the secondary structure + for (r=0;r<msa.size();r++) msa[r].clear(); msa.clear(); + msa.assign(ylen,""); // row is position along msa; column is sequence + vector<string> msa_ext; // row is position along msa; column is sequence + for (r=0;r<ylen;r++) msa[r]=seqy[r]; + //for (r=0;r<msa.size();r++) cout<<"["<<r<<"]\t"<<msa[r]<<endl; + //cout<<"start recover"<<endl; + assign_list[repr_idx]=0; + for (tm_idx=0; tm_idx<TM_pair_vec.size(); tm_idx++) + { + i=TM_pair_vec[tm_idx].second; + assign_list[i]=tm_idx+1; + + xlen = len_vec[i]; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(a_vec[i],seq_vec[i],sec_vec[i], xlen,xa,seqx,secx); + + /* declare variable specific to this pair of TMalign */ + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + int *invmap = new int[ylen+1]; + + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + 0, a_opt, u_opt, d_opt, mol_type, 1, invmap); + + int rx=0,ry=0; + ylen_ext=seqxA.size(); + NewArray(&ya_ext, ylen_ext, 3); // structure of single chain + seqy_ext= new char[ylen_ext+1]; // for the protein sequence + secy_ext= new char[ylen_ext+1]; // for the secondary structure + string tmp_gap=""; + for (r=0;r<msa[0].size();r++) tmp_gap+='-'; + for (r=msa_ext.size();r<ylen_ext;r++) msa_ext.push_back(""); + //cout<<"x:"<<xname_vec[i]<<'\n'<<seqxA<<endl; + //cout<<"y:"<<xname_vec[repr_idx]<<'\n'<<seqyA<<endl; + for (r=0;r<ylen_ext;r++) + { + if (seqyA[r]=='-') + { + msa_ext[r]=tmp_gap+seqxA[r]; + ya_ext[r][0]=xa[rx][0]; + ya_ext[r][1]=xa[rx][1]; + ya_ext[r][2]=xa[rx][2]; + seqy_ext[r]=seqx[rx]; + secy_ext[r]=secx[rx]; + } + else + { + msa_ext[r]=msa[ry]+seqxA[r]; + ya_ext[r][0]=ya[ry][0]; + ya_ext[r][1]=ya[ry][1]; + ya_ext[r][2]=ya[ry][2]; + seqy_ext[r]=seqy[ry]; + secy_ext[r]=secy[ry]; + } + rx+=(seqxA[r]!='-'); + ry+=(seqyA[r]!='-'); + } + + /* copy ya_ext to ya */ + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + + ylen=ylen_ext; + NewArray(&ya,ylen,3); + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + for (r=0;r<ylen;r++) + { + ya[r][0]=ya_ext[r][0]; + ya[r][1]=ya_ext[r][1]; + ya[r][2]=ya_ext[r][2]; + seqy[r]=seqy_ext[r]; + secy[r]=secy_ext[r]; + } + for (r=0;r<ylen;r++) + { + if (r<msa.size()) msa[r]=msa_ext[r]; + else msa.push_back(msa_ext[r]); + } + //for (r=0;r<ylen_ext;r++) cout<<"["<<r<<"]\t"<<msa_ext[r]<<'\t'<<seqy[r]<<'\t' + //<<ya[r][0]<<'\t'<<ya[r][1]<<'\t'<<ya[r][2]<<'\t'<<secy[r]<<endl; + + /* clean up */ + tmp_gap.clear(); + delete[]invmap; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + + delete[]seqy_ext; + delete[]secy_ext; + DeleteArray(&ya_ext,ylen_ext); + } + vector<string>().swap(msa_ext); + vector<pair<double,int> >().swap(TM_pair_vec); + for (i=0; i<chain_num; i++) + { + tm_idx=assign_list[i]; + if (tm_idx<0) continue; + seqyA_mat[i][i]=""; + for (r=0 ;r<ylen ; r++) seqyA_mat[i][i]+=msa[r][tm_idx]; + seqxA_mat[i][i]=seqyA_mat[i][i]; + //cout<<xname_vec[i]<<'\t'<<seqxA_mat[i][i]<<endl; + } + for (i=0;i<chain_num; i++) + { + if (assign_list[i]<0) continue; + string seqxA=seqxA_mat[i][i]; + for (j=0; j<chain_num; j++) + { + if (i==j || assign_list[j]<0) continue; + string seqyA=seqyA_mat[j][j]; + seqxA_mat[i][j]=seqyA_mat[i][j]=""; + for (r=0;r<ylen;r++) + { + if (seqxA[r]=='-' && seqyA[r]=='-') continue; + seqxA_mat[i][j]+=seqxA[r]; + seqyA_mat[i][j]+=seqyA[r]; + } + seqyA.clear(); + } + seqxA.clear(); + } + + /* recover statistics such as TM-score */ + compare_num=0; + TM1_total=0, TM2_total=0; + TM3_total=0, TM4_total=0, TM5_total=0; + d0_0_total=0, TM_0_total=0; + d0A_total=0, d0B_total=0, d0u_total=0, d0a_total=0; + d0_out_total=0; + rmsd0_total = 0.0; + L_ali_total=0; + Liden_total=0; + TM_ali_total=0, rmsd_ali_total=0; + n_ali_total=0; + n_ali8_total=0; + xlen_total=0, ylen_total=0; + for (i=0; i< chain_num; i++) + { + xlen=len_vec[i]; + if (xlen<3) continue; + seqx = new char[xlen+1]; + secx = new char[xlen+1]; + NewArray(&xa, xlen, 3); + copy_chain_data(a_vec[i],seq_vec[i],sec_vec[i], xlen,xa,seqx,secx); + for (j=i+1;j<chain_num;j++) + { + ylen=len_vec[j]; + if (ylen<3) continue; + compare_num++; + seqy = new char[ylen+1]; + secy = new char[ylen+1]; + NewArray(&ya, ylen, 3); + copy_chain_data(a_vec[j],seq_vec[j],sec_vec[j],ylen,ya,seqy,secy); + sequence[0]=seqxA_mat[i][j]; + sequence[1]=seqyA_mat[i][j]; + + /* declare variable specific to this pair of TMalign */ + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali=0; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + int *invmap = new int[ylen+1]; + + se_main(xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + true, a_opt, u_opt, d_opt, mol_type, 1, invmap); + + if (xlen<=ylen) + { + xlen_total+=xlen; + ylen_total+=ylen; + TM1_total+=TM1; + TM2_total+=TM2; + d0A_total+=d0A; + d0B_total+=d0B; + } + else + { + xlen_total+=ylen; + ylen_total+=xlen; + TM1_total+=TM2; + TM2_total+=TM1; + d0A_total+=d0B; + d0B_total+=d0A; + } + TM_mat[i][j]=TM2; + TM_mat[j][i]=TM1; + d0_mat[i][j]=d0B; + d0_mat[j][i]=d0A; + seqID_mat[i][j]=1.*Liden/xlen; + seqID_mat[j][i]=1.*Liden/ylen; + + TM3_total+=TM3; + TM4_total+=TM4; + TM5_total+=TM5; + d0_0_total+=d0_0; + TM_0_total+=TM_0; + d0u_total+=d0u; + d0_out_total+=d0_out; + rmsd0_total+=rmsd0; + L_ali_total+=L_ali; // Aligned length in standard_TMscore + Liden_total+=Liden; + TM_ali_total+=TM_ali; + rmsd_ali_total+=rmsd_ali; // TMscore and rmsd in standard_TMscore + n_ali_total+=n_ali; + n_ali8_total+=n_ali8; + + /* clean up */ + delete[]invmap; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + delete[]seqy; + delete[]secy; + DeleteArray(&ya,ylen); + } + delete[]seqx; + delete[]secx; + DeleteArray(&xa,xlen); + } + if (TM4_total<=TM4_total_max) break; + TM4_total_max=TM4_total; + } + for (i=0;i<chain_num;i++) + { + for (j=0;j<chain_num;j++) + { + if (i==j) continue; + TM_vec[i]+=TM_mat[i][j]; + d0_vec[i]+=d0_mat[i][j]; + seqID_vec[i]+=seqID_mat[i][j]; + } + TM_vec[i]/=(chain_num-1); + d0_vec[i]/=(chain_num-1); + seqID_vec[i]/=(chain_num-1); + } + xlen_total /=compare_num; + ylen_total /=compare_num; + TM1_total /=compare_num; + TM2_total /=compare_num; + d0A_total /=compare_num; + d0B_total /=compare_num; + TM3_total /=compare_num; + TM4_total /=compare_num; + TM5_total /=compare_num; + d0_0_total /=compare_num; + TM_0_total /=compare_num; + d0u_total /=compare_num; + d0_out_total /=compare_num; + rmsd0_total /=compare_num; + L_ali_total /=compare_num; + Liden_total /=compare_num; + TM_ali_total /=compare_num; + rmsd_ali_total/=compare_num; + n_ali_total /=compare_num; + n_ali8_total /=compare_num; + xname="shorter"; + yname="longer"; + string seqM=""; + string seqxA=""; + string seqyA=""; + double t0[3]; + double u0[3][3]; + stringstream buf; + for (i=0; i<chain_num; i++) + { + if (assign_list[i]<0) continue; + buf <<">"<<xname_vec[i]<<"\tL="<<len_vec[i] + <<"\td0="<<setiosflags(ios::fixed)<<setprecision(2)<<d0_vec[i] + <<"\tseqID="<<setiosflags(ios::fixed)<<setprecision(3)<<seqID_vec[i] + <<"\tTM-score="<<setiosflags(ios::fixed)<<setprecision(5)<<TM_vec[i]; + if (i==repr_idx) buf<<"\t*"; + buf<<'\n'<<seqxA_mat[i][i]<<endl; + } + seqM=buf.str(); + seqM=seqM.substr(0,seqM.size()-1); + buf.str(string()); + //MergeAlign(seqxA_mat,seqyA_mat,repr_idx,xname_vec,chain_num,seqM); + if (outfmt_opt==0) print_version(); + output_mTMalign_results( xname,yname, "","", + xlen_total, ylen_total, t0, u0, TM1_total, TM2_total, + TM3_total, TM4_total, TM5_total, rmsd0_total, d0_out_total, + seqM.c_str(), seqxA.c_str(), seqyA.c_str(), Liden_total, + n_ali8_total, L_ali_total, TM_ali_total, rmsd_ali_total, + TM_0_total, d0_0_total, d0A_total, d0B_total, + Lnorm_ass, d0_scale, d0a_total, d0u_total, + "", outfmt_opt, ter_opt, 0, split_opt, false, + "", false, a_opt, u_opt, d_opt, false, + resi_vec, resi_vec ); + + if (m_opt || o_opt) + { + double **ut_mat; // rotation matrices for all-against-all alignment + int ui,uj; + double t[3], u[3][3]; + double rmsd; + NewArray(&ut_mat,chain_num,4*3); + for (i=0;i<chain_num;i++) + { + xlen=ylen=a_vec[i].size(); + NewArray(&xa,xlen,3); + NewArray(&ya,xlen,3); + for (r=0;r<xlen;r++) + { + xa[r][0]=ua_vec[i][r][0]; + xa[r][1]=ua_vec[i][r][1]; + xa[r][2]=ua_vec[i][r][2]; + ya[r][0]= a_vec[i][r][0]; + ya[r][1]= a_vec[i][r][1]; + ya[r][2]= a_vec[i][r][2]; + } + Kabsch(xa,ya,xlen,1,&rmsd,t,u); + for (ui=0;ui<3;ui++) for (uj=0;uj<3;uj++) ut_mat[i][ui*3+uj]=u[ui][uj]; + for (uj=0;uj<3;uj++) ut_mat[i][9+uj]=t[uj]; + DeleteArray(&xa,xlen); + DeleteArray(&ya,xlen); + } + vector<vector<vector<double> > >().swap(ua_vec); + + if (m_opt) + { + assign_list[repr_idx]=-1; + output_dock_rotation_matrix(fname_matrix.c_str(), + xname_vec,yname_vec, ut_mat, assign_list); + } + + if (o_opt) output_dock(chain_list, ter_opt, split_opt, + infmt_opt, atom_opt, false, ut_mat, fname_super); + + DeleteArray(&ut_mat,chain_num); + } + + /* clean up */ + vector<string>().swap(msa); + vector<string>().swap(tmp_str_vec); + vector<vector<string> >().swap(seqxA_mat); + vector<vector<string> >().swap(seqyA_mat); + vector<string>().swap(xname_vec); + vector<string>().swap(yname_vec); + delete[]TMave_list; + DeleteArray(&TMave_mat,chain_num); + vector<vector<vector<double> > >().swap(a_vec); // structure of complex + vector<vector<char> >().swap(seq_vec); // sequence of complex + vector<vector<char> >().swap(sec_vec); // secondary structure of complex + vector<int>().swap(mol_vec); // molecule type of complex1, RNA if >0 + vector<string>().swap(chainID_list); // list of chainID + vector<int>().swap(len_vec); // length of complex + vector<double>().swap(TM_vec); + vector<double>().swap(d0_vec); + vector<double>().swap(seqID_vec); + vector<vector<double> >().swap(TM_mat); + vector<vector<double> >().swap(d0_mat); + vector<vector<double> >().swap(seqID_mat); + return 1; +} + +/* sequence order independent alignment */ +int SOIalign(string &xname, string &yname, const string &fname_super, + const string &fname_lign, const string &fname_matrix, + vector<string> &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int cp_opt, const int mirror_opt, const int het_opt, + const string &atom_opt, const string &mol_opt, const string &dir_opt, + const string &dir1_opt, const string &dir2_opt, + const vector<string> &chain1_list, const vector<string> &chain2_list, + const bool se_opt, const int closeK_opt, const int mm_opt) +{ + /* declare previously global variables */ + vector<vector<string> >PDB_lines1; // text of chain1 + vector<vector<string> >PDB_lines2; // text of chain2 + vector<int> mol_vec1; // molecule type of chain1, RNA if >0 + vector<int> mol_vec2; // molecule type of chain2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + int **secx_bond; // boundary of secondary structure + int **secy_bond; // boundary of secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + double **xk, **yk; // k closest residues + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + int read_resi=0; // whether to read residue index + if (o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i<chain1_list.size();i++) + { + /* parse chain 1 */ + xname=chain1_list[i]; + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + if (!xchainnum) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<xchainnum;chain_i++) + { + xlen=PDB_lines1[chain_i].size(); + if (mol_opt=="RNA") mol_vec1[chain_i]=1; + else if (mol_opt=="protein") mol_vec1[chain_i]=-1; + if (!xlen) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain length 0."<<endl; + continue; + } + else if (xlen<3) + { + cerr<<"Sequence is too short <3!: "<<xname<<endl; + continue; + } + NewArray(&xa, xlen, 3); + if (closeK_opt>=3) NewArray(&xk, xlen*closeK_opt, 3); + seqx = new char[xlen + 1]; + secx = new char[xlen + 1]; + xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, + resi_vec1, read_resi); + if (mirror_opt) for (r=0;r<xlen;r++) xa[r][2]=-xa[r][2]; + if (mol_vec1[chain_i]>0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + if (closeK_opt>=3) getCloseK(xa, xlen, closeK_opt, xk); + if (mm_opt==6) + { + NewArray(&secx_bond, xlen, 2); + assign_sec_bond(secx_bond, secx, xlen); + } + + for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) + { + /* parse chain 2 */ + if (PDB_lines2.size()==0) + { + yname=chain2_list[j]; + ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); + if (!ychainnum) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain number 0."<<endl; + continue; + } + } + for (chain_j=0;chain_j<ychainnum;chain_j++) + { + ylen=PDB_lines2[chain_j].size(); + if (mol_opt=="RNA") mol_vec2[chain_j]=1; + else if (mol_opt=="protein") mol_vec2[chain_j]=-1; + if (!ylen) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain length 0."<<endl; + continue; + } + else if (ylen<3) + { + cerr<<"Sequence is too short <3!: "<<yname<<endl; + continue; + } + NewArray(&ya, ylen, 3); + if (closeK_opt>=3) NewArray(&yk, ylen*closeK_opt, 3); + seqy = new char[ylen + 1]; + secy = new char[ylen + 1]; + ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, + resi_vec2, read_resi); + if (mol_vec2[chain_j]>0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + if (closeK_opt>=3) getCloseK(ya, ylen, closeK_opt, yk); + if (mm_opt==6) + { + NewArray(&secy_bond, ylen, 2); + assign_sec_bond(secy_bond, secy, ylen); + } + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + int *invmap = new int[ylen+1]; + double *dist_list = new double[ylen+1]; + + /* entry function for structure alignment */ + if (se_opt) + { + u0[0][0]=u0[1][1]=u0[2][2]=1; + u0[0][1]= u0[0][2]= + u0[1][0]= u0[1][2]= + u0[2][0]= u0[2][1]= + t0[0] =t0[1] =t0[2] =0; + soi_se_main( + xa, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], + outfmt_opt, invmap, dist_list, + secx_bond, secy_bond, mm_opt); + if (outfmt_opt>=2) + { + Liden=L_ali=0; + int r1,r2; + for (r2=0;r2<ylen;r2++) + { + r1=invmap[r2]; + if (r1<0) continue; + L_ali+=1; + Liden+=(seqx[r1]==seqy[r2]); + } + } + } + else SOIalign_main(xa, ya, xk, yk, closeK_opt, + seqx, seqy, secx, secy, + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, invmap, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j], dist_list, + secx_bond, secy_bond, mm_opt); + + /* print result */ + if (outfmt_opt==0) print_version(); + output_results( + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), + chainID_list1[chain_i], chainID_list2[chain_j], + xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, + rmsd0, d0_out, seqM.c_str(), + seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, + d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), + outfmt_opt, ter_opt, false, split_opt, o_opt, + fname_super, i_opt, a_opt, u_opt, d_opt, mirror_opt, + resi_vec1, resi_vec2); + if (outfmt_opt<=0) + { + cout<<"###############\t###############\t#########"<<endl; + cout<<"#Aligned atom 1\tAligned atom 2 \tDistance#"<<endl; + int r1,r2; + for (r2=0;r2<ylen;r2++) + { + r1=invmap[r2]; + if (r1<0) continue; + cout<<PDB_lines1[chain_i][r1].substr(12,15)<<'\t' + <<PDB_lines2[chain_j][r2].substr(12,15)<<'\t' + <<setw(9)<<setiosflags(ios::fixed)<<setprecision(3) + <<dist_list[r2]<<'\n'; + } + cout<<"###############\t###############\t#########"<<endl; + } + + /* Done! Free memory */ + delete [] invmap; + delete [] dist_list; + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + if (closeK_opt>=3) DeleteArray(&yk, ylen*closeK_opt); + delete [] seqy; + delete [] secy; + resi_vec2.clear(); + if (mm_opt==6) DeleteArray(&secy_bond, ylen); + } // chain_j + if (chain2_list.size()>1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + } // j + PDB_lines1[chain_i].clear(); + DeleteArray(&xa, xlen); + if (closeK_opt>=3) DeleteArray(&xk, xlen*closeK_opt); + delete [] seqx; + delete [] secx; + resi_vec1.clear(); + if (mm_opt==6) DeleteArray(&secx_bond, xlen); + } // chain_i + xname.clear(); + PDB_lines1.clear(); + chainID_list1.clear(); + mol_vec1.clear(); + } // i + if (chain2_list.size()==1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + resi_vec2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + return 0; +} + +int flexalign(string &xname, string &yname, const string &fname_super, + const string &fname_lign, const string &fname_matrix, + vector<string> &sequence, const double Lnorm_ass, const double d0_scale, + const bool m_opt, const int i_opt, const int o_opt, const int a_opt, + const bool u_opt, const bool d_opt, const double TMcut, + const int infmt1_opt, const int infmt2_opt, const int ter_opt, + const int split_opt, const int outfmt_opt, const bool fast_opt, + const int mirror_opt, const int het_opt, + const string &atom_opt, const string &mol_opt, const string &dir_opt, + const string &dir1_opt, const string &dir2_opt, const int byresi_opt, + const vector<string> &chain1_list, const vector<string> &chain2_list, + const int hinge_opt) +{ + /* declare previously global variables */ + vector<vector<string> >PDB_lines1; // text of chain1 + vector<vector<string> >PDB_lines2; // text of chain2 + vector<int> mol_vec1; // molecule type of chain1, RNA if >0 + vector<int> mol_vec2; // molecule type of chain2, RNA if >0 + vector<string> chainID_list1; // list of chainID1 + vector<string> chainID_list2; // list of chainID2 + int i,j; // file index + int chain_i,chain_j; // chain index + int r; // residue index + int xlen, ylen; // chain length + int xchainnum,ychainnum;// number of chains in a PDB file + char *seqx, *seqy; // for the protein sequence + char *secx, *secy; // for the secondary structure + double **xa, **ya; // for input vectors xa[0...xlen-1][0..2] and + // ya[0...ylen-1][0..2], in general, + // ya is regarded as native structure + // --> superpose xa onto ya + vector<string> resi_vec1; // residue index for chain1 + vector<string> resi_vec2; // residue index for chain2 + int read_resi=byresi_opt; // whether to read residue index + if (byresi_opt==0 && o_opt) read_resi=2; + + /* loop over file names */ + for (i=0;i<chain1_list.size();i++) + { + /* parse chain 1 */ + xname=chain1_list[i]; + xchainnum=get_PDB_lines(xname, PDB_lines1, chainID_list1, + mol_vec1, ter_opt, infmt1_opt, atom_opt, split_opt, het_opt); + if (!xchainnum) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain number 0."<<endl; + continue; + } + for (chain_i=0;chain_i<xchainnum;chain_i++) + { + xlen=PDB_lines1[chain_i].size(); + if (mol_opt=="RNA") mol_vec1[chain_i]=1; + else if (mol_opt=="protein") mol_vec1[chain_i]=-1; + if (!xlen) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain length 0."<<endl; + continue; + } + else if (xlen<3) + { + cerr<<"Sequence is too short <3!: "<<xname<<endl; + continue; + } + NewArray(&xa, xlen, 3); + seqx = new char[xlen + 1]; + secx = new char[xlen + 1]; + xlen = read_PDB(PDB_lines1[chain_i], xa, seqx, + resi_vec1, read_resi); + if (mirror_opt) for (r=0;r<xlen;r++) xa[r][2]=-xa[r][2]; + if (mol_vec1[chain_i]>0) make_sec(seqx,xa, xlen, secx,atom_opt); + else make_sec(xa, xlen, secx); // secondary structure assignment + + for (j=(dir_opt.size()>0)*(i+1);j<chain2_list.size();j++) + { + /* parse chain 2 */ + if (PDB_lines2.size()==0) + { + yname=chain2_list[j]; + ychainnum=get_PDB_lines(yname, PDB_lines2, chainID_list2, + mol_vec2, ter_opt, infmt2_opt, atom_opt, split_opt, + het_opt); + if (!ychainnum) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain number 0."<<endl; + continue; + } + } + for (chain_j=0;chain_j<ychainnum;chain_j++) + { + ylen=PDB_lines2[chain_j].size(); + if (mol_opt=="RNA") mol_vec2[chain_j]=1; + else if (mol_opt=="protein") mol_vec2[chain_j]=-1; + if (!ylen) + { + cerr<<"Warning! Cannot parse file: "<<yname + <<". Chain length 0."<<endl; + continue; + } + else if (ylen<3) + { + cerr<<"Sequence is too short <3!: "<<yname<<endl; + continue; + } + NewArray(&ya, ylen, 3); + seqy = new char[ylen + 1]; + secy = new char[ylen + 1]; + ylen = read_PDB(PDB_lines2[chain_j], ya, seqy, + resi_vec2, read_resi); + if (mol_vec2[chain_j]>0) + make_sec(seqy, ya, ylen, secy, atom_opt); + else make_sec(ya, ylen, secy); + + if (byresi_opt) extract_aln_from_resi(sequence, + seqx,seqy,resi_vec1,resi_vec2,byresi_opt); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for a_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + bool force_fast_opt=(getmin(xlen,ylen)>1500)?true:fast_opt; + vector<vector<double> >tu_vec; + + /* entry function for structure alignment */ + int hingeNum=flexalign_main( + xa, ya, seqx, seqy, secx, secy, + t0, u0, tu_vec, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],hinge_opt); + + if (hinge_opt && hingeNum<=1 && + n_ali8<0.6*getmin(xlen,ylen)) + { + double t0_h[3], u0_h[3][3]; + double TM1_h, TM2_h; + double TM3_h, TM4_h, TM5_h; + double d0_0_h, TM_0_h; + double d0_out_h=5.0; + string seqM_h, seqxA_h, seqyA_h; + double rmsd0_h = 0.0; + int L_ali_h; + double Liden_h=0; + double TM_ali_h, rmsd_ali_h; + int n_ali_h=0; + int n_ali8_h=0; + vector<vector<double> >tu_vec_h(1,tu_vec[0]); + tu2t_u(tu_vec[0],t0_h,u0_h); + + int hingeNum_h=flexalign_main( + xa, ya, seqx, seqy, secx, secy, + t0_h, u0_h, tu_vec_h, + TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, + d0_0_h, TM_0_h, d0A, d0B, d0u, d0a, d0_out_h, + seqM_h, seqxA_h, seqyA_h, rmsd0_h, L_ali_h, + Liden_h, TM_ali_h, rmsd_ali_h, n_ali_h, n_ali8_h, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, force_fast_opt, + mol_vec1[chain_i]+mol_vec2[chain_j],hinge_opt); + + double TM =(TM1 >TM2 )?TM1 :TM2; + double TM_h=(TM1_h>TM2_h)?TM1_h:TM2_h; + if (TM_h>TM) + { + hingeNum=hingeNum_h; + tu2t_u(tu_vec_h[0],t0,u0); + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + d0_0=d0_0_h; + TM_0=TM_0_h; + d0_out=d0_out_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + L_ali=L_ali_h; + Liden=Liden_h; + TM_ali=TM_ali_h; + rmsd_ali=rmsd_ali_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + tu_vec.clear(); + for (int hinge=0;hinge<tu_vec_h.size();hinge++) + tu_vec.push_back(tu_vec_h[hinge]); + } + else tu2t_u(tu_vec[0],t0,u0); + } + + /* print result */ + if (outfmt_opt==0) print_version(); + output_flexalign_results( + xname.substr(dir1_opt.size()+dir_opt.size()), + yname.substr(dir2_opt.size()+dir_opt.size()), + chainID_list1[chain_i], chainID_list2[chain_j], + xlen, ylen, t0, u0, tu_vec, TM1, TM2, TM3, TM4, TM5, + rmsd0, d0_out, seqM.c_str(), + seqxA.c_str(), seqyA.c_str(), Liden, + n_ali8, L_ali, TM_ali, rmsd_ali, TM_0, d0_0, + d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, + (m_opt?fname_matrix:"").c_str(), + outfmt_opt, ter_opt, false, split_opt, o_opt, + fname_super, i_opt, a_opt, u_opt, d_opt, mirror_opt, + resi_vec1, resi_vec2); + + /* Done! Free memory */ + tu_vec.clear(); + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + delete [] seqy; + delete [] secy; + resi_vec2.clear(); + } // chain_j + if (chain2_list.size()>1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + } // j + PDB_lines1[chain_i].clear(); + DeleteArray(&xa, xlen); + delete [] seqx; + delete [] secx; + resi_vec1.clear(); + } // chain_i + xname.clear(); + PDB_lines1.clear(); + chainID_list1.clear(); + mol_vec1.clear(); + } // i + if (chain2_list.size()==1) + { + yname.clear(); + for (chain_j=0;chain_j<ychainnum;chain_j++) + PDB_lines2[chain_j].clear(); + PDB_lines2.clear(); + resi_vec2.clear(); + chainID_list2.clear(); + mol_vec2.clear(); + } + return 0; +} + + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + + clock_t t1, t2; + t1 = clock(); + + /**********************/ + /* get argument */ + /**********************/ + string xname = ""; + string yname = ""; + string fname_super = ""; // file name for superposed structure + string fname_lign = ""; // file name for user alignment + string fname_matrix= ""; // file name for output matrix + vector<string> sequence; // get value from alignment file + double Lnorm_ass, d0_scale; + + bool h_opt = false; // print full help message + bool v_opt = false; // print version + bool m_opt = false; // flag for -m, output rotation matrix + int i_opt = 0; // 1 for -i, 3 for -I + int o_opt = 0; // 1 for -o, 2 for -rasmol + int a_opt = 0; // flag for -a, do not normalized by average length + bool u_opt = false; // flag for -u, normalized by user specified length + bool d_opt = false; // flag for -d, user specified d0 + + bool full_opt = false;// do not show chain level alignment + double TMcut =-1; + bool se_opt =false; + int infmt1_opt=-1; // PDB or PDBx/mmCIF format for chain_1 + int infmt2_opt=-1; // PDB or PDBx/mmCIF format for chain_2 + int ter_opt =2; // END, or different chainID + int split_opt =2; // split each chains + int outfmt_opt=0; // set -outfmt to full output + bool fast_opt =false; // flags for -fast, fTM-align algorithm + int cp_opt =0; // do not check circular permutation + int closeK_opt=-1; // number of atoms for SOI initial alignment. + // 5 and 0 for -mm 5 and 6 + int hinge_opt =9; // maximum number of hinge allowed for flexible + int mirror_opt=0; // do not align mirror + int het_opt=0; // do not read HETATM residues + int mm_opt=0; // do not perform MM-align + string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA + string suffix_opt=""; // set -suffix to empty + string dir_opt =""; // set -dir to empty + string dir1_opt =""; // set -dir1 to empty + string dir2_opt =""; // set -dir2 to empty + int byresi_opt=0; // set -byresi to 0 + vector<string> chain1_list; // only when -dir1 is set + vector<string> chain2_list; // only when -dir2 is set + + for(int i = 1; i < argc; i++) + { + if ( !strcmp(argv[i],"-o") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -o"); + if (o_opt==2) + cerr<<"Warning! -rasmol is already set. Ignore -o"<<endl; + else + { + fname_super = argv[i + 1]; + o_opt = 1; + } + i++; + } + else if ( !strcmp(argv[i],"-rasmol") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -rasmol"); + if (o_opt==1) + cerr<<"Warning! -o is already set. Ignore -rasmol"<<endl; + else + { + fname_super = argv[i + 1]; + o_opt = 2; + } + i++; + } + else if ( !strcmp(argv[i],"-u") || !strcmp(argv[i],"-L") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -u or -L"); + Lnorm_ass = atof(argv[i + 1]); u_opt = true; i++; + if (Lnorm_ass<=0) PrintErrorAndQuit( + "ERROR! The value for -u or -L should be >0"); + } + else if ( !strcmp(argv[i],"-a") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -a"); + if (!strcmp(argv[i + 1], "T")) a_opt=true; + else if (!strcmp(argv[i + 1], "F")) a_opt=false; + else + { + a_opt=atoi(argv[i + 1]); + if (a_opt!=-2 && a_opt!=-1 && a_opt!=1) + PrintErrorAndQuit("-a must be -2, -1, 1, T or F"); + } + i++; + } + else if ( !strcmp(argv[i],"-full") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -full"); + if (!strcmp(argv[i + 1], "T")) full_opt=true; + else if (!strcmp(argv[i + 1], "F")) full_opt=false; + else PrintErrorAndQuit("-full must be T or F"); + i++; + } + else if ( !strcmp(argv[i],"-d") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -d"); + d0_scale = atof(argv[i + 1]); d_opt = true; i++; + } + else if ( !strcmp(argv[i],"-closeK") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -closeK"); + closeK_opt = atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-hinge") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -hinge"); + hinge_opt = atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-v") ) + { + v_opt = true; + } + else if ( !strcmp(argv[i],"-h") ) + { + h_opt = true; + } + else if ( !strcmp(argv[i],"-i") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -i"); + if (i_opt==3) + PrintErrorAndQuit("ERROR! -i and -I cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 1; i++; + } + else if (!strcmp(argv[i], "-I") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -I"); + if (i_opt==1) + PrintErrorAndQuit("ERROR! -I and -i cannot be used together"); + fname_lign = argv[i + 1]; i_opt = 3; i++; + } + else if (!strcmp(argv[i], "-m") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -m"); + fname_matrix = argv[i + 1]; m_opt = true; i++; + }// get filename for rotation matrix + else if (!strcmp(argv[i], "-fast")) + { + fast_opt = true; + } + else if (!strcmp(argv[i], "-se")) + { + se_opt = true; + } + else if ( !strcmp(argv[i],"-infmt1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -infmt1"); + infmt1_opt=atoi(argv[i + 1]); i++; + if (infmt1_opt<-1 || infmt1_opt>3) + PrintErrorAndQuit("ERROR! -infmt1 can only be -1, 0, 1, 2, or 3"); + } + else if ( !strcmp(argv[i],"-infmt2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -infmt2"); + infmt2_opt=atoi(argv[i + 1]); i++; + if (infmt2_opt<-1 || infmt2_opt>3) + PrintErrorAndQuit("ERROR! -infmt2 can only be -1, 0, 1, 2, or 3"); + } + else if ( !strcmp(argv[i],"-ter") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -ter"); + ter_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-split") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -split"); + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-atom") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -atom"); + atom_opt=argv[i + 1]; i++; + if (atom_opt.size()!=4) PrintErrorAndQuit( + "ERROR! Atom name must have 4 characters, including space.\n" + "For example, C alpha, C3' and P atoms should be specified by\n" + "-atom \" CA \", -atom \" P \" and -atom \" C3'\", respectively."); + } + else if ( !strcmp(argv[i],"-mol") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mol"); + mol_opt=argv[i + 1]; i++; + if (mol_opt=="prot") mol_opt="protein"; + else if (mol_opt=="DNA") mol_opt="RNA"; + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be one of the " + "following:\nauto, prot (the same as 'protein'), and " + "RNA (the same as 'DNA')."); + } + else if ( !strcmp(argv[i],"-dir") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir"); + dir_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir1") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir1"); + dir1_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir2") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -dir2"); + dir2_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-suffix") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -suffix"); + suffix_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-outfmt") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -outfmt"); + outfmt_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-TMcut") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -TMcut"); + TMcut=atof(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-byresi") || + !strcmp(argv[i],"-tmscore") || + !strcmp(argv[i],"-TMscore")) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -byresi"); + byresi_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-seq") ) + { + byresi_opt=5; + } + else if ( !strcmp(argv[i],"-cp") ) + { + mm_opt=3; + } + else if ( !strcmp(argv[i],"-mirror") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mirror"); + mirror_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -het"); + het_opt=atoi(argv[i + 1]); i++; + if (het_opt!=0 && het_opt!=1 && het_opt!=2) + PrintErrorAndQuit("-het must be 0, 1, or 2"); + } + else if ( !strcmp(argv[i],"-mm") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mm"); + mm_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else if (yname.size() == 0) yname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if(xname.size()==0 || (yname.size()==0 && dir_opt.size()==0) || + (yname.size() && dir_opt.size())) + { + if (h_opt) print_help(h_opt); + if (v_opt) + { + print_version(); + exit(EXIT_FAILURE); + } + if (xname.size()==0) + PrintErrorAndQuit("Please provide input structures"); + else if (yname.size()==0 && dir_opt.size()==0 && mm_opt!=4) + PrintErrorAndQuit("Please provide structure B"); + else if (yname.size() && dir_opt.size()) + PrintErrorAndQuit("Please provide only one file name if -dir is set"); + } + + if (suffix_opt.size() && dir_opt.size()+dir1_opt.size()+dir2_opt.size()==0) + PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); + if ((dir_opt.size() || dir1_opt.size() || dir2_opt.size())) + { + if (mm_opt!=2 && mm_opt!=4) + { + if (o_opt) + PrintErrorAndQuit("-o cannot be set with -dir, -dir1 or -dir2"); + if (m_opt && fname_matrix!="-") + PrintErrorAndQuit("-m can only be - or unset when using -dir, -dir1 or -dir2"); + } + else if (dir_opt.size() && (dir1_opt.size() || dir2_opt.size())) + PrintErrorAndQuit("-dir cannot be set with -dir1 or -dir2"); + } + if (o_opt && (infmt1_opt!=-1 && infmt1_opt!=0 && infmt1_opt!=3)) + PrintErrorAndQuit("-o can only be used with -infmt1 -1, 0 or 3"); + + if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; + + if (d_opt && d0_scale<=0) + PrintErrorAndQuit("Wrong value for option -d! It should be >0"); + if (outfmt_opt>=2 && (a_opt || u_opt || d_opt)) + PrintErrorAndQuit("-outfmt 2 cannot be used with -a, -u, -L, -d"); + if (byresi_opt!=0) + { + if (i_opt) + PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); + if (byresi_opt<0 || byresi_opt>6) + PrintErrorAndQuit("-byresi can only be 0 to 6"); + if ((byresi_opt==2 || byresi_opt==3 || byresi_opt==6) && ter_opt>=2) + PrintErrorAndQuit("-byresi 2 and 6 must be used with -ter <=1"); + } + //if (split_opt==1 && ter_opt!=0) + //PrintErrorAndQuit("-split 1 should be used with -ter 0"); + //else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) + //PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); + if (split_opt<0 || split_opt>2) + PrintErrorAndQuit("-split can only be 0, 1 or 2"); + + if (mm_opt==3) + { + cp_opt=true; + mm_opt=0; + } + if (cp_opt && i_opt) + PrintErrorAndQuit("-mm 3 cannot be used with -i or -I"); + + if (mirror_opt && het_opt!=1) + cerr<<"WARNING! -mirror was not used with -het 1. " + <<"D amino acids may not be correctly aligned."<<endl; + + if (mm_opt) + { + if (i_opt) PrintErrorAndQuit("-mm cannot be used with -i or -I"); + if (u_opt) PrintErrorAndQuit("-mm cannot be used with -u or -L"); + //if (cp_opt) PrintErrorAndQuit("-mm cannot be used with -cp"); + if (dir_opt.size() && (mm_opt==1||mm_opt==2)) PrintErrorAndQuit("-mm 1 or 2 cannot be used with -dir"); + if (byresi_opt) PrintErrorAndQuit("-mm cannot be used with -byresi"); + if (ter_opt>=2 && (mm_opt==1 || mm_opt==2)) PrintErrorAndQuit("-mm 1 or 2 must be used with -ter 0 or -ter 1"); + if (mm_opt==4 && (yname.size() || dir2_opt.size())) + cerr<<"WARNING! structure_2 is ignored for -mm 4"<<endl; + } + else if (full_opt) PrintErrorAndQuit("-full can only be used with -mm"); + + if (o_opt && ter_opt<=1 && split_opt==2) + { + if (mm_opt && o_opt==2) cerr<<"WARNING! -mm may generate incorrect" + <<" RasMol output due to limitations in PDB file format. " + <<"When -mm is used, -o is recommended over -rasmol"<<endl; + else if (mm_opt==0) cerr<<"WARNING! Only the superposition of the" + <<"last aligned chain pair will be generated"<<endl; + } + + if (closeK_opt<0) + { + if (mm_opt==5) closeK_opt=5; + else closeK_opt=0; + } + + if (mm_opt==7 && hinge_opt>=10) + PrintErrorAndQuit("ERROR! -hinge must be <10"); + + + /* read initial alignment file from 'align.txt' */ + if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); + + if (byresi_opt==6) mm_opt=1; + else if (byresi_opt) i_opt=3; + + if (m_opt && fname_matrix == "") // Output rotation matrix: matrix.txt + PrintErrorAndQuit("ERROR! Please provide a file name for option -m!"); + + /* parse file list */ + if (dir1_opt.size()+dir_opt.size()==0) chain1_list.push_back(xname); + else file2chainlist(chain1_list, xname, dir_opt+dir1_opt, suffix_opt); + + int i; + if (dir_opt.size()) + for (i=0;i<chain1_list.size();i++) + chain2_list.push_back(chain1_list[i]); + else if (dir2_opt.size()==0) chain2_list.push_back(yname); + else file2chainlist(chain2_list, yname, dir2_opt, suffix_opt); + + if (outfmt_opt==2) + { + if (mm_opt==2) cout<<"#Query\tTemplate\tTM"<<endl; + else cout<<"#PDBchain1\tPDBchain2\tTM1\tTM2\t" + <<"RMSD\tID1\tID2\tIDali\tL1\tL2\tLali"<<endl; + } + + /* real alignment. entry functions are MMalign_main and + * TMalign_main */ + if (mm_opt==0) TMalign(xname, yname, fname_super, fname_lign, fname_matrix, + sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, + u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, + split_opt, outfmt_opt, fast_opt, cp_opt, mirror_opt, het_opt, + atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, byresi_opt, + chain1_list, chain2_list, se_opt); + else if (mm_opt==1) MMalign(xname, yname, fname_super, fname_lign, + fname_matrix, sequence, d0_scale, m_opt, o_opt, + a_opt, d_opt, full_opt, TMcut, infmt1_opt, infmt2_opt, + ter_opt, split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, + atom_opt, mol_opt, dir1_opt, dir2_opt, chain1_list, chain2_list, + byresi_opt); + else if (mm_opt==2) MMdock(xname, yname, fname_super, + fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, o_opt, a_opt, + u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, + split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, + atom_opt, mol_opt, dir1_opt, dir2_opt, chain1_list, chain2_list); + else if (mm_opt==3) ; // should be changed to mm_opt=0, cp_opt=true + else if (mm_opt==4) mTMalign(xname, yname, fname_super, fname_matrix, + sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, a_opt, + u_opt, d_opt, full_opt, TMcut, infmt1_opt, ter_opt, + split_opt, outfmt_opt, fast_opt, het_opt, + atom_opt, mol_opt, dir_opt, byresi_opt, chain1_list); + else if (mm_opt==5 || mm_opt==6) SOIalign(xname, yname, fname_super, fname_lign, + fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, + a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, + split_opt, outfmt_opt, fast_opt, cp_opt, mirror_opt, het_opt, + atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, + chain1_list, chain2_list, se_opt, closeK_opt, mm_opt); + else if (mm_opt==7) flexalign(xname, yname, fname_super, fname_lign, + fname_matrix, sequence, Lnorm_ass, d0_scale, m_opt, i_opt, o_opt, + a_opt, u_opt, d_opt, TMcut, infmt1_opt, infmt2_opt, ter_opt, + split_opt, outfmt_opt, fast_opt, mirror_opt, het_opt, + atom_opt, mol_opt, dir_opt, dir1_opt, dir2_opt, byresi_opt, + chain1_list, chain2_list, hinge_opt); + else cerr<<"WARNING! -mm "<<mm_opt<<" not implemented"<<endl; + + /* clean up */ + vector<string>().swap(chain1_list); + vector<string>().swap(chain2_list); + vector<string>().swap(sequence); + + t2 = clock(); + float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; + if (outfmt_opt<2) printf("#Total CPU time is %5.2f seconds\n", diff); + return 0; +} diff --git a/modules/bindings/src/tmalign/align.txt b/modules/bindings/src/USalign/align.txt similarity index 100% rename from modules/bindings/src/tmalign/align.txt rename to modules/bindings/src/USalign/align.txt diff --git a/modules/bindings/src/tmalign/basic_fun.h b/modules/bindings/src/USalign/basic_fun.h similarity index 78% rename from modules/bindings/src/tmalign/basic_fun.h rename to modules/bindings/src/USalign/basic_fun.h index 0e8ae307d..0fe070119 100644 --- a/modules/bindings/src/tmalign/basic_fun.h +++ b/modules/bindings/src/USalign/basic_fun.h @@ -137,6 +137,17 @@ void split(const string &line, vector<string> &line_vec, } } +/* strip white space at the begining or end of string */ +string Trim(const string &inputString) +{ + string result = inputString; + int idxBegin = inputString.find_first_not_of(" \n\r\t"); + int idxEnd = inputString.find_last_not_of(" \n\r\t"); + if (idxBegin >= 0 && idxEnd >= 0) + result = inputString.substr(idxBegin, idxEnd + 1 - idxBegin); + return result; +} + size_t get_PDB_lines(const string filename, vector<vector<string> >&PDB_lines, vector<string> &chainID_list, vector<int> &mol_vec, const int ter_opt, const int infmt_opt, @@ -152,11 +163,14 @@ size_t get_PDB_lines(const string filename, int compress_type=0; // uncompressed file ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else redi::ipstream fin_gz; // if file is compressed if (filename.size()>=3 && filename.substr(filename.size()-3,3)==".gz") { - fin_gz.open("zcat '"+filename+"'"); + fin_gz.open("gunzip -c '"+filename+"'"); compress_type=1; } else if (filename.size()>=4 && @@ -165,14 +179,20 @@ size_t get_PDB_lines(const string filename, fin_gz.open("bzcat '"+filename+"'"); compress_type=2; } - else fin.open(filename.c_str()); + else +#endif + { + if (filename=="-") compress_type=-1; + else fin.open(filename.c_str()); + } if (infmt_opt==0||infmt_opt==-1) // PDB format { - while (compress_type?fin_gz.good():fin.good()) + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); if (infmt_opt==-1 && line.compare(0,5,"loop_")==0) // PDBx/mmCIF return get_PDB_lines(filename,PDB_lines,chainID_list, mol_vec, ter_opt, 3, atom_opt, split_opt,het_opt); @@ -194,6 +214,13 @@ size_t get_PDB_lines(const string filename, select_atom=(line.compare(12,4," C3'")==0); else select_atom=(line.compare(12,4," CA ")==0); } + else if (atom_opt=="PC4'") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(line.compare(12,4," P ")==0 + )||(line.compare(12,4," C4'")==0); + else select_atom=(line.compare(12,4," CA ")==0); + } else select_atom=(line.compare(12,4,atom_opt)==0); if (select_atom) { @@ -246,7 +273,7 @@ size_t get_PDB_lines(const string filename, mol_vec.push_back(0); } - if (resi==line.substr(22,5)) + if (resi==line.substr(22,5) && atom_opt!="PC4'") cerr<<"Warning! Duplicated residue "<<resi<<endl; resi=line.substr(22,5); // including insertion code @@ -263,13 +290,26 @@ size_t get_PDB_lines(const string filename, size_t L=0; float x,y,z; stringstream i8_stream; - while (compress_type?fin_gz.good():fin.good()) + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { - if (compress_type) fin_gz>>L>>x>>y>>z; - else fin >>L>>x>>y>>z; - if (compress_type) getline(fin_gz, line); - else getline(fin, line); - if (!(compress_type?fin_gz.good():fin.good())) break; + if (compress_type==-1) + { + cin>>L>>x>>y>>z; + getline(cin, line); + if (!cin.good()) break; + } + else if (compress_type) + { + fin_gz>>L>>x>>y>>z; + getline(fin_gz, line); + if (!fin_gz.good()) break; + } + else + { + fin >>L>>x>>y>>z; + getline(fin, line); + if (!fin.good()) break; + } model_idx++; stringstream i8_stream; i8_stream << ':' << model_idx; @@ -278,8 +318,9 @@ size_t get_PDB_lines(const string filename, mol_vec.push_back(0); for (i=0;i<L;i++) { - if (compress_type) fin_gz>>x>>y>>z; - else fin >>x>>y>>z; + if (compress_type==-1) cin>>x>>y>>z; + else if (compress_type) fin_gz>>x>>y>>z; + else fin >>x>>y>>z; i8_stream<<"ATOM "<<setw(4)<<i+1<<" CA UNK "<<setw(4) <<i+1<<" "<<setiosflags(ios::fixed)<<setprecision(3) <<setw(8)<<x<<setw(8)<<y<<setw(8)<<z; @@ -287,31 +328,35 @@ size_t get_PDB_lines(const string filename, i8_stream.str(string()); PDB_lines.back().push_back(line); } - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); } } else if (infmt_opt==2) // xyz format { size_t L=0; stringstream i8_stream; - while (compress_type?fin_gz.good():fin.good()) + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); L=atoi(line.c_str()); - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); for (i=0;i<line.size();i++) if (line[i]==' '||line[i]=='\t') break; - if (!(compress_type?fin_gz.good():fin.good())) break; + if (!((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good()))) break; chainID_list.push_back(':'+line.substr(0,i)); PDB_lines.push_back(tmp_str_vec); mol_vec.push_back(0); for (i=0;i<L;i++) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); i8_stream<<"ATOM "<<setw(4)<<i+1<<" CA " <<AAmap(line[0])<<" "<<setw(4)<<i+1<<" " <<line.substr(2,8)<<line.substr(11,8)<<line.substr(20,8); @@ -339,18 +384,24 @@ size_t get_PDB_lines(const string filename, string prev_resi=""; string model_index=""; // the same as model_idx but type is string stringstream i8_stream; - while (compress_type?fin_gz.good():fin.good()) + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); if (line.size()==0) continue; - if (loop_) loop_ = line.compare(0,2,"# "); + if (loop_) loop_ = (line.size()>=2)?(line.compare(0,2,"# ")):(line.compare(0,1,"#")); if (!loop_) { if (line.compare(0,5,"loop_")) continue; while(1) { - if (compress_type) + if (compress_type==-1) + { + if (cin.good()) getline(cin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of -"); + } + else if (compress_type) { if (fin_gz.good()) getline(fin_gz, line); else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); @@ -367,15 +418,16 @@ size_t get_PDB_lines(const string filename, loop_=true; _atom_site.clear(); atom_site_pos=0; - _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + _atom_site[Trim(line.substr(11))]=atom_site_pos; while(1) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); if (line.size()==0) continue; if (line.compare(0,11,"_atom_site.")) break; - _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; } @@ -431,6 +483,13 @@ size_t get_PDB_lines(const string filename, select_atom=(atom==" C3'"); else select_atom=(atom==" CA "); } + else if (atom_opt=="PC4'") + { + if (line[17]==' ' && (line[18]=='D'||line[18]==' ')) + select_atom=(line.compare(12,4," P ")==0 + )||(line.compare(12,4," C4'")==0); + else select_atom=(line.compare(12,4," CA ")==0); + } else select_atom=(atom==atom_opt); if (!select_atom) continue; @@ -493,7 +552,7 @@ size_t get_PDB_lines(const string filename, resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; else resi+=" "; - if (prev_resi==resi) + if (prev_resi==resi && atom_opt!="PC4'") cerr<<"Warning! Duplicated residue "<<resi<<endl; prev_resi=resi; @@ -514,8 +573,8 @@ size_t get_PDB_lines(const string filename, AA.clear(); } - if (compress_type) fin_gz.close(); - else fin.close(); + if (compress_type>=1) fin_gz.close(); + else if (compress_type==0) fin.close(); line.clear(); if (!split_opt) chainID_list.push_back(""); return PDB_lines.size(); @@ -537,11 +596,14 @@ size_t get_FASTA_lines(const string filename, int compress_type=0; // uncompressed file ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else redi::ipstream fin_gz; // if file is compressed if (filename.size()>=3 && filename.substr(filename.size()-3,3)==".gz") { - fin_gz.open("zcat '"+filename+"'"); + fin_gz.open("gunzip -c '"+filename+"'"); compress_type=1; } else if (filename.size()>=4 && @@ -550,12 +612,19 @@ size_t get_FASTA_lines(const string filename, fin_gz.open("bzcat '"+filename+"'"); compress_type=2; } - else fin.open(filename.c_str()); + else +#endif + { + if (filename=="-") compress_type=-1; + else fin.open(filename.c_str()); + } - while (compress_type?fin_gz.good():fin.good()) + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) { - if (compress_type) getline(fin_gz, line); - else getline(fin, line); + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0 || line[0]=='#') continue; if (line[0]=='>') @@ -584,132 +653,11 @@ size_t get_FASTA_lines(const string filename, } line.clear(); - if (compress_type) fin_gz.close(); - else fin.close(); + if (compress_type>=1) fin_gz.close(); + else if (compress_type==0) fin.close(); return FASTA_lines.size(); } - -/* extract pairwise sequence alignment from residue index vectors, - * assuming that "sequence" contains two empty strings. - * return length of alignment, including gap. */ -int extract_aln_from_resi(vector<string> &sequence, char *seqx, char *seqy, - const vector<string> resi_vec1, const vector<string> resi_vec2, - const int byresi_opt) -{ - sequence.clear(); - sequence.push_back(""); - sequence.push_back(""); - - int i1=0; // positions in resi_vec1 - int i2=0; // positions in resi_vec2 - int xlen=resi_vec1.size(); - int ylen=resi_vec2.size(); - map<string,string> chainID_map1; - map<string,string> chainID_map2; - if (byresi_opt==3) - { - vector<string> chainID_vec; - string chainID; - stringstream ss; - int i; - for (i=0;i<xlen;i++) - { - chainID=resi_vec1[i].substr(5); - if (!chainID_vec.size()|| chainID_vec.back()!=chainID) - { - chainID_vec.push_back(chainID); - ss<<chainID_vec.size(); - chainID_map1[chainID]=ss.str(); - ss.str(""); - } - } - chainID_vec.clear(); - for (i=0;i<ylen;i++) - { - chainID=resi_vec2[i].substr(5); - if (!chainID_vec.size()|| chainID_vec.back()!=chainID) - { - chainID_vec.push_back(chainID); - ss<<chainID_vec.size(); - chainID_map2[chainID]=ss.str(); - ss.str(""); - } - } - vector<string>().swap(chainID_vec); - } - string chainID1=""; - string chainID2=""; - string chainID1_prev=""; - string chainID2_prev=""; - while(i1<xlen && i2<ylen) - { - if (byresi_opt==2) - { - chainID1=resi_vec1[i1].substr(5); - chainID2=resi_vec2[i2].substr(5); - } - else if (byresi_opt==3) - { - chainID1=chainID_map1[resi_vec1[i1].substr(5)]; - chainID2=chainID_map2[resi_vec2[i2].substr(5)]; - } - - if (chainID1==chainID2) - { - if (atoi(resi_vec1[i1].substr(0,4).c_str())< - atoi(resi_vec2[i2].substr(0,4).c_str())) - { - sequence[0]+=seqx[i1++]; - sequence[1]+='-'; - } - else if (atoi(resi_vec1[i1].substr(0,4).c_str())> - atoi(resi_vec2[i2].substr(0,4).c_str())) - { - sequence[0]+='-'; - sequence[1]+=seqy[i2++]; - } - else - { - sequence[0]+=seqx[i1++]; - sequence[1]+=seqy[i2++]; - } - chainID1_prev=chainID1; - chainID2_prev=chainID2; - } - else - { - if (chainID1_prev==chainID1 && chainID2_prev!=chainID2) - { - sequence[0]+=seqx[i1++]; - sequence[1]+='-'; - chainID1_prev=chainID1; - } - else if (chainID1_prev!=chainID1 && chainID2_prev==chainID2) - { - sequence[0]+='-'; - sequence[1]+=seqy[i2++]; - chainID2_prev=chainID2; - } - else - { - sequence[0]+=seqx[i1++]; - sequence[1]+=seqy[i2++]; - chainID1_prev=chainID1; - chainID2_prev=chainID2; - } - } - - } - map<string,string>().swap(chainID_map1); - map<string,string>().swap(chainID_map2); - chainID1.clear(); - chainID2.clear(); - chainID1_prev.clear(); - chainID2_prev.clear(); - return sequence[0].size(); -} - int read_PDB(const vector<string> &PDB_lines, double **a, char *seq, vector<string> &resi_vec, const int read_resi) { @@ -758,17 +706,6 @@ void do_rotation(double **x, double **x1, int len, double t[3], double u[3][3]) } } -/* strip white space at the begining or end of string */ -string Trim(const string &inputString) -{ - string result = inputString; - int idxBegin = inputString.find_first_not_of(" \n\r\t"); - int idxEnd = inputString.find_last_not_of(" \n\r\t"); - if (idxBegin >= 0 && idxEnd >= 0) - result = inputString.substr(idxBegin, idxEnd + 1 - idxBegin); - return result; -} - /* read user specified pairwise alignment from 'fname_lign' to 'sequence'. * This function should only be called by main function, as it will * terminate a program if wrong alignment is given */ diff --git a/modules/bindings/src/USalign/cif2pdb.cpp b/modules/bindings/src/USalign/cif2pdb.cpp new file mode 100644 index 000000000..cfd06c269 --- /dev/null +++ b/modules/bindings/src/USalign/cif2pdb.cpp @@ -0,0 +1,533 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <time.h> +#include <string.h> + +#include <sstream> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <vector> +#include <iterator> +#include <algorithm> +#include <string> +#include <iomanip> +#include <map> + +#include "pstream.h" // For reading gzip and bz2 compressed files + +using namespace std; + +void print_help() +{ + cout << +"Converting mmCIF file to PDB file(s)\n" +"\n" +"Usage: cif2pdb input.cif output.pdb\n" +"\n" +" -chain Specify auth chain ID to convert:\n" +" $ cif2pdb input.cif output.pdb -chain A\n" +"\n" +" -mol macromolecule type. default is all macromolecules.\n" +" 1: protein only\n" +" 2: RNA only\n" +" 4: DNA only\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 0: (default) do not split; output a single PDB\n" +" 1: output one PDB file per chain\n" +"\n" +" -het Whether to read residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: only 'ATOM ' residues\n" +" 1: (default) 'ATOM ' and 'HETATM' for MSE\n" +" 2: 'ATOM ' and all 'HETATM', excluding HOH\n" +" 3: 'ATOM ' and all 'HETATM', including HOH\n" +" If -het >=1, MSE will be converted to MET\n" + <<endl; + exit(EXIT_SUCCESS); +} + +void PrintErrorAndQuit(const string sErrorString) +{ + cout << sErrorString << endl; + exit(1); +} + +/* strip white space at the begining or end of string */ +string Trim(const string &inputString) +{ + string result = inputString; + int idxBegin = inputString.find_first_not_of(" \n\r\t"); + int idxEnd = inputString.find_last_not_of(" \n\r\t"); + if (idxBegin >= 0 && idxEnd >= 0) + result = inputString.substr(idxBegin, idxEnd + 1 - idxBegin); + return result; +} + +/* split a long string into vectors by whitespace + * line - input string + * line_vec - output vector + * delimiter - delimiter */ +void split(const string &line, vector<string> &line_vec, + const char delimiter=' ') +{ + bool within_word = false; + for (size_t pos=0;pos<line.size();pos++) + { + if (line[pos]==delimiter) + { + within_word = false; + continue; + } + if (!within_word) + { + within_word = true; + line_vec.push_back(""); + } + line_vec.back()+=line[pos]; + } +} + +void write_mmcif_to_pdb(const string filename, + const vector<vector<string> >&PDB_lines, + const vector<string> &chainID_list, const int split_opt) +{ + size_t c,r; + + ofstream fout; + if (split_opt) + { + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + if (filename=="-") + { + cout<<"REMARK cif2pdb "<<PDB_lines[c][0][21]<<" "<<chainID_list[c]<<endl; + for (r=0;r<PDB_lines[c].size();r++) cout<<PDB_lines[c][r]; + cout<<"TER"<<endl; + continue; + } + cout<< filename+Trim(chainID_list[c])+".pdb"<<endl; + fout.open((filename+Trim(chainID_list[c])+".pdb").c_str()); + fout<<"REMARK cif2pdb "<<PDB_lines[c][0][21]<<" "<<chainID_list[c]<<endl; + for (r=0;r<PDB_lines[c].size();r++) fout<<PDB_lines[c][r]; + fout<<"TER"<<endl; + fout.close(); + } + } + else if (filename=="-") + { + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + cout<<"REMARK cif2pdb "<<PDB_lines[c][0][21]<<" "<<chainID_list[c]<<endl; + } + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + for (r=0;r<PDB_lines[c].size();r++) cout<<PDB_lines[c][r]; + cout<<"TER"<<endl; + } + cout<<"END"<<endl; + } + else + { + fout.open(filename.c_str()); + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + fout<<"REMARK cif2pdb "<<PDB_lines[c][0][21]<<" "<<chainID_list[c]<<endl; + } + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + for (r=0;r<PDB_lines[c].size();r++) fout<<PDB_lines[c][r]; + fout<<"TER"<<endl; + } + fout<<"END"<<endl; + fout.close(); + } + return; +} + +size_t resolve_chainID_for_mmcif(vector<vector<string> >&PDB_lines, + const vector<string> &chainID_list) +{ + size_t changed_chains=0; + size_t c,r,i; + string chainID; + + string chainID_string="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + vector<bool> chainID_taken(chainID_string.size(),false); + vector<bool> chainID_accept(chainID_list.size(),false); + + /* accept all single character chain ID */ + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0) continue; + chainID=PDB_lines[c][0][21]; + if (chainID!=chainID_list[c]) continue; + chainID_accept[c]=true; + for (i=0;i<chainID_string.size();i++) + { + if (chainID_string[i]!=chainID[0]) continue; + if (chainID_taken[i]) chainID_accept[c]=false; + else chainID_taken[i]=true; + break; + } + } + + /* accept all remaining non-conflicting chain ID */ + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0 || chainID_accept[c]) continue; + chainID=PDB_lines[c][0][21]; + chainID_accept[c]=true; + for (i=0;i<chainID_string.size();i++) + { + if (chainID_string[i]!=chainID[0]) continue; + if (chainID_taken[i]) chainID_accept[c]=false; + else chainID_taken[i]=true; + break; + } + } + + /* resolve remaining chain ID */ + for (c=0;c<PDB_lines.size();c++) + { + if (PDB_lines[c].size()==0 || chainID_accept[c]) continue; + chainID=""; + for (i=0;i<chainID_taken.size();i++) + { + if (chainID_taken[i]) continue; + chainID=chainID_string[i]; + chainID_taken[i]=true; + break; + } + if (chainID=="") + { + cerr<<"WARNING! Cannot parse "<<chainID_list[c]<<" with " + <<PDB_lines[c].size()<<" atoms due to chain ID conflict. " + <<"Please consider -split 1"<<endl; + vector<string>().swap(PDB_lines[c]); + } + else + { + for (r=0;r<PDB_lines[c].size();r++) PDB_lines[c][r]= + PDB_lines[c][r].substr(0,21)+chainID+PDB_lines[c][r].substr(22); + changed_chains++; + } + } + if (changed_chains) + cerr<<"WARNING! Changed "<<changed_chains<<" chain ID(s)"<<endl; + + /* clean up*/ + chainID.clear(); + string().swap(chainID_string); + vector<bool>().swap(chainID_taken); + vector<bool>().swap(chainID_accept); + return changed_chains; +} + +size_t get_all_mmcif_lines(const string filename, const string chain_opt, + vector<vector<string> >&PDB_lines, vector<string> &chainID_list, + const bool dna_opt, const bool rna_opt, const bool protein_opt, + const bool hoh_opt, const bool lig_opt, const bool mse_opt) +{ + size_t a=0; // atom index + string line; + bool select_atom=false; + size_t model_idx=0; + vector<string> tmp_str_vec; + + int compress_type=0; // uncompressed file + ifstream fin; + redi::ipstream fin_gz; // if file is compressed + if (filename.size()>=3 && + filename.substr(filename.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c '"+filename+"'"); + compress_type=1; + } + else if (filename.size()>=4 && + filename.substr(filename.size()-4,4)==".bz2") + { + fin_gz.open("bzcat '"+filename+"'"); + compress_type=2; + } + else + { + if (filename=="-") compress_type=-1; + else fin.open(filename.c_str()); + } + + bool loop_ = false; // not reading following content + map<string,int> _atom_site; + int atom_site_pos; + vector<string> line_vec; + string group_PDB="ATOM "; + string alt_id="."; // alternative location indicator + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + string prev_asym_id=""; + string resn=""; // residue name + string resi=""; + string atom=""; + string model_index=""; // the same as model_idx but type is string + stringstream i8_stream; + while ((compress_type==-1)?cin.good():(compress_type?fin_gz.good():fin.good())) + { + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (loop_) loop_ = (line.size()>=2)?(line.compare(0,2,"# ")):(line.compare(0,1,"#")); + if (!loop_) + { + if (line.compare(0,5,"loop_")) continue; + while(1) + { + if (compress_type==-1) + { + if (cin.good()) getline(cin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+filename); + } + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + + loop_=true; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[Trim(line.substr(11))]=atom_site_pos; + + while(1) + { + if (compress_type==-1) getline(cin, line); + else if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; + } + + if (_atom_site.count("group_PDB")* + _atom_site.count("label_atom_id")* + _atom_site.count("label_comp_id")* + (_atom_site.count("auth_asym_id")+ + _atom_site.count("label_asym_id"))* + (_atom_site.count("auth_seq_id")+ + _atom_site.count("label_seq_id"))* + _atom_site.count("Cartn_x")* + _atom_site.count("Cartn_y")* + _atom_site.count("Cartn_z")==0) + { + loop_ = false; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, label_atom_id, label_comp_id, auth_asym_id/label_asym_id, auth_seq_id/label_seq_id, Cartn_x, Cartn_y, Cartn_z"<<endl; + continue; + } + } + + line_vec.clear(); + split(line,line_vec); + atom =line_vec[_atom_site["label_atom_id"]]; + resn =line_vec[_atom_site["label_comp_id"]]; + group_PDB=line_vec[_atom_site["group_PDB"]]; + if (group_PDB=="ATOM") group_PDB="ATOM "; + if (mse_opt && resn=="MSE") + { + group_PDB="ATOM "; + if (atom=="SE") atom="SD"; + } + if (group_PDB=="HETATM") + { + if (!lig_opt) continue; + if (!hoh_opt && resn=="HOH") continue; + if (asym_id!=prev_asym_id && prev_asym_id.size()) + asym_id=prev_asym_id; // no separate chain for ligand + } + else if (group_PDB!="ATOM ") continue; + + alt_id="."; + if (_atom_site.count("label_alt_id")) // in 39.4 % of entries + alt_id=line_vec[_atom_site["label_alt_id"]]; + if (alt_id!="." && alt_id!="A") continue; + + if (resn.size()==1) + { + if (!rna_opt && group_PDB=="ATOM ") continue; + resn=" "+resn; + } + else if (resn.size()==2) + { + if (!dna_opt && resn[0]=='D' && group_PDB=="ATOM ") continue; + resn=" " +resn; + } + else if (resn.size()==3 && !protein_opt && group_PDB=="ATOM ") continue; + else if (resn.size()>=4) resn=resn.substr(0,3); + + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) continue; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; // wrong for sidechain H + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + if (resi.size()>5) + { + cerr<<"WARNING! Cannot parse line due to long residue index\n"<<line<<endl; + continue; + } + + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + if (chain_opt.size() && asym_id!=chain_opt && + !(asym_id==" " && (chain_opt=="_" || chain_opt=="."))) continue; + + if (_atom_site.count("pdbx_PDB_model_num") && + model_index!=line_vec[_atom_site["pdbx_PDB_model_num"]]) + { + if (PDB_lines.size()) break; + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + } + + if (prev_asym_id!=asym_id) + { + PDB_lines.push_back(tmp_str_vec); + chainID_list.push_back(asym_id); + prev_asym_id=asym_id; + } + + a++; + a%=100000; + i8_stream<<group_PDB + <<setw(5)<<a<<" "<<atom<<" "<<resn<<" "<<asym_id[asym_id.size()-1] + <<setw(5)<<resi<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_y"]].substr(0,8) + <<setw(8)<<line_vec[_atom_site["Cartn_z"]].substr(0,8); + if (_atom_site.count("B_iso_or_equiv")) + { + i8_stream<<" 1.00"<<setw(6)<<line_vec[_atom_site["B_iso_or_equiv"]].substr(0,6); + if (_atom_site.count("type_symbol")) + i8_stream<<setw(12)<<line_vec[_atom_site["type_symbol"]].substr(0,12); + } + i8_stream<<endl; + PDB_lines.back().push_back(i8_stream.str()); + i8_stream.str(string()); + } + group_PDB.clear(); + _atom_site.clear(); + line_vec.clear(); + alt_id.clear(); + asym_id.clear(); + resn.clear(); + + if (compress_type>=0) + { + if (compress_type) fin_gz.close(); + else fin.close(); + } + line.clear(); + chainID_list.push_back(""); + return PDB_lines.size(); +} + + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + /**********************/ + /* get argument */ + /**********************/ + string xname = ""; + string yname = ""; + + int split_opt =0; // do not split chain + int het_opt =0; // do not read HETATM residues + int mol_opt =7; // auto-detect the molecule type as protein/RNA + string chain_opt =""; // read all chains + + for(int i = 1; i < argc; i++) + { + if ( !strcmp(argv[i],"-split") && i < (argc-1) ) + { + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-mol") && i < (argc-1) ) + { + mol_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-chain") && i < (argc-1) ) + { + chain_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else if (yname.size() == 0) yname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if(yname.size()==0) + { + if (xname.size()==0) + PrintErrorAndQuit("Please provide input structures"); + else if (yname.size()==0) yname="-"; + } + + bool dna_opt=(mol_opt>=4); + mol_opt %= 4; + bool rna_opt=(mol_opt>=2); + mol_opt %= 2; + bool protein_opt=(mol_opt>=1); + + if (split_opt<0 || split_opt>1) + PrintErrorAndQuit("-split can only be 0 or 1"); + if (het_opt<0 || het_opt>3) + PrintErrorAndQuit("-het can only be 0, 1, 2, or 3"); + + bool hoh_opt=(het_opt==3); + bool lig_opt=(het_opt>=2); + bool mse_opt=(het_opt>=1); + + /* parse structure */ + vector<vector<string> >PDB_lines; + vector<string> chainID_list; + get_all_mmcif_lines(xname, chain_opt, PDB_lines, chainID_list, + dna_opt, rna_opt, protein_opt, hoh_opt, lig_opt, mse_opt); + if (!split_opt) resolve_chainID_for_mmcif(PDB_lines,chainID_list); + write_mmcif_to_pdb(yname, PDB_lines, chainID_list, split_opt); + + /* clean up */ + vector<vector<string> >().swap(PDB_lines); + vector<string>().swap(chainID_list); + chain_opt.clear(); + return 0; +} diff --git a/modules/bindings/src/USalign/flexalign.h b/modules/bindings/src/USalign/flexalign.h new file mode 100644 index 000000000..f982fa921 --- /dev/null +++ b/modules/bindings/src/USalign/flexalign.h @@ -0,0 +1,1826 @@ +/* Functions for the core TMalign algorithm, including the entry function + * flexalign_main */ +#ifndef flexalign_h +#define flexalign_h 1 + +#include "TMalign.h" + +void t_u2tu(double t0[3],double u0[3][3], vector<double> &tu_tmp) +{ + int i,j,k; + for (i=0;i<3;i++) tu_tmp[i]=t0[i]; + k=3; + for (i=0;i<3;i++) for (j=0;j<3;j++) + { + tu_tmp[k]=u0[i][j]; + k++; + } +} + +void tu2t_u(vector<double> tu_tmp, double t0[3],double u0[3][3]) +{ + int i,j,k; + for (i=0;i<3;i++) t0[i]=tu_tmp[i]; + k=3; + for (i=0;i<3;i++) for (j=0;j<3;j++) + { + u0[i][j]=tu_tmp[k]; + k++; + } +} + +void aln2invmap(const string &seqxA, const string &seqyA, int *invmap) +{ + int i,j,r; + int ylen=0; + for (r=0;r<seqyA.size();r++) ylen+=seqyA[r]!='-'; + for(j=0; j<ylen; j++) invmap[j]=-1; + + i=j=-1; + for (r=0;r<seqxA.size();r++) + { + i+=seqxA[r]!='-'; + j+=seqyA[r]!='-'; + if (seqxA[r]!='-' && seqyA[r]!='-') invmap[j]=i; + } +} + +int flexalign_main(double **xa, double **ya, + const char *seqx, const char *seqy, const char *secx, const char *secy, + double t0[3], double u0[3][3], vector<vector<double> >&tu_vec, + double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, + double &d0_0, double &TM_0, + double &d0A, double &d0B, double &d0u, double &d0a, double &d0_out, + string &seqM, string &seqxA, string &seqyA, + double &rmsd0, int &L_ali, double &Liden, + double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, + const int xlen, const int ylen, + const vector<string> sequence, const double Lnorm_ass, + const double d0_scale, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const bool fast_opt, + const int mol_type, const int hinge_opt) +{ + vector<double> tu_tmp(12,0); + int round2=tu_vec.size(); + if (round2==0) + { + TMalign_main(xa, ya, seqx, seqy, secx, secy, t0, u0, + TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, + d0_scale, i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type); + + t_u2tu(t0,u0,tu_tmp); + tu_vec.push_back(tu_tmp); + } + + int i,j,r; + int* invmap=new int[ylen+1]; + for (j=0;j<ylen+1;j++) invmap[j]=-1; + double **xt; + NewArray(&xt, xlen, 3); + do_rotation(xa, xt, xlen, t0, u0); + + TM1= TM2= TM3= TM4= TM5=rmsd0=0; + seqM=""; + seqxA=""; + seqyA=""; + n_ali=n_ali8=0; + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, mol_type, 0, invmap, 1); + if (round2) + { + /* aligned structure A vs unaligned structure B */ + int xlen_h=n_ali8; + int ylen_h=ylen - n_ali8; + char *seqx_h = new char[xlen + 1]; + char *seqy_h = new char[ylen + 1]; + char *secx_h = new char[xlen + 1]; + char *secy_h = new char[ylen + 1]; + seqx_h[xlen]=seqy_h[ylen]=0; + secx_h[xlen]=secy_h[ylen]=0; + double **xa_h, **ya_h; + NewArray(&xa_h, xlen, 3); + NewArray(&ya_h, ylen, 3); + + int r1,r2; + i=j=-1; + r1=r2=0; + for (r=0;r<seqxA.size();r++) + { + i+=(seqxA[r]!='-'); + j+=(seqyA[r]!='-'); + if (seqxA[r]!='-' && seqyA[r]!='-') + { + seqx_h[r1]=seqx[i]; + secx_h[r1]=secx[i]; + xa_h[r1][0]=xa[i][0]; + xa_h[r1][1]=xa[i][1]; + xa_h[r1][2]=xa[i][2]; + r1++; + } + if (seqxA[r]=='-') + { + seqy_h[r2]=seqx[j]; + secy_h[r2]=secx[j]; + ya_h[r2][0]=ya[j][0]; + ya_h[r2][1]=ya[j][1]; + ya_h[r2][2]=ya[j][2]; + r2++; + } + } + + double TM1_h, TM2_h; + double TM3_h, TM4_h, TM5_h; // for a_opt, u_opt, d_opt + double d0_0_h, TM_0_h; + double d0A_h, d0B_h, d0u_h, d0a_h; + double d0_out_h=5.0; + string seqM_h, seqxA_h, seqyA_h;// for output alignment + double rmsd0_h = 0.0; + int L_ali_h=0; // Aligned length in standard_TMscore + double Liden_h=0; + double TM_ali_h, rmsd_ali_h; // TMscore and rmsd in standard_TMscore + int n_ali_h=0; + int n_ali8_h=0; + + TMalign_main(xa_h, ya_h, seqx_h, seqy_h, secx_h, secy_h, t0, u0, + TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, d0_0_h, TM_0_h, + d0A_h, d0B_h, d0u_h, d0a_h, d0_out_h, seqM_h, seqxA_h, seqyA_h, + rmsd0_h, L_ali_h, Liden_h, TM_ali_h, rmsd_ali_h, n_ali_h, n_ali8_h, + xlen_h, ylen_h, sequence, Lnorm_ass, + d0_scale, i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type); + + do_rotation(xa, xt, xlen, t0, u0); + t_u2tu(t0,u0,tu_vec[0]); + + int* invmap_h=new int[ylen+1]; + for (j=0;j<ylen+1;j++) invmap_h[j]=-1; + TM1_h= TM2_h= TM3_h= TM4_h= TM5_h=rmsd0_h=0; + seqM_h=""; + seqxA_h=""; + seqyA_h=""; + n_ali_h=n_ali8_h=0; + se_main(xt, ya, seqx, seqy, TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, d0_0, + TM_0, d0A, d0B, d0u, d0a, d0_out, seqM_h, seqxA_h, seqyA_h, + rmsd0_h, L_ali, Liden, TM_ali, rmsd_ali, n_ali_h, n_ali8_h, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, mol_type, 0, invmap_h, 1); + + /* unaligned structure A vs aligned structure B */ + xlen_h=xlen - n_ali8; + ylen_h=n_ali8; + + i=j=-1; + r1=r2=0; + for (r=0;r<seqxA.size();r++) + { + i+=(seqxA[r]!='-'); + j+=(seqyA[r]!='-'); + if (seqyA[r]=='-') + { + seqx_h[r1]=seqx[i]; + secx_h[r1]=secx[i]; + xa_h[r1][0]=xa[i][0]; + xa_h[r1][1]=xa[i][1]; + xa_h[r1][2]=xa[i][2]; + r1++; + } + if (seqxA[r]!='-' && seqyA[r]!='-') + { + seqy_h[r2]=seqx[j]; + secy_h[r2]=secx[j]; + ya_h[r2][0]=ya[j][0]; + ya_h[r2][1]=ya[j][1]; + ya_h[r2][2]=ya[j][2]; + r2++; + } + } + + d0_out_h=5.0; + L_ali_h=Liden_h=0; + TM1= TM2= TM3= TM4= TM5=rmsd0=0; + seqM=""; + seqxA=""; + seqyA=""; + n_ali=n_ali8=0; + + TMalign_main(xa_h, ya_h, seqx_h, seqy_h, secx_h, secy_h, t0, u0, + TM1, TM2, TM3, TM4, TM5, d0_0_h, TM_0_h, + d0A_h, d0B_h, d0u_h, d0a_h, d0_out_h, seqM, seqxA, seqyA, + rmsd0, L_ali_h, Liden_h, TM_ali_h, rmsd_ali_h, n_ali, n_ali8, + xlen_h, ylen_h, sequence, Lnorm_ass, + d0_scale, i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type); + + do_rotation(xa, xt, xlen, t0, u0); + + for (j=0;j<ylen+1;j++) invmap[j]=-1; + TM1= TM2= TM3= TM4= TM5=rmsd0=0; + seqM=""; + seqxA=""; + seqyA=""; + n_ali=n_ali8=0; + se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, d0_0, + TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, mol_type, 0, invmap, 1); + + double TM_h=(TM1_h>TM2_h)?TM1_h:TM2_h; + double TM =(TM1 >TM2 )?TM1 :TM2 ; + if (TM_h>TM) + { + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + for (j=0;j<ylen+1;j++) invmap[j]=invmap_h[j]; + } + else t_u2tu(t0,u0,tu_vec[0]); + + /* clean up */ + delete [] invmap_h; + DeleteArray(&xa_h, xlen); + DeleteArray(&ya_h, ylen); + seqM_h.clear(); + seqxA_h.clear(); + seqyA_h.clear(); + delete [] seqx_h; + delete [] secx_h; + delete [] seqy_h; + delete [] secy_h; + } + for (r=0;r<seqM.size();r++) if (seqM[r]=='1') seqM[r]='0'; + + int minlen = min(xlen, ylen); + int hinge; + for (hinge=0;hinge<hinge_opt;hinge++) + { + if (minlen-n_ali8<5) break; + int xlen_h=xlen - n_ali8; + int ylen_h=ylen - n_ali8; + char *seqx_h = new char[xlen_h + 1]; + char *seqy_h = new char[ylen_h + 1]; + char *secx_h = new char[xlen_h + 1]; + char *secy_h = new char[ylen_h + 1]; + seqx_h[xlen_h]=seqy_h[ylen_h]=0; + secx_h[xlen_h]=secy_h[ylen_h]=0; + double **xa_h, **ya_h; + NewArray(&xa_h, xlen_h, 3); + NewArray(&ya_h, ylen_h, 3); + vector<int> r1toi(xlen_h,0); + vector<int> r2toj(ylen_h,0); + + int r1,r2; + i=j=-1; + r1=r2=0; + for (r=0;r<seqxA.size();r++) + { + i+=(seqxA[r]!='-'); + j+=(seqyA[r]!='-'); + if (seqyA[r]=='-') + { + seqx_h[r1]=seqx[i]; + secx_h[r1]=secx[i]; + xa_h[r1][0]=xa[i][0]; + xa_h[r1][1]=xa[i][1]; + xa_h[r1][2]=xa[i][2]; + r1toi[r1]=i; + r1++; + } + if (seqxA[r]=='-') + { + seqy_h[r2]=seqx[j]; + secy_h[r2]=secx[j]; + ya_h[r2][0]=ya[j][0]; + ya_h[r2][1]=ya[j][1]; + ya_h[r2][2]=ya[j][2]; + r2toj[r2]=j; + r2++; + } + } + + double TM1_h, TM2_h; + double TM3_h, TM4_h, TM5_h; // for a_opt, u_opt, d_opt + double d0_0_h, TM_0_h; + double d0A_h, d0B_h, d0u_h, d0a_h; + double d0_out_h=5.0; + string seqM_h, seqxA_h, seqyA_h;// for output alignment + double rmsd0_h = 0.0; + int L_ali_h=0; // Aligned length in standard_TMscore + double Liden_h=0; + double TM_ali_h, rmsd_ali_h; // TMscore and rmsd in standard_TMscore + int n_ali_h=0; + int n_ali8_h=0; + + TMalign_main(xa_h, ya_h, seqx_h, seqy_h, secx_h, secy_h, t0, u0, + TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, d0_0_h, TM_0_h, + d0A_h, d0B_h, d0u_h, d0a_h, d0_out_h, seqM_h, seqxA_h, seqyA_h, + rmsd0_h, L_ali_h, Liden_h, TM_ali_h, rmsd_ali_h, n_ali_h, n_ali8_h, + xlen_h, ylen_h, sequence, Lnorm_ass, + d0_scale, i_opt, a_opt, u_opt, d_opt, fast_opt, mol_type); + + do_rotation(xa, xt, xlen, t0, u0); + + TM1_h=TM1; + TM2_h=TM2; + TM3_h=TM3; + TM4_h=TM4; + TM5_h=TM5; + seqM_h=seqM; + seqxA_h=seqxA; + seqyA_h=seqyA; + rmsd0_h=rmsd0; + n_ali_h=n_ali; + n_ali8_h=n_ali8; + int* invmap_h=new int[ylen+1]; + for (j=0;j<ylen+1;j++) invmap_h[j]=invmap[j]; + se_main(xt, ya, seqx, seqy, TM1_h, TM2_h, TM3_h, TM4_h, TM5_h, d0_0, TM_0, + d0A, d0B, d0u, d0a, d0_out, seqM_h, seqxA_h, seqyA_h, + rmsd0_h, L_ali, Liden, TM_ali, rmsd_ali, n_ali_h, n_ali8_h, + xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, + a_opt, u_opt, d_opt, mol_type, 0, invmap_h, hinge+1); + int new_ali=0; + for (r=0;r<seqM_h.size();r++) new_ali+=(seqM_h[r]==hinge+'1'); + if (n_ali8_h - n_ali8<5) new_ali=0; + if (new_ali>=5) + { + TM1=TM1_h; + TM2=TM2_h; + TM3=TM3_h; + TM4=TM4_h; + TM5=TM5_h; + seqM=seqM_h; + seqxA=seqxA_h; + seqyA=seqyA_h; + rmsd0=rmsd0_h; + n_ali=n_ali_h; + n_ali8=n_ali8_h; + t_u2tu(t0,u0,tu_tmp); + tu_vec.push_back(tu_tmp); + for (j=0;j<ylen+1;j++) invmap[j]=invmap_h[j]; + //cout<<">hinge="<<hinge<<'\n' + //<<seqxA<<'\n'<<seqM<<'\n'<<seqyA<<endl; + //for (j=0;j<ylen;j++) if ((i=invmap[j])>=0) cout<<"("<<i<<","<<j<<")"; + //cout<<endl; + } + + /* clean up */ + delete [] invmap_h; + DeleteArray(&xa_h, xlen_h); + DeleteArray(&ya_h, ylen_h); + r1toi.clear(); + r2toj.clear(); + seqM_h.clear(); + seqxA_h.clear(); + seqyA_h.clear(); + delete [] seqx_h; + delete [] secx_h; + delete [] seqy_h; + delete [] secy_h; + if (new_ali<5) break; + } + + if (tu_vec.size()<=1) + { + DeleteArray(&xt, xlen); + delete[] invmap; + return tu_vec.size(); + } + + /* re-derive alignment based on tu_vec */ + vector<char> seqM_char(ylen,' '); + vector<double> di_vec(ylen,-1); + double d; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + tu2t_u(tu_vec[hinge],t0,u0); + do_rotation(xa, xt, xlen, t0, u0); + for (j=0;j<ylen;j++) + { + i=invmap[j]; + if (i<0) continue; + d=sqrt(dist(xt[i], ya[j])); + if (di_vec[j]<0 || d<=di_vec[j]) + { + di_vec[j]=d; + seqM_char[j]=hinge+'0'; + } + } + } + j=-1; + for (r=0;r<seqM.size();r++) + { + if (seqyA[r]=='-') continue; + j++; + seqM[r]=seqM_char[j]; + } + + /* smooth out AFP assignment: remove singleton insert */ + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r<seqM.size();r++) + { + if (seqyA[r]=='-') continue; + j++; + if (seqM_char[j]!=hinge+'0') continue; + if (r<seqM.size()-1 && (seqM[r+1]==hinge+'0' || seqM[r+1]==' ')) + continue; + if (r>0 && (seqM[r-1]==hinge+'0' || seqM[r-1]==' ')) continue; + if (r<seqM.size()-1 && r>0 && seqM[r-1]!=seqM[r+1]) continue; + if (r>0) seqM[r]=seqM_char[j]=seqM[r-1]; + else seqM[r]=seqM_char[j]=seqM[r+1]; + } + } + /* smooth out AFP assignment: remove singleton at the end of fragment */ + char left_hinge=' '; + char right_hinge=' '; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r<seqM.size();r++) + { + if (seqyA[r]=='-') continue; + j++; + if (seqM[r]!=hinge+'0') continue; + if (r>0 && seqM[r-1]==' ' && r<seqM.size()-1 && seqM[r+1]==' ') + continue; + + left_hinge=' '; + for (i=r-1;i>=0;i--) + { + if (seqM[i]==' ') continue; + left_hinge=seqM[i]; + break; + } + if (left_hinge==hinge+'0') continue; + + right_hinge=' '; + for (i=r+1;i<seqM.size();i++) + { + if (seqM[i]==' ') continue; + right_hinge=seqM[i]; + break; + } + if (right_hinge==hinge+'0') continue; + if (left_hinge!=right_hinge && left_hinge!=' ' && right_hinge!=' ') + continue; + + if (right_hinge!=' ') seqM[r]=seqM_char[j]=right_hinge; + else if (left_hinge!=' ') seqM[r]=seqM_char[j]=left_hinge; + } + } + /* smooth out AFP assignment: remove dimer insert */ + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r<seqM.size()-1;r++) + { + if (seqyA[r]=='-') continue; + j++; + if (seqM[r] !=hinge+'0'|| seqM[r+1]!=hinge+'0') continue; + + if (r<seqM.size()-2 && (seqM[r+2]==' ' || seqM[r+2]==hinge+'0')) + continue; + if (r>0 && (seqM[r-1]==' ' || seqM[r-1]==hinge+'0')) continue; + if (r<seqM.size()-2 && r>0 && seqM[r-1]!=seqM[r+2]) continue; + + if (r>0) seqM[r]=seqM_char[j]=seqM[r+1]=seqM_char[j+1]=seqM[r-1]; + else seqM[r]=seqM_char[j]=seqM[r+1]=seqM_char[j+1]=seqM[r+2]; + } + } + /* smooth out AFP assignment: remove disconnected singleton */ + int i1,i2; + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + j=-1; + for (r=0;r<seqM.size();r++) + { + if (seqyA[r]=='-') continue; + j++; + if (seqM[r]!=hinge+'0') continue; + + left_hinge=' '; + for (i=r-1;i>=0;i--) + { + if (seqM[i]==' ') continue; + left_hinge=seqM[i]; + i1=(r-i); + break; + } + if (left_hinge==hinge+'0') continue; + + right_hinge=' '; + for (i=r+1;i<seqM.size();i++) + { + if (seqM[i]==' ') continue; + right_hinge=seqM[i]; + i2=(i-r); + break; + } + if (right_hinge==hinge+'0') continue; + + if (right_hinge==' ') seqM[r]=seqM_char[j]=left_hinge; + else if (left_hinge==' ') seqM[r]=seqM_char[j]=right_hinge; + else + { + if (i1<i2) seqM[r]=seqM_char[j]=left_hinge; + else seqM[r]=seqM_char[j]=right_hinge; + } + } + } + + /* recalculate all scores */ + for (hinge=tu_vec.size()-1;hinge>=0;hinge--) + { + tu2t_u(tu_vec[hinge],t0,u0); + do_rotation(xa, xt, xlen, t0, u0); + for (j=0;j<ylen;j++) + { + i=invmap[j]; + if (i<0) continue; + if (seqM_char[j]!=hinge+'0') continue; + d=sqrt(dist(xt[i], ya[j])); + if (di_vec[j]<0 || d<=di_vec[j]) + { + di_vec[j]=d; + seqM_char[j]=hinge+'0'; + } + } + } + rmsd0=TM1=TM2=TM3=TM4=TM5=0; + Liden=0; + for (r=0;r<seqM.size();r++) if (seqM[r]!=' ') Liden+=seqxA[r]==seqyA[r]; + for(j=0; j<ylen; j++) + { + i=invmap[j]; + if(i<0) continue; + { + d=di_vec[j]; + TM2+=1/(1+(d/d0B)*(d/d0B)); // chain_1 + TM1+=1/(1+(d/d0A)*(d/d0A)); // chain_2 + if (a_opt) TM3+=1/(1+(d/d0a)*(d/d0a)); // -a + if (u_opt) TM4+=1/(1+(d/d0u)*(d/d0u)); // -u + if (d_opt) TM5+=1/(1+(d/d0_scale)*(d/d0_scale)); // -d + rmsd0+=d*d; + } + } + TM2/=xlen; + TM1/=ylen; + TM3/=(xlen+ylen)*0.5; + TM4/=Lnorm_ass; + TM5/=ylen; + if (n_ali8) rmsd0=sqrt(rmsd0/n_ali8); + for (hinge=tu_vec.size()-1;hinge>0;hinge--) + { + int afp_len=0; + for (r=0;r<seqM.size();r++) afp_len+=seqM[r]==hinge+'0'; + if (afp_len) break; + tu_vec.pop_back(); // remove unnecessary afp + } + + /* clean up */ + seqM_char.clear(); + di_vec.clear(); + DeleteArray(&xt, xlen); + delete[] invmap; + return tu_vec.size(); +} + +/* extract rotation matrix based on TMscore8 */ +void output_flexalign_rotation_matrix(const char* fname_matrix, + const vector<vector<double> >&tu_vec, double t[3], double u[3][3]) +{ + stringstream ss; + char dest[1000]; + for (int hinge=0;hinge<tu_vec.size();hinge++) + { + tu2t_u(tu_vec[hinge],t,u); + ss << "------ The rotation matrix to rotate Structure_1 to Structure_2 ------\n"; + sprintf(dest, "m %18s %14s %14s %14s\n", "t[m]", "u[m][0]", "u[m][1]", "u[m][2]"); + ss << string(dest); + for (int k = 0; k < 3; k++) + { + sprintf(dest, "%d %18.10f %14.10f %14.10f %14.10f\n", k, t[k], u[k][0], u[k][1], u[k][2]); + ss << string(dest); + } + } + ss << "\nCode for rotating Structure 1 from (x,y,z) to (X,Y,Z):\n" + "for(i=0; i<L; i++)\n" + "{\n" + " X[i] = t[0] + u[0][0]*x[i] + u[0][1]*y[i] + u[0][2]*z[i];\n" + " Y[i] = t[1] + u[1][0]*x[i] + u[1][1]*y[i] + u[1][2]*z[i];\n" + " Z[i] = t[2] + u[2][0]*x[i] + u[2][1]*y[i] + u[2][2]*z[i];\n" + "}\n"; + if (strcmp(fname_matrix,(char *)("-"))==0) + cout<<ss.str(); + else + { + fstream fout; + fout.open(fname_matrix, ios::out | ios::trunc); + if (fout) + { + fout<<ss.str(); + fout.close(); + } + else cout << "Open file to output rotation matrix fail.\n"; + } + ss.str(string()); +} + +void output_flexalign_rasmol(const string xname, const string yname, + const string fname_super,const vector<vector<double> >&tu_vec, + double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector<string>&resi_vec1, const vector<string>&resi_vec2, + const string chainID1, const string chainID2, + const int xlen, const int ylen, const double d0A, const int n_ali8, + const double rmsd, const double TM1, const double Liden) +{ + stringstream buf; + stringstream buf_all; + stringstream buf_atm; + stringstream buf_all_atm; + stringstream buf_all_atm_lig; + //stringstream buf_pdb; + stringstream buf_tm; + string line; + double x[3]; // before transform + double x1[3]; // after transform + bool after_ter; // true if passed the "TER" line in PDB + string asym_id; // chain ID + + map<string,int> resi2hinge_dict; + int r,i,j; + j=-1; + char hinge_char=0; + int ali_len=strlen(seqM); + for (r=0;r<strlen(seqxA);r++) + { + if (seqxA[r]=='-') continue; + j++; + hinge_char=seqM[r]; + if (hinge_char==' ') + { + for (i=1;i<ali_len;i++) + { + if (r-i>=0 && seqM[r-i]!=' ') + hinge_char=seqM[r-i]; + else if (r+i<xlen && seqM[r+i]!=' ') + hinge_char=seqM[r+i]; + if (hinge_char!=' ') break; + } + } + resi2hinge_dict[resi_vec1[j]]=hinge_char-'0'; + } + string resi=resi_vec1[0]; + int read_resi=resi.size()-4; + + buf_tm<<"REMARK US-align" + <<"\nREMARK Structure 1:"<<setw(11)<<left<<xname+chainID1<<" Size= "<<xlen + <<"\nREMARK Structure 2:"<<setw(11)<<yname+chainID2<<right<<" Size= "<<ylen + <<" (TM-score is normalized by "<<setw(4)<<ylen<<", d0=" + <<setiosflags(ios::fixed)<<setprecision(2)<<setw(6)<<d0A<<")" + <<"\nREMARK Aligned length="<<setw(4)<<n_ali8<<", RMSD=" + <<setw(6)<<setiosflags(ios::fixed)<<setprecision(2)<<rmsd + <<", TM-score="<<setw(7)<<setiosflags(ios::fixed)<<setprecision(5)<<TM1 + <<", ID="<<setw(5)<<setiosflags(ios::fixed)<<setprecision(3) + <<((n_ali8>0)?Liden/n_ali8:0)<<endl; + string rasmol_CA_header="load inline\nselect *A\nwireframe .45\nselect *B\nwireframe .20\nselect all\ncolor white\n"; + string rasmol_cartoon_header="load inline\nselect all\ncartoon\nselect *A\ncolor blue\nselect *B\ncolor red\nselect ligand\nwireframe 0.25\nselect solvent\nspacefill 0.25\nselect all\nexit\n"+buf_tm.str(); + if (!mm_opt) buf<<rasmol_CA_header; + buf_all<<rasmol_CA_header; + if (!mm_opt) buf_atm<<rasmol_cartoon_header; + buf_all_atm<<rasmol_cartoon_header; + buf_all_atm_lig<<rasmol_cartoon_header; + + /* selecting chains for -mol */ + string chain1_sele; + string chain2_sele; + if (!mm_opt) + { + if (split_opt==2 && ter_opt>=1) // align one chain from model 1 + { + chain1_sele=chainID1.substr(1); + chain2_sele=chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i<chainID1.size();i++) if (chainID1[i]==',') break; + chain1_sele=chainID1.substr(i+1); + for (i=1;i<chainID2.size();i++) if (chainID2[i]==',') break; + chain2_sele=chainID2.substr(i+1); + } + } + + + /* for PDBx/mmCIF only */ + map<string,int> _atom_site; + int atom_site_pos; + vector<string> line_vec; + string atom; // 4-character atom name + string AA; // 3-character residue name + string inscode; // 1-character insertion code + string model_index; // model index + bool is_mmcif=false; + + /* used for CONECT record of chain1 */ + int ca_idx1=0; // all CA atoms + int lig_idx1=0; // all atoms + vector <int> idx_vec; + + /* used for CONECT record of chain2 */ + int ca_idx2=0; // all CA atoms + int lig_idx2=0; // all atoms + + /* extract aligned region */ + vector<string> resi_aln1; + vector<string> resi_aln2; + int i1=-1; + int i2=-1; + if (!mm_opt) + { + for (i=0;i<strlen(seqM);i++) + { + i1+=(seqxA[i]!='-'); + i2+=(seqyA[i]!='-'); + if (seqM[i]==' ') continue; + resi_aln1.push_back(resi_vec1[i1].substr(0,4)); + resi_aln2.push_back(resi_vec2[i2].substr(0,4)); + if (seqM[i]!=':') continue; + buf <<"select "<<resi_aln1.back()<<":A," + <<resi_aln2.back()<<":B\ncolor red\n"; + buf_all<<"select "<<resi_aln1.back()<<":A," + <<resi_aln2.back()<<":B\ncolor red\n"; + } + buf<<"select all\nexit\n"<<buf_tm.str(); + } + buf_all<<"select all\nexit\n"<<buf_tm.str(); + + ifstream fin; + /* read first file */ + after_ter=false; + asym_id=""; + fin.open(xname.c_str()); + int hinge=0; + while (fin.good()) + { + getline(fin, line); + if (ter_opt>=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (is_mmcif==false && line.size()>=54 && + (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + if (read_resi==1) resi=line.substr(22,5); + else resi=line.substr(22,5)+line[21]; + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + //buf_pdb<<line.substr(0,30)<<setiosflags(ios::fixed) + //<<setprecision(3) + //<<setw(8)<<x1[0] <<setw(8)<<x1[1] <<setw(8)<<x1[2] + //<<line.substr(54)<<'\n'; + + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx1++; + buf_all_atm_lig<<line.substr(0,6)<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + if (chain1_sele.size() && line[21]!=chain1_sele[0]) continue; + if (after_ter || line.compare(0,6,"ATOM ")) continue; + if (ter_opt>=2) + { + if (ca_idx1 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(),resi_aln1.end(), + line.substr(22,4))!=resi_aln1.end()) + { + buf_atm<<"ATOM "<<setw(5)<<lig_idx1 + <<line.substr(11,9)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1] <<setw(8)<<x1[2]<<'\n'; + } + if (line.substr(12,4)!=" CA " && line.substr(12,4)!=" C3'") continue; + ca_idx1++; + buf_all<<"ATOM "<<setw(5)<<ca_idx1<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1]<<setw(8)<<x1[2]<<'\n'; + if (find(resi_aln1.begin(),resi_aln1.end(), + line.substr(22,4))==resi_aln1.end()) continue; + if (!mm_opt) buf<<"ATOM "<<setw(5)<<ca_idx1<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" A"<<line.substr(22,8) + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<setw(8)<<x1[1]<<setw(8)<<x1[2]<<'\n'; + idx_vec.push_back(ca_idx1); + } + else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF + { + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + } + + if (is_mmcif==false) + { + //buf_pdb.str(string()); + is_mmcif=true; + } + + while(1) + { + line_vec.clear(); + split(line,line_vec); + if (line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") break; + if (_atom_site.count("pdbx_PDB_model_num")) + { + if (model_index.size() && model_index!= + line_vec[_atom_site["pdbx_PDB_model_num"]]) + break; + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + } + + x[0]=atof(line_vec[_atom_site["Cartn_x"]].c_str()); + x[1]=atof(line_vec[_atom_site["Cartn_y"]].c_str()); + x[2]=atof(line_vec[_atom_site["Cartn_z"]].c_str()); + if (mirror_opt) x[2]=-x[2]; + + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + if (read_resi>=2) + { + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + resi+=asym_id[0]; + } + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + + if (_atom_site.count("label_alt_id")==0 || + line_vec[_atom_site["label_alt_id"]]=="." || + line_vec[_atom_site["label_alt_id"]]=="A") + { + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) atom=" "; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain1_sele; + else if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain1_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain1_sele; + if (ter_opt>=2 && ca_idx1 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + //buf_pdb<<left<<setw(6) + //<<line_vec[_atom_site["group_PDB"]]<<right + //<<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + //<<AA<<" "<<asym_id[asym_id.size()-1] + //<<resi<<inscode<<" " + //<<setiosflags(ios::fixed)<<setprecision(3) + //<<setw(8)<<x1[0] + //<<setw(8)<<x1[1] + //<<setw(8)<<x1[2]<<'\n'; + + if (after_ter==false || + line_vec[_atom_site["group_pdb"]]=="HETATM") + { + lig_idx1++; + buf_all_atm_lig<<left<<setw(6) + <<line_vec[_atom_site["group_PDB"]]<<right + <<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (after_ter==false && + line_vec[_atom_site["group_PDB"]]=="ATOM") + { + buf_all_atm<<"ATOM "<<setw(6) + <<setw(5)<<lig_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(), + resi_aln1.end(),resi)!=resi_aln1.end()) + { + buf_atm<<"ATOM "<<setw(6) + <<setw(5)<<lig_idx1%100000<<' ' + <<atom<<' '<<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + } + if (atom==" CA " || atom==" C3'") + { + ca_idx1++; + //mm_opt, split_opt, mirror_opt, chainID1,chainID2); + buf_all<<"ATOM "<<setw(6) + <<setw(5)<<ca_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + if (!mm_opt && find(resi_aln1.begin(), + resi_aln1.end(),resi)!=resi_aln1.end()) + { + buf<<"ATOM "<<setw(6) + <<setw(5)<<ca_idx1%100000<<' '<<atom<<' ' + <<AA<<" A"<<resi<<inscode<<" " + <<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0] + <<setw(8)<<x1[1] + <<setw(8)<<x1[2]<<'\n'; + idx_vec.push_back(ca_idx1); + } + } + } + } + } + + while(1) + { + if (fin.good()) getline(fin, line); + else break; + if (line.size()) break; + } + } + } + else if (line.size() && is_mmcif==false) + { + //buf_pdb<<line<<'\n'; + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=1;i<ca_idx1;i++) buf_all<<"CONECT" + <<setw(5)<<i%100000<<setw(5)<<(i+1)%100000<<'\n'; + if (!mm_opt) for (i=1;i<idx_vec.size();i++) buf<<"CONECT" + <<setw(5)<<idx_vec[i-1]%100000<<setw(5)<<idx_vec[i]%100000<<'\n'; + idx_vec.clear(); + + /* read second file */ + after_ter=false; + asym_id=""; + fin.open(yname.c_str()); + while (fin.good()) + { + getline(fin, line); + if (ter_opt>=3 && line.compare(0,3,"TER")==0) after_ter=true; + if (line.size()>=54 && (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0)) // PDB format + { + if (line[16]!='A' && line[16]!=' ') continue; + if (after_ter && line.compare(0,6,"ATOM ")==0) continue; + lig_idx2++; + buf_all_atm_lig<<line.substr(0,6)<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + if (chain1_sele.size() && line[21]!=chain1_sele[0]) continue; + if (after_ter || line.compare(0,6,"ATOM ")) continue; + if (ter_opt>=2) + { + if (ca_idx2 && asym_id.size() && asym_id!=line.substr(21,1)) + { + after_ter=true; + continue; + } + asym_id=line[21]; + } + buf_all_atm<<"ATOM "<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + if (!mm_opt && find(resi_aln2.begin(),resi_aln2.end(), + line.substr(22,4))!=resi_aln2.end()) + { + buf_atm<<"ATOM "<<setw(5)<<lig_idx1+lig_idx2 + <<line.substr(11,9)<<" B"<<line.substr(22,32)<<'\n'; + } + if (line.substr(12,4)!=" CA " && line.substr(12,4)!=" C3'") continue; + ca_idx2++; + buf_all<<"ATOM "<<setw(5)<<ca_idx1+ca_idx2<<' '<<line.substr(12,4) + <<' '<<line.substr(17,3)<<" B"<<line.substr(22,32)<<'\n'; + if (find(resi_aln2.begin(),resi_aln2.end(),line.substr(22,4) + )==resi_aln2.end()) continue; + if (!mm_opt) buf<<"ATOM "<<setw(5)<<ca_idx1+ca_idx2<<' ' + <<line.substr(12,4)<<' '<<line.substr(17,3)<<" B" + <<line.substr(22,32)<<'\n'; + idx_vec.push_back(ca_idx1+ca_idx2); + } + else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF + { + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+yname); + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) continue; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[line.substr(11,line.size()-12)]=atom_site_pos; + while(1) + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+yname); + if (line.size()==0) continue; + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[line.substr(11,line.size()-12)]=++atom_site_pos; + } + + while(1) + { + line_vec.clear(); + split(line,line_vec); + if (line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") break; + if (_atom_site.count("pdbx_PDB_model_num")) + { + if (model_index.size() && model_index!= + line_vec[_atom_site["pdbx_PDB_model_num"]]) + break; + model_index=line_vec[_atom_site["pdbx_PDB_model_num"]]; + } + + if (_atom_site.count("label_alt_id")==0 || + line_vec[_atom_site["label_alt_id"]]=="." || + line_vec[_atom_site["label_alt_id"]]=="A") + { + atom=line_vec[_atom_site["label_atom_id"]]; + if (atom[0]=='"') atom=atom.substr(1); + if (atom.size() && atom[atom.size()-1]=='"') + atom=atom.substr(0,atom.size()-1); + if (atom.size()==0) atom=" "; + else if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + else if (atom.size()>=5) atom=atom.substr(0,4); + + AA=line_vec[_atom_site["label_comp_id"]]; // residue name + if (AA.size()==1) AA=" "+AA; + else if (AA.size()==2) AA=" " +AA; + else if (AA.size()>=4) AA=AA.substr(0,3); + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + while (resi.size()<4) resi=' '+resi; + if (resi.size()>4) resi=resi.substr(0,4); + + inscode=' '; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + inscode=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + + if (_atom_site.count("auth_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["auth_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["auth_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["auth_asym_id"]]; + } + else if (_atom_site.count("label_asym_id")) + { + if (chain2_sele.size()) after_ter + =line_vec[_atom_site["label_asym_id"]]!=chain2_sele; + if (ter_opt>=2 && ca_idx2 && asym_id.size() && + asym_id!=line_vec[_atom_site["label_asym_id"]]) + after_ter=true; + asym_id=line_vec[_atom_site["label_asym_id"]]; + } + if (after_ter==false || + line_vec[_atom_site["group_PDB"]]=="HETATM") + { + lig_idx2++; + buf_all_atm_lig<<left<<setw(6) + <<line_vec[_atom_site["group_PDB"]]<<right + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (after_ter==false && + line_vec[_atom_site["group_PDB"]]=="ATOM") + { + buf_all_atm<<"ATOM "<<setw(6) + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (!mm_opt && find(resi_aln2.begin(), + resi_aln2.end(),resi)!=resi_aln2.end()) + { + buf_atm<<"ATOM "<<setw(6) + <<setw(5)<<(lig_idx1+lig_idx2)%100000<<' ' + <<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + } + if (atom==" CA " || atom==" C3'") + { + ca_idx2++; + buf_all<<"ATOM "<<setw(6) + <<setw(5)<<(ca_idx1+ca_idx2)%100000 + <<' '<<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + if (!mm_opt && find(resi_aln2.begin(), + resi_aln2.end(),resi)!=resi_aln2.end()) + { + buf<<"ATOM "<<setw(6) + <<setw(5)<<(ca_idx1+ca_idx2)%100000 + <<' '<<atom<<' '<<AA<<" B"<<resi<<inscode<<" " + <<setw(8)<<line_vec[_atom_site["Cartn_x"]] + <<setw(8)<<line_vec[_atom_site["Cartn_y"]] + <<setw(8)<<line_vec[_atom_site["Cartn_z"]] + <<'\n'; + idx_vec.push_back(ca_idx1+ca_idx2); + } + } + } + } + } + + if (fin.good()) getline(fin, line); + else break; + } + } + else if (line.size()) + { + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + } + } + fin.close(); + if (!mm_opt) buf<<"TER\n"; + buf_all<<"TER\n"; + if (!mm_opt) buf_atm<<"TER\n"; + buf_all_atm<<"TER\n"; + buf_all_atm_lig<<"TER\n"; + for (i=ca_idx1+1;i<ca_idx1+ca_idx2;i++) buf_all<<"CONECT" + <<setw(5)<<i%100000<<setw(5)<<(i+1)%100000<<'\n'; + for (i=1;i<idx_vec.size();i++) buf<<"CONECT" + <<setw(5)<<idx_vec[i-1]%100000<<setw(5)<<idx_vec[i]%100000<<'\n'; + idx_vec.clear(); + + /* write pymol script */ + ofstream fp; + /* + stringstream buf_pymol; + vector<string> pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + for (i=0;i<pml_list.size();i++) + { + buf_pymol<<"#!/usr/bin/env pymol\n" + <<"load "<<pml_list[i]<<"\n" + <<"hide all\n" + <<((i==0 || i==2)?("show stick\n"):("show cartoon\n")) + <<"color blue, chain A\n" + <<"color red, chain B\n" + <<"set ray_shadow, 0\n" + <<"set stick_radius, 0.3\n" + <<"set sphere_scale, 0.25\n" + <<"show stick, not polymer\n" + <<"show sphere, not polymer\n" + <<"bg_color white\n" + <<"set transparency=0.2\n" + <<"zoom polymer\n" + <<endl; + fp.open((pml_list[i]+".pml").c_str()); + fp<<buf_pymol.str(); + fp.close(); + buf_pymol.str(string()); + pml_list[i].clear(); + } + pml_list.clear(); + */ + + /* write rasmol script */ + if (!mm_opt) + { + fp.open((fname_super).c_str()); + fp<<buf.str(); + fp.close(); + } + fp.open((fname_super+"_all").c_str()); + fp<<buf_all.str(); + fp.close(); + if (!mm_opt) + { + fp.open((fname_super+"_atm").c_str()); + fp<<buf_atm.str(); + fp.close(); + } + fp.open((fname_super+"_all_atm").c_str()); + fp<<buf_all_atm.str(); + fp.close(); + fp.open((fname_super+"_all_atm_lig").c_str()); + fp<<buf_all_atm_lig.str(); + fp.close(); + //fp.open((fname_super+".pdb").c_str()); + //fp<<buf_pdb.str(); + //fp.close(); + + /* clear stream */ + buf.str(string()); + buf_all.str(string()); + buf_atm.str(string()); + buf_all_atm.str(string()); + buf_all_atm_lig.str(string()); + //buf_pdb.str(string()); + buf_tm.str(string()); + resi_aln1.clear(); + resi_aln2.clear(); + asym_id.clear(); + line_vec.clear(); + atom.clear(); + AA.clear(); + resi.clear(); + inscode.clear(); + model_index.clear(); +} + +void output_flexalign_pymol(const string xname, const string yname, + const string fname_super, const vector<vector<double> >&tu_vec, + double t[3], double u[3][3], const int ter_opt, + const int mm_opt, const int split_opt, const int mirror_opt, + const char *seqM, const char *seqxA, const char *seqyA, + const vector<string>&resi_vec1, const vector<string>&resi_vec2, + const string chainID1, const string chainID2) +{ + int compress_type=0; // uncompressed file + ifstream fin; +#ifndef REDI_PSTREAM_H_SEEN + ifstream fin_gz; +#else + redi::ipstream fin_gz; // if file is compressed + if (xname.size()>=3 && + xname.substr(xname.size()-3,3)==".gz") + { + fin_gz.open("gunzip -c "+xname); + compress_type=1; + } + else if (xname.size()>=4 && + xname.substr(xname.size()-4,4)==".bz2") + { + fin_gz.open("bzcat "+xname); + compress_type=2; + } + else +#endif + fin.open(xname.c_str()); + + map<string,int> resi2hinge_dict; + int r,i,j; + j=-1; + char hinge_char=0; + int xlen=resi_vec1.size(); + int ali_len=strlen(seqM); + for (r=0;r<strlen(seqxA);r++) + { + if (seqxA[r]=='-') continue; + j++; + hinge_char=seqM[r]; + if (hinge_char==' ') + { + for (i=1;i<ali_len;i++) + { + if (r-i>=0 && seqM[r-i]!=' ') + hinge_char=seqM[r-i]; + else if (r+i<xlen && seqM[r+i]!=' ') + hinge_char=seqM[r+i]; + if (hinge_char!=' ') break; + } + } + resi2hinge_dict[resi_vec1[j]]=hinge_char-'0'; + } + string resi=resi_vec1[0]; + int read_resi=resi.size()-4; + + stringstream buf; + stringstream buf_pymol; + string line; + double x[3]; // before transform + double x1[3]; // after transform + + /* for PDBx/mmCIF only */ + map<string,int> _atom_site; + size_t atom_site_pos; + vector<string> line_vec; + int infmt=-1; // 0 - PDB, 3 - PDBx/mmCIF + int hinge=0; + string asym_id="."; // this is similar to chainID, except that + // chainID is char while asym_id is a string + // with possibly multiple char + while (compress_type?fin_gz.good():fin.good()) + { + if (compress_type) getline(fin_gz, line); + else getline(fin, line); + if (line.compare(0, 6, "ATOM ")==0 || + line.compare(0, 6, "HETATM")==0) // PDB format + { + infmt=0; + x[0]=atof(line.substr(30,8).c_str()); + x[1]=atof(line.substr(38,8).c_str()); + x[2]=atof(line.substr(46,8).c_str()); + if (mirror_opt) x[2]=-x[2]; + if (read_resi==1) resi=line.substr(22,5); + else resi=line.substr(22,5)+line[21]; + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + buf<<line.substr(0,30)<<setiosflags(ios::fixed) + <<setprecision(3) + <<setw(8)<<x1[0] <<setw(8)<<x1[1] <<setw(8)<<x1[2] + <<line.substr(54)<<'\n'; + } + else if (line.compare(0,5,"loop_")==0) // PDBx/mmCIF + { + infmt=3; + buf<<line<<'\n'; + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + if (line.size()) break; + } + buf<<line<<'\n'; + if (line.compare(0,11,"_atom_site.")) continue; + _atom_site.clear(); + atom_site_pos=0; + _atom_site[Trim(line.substr(11))]=atom_site_pos; + while(1) + { + while(1) + { + if (compress_type) + { + if (fin_gz.good()) getline(fin_gz, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + else + { + if (fin.good()) getline(fin, line); + else PrintErrorAndQuit("ERROR! Unexpected end of "+xname); + } + if (line.size()) break; + } + if (line.compare(0,11,"_atom_site.")) break; + _atom_site[Trim(line.substr(11))]=++atom_site_pos; + buf<<line<<'\n'; + } + + if (_atom_site.count("group_PDB")* + _atom_site.count("Cartn_x")* + _atom_site.count("Cartn_y")* + _atom_site.count("Cartn_z")==0) + { + buf<<line<<'\n'; + cerr<<"Warning! Missing one of the following _atom_site data items: group_PDB, Cartn_x, Cartn_y, Cartn_z"<<endl; + continue; + } + + while(1) + { + line_vec.clear(); + split(line,line_vec); + if (line_vec[_atom_site["group_PDB"]]!="ATOM" && + line_vec[_atom_site["group_PDB"]]!="HETATM") break; + + x[0]=atof(line_vec[_atom_site["Cartn_x"]].c_str()); + x[1]=atof(line_vec[_atom_site["Cartn_y"]].c_str()); + x[2]=atof(line_vec[_atom_site["Cartn_z"]].c_str()); + if (mirror_opt) x[2]=-x[2]; + + + + if (_atom_site.count("auth_seq_id")) + resi=line_vec[_atom_site["auth_seq_id"]]; + else resi=line_vec[_atom_site["label_seq_id"]]; + if (_atom_site.count("pdbx_PDB_ins_code") && + line_vec[_atom_site["pdbx_PDB_ins_code"]]!="?") + resi+=line_vec[_atom_site["pdbx_PDB_ins_code"]][0]; + else resi+=" "; + if (read_resi>=2) + { + if (_atom_site.count("auth_asym_id")) + asym_id=line_vec[_atom_site["auth_asym_id"]]; + else asym_id=line_vec[_atom_site["label_asym_id"]]; + if (asym_id==".") asym_id=" "; + resi+=asym_id[0]; + } + hinge=0; + if (resi2hinge_dict.count(resi)) hinge=resi2hinge_dict[resi]; + tu2t_u(tu_vec[hinge],t,u); + transform(t, u, x, x1); + + for (atom_site_pos=0; atom_site_pos<_atom_site.size(); atom_site_pos++) + { + if (atom_site_pos==_atom_site["Cartn_x"]) + buf<<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[0]<<' '; + else if (atom_site_pos==_atom_site["Cartn_y"]) + buf<<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[1]<<' '; + else if (atom_site_pos==_atom_site["Cartn_z"]) + buf<<setiosflags(ios::fixed)<<setprecision(3) + <<setw(8)<<x1[2]<<' '; + else buf<<line_vec[atom_site_pos]<<' '; + } + buf<<'\n'; + + if (compress_type && fin_gz.good()) getline(fin_gz, line); + else if (!compress_type && fin.good()) getline(fin, line); + else break; + } + if (compress_type?fin_gz.good():fin.good()) buf<<line<<'\n'; + } + else if (line.size()) + { + buf<<line<<'\n'; + if (ter_opt>=1 && line.compare(0,3,"END")==0) break; + } + } + if (compress_type) fin_gz.close(); + else fin.close(); + + string fname_super_full=fname_super; + if (infmt==0) fname_super_full+=".pdb"; + else if (infmt==3) fname_super_full+=".cif"; + ofstream fp; + fp.open(fname_super_full.c_str()); + fp<<buf.str(); + fp.close(); + buf.str(string()); // clear stream + + string chain1_sele; + string chain2_sele; + if (!mm_opt) + { + if (split_opt==2 && ter_opt>=1) // align one chain from model 1 + { + chain1_sele=" and c. "+chainID1.substr(1); + chain2_sele=" and c. "+chainID2.substr(1); + } + else if (split_opt==2 && ter_opt==0) // align one chain from each model + { + for (i=1;i<chainID1.size();i++) if (chainID1[i]==',') break; + chain1_sele=" and c. "+chainID1.substr(i+1); + for (i=1;i<chainID2.size();i++) if (chainID2[i]==',') break; + chain2_sele=" and c. "+chainID2.substr(i+1); + } + } + + /* extract aligned region */ + int i1=-1; + int i2=-1; + string resi1_sele; + string resi2_sele; + string resi1_bond; + string resi2_bond; + string prev_resi1; + string prev_resi2; + string curr_resi1; + string curr_resi2; + if (mm_opt) + { + ; + } + else + { + for (i=0;i<strlen(seqM);i++) + { + i1+=(seqxA[i]!='-' && seqxA[i]!='*'); + i2+=(seqyA[i]!='-'); + if (seqM[i]==' ' || seqxA[i]=='*') continue; + curr_resi1=resi_vec1[i1].substr(0,4); + curr_resi2=resi_vec2[i2].substr(0,4); + if (resi1_sele.size()==0) + resi1_sele = "i. "+curr_resi1; + else + { + resi1_sele+=" or i. "+curr_resi1; + resi1_bond+="bond structure1 and i. "+prev_resi1+ + ", i. "+curr_resi1+"\n"; + } + if (resi2_sele.size()==0) + resi2_sele = "i. "+curr_resi2; + else + { + resi2_sele+=" or i. "+curr_resi2; + resi2_bond+="bond structure2 and i. "+prev_resi2+ + ", i. "+curr_resi2+"\n"; + } + prev_resi1=curr_resi1; + prev_resi2=curr_resi2; + //if (seqM[i]!=':') continue; + } + if (resi1_sele.size()) resi1_sele=" and ( "+resi1_sele+")"; + if (resi2_sele.size()) resi2_sele=" and ( "+resi2_sele+")"; + } + + /* write pymol script */ + vector<string> pml_list; + pml_list.push_back(fname_super+""); + pml_list.push_back(fname_super+"_atm"); + pml_list.push_back(fname_super+"_all"); + pml_list.push_back(fname_super+"_all_atm"); + pml_list.push_back(fname_super+"_all_atm_lig"); + + for (int p=0;p<pml_list.size();p++) + { + if (mm_opt && p<=1) continue; + buf_pymol + <<"#!/usr/bin/env pymol\n" + <<"cmd.load(\""<<fname_super_full<<"\", \"structure1\")\n" + <<"cmd.load(\""<<yname<<"\", \"structure2\")\n" + <<"hide all\n" + <<"set all_states, "<<((ter_opt==0)?"on":"off")<<'\n'; + if (p==0) // .pml + { + if (chain1_sele.size()) buf_pymol + <<"remove structure1 and not "<<chain1_sele.substr(4)<<"\n"; + if (chain2_sele.size()) buf_pymol + <<"remove structure2 and not "<<chain2_sele.substr(4)<<"\n"; + buf_pymol + <<"remove not n. CA and not n. C3'\n" + <<resi1_bond + <<resi2_bond + <<"show stick, structure1"<<chain1_sele<<resi1_sele<<"\n" + <<"show stick, structure2"<<chain2_sele<<resi2_sele<<"\n"; + } + else if (p==1) // _atm.pml + { + buf_pymol + <<"show cartoon, structure1"<<chain1_sele<<resi1_sele<<"\n" + <<"show cartoon, structure2"<<chain2_sele<<resi2_sele<<"\n"; + } + else if (p==2) // _all.pml + { + buf_pymol + <<"show ribbon, structure1"<<chain1_sele<<"\n" + <<"show ribbon, structure2"<<chain2_sele<<"\n"; + } + else if (p==3) // _all_atm.pml + { + buf_pymol + <<"show cartoon, structure1"<<chain1_sele<<"\n" + <<"show cartoon, structure2"<<chain2_sele<<"\n"; + } + else if (p==4) // _all_atm_lig.pml + { + buf_pymol + <<"show cartoon, structure1\n" + <<"show cartoon, structure2\n" + <<"show stick, not polymer\n" + <<"show sphere, not polymer\n"; + } + buf_pymol + <<"color blue, structure1\n" + <<"color red, structure2\n" + <<"set ribbon_width, 6\n" + <<"set stick_radius, 0.3\n" + <<"set sphere_scale, 0.25\n" + <<"set ray_shadow, 0\n" + <<"bg_color white\n" + <<"set transparency=0.2\n" + <<"zoom polymer and ((structure1"<<chain1_sele + <<") or (structure2"<<chain2_sele<<"))\n" + <<endl; + + fp.open((pml_list[p]+".pml").c_str()); + fp<<buf_pymol.str(); + fp.close(); + buf_pymol.str(string()); + } + + /* clean up */ + pml_list.clear(); + + resi1_sele.clear(); + resi2_sele.clear(); + + resi1_bond.clear(); + resi2_bond.clear(); + + prev_resi1.clear(); + prev_resi2.clear(); + + curr_resi1.clear(); + curr_resi2.clear(); + + chain1_sele.clear(); + chain2_sele.clear(); + resi2hinge_dict.clear(); +} + +//output the final results +void output_flexalign_results(const string xname, const string yname, + const string chainID1, const string chainID2, + const int xlen, const int ylen, double t[3], double u[3][3], + const vector<vector<double> >&tu_vec, const double TM1, const double TM2, + const double TM3, const double TM4, const double TM5, + const double rmsd, const double d0_out, const char *seqM, + const char *seqxA, const char *seqyA, const double Liden, + const int n_ali8, const int L_ali, const double TM_ali, + const double rmsd_ali, const double TM_0, const double d0_0, + const double d0A, const double d0B, const double Lnorm_ass, + const double d0_scale, const double d0a, const double d0u, + const char* fname_matrix, const int outfmt_opt, const int ter_opt, + const int mm_opt, const int split_opt, const int o_opt, + const string fname_super, const int i_opt, const int a_opt, + const bool u_opt, const bool d_opt, const int mirror_opt, + const vector<string>&resi_vec1, const vector<string>&resi_vec2) +{ + if (outfmt_opt<=0) + { + printf("\nName of Structure_1: %s%s (to be superimposed onto Structure_2)\n", + xname.c_str(), chainID1.c_str()); + printf("Name of Structure_2: %s%s\n", yname.c_str(), chainID2.c_str()); + printf("Length of Structure_1: %d residues\n", xlen); + printf("Length of Structure_2: %d residues\n\n", ylen); + + if (i_opt) + printf("User-specified initial alignment: TM/Lali/rmsd = %7.5lf, %4d, %6.3lf\n", TM_ali, L_ali, rmsd_ali); + + printf("Aligned length= %d, RMSD= %6.2f, Seq_ID=n_identical/n_aligned= %4.3f\n", n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + printf("TM-score= %6.5f (normalized by length of Structure_1: L=%d, d0=%.2f)\n", TM2, xlen, d0B); + printf("TM-score= %6.5f (normalized by length of Structure_2: L=%d, d0=%.2f)\n", TM1, ylen, d0A); + + if (a_opt==1) + printf("TM-score= %6.5f (if normalized by average length of two structures: L=%.1f, d0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + if (u_opt) + printf("TM-score= %6.5f (normalized by user-specified L=%.2f and d0=%.2f)\n", TM4, Lnorm_ass, d0u); + if (d_opt) + printf("TM-score= %6.5f (scaled by user-specified d0=%.2f, and L=%d)\n", TM5, d0_scale, ylen); + printf("(You should use TM-score normalized by length of the reference structure)\n"); + + //output alignment + printf("\n([0-9] denote different aligned fragment pairs separated by different hinges)\n"); + printf("%s\n", seqxA); + printf("%s\n", seqM); + printf("%s\n", seqyA); + } + else if (outfmt_opt==1) + { + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + xname.c_str(), chainID1.c_str(), xlen, d0B, Liden/xlen, TM2); + printf("%s\n", seqxA); + printf(">%s%s\tL=%d\td0=%.2f\tseqID=%.3f\tTM-score=%.5f\n", + yname.c_str(), chainID2.c_str(), ylen, d0A, Liden/ylen, TM1); + printf("%s\n", seqyA); + + printf("# Lali=%d\tRMSD=%.2f\tseqID_ali=%.3f\n", + n_ali8, rmsd, (n_ali8>0)?Liden/n_ali8:0); + + if (i_opt) + printf("# User-specified initial alignment: TM=%.5lf\tLali=%4d\trmsd=%.3lf\n", TM_ali, L_ali, rmsd_ali); + + if(a_opt) + printf("# TM-score=%.5f (normalized by average length of two structures: L=%.1f\td0=%.2f)\n", TM3, (xlen+ylen)*0.5, d0a); + + if(u_opt) + printf("# TM-score=%.5f (normalized by user-specified L=%.2f\td0=%.2f)\n", TM4, Lnorm_ass, d0u); + + if(d_opt) + printf("# TM-score=%.5f (scaled by user-specified d0=%.2f\tL=%d)\n", TM5, d0_scale, ylen); + + printf("$$$$\n"); + } + else if (outfmt_opt==2) + { + printf("%s%s\t%s%s\t%.4f\t%.4f\t%.2f\t%4.3f\t%4.3f\t%4.3f\t%d\t%d\t%d", + xname.c_str(), chainID1.c_str(), yname.c_str(), chainID2.c_str(), + TM2, TM1, rmsd, Liden/xlen, Liden/ylen, (n_ali8>0)?Liden/n_ali8:0, + xlen, ylen, n_ali8); + } + cout << endl; + + if (strlen(fname_matrix)) output_flexalign_rotation_matrix( + fname_matrix, tu_vec, t, u); + + if (o_opt==1) output_flexalign_pymol(xname, yname, fname_super, tu_vec, + t, u, ter_opt, mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2); + else if (o_opt==2) + output_flexalign_rasmol(xname, yname, fname_super, tu_vec, + t, u, ter_opt, mm_opt, split_opt, mirror_opt, seqM, seqxA, seqyA, + resi_vec1, resi_vec2, chainID1, chainID2, + xlen, ylen, d0A, n_ali8, rmsd, TM1, Liden); +} + +#endif diff --git a/modules/bindings/src/tmalign/param_set.h b/modules/bindings/src/USalign/param_set.h similarity index 100% rename from modules/bindings/src/tmalign/param_set.h rename to modules/bindings/src/USalign/param_set.h diff --git a/modules/bindings/src/tmalign/pdb2fasta.cpp b/modules/bindings/src/USalign/pdb2fasta.cpp similarity index 79% rename from modules/bindings/src/tmalign/pdb2fasta.cpp rename to modules/bindings/src/USalign/pdb2fasta.cpp index 7c94206ff..e0fc71206 100644 --- a/modules/bindings/src/tmalign/pdb2fasta.cpp +++ b/modules/bindings/src/USalign/pdb2fasta.cpp @@ -20,16 +20,21 @@ void print_help() " Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" " (note the spaces before and after CA).\n" "\n" +" -mol Type of molecule(s) to align.\n" +" auto: (default) align both protein and nucleic acids.\n" +" prot: only align proteins in a structure.\n" +" RNA : only align RNA and DNA in a structure.\n" +"\n" " -ter Strings to mark the end of a chain\n" -" 3: (default) TER, ENDMDL, END or different chain ID\n" +" 3: TER, ENDMDL, END or different chain ID\n" " 2: ENDMDL, END, or different chain ID\n" -" 1: ENDMDL or END\n" +" 1: (default) ENDMDL or END\n" " 0: end of file\n" "\n" " -split Whether to split PDB file into multiple chains\n" -" 0: (default) treat the whole structure as one single chain\n" +" 0: treat the whole structure as one single chain\n" " 1: treat each MODEL as a separate chain (-ter should be 0)\n" -" 2: treat each chain as a seperate chain (-ter should be <=1)\n" +" 2: (default) treat each chain as a seperate chain (-ter should be <=1)\n" "\n" " -het Whether to read residues marked as 'HETATM' in addition to 'ATOM '\n" " 0: (default) only align 'ATOM ' residues\n" @@ -53,11 +58,12 @@ int main(int argc, char *argv[]) /* get argument */ /**********************/ string xname = ""; - int ter_opt =3; // TER, END, or different chainID + int ter_opt =1; // TER, END, or different chainID int infmt_opt =-1; // PDB or PDBx/mmCIF format - int split_opt =0; // do not split chain + int split_opt =2; // do not split chain int het_opt=0; // do not read HETATM residues string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA string suffix_opt=""; // set -suffix to empty string dir_opt =""; // set -dir to empty vector<string> chain_list; // only when -dir1 is set @@ -77,6 +83,12 @@ int main(int argc, char *argv[]) { atom_opt=argv[i + 1]; i++; } + else if ( !strcmp(argv[i],"-mol") ) + { + if (i>=(argc-1)) + PrintErrorAndQuit("ERROR! Missing value for -mol"); + mol_opt=argv[i + 1]; i++; + } else if ( !strcmp(argv[i],"-dir") && i < (argc-1) ) { dir_opt=argv[i + 1]; i++; @@ -108,6 +120,16 @@ int main(int argc, char *argv[]) PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); if (split_opt<0 || split_opt>2) PrintErrorAndQuit("-split can only be 0, 1 or 2"); + if (mol_opt=="prot") mol_opt="protein"; + else if (mol_opt=="DNA") mol_opt="RNA"; + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be one of the" + "following:\nauto, prot (the same as 'protein'), and " + "RNA (the same as 'DNA')."); + if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; /* parse file list */ if (dir_opt.size()==0) diff --git a/modules/bindings/src/tmalign/pdb2ss.cpp b/modules/bindings/src/USalign/pdb2ss.cpp similarity index 100% rename from modules/bindings/src/tmalign/pdb2ss.cpp rename to modules/bindings/src/USalign/pdb2ss.cpp diff --git a/modules/bindings/src/tmalign/pdb2xyz.cpp b/modules/bindings/src/USalign/pdb2xyz.cpp similarity index 100% rename from modules/bindings/src/tmalign/pdb2xyz.cpp rename to modules/bindings/src/USalign/pdb2xyz.cpp diff --git a/modules/bindings/src/USalign/pdbAtomName.cpp b/modules/bindings/src/USalign/pdbAtomName.cpp new file mode 100644 index 000000000..d65c576d2 --- /dev/null +++ b/modules/bindings/src/USalign/pdbAtomName.cpp @@ -0,0 +1,232 @@ +#include <fstream> +#include <map> +#include <sstream> +#include <iostream> +#include <string> +#include <vector> +#include "pstream.h" + +using namespace std; + +void print_help() +{ + cout << +"Fix atom name justification in PDB format file.\n" +"\n" +"Usage: pdbAtomName input.pdb output.pdb\n" + <<endl; + exit(EXIT_SUCCESS); +} + +void splitlines(const string &line, vector<string> &lines, + const char delimiter='\n') +{ + bool within_word = false; + for (size_t pos=0;pos<line.size();pos++) + { + if (line[pos]==delimiter) + { + within_word = false; + continue; + } + if (!within_word) + { + within_word = true; + lines.push_back(""); + } + lines.back()+=line[pos]; + } +} + +size_t pdbAtomName(const string &infile,const string &outfile) +{ + stringstream buf; + if (infile=="-") buf<<cin.rdbuf(); +#if defined(REDI_PSTREAM_H_SEEN) + else if (infile.size()>3 && infile.substr(infile.size()-3)==".gz") + { + redi::ipstream fp_gz; // if file is compressed + fp_gz.open("gunzip -c "+infile); + buf<<fp_gz.rdbuf(); + fp_gz.close(); + } +#endif + else + { + ifstream fp; + fp.open(infile.c_str(),ios::in); //ifstream fp(filename,ios::in); + buf<<fp.rdbuf(); + fp.close(); + } + vector<string> lines; + splitlines(buf.str(),lines); + buf.str(string()); + + map<string,string> aa3to1; + aa3to1[" A"]=aa3to1[" DA"]='a'; + aa3to1[" C"]=aa3to1[" DC"]='c'; + aa3to1[" G"]=aa3to1[" DG"]='g'; + aa3to1[" U"]=aa3to1["PSU"]='u'; + aa3to1[" I"]=aa3to1[" DI"]='i'; + aa3to1[" T"]='t'; + aa3to1["ALA"]='A'; + aa3to1["CYS"]='C'; + aa3to1["ASP"]='D'; + aa3to1["GLU"]='E'; + aa3to1["PHE"]='F'; + aa3to1["GLY"]='G'; + aa3to1["HIS"]='H'; + aa3to1["ILE"]='I'; + aa3to1["LYS"]='K'; + aa3to1["LEU"]='L'; + aa3to1["MET"]=aa3to1["MSE"]='M'; + aa3to1["ASN"]='N'; + aa3to1["PRO"]='P'; + aa3to1["GLN"]='Q'; + aa3to1["ARG"]='R'; + aa3to1["SER"]='S'; + aa3to1["THR"]='T'; + aa3to1["VAL"]='V'; + aa3to1["TRP"]='W'; + aa3to1["TYR"]='Y'; + aa3to1["ASX"]='B'; + aa3to1["GLX"]='Z'; + aa3to1["SEC"]='U'; + aa3to1["PYL"]='O'; + + size_t l=0; + string atom=" "; + string resn=" "; + int idxBegin = -1; + int idxEnd = -1; + int i; + string msg; + map<string,int> msg_dict; + size_t changeNum=0; + for (l=0;l<lines.size();l++) + { + if (lines[l].substr(0,6)=="ATOM " || + lines[l].substr(0,6)=="HETATM") + { + if (lines[l].size()<54) + { + cerr<<"incomplete:"<<lines[l]<<endl; + continue; + } + resn=lines[l].substr(17,3); + if (resn[2]==' ') + { + if (resn[1]==' ') resn=" "+resn.substr(0,1); + else resn=" "+resn.substr(0,2); + msg=lines[l].substr(17,3)+"=>"+resn; + if (msg_dict.count(msg)==0) + { + cerr<<msg<<'.'<<endl; + msg_dict[msg]=0; + } + msg_dict[msg]++; + changeNum++; + } + if (lines[l].size()<78 && aa3to1.count(resn)==0) + { + cerr<<"heteroatom:"<<lines[l]<<endl; + buf<<lines[l].substr(0,17)<<resn<<lines[l].substr(20)<<endl; + continue; + } + + atom=lines[l].substr(12,4); + idxBegin = idxEnd = -1; + for (i=0;i<4;i++) + { + if (atom[i]==' ') continue; + if (idxBegin==-1) idxBegin=i; + idxEnd=i; + } + if (idxBegin>=0 && (idxBegin>0 || idxEnd<3)) + atom = atom.substr(idxBegin, idxEnd + 1 - idxBegin); + if (atom[atom.size()-1]=='*') // C3* (old) => C3' (new) + atom=atom.substr(0,atom.size()-1)+"'"; + if (atom.size()==4) + { + buf<<lines[l].substr(0,17)<<resn<<lines[l].substr(20)<<endl; + continue; + } + if ((lines[l].size()>=78 && lines[l][76]!=' ' && lines[l][77]!=' ')|| + ('0'<=atom[0] && atom[0]<='9')) + { + if (atom.size()==1) atom+=" "; + else if (atom.size()==2) atom+=" "; + else if (atom.size()==3) atom+=" "; + } + else if (resn=="MSE" && atom=="SE") atom="SE "; + else + { + if (atom.size()==1) atom=" "+atom+" "; + else if (atom.size()==2) atom=" "+atom+" "; + else if (atom.size()==3) atom=" "+atom; + } + if (atom!=lines[l].substr(12,4)) + { + msg=resn+":"+lines[l].substr(12,4)+"=>"+atom; + if (msg_dict.count(msg)==0) + { + cerr<<msg<<'.'<<endl; + msg_dict[msg]=0; + } + msg_dict[msg]++; + changeNum++; + } + buf<<lines[l].substr(0,12)<<atom<<lines[l].substr(16,1) + <<resn<<lines[l].substr(20)<<endl; + } + else if (lines[l].size()) + { + buf<<lines[l]<<endl; + } + lines[l].clear(); + } + + if (outfile=="-") + cout<<buf.str(); + else + { + ofstream fout; + fout.open(outfile.c_str(),ios::out); + fout<<buf.str(); + fout.close(); + } + buf.str(string()); + vector<string>().swap(lines); + map<string,int>().swap(msg_dict); + map<string,string>().swap(aa3to1); + if (changeNum) + cerr<<"Update "<<changeNum<<" atom name in "<<infile<<endl; + return changeNum; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + string infile =""; + string outfile=""; + + for (int i=1; i<argc; i++) + { + if (infile.size()==0) infile=argv[i]; + else if (outfile.size()==0) outfile=argv[i]; + else + { + cerr<<"ERROR! no such option "<<argv[i]<<endl; + exit(1); + } + } + + if (outfile.size()==0) outfile="-"; + + pdbAtomName(infile,outfile); + + infile.clear(); + outfile.clear(); + return 0; +} diff --git a/modules/bindings/src/tmalign/pstream.h b/modules/bindings/src/USalign/pstream.h similarity index 99% rename from modules/bindings/src/tmalign/pstream.h rename to modules/bindings/src/USalign/pstream.h index 28cbeadb1..12c759874 100644 --- a/modules/bindings/src/tmalign/pstream.h +++ b/modules/bindings/src/USalign/pstream.h @@ -15,6 +15,11 @@ * and redi::rpstream. */ +/* do not compile on windows, which does not have cygwin */ +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) && !defined(__CYGWIN__) +#define NO_PSTREAM +#else + #ifndef REDI_PSTREAM_H_SEEN #define REDI_PSTREAM_H_SEEN @@ -2250,6 +2255,6 @@ namespace redi */ #endif // REDI_PSTREAM_H_SEEN - +#endif // WIN32 // vim: ts=2 sw=2 expandtab diff --git a/modules/bindings/src/USalign/qTMclust.cpp b/modules/bindings/src/USalign/qTMclust.cpp new file mode 100644 index 000000000..08fc64b68 --- /dev/null +++ b/modules/bindings/src/USalign/qTMclust.cpp @@ -0,0 +1,723 @@ +/* Different filters are used when different header files are included. + * At least one of HwRMSD.h and TMalign.h should be included. + * HwRMSD.h implement HwRMSD filter. + * No filter will be used if only TMalign.h is included. */ + +#include "HwRMSD.h" +#include "TMalign.h" + +using namespace std; + +void print_extra_help() +{ + cout << +"Additional options:\n" +" -fast Fast but slightly inaccurate final alignment\n" +"\n" +" -atom 4-character atom name used to represent a residue.\n" +" Default is \" C3'\" for RNA/DNA and \" CA \" for proteins\n" +" (note the spaces before and after CA).\n" +"\n" +" -mol Molecule type: RNA or protein\n" +" Default is detect molecule type automatically\n" +"\n" +" -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" +" 0: (default) only align 'ATOM ' residues\n" +" 1: align both 'ATOM ' and 'HETATM' residues\n" +"\n" +" -infmt Input format\n" +" -1: (default) automatically detect PDB or PDBx/mmCIF format\n" +" 0: PDB format\n" +" 1: SPICKER format\n" +" 2: xyz format\n" +" 3: PDBx/mmCIF format\n" + <<endl; +} + +void print_help(bool h_opt=false) +{ + cout << "\n" +"qTMclust: Structure Clustering by Sequence-Indepedent Structure Alignment\n" +"\n" +"Usage 1: (alignment within a folder of PDB files)\n" +" qTMclust -dir chain_folder/ chain_list -TMcut 0.5 -o cluster.txt\n" +"\n" +"Usage 2: (alignment within chains or within models of a single PDB file)\n" +" qTMclust -split 2 -ter 1 multichain.pdb -TMcut 0.5 -o cluster.txt\n" +" qTMclust -split 1 -ter 0 multimodel.pdb -TMcut 0.5 -o cluster.txt\n" +"\n" +"Options:\n" +" -TMcut TM-score cutoff in the range of [0.45,1) for considering two\n" +" structures being similar. Default is 0.5.\n" +"\n" +" -s Which TM-score to use when aligning structures with different lengths?\n" +" 1: the larger TM-score, i.e. normalized by shorter length\n" +" 2: (default) the smaller TM-score, i.e. normalized by longer length\n" +" 3: average of the two TM-scores\n" +" 4: harmonic average of the two TM-scores\n" +" 5: geometric average of the two TM-scores\n" +" 6: root mean square of the two TM-scores\n" +"\n" +" -o Output the cluster result to file.\n" +" Default is print result to screen.\n" +"\n" +" -dir Perform all-against-all alignment among the list of PDB\n" +" chains listed by 'chain_list' under 'chain_folder'. Note\n" +" that the slash is necessary.\n" +" $ qTMclust -dir chain_folder/ chain_list\n" +"\n" +" -suffix (Only when -dir is set, default is empty)\n" +" add file name suffix to files listed by chain_list\n" +"\n" +" -ter Strings to mark the end of a chain\n" +" 3: (default) TER, ENDMDL, END or different chain ID\n" +" 2: ENDMDL, END, or different chain ID\n" +" 1: ENDMDL or END\n" +" 0: end of file\n" +"\n" +" -split Whether to split PDB file into multiple chains\n" +" 0: (default) treat the whole structure as one single chain\n" +" 1: treat each MODEL as a separate chain (-ter should be 0)\n" +" 2: treat each chain as a seperate chain (-ter should be <=1)\n" +"\n" +" -h Print the full help message, including additional options.\n" +"\n" + <<endl; + + if (h_opt) print_extra_help(); + + exit(EXIT_SUCCESS); +} + +void filter_lower_bound(double &lb_HwRMSD, double &lb_TMfast, + const double TMcut, const int s_opt,const int mol_type) +{ + lb_HwRMSD=0.5*TMcut; + lb_TMfast=0.9*TMcut; + if (s_opt<=1) + { + if (mol_type>0) // RNA + { + lb_HwRMSD=0.02*TMcut; + lb_TMfast=0.60*TMcut; + } + else // protein + { + lb_HwRMSD=0.25*TMcut; + lb_TMfast=0.80*TMcut; + } + } + return; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) print_help(); + + + clock_t t1, t2; + t1 = clock(); + + /**********************/ + /* get argument */ + /**********************/ + string xname = ""; + double TMcut = 0.5; + string fname_clust = ""; // file name for output cluster result + string fname_lign = ""; // file name for user alignment + vector<string> sequence; // get value from alignment file + double Lnorm_ass, d0_scale; + + bool h_opt = false; // print full help message + int i_opt = 0; // 3 for -I, stick to user given alignment + int a_opt = 0; // flag for -a, do not normalized by average length + int s_opt = 2; // flag for -s, normalized by longer length + bool u_opt = false; // flag for -u, normalized by user specified length + bool d_opt = false; // flag for -d, user specified d0 + + int infmt_opt =-1; // PDB or PDBx/mmCIF format + int ter_opt =3; // TER, END, or different chainID + int split_opt =0; // do not split chain + bool fast_opt =false; // flags for -fast, fTM-align algorithm + int het_opt =0; // do not read HETATM residues + string atom_opt ="auto";// use C alpha atom for protein and C3' for RNA + string mol_opt ="auto";// auto-detect the molecule type as protein/RNA + string suffix_opt=""; // set -suffix to empty + string dir_opt =""; // set -dir to empty + int byresi_opt=0; // set -byresi to 0 + vector<string> chain_list; + + for(int i = 1; i < argc; i++) + { + if ( (!strcmp(argv[i],"-u")||!strcmp(argv[i],"-L")) && i < (argc-1) ) + { + PrintErrorAndQuit("Sorry! -u has not been implemented yet"); + Lnorm_ass = atof(argv[i + 1]); u_opt = true; i++; + } + else if ( !strcmp(argv[i],"-d") && i < (argc-1) ) + { + PrintErrorAndQuit("Sorry! -d has not been implemented yet"); + d0_scale = atof(argv[i + 1]); d_opt = true; i++; + } + else if (!strcmp(argv[i], "-I") && i < (argc-1) ) + { + fname_lign = argv[i + 1]; i_opt = 3; i++; + } + else if ( !strcmp(argv[i],"-o") && i < (argc-1) ) + { + fname_clust = argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-a") && i < (argc-1)) + { + PrintErrorAndQuit("Sorry! -a is not used for clustering"); + } + else if ( !strcmp(argv[i],"-s") && i < (argc-1) ) + { + s_opt=atoi(argv[i + 1]); i++; + if (s_opt<1 || s_opt>6) + PrintErrorAndQuit("-s must be within 1 to 6"); + } + else if ( !strcmp(argv[i],"-h") ) + { + h_opt = true; + } + else if (!strcmp(argv[i], "-fast")) + { + fast_opt = true; + } + else if ( !strcmp(argv[i],"-infmt") && i < (argc-1) ) + { + infmt_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-ter") && i < (argc-1) ) + { + ter_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-split") && i < (argc-1) ) + { + split_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-atom") && i < (argc-1) ) + { + atom_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-mol") && i < (argc-1) ) + { + mol_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-dir") && i < (argc-1) ) + { + dir_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-suffix") && i < (argc-1) ) + { + suffix_opt=argv[i + 1]; i++; + } + else if ( !strcmp(argv[i],"-TMcut") && i < (argc-1) ) + { + TMcut=atof(argv[i + 1]); i++; + if (TMcut>1 or TMcut<0.45) + PrintErrorAndQuit("TMcut must be in the range of [0.45,1)"); + } + else if ( !strcmp(argv[i],"-byresi") && i < (argc-1) ) + { + PrintErrorAndQuit("Sorry! -byresi has not been implemented yet"); + byresi_opt=atoi(argv[i + 1]); i++; + } + else if ( !strcmp(argv[i],"-het") && i < (argc-1) ) + { + het_opt=atoi(argv[i + 1]); i++; + } + else if (xname.size() == 0) xname=argv[i]; + else PrintErrorAndQuit(string("ERROR! Undefined option ")+argv[i]); + } + + if(xname.size()==0) print_help(h_opt); + + if (suffix_opt.size() && dir_opt.size()==0) + PrintErrorAndQuit("-suffix is only valid if -dir, -dir1 or -dir2 is set"); + if (atom_opt.size()!=4) + PrintErrorAndQuit("ERROR! Atom name must have 4 characters, including space."); + if (mol_opt!="auto" && mol_opt!="protein" && mol_opt!="RNA") + PrintErrorAndQuit("ERROR! Molecule type must be either RNA or protein."); + else if (mol_opt=="protein" && atom_opt=="auto") + atom_opt=" CA "; + else if (mol_opt=="RNA" && atom_opt=="auto") + atom_opt=" C3'"; + + if (u_opt && Lnorm_ass<=0) + PrintErrorAndQuit("Wrong value for option -u! It should be >0"); + if (d_opt && d0_scale<=0) + PrintErrorAndQuit("Wrong value for option -d! It should be >0"); + if (split_opt==1 && ter_opt!=0) + PrintErrorAndQuit("-split 1 should be used with -ter 0"); + else if (split_opt==2 && ter_opt!=0 && ter_opt!=1) + PrintErrorAndQuit("-split 2 should be used with -ter 0 or 1"); + if (split_opt<0 || split_opt>2) + PrintErrorAndQuit("-split can only be 0, 1 or 2"); + + /* read initial alignment file from 'align.txt' */ + if (i_opt) read_user_alignment(sequence, fname_lign, i_opt); + + if (byresi_opt) i_opt=3; + + /* parse file list */ + if (dir_opt.size()==0) chain_list.push_back(xname); + else file2chainlist(chain_list, xname, dir_opt, suffix_opt); + + /* declare previously global variables */ + vector<vector<string> >PDB_lines; // text of chain + vector<int> mol_vec; // molecule type of chain1, RNA if >0 + vector<string> chainID_list; // list of chainID + size_t xchainnum=0; // number of chains in a PDB file + size_t i,j; // number of residues/chains in a PDB is + // usually quite limited. Yet, the number of + // files can be very large. size_t is safer + // than int for very long list of files + int xlen,ylen; // chain length + double **xa,**ya; // xyz coordinate + vector<string> resi_vec; // residue index for chain, dummy variable + vector<pair<int,size_t> >chainLen_list; // vector of (length,index) pair + vector<vector<char> > seq_vec; + vector<vector<char> > sec_vec; + vector<vector<vector<float> > >xyz_vec; + + /* parse files */ + string chain_name; + vector<char> seq_tmp; + vector<char> sec_tmp; + vector<float> flt_tmp(3,0); + vector<vector<float> >xyz_tmp; + int r; // residue index + size_t newchainnum; + double ub_HwRMSD=0.90*TMcut+0.10; + double lb_HwRMSD=0.5*TMcut; + double ub_TMfast=0.90*TMcut+0.10; + double lb_TMfast=0.9*TMcut; + if (s_opt==2 || s_opt==4 || s_opt==5) a_opt=-2; // normalized by longer length, i.e. smaller TM + else if (s_opt==1 || s_opt==5) a_opt=-1; // normalized by shorter length, i.e. larger TM + else if (s_opt==3) a_opt= 1; // normalized by average length + +#ifdef TMalign_HwRMSD_h + /* These parameters controls HwRMSD filter. iter_opt typically should be + * >=3. Many alignments converge within iter_opt=5. Occassionally + * some alignments require iter_opt=10. Higher iter_opt takes more time, + * even though HwRMSD iter_opt 10 still takes far less time than TMalign + * -fast -TMcut 0.5. + * After HwRMSD filter, at least min_repr_num and at most max_repr_num + * are used for subsequent TMalign. The actual number of representatives + * are decided by xlen */ + const int glocal =0; // global alignment + const int iter_opt =10; + const int min_repr_num=10; + const int max_repr_num=50; +#endif + + for (i=0;i<chain_list.size();i++) + { + xname=chain_list[i]; + newchainnum=get_PDB_lines(xname, PDB_lines, chainID_list, + mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); + if (!newchainnum) + { + cerr<<"Warning! Cannot parse file: "<<xname + <<". Chain number 0."<<endl; + continue; + } + chain_name=xname.substr(dir_opt.size(), + xname.size()-dir_opt.size()-suffix_opt.size()); + for (j=0;j<newchainnum;j++) + { + chainID_list[j+xchainnum]=chain_name+chainID_list[j+xchainnum]; + xlen=PDB_lines[j].size(); + cout<<"Parsing "<<xname<<'\t'<<chainID_list[j+xchainnum] + <<" ("<<xlen<<" residues)."<<endl; + if (mol_opt=="RNA") mol_vec[j+xchainnum]=1; + else if (mol_opt=="protein") mol_vec[j+xchainnum]=-1; + + NewArray(&xa, xlen, 3); + seq_tmp.assign(xlen+1,'A'); + sec_tmp.assign(xlen+1,0); + + read_PDB(PDB_lines[j], xa, &seq_tmp[0], resi_vec, byresi_opt); + + if (mol_vec[j]<=0) make_sec(xa, xlen, &sec_tmp[0]); + else make_sec(&seq_tmp[0],xa,xlen,&sec_tmp[0],atom_opt); + + xyz_tmp.assign(xlen,flt_tmp); + for (r=0;r<xlen;r++) + { + xyz_tmp[r][0]=xa[r][0]; + xyz_tmp[r][1]=xa[r][1]; + xyz_tmp[r][2]=xa[r][2]; + } + + seq_vec.push_back(seq_tmp); + sec_vec.push_back(sec_tmp); + xyz_vec.push_back(xyz_tmp); + + chainLen_list.push_back( + make_pair(PDB_lines[j].size(),j+xchainnum)); + + seq_tmp.clear(); + sec_tmp.clear(); + xyz_tmp.clear(); + DeleteArray(&xa, xlen); + PDB_lines[j].clear(); + } + PDB_lines.clear(); + xchainnum+=newchainnum; + } + flt_tmp.clear(); + chain_list.clear(); + + // swap completely destroy the vector and free up the memory capacity + vector<vector<string> >().swap(PDB_lines); + size_t Nstruct=chainLen_list.size(); + + /* sort by chain length */ + stable_sort(chainLen_list.begin(),chainLen_list.end(), + greater<pair<int,int> >()); + cout<<"Clustering "<<chainLen_list.size() + <<" chains with TM-score cutoff >="<<TMcut<<'\n' + <<"Longest chain "<<chainID_list[chainLen_list[0].second]<<'\t' + <<chainLen_list[0].first<<" residues.\n" + <<"Shortest chain "<<chainID_list[chainLen_list.back().second]<<'\t' + <<chainLen_list.back().first<<" residues."<<endl; + + /* set the first cluster */ + vector<size_t> clust_mem_vec(Nstruct,-1); // cluster membership + vector<size_t> clust_repr_vec; // the same as number of clusters + size_t chain_i=chainLen_list[0].second; + clust_repr_vec.push_back(chain_i); + clust_mem_vec[chain_i]=0; + map<size_t,size_t> clust_repr_map; + + /* perform alignment */ + size_t chain_j; + const double fast_lb=50.; // proteins shorter than fast_lb never use -fast + const double fast_ub=1000.;// proteins longer than fast_ub always use -fast + double Lave; // average protein length for chain_i and chain_j + size_t sizePROT; // number of representatives for current chain + vector<size_t> index_vec; // index of cluster representatives for the chain + bool found_clust; // whether current chain hit previous cluster + + for (i=1;i<Nstruct;i++) + { + chain_i=chainLen_list[i].second; + xlen=xyz_vec[chain_i].size(); + if (xlen<=5) // TMalign cannot handle L<=5 + { + clust_mem_vec[chain_i]=clust_repr_vec.size(); + clust_repr_vec.push_back(clust_repr_vec.size()); + continue; + } + + NewArray(&xa, xlen, 3); + for (r=0;r<xlen;r++) + { + xa[r][0]=xyz_vec[chain_i][r][0]; + xa[r][1]=xyz_vec[chain_i][r][1]; + xa[r][2]=xyz_vec[chain_i][r][2]; + } + + // j-1 is index of old cluster. here, we starts from the latest + // cluster because proteins with similar length are more likely + // to be similar. we cannot use j as index because size_t j cannot + // be negative at the end of this loop + for (j=clust_repr_vec.size();j>0;j--) + { + chain_j=clust_repr_vec[j-1]; + ylen=xyz_vec[chain_j].size(); + if (mol_vec[chain_i]*mol_vec[chain_j]<0) continue; + else if (s_opt==2 && xlen<TMcut*ylen) continue; + else if (s_opt==3 && xlen<(2*TMcut-1)*ylen) continue; + else if (s_opt==4 && xlen*(2/TMcut-1)<ylen) continue; + else if (s_opt==5 && xlen<TMcut*TMcut*ylen) continue; + else if (s_opt==6 && xlen*xlen<(2*TMcut*TMcut-1)*ylen*ylen) continue; + index_vec.push_back(chain_j); + } + sizePROT=index_vec.size(); + + cout<<'>'<<chainID_list[chain_i]<<'\t'<<xlen<<'\t' + <<setiosflags(ios::fixed)<<setprecision(2) + <<100.*i/Nstruct<<"%(#"<<i<<")\t" + <<"#repr="<<sizePROT<<"/"<<clust_repr_vec.size()<<endl; + +#ifdef TMalign_HwRMSD_h + vector<pair<double,size_t> > HwRMSDscore_list; + double TM; + for (j=0;j<sizePROT;j++) + { + chain_j=index_vec[j]; + ylen=xyz_vec[chain_j].size(); + if (mol_vec[chain_i]*mol_vec[chain_j]<0) continue; + else if (s_opt==2 && xlen<TMcut*ylen) continue; + else if (s_opt==3 && xlen<(2*TMcut-1)*ylen) continue; + else if (s_opt==4 && xlen*(2/TMcut-1)<ylen) continue; + else if (s_opt==5 && xlen<TMcut*TMcut*ylen) continue; + else if (s_opt==6 && xlen*xlen<(2*TMcut*TMcut-1)*ylen*ylen) continue; + + if (s_opt<=1) filter_lower_bound(lb_HwRMSD, lb_TMfast, + TMcut, s_opt, mol_vec[chain_i]+mol_vec[chain_j]); + + NewArray(&ya, ylen, 3); + for (r=0;r<ylen;r++) + { + ya[r][0]=xyz_vec[chain_j][r][0]; + ya[r][1]=xyz_vec[chain_j][r][1]; + ya[r][2]=xyz_vec[chain_j][r][2]; + } + + /* declare variable specific to this pair of HwRMSD */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for s_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + int *invmap = new int[ylen+1]; + + /* entry function for structure alignment */ + HwRMSD_main( + xa, ya, &seq_vec[chain_i][0], &seq_vec[chain_j][0], + &sec_vec[chain_i][0], &sec_vec[chain_j][0], t0, u0, + TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, + d0a, d0_out, seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, + rmsd_ali, n_ali, n_ali8, xlen, ylen, + sequence, Lnorm_ass, + d0_scale, i_opt, + a_opt, u_opt, d_opt, mol_vec[chain_i]+mol_vec[chain_j], + invmap, glocal, iter_opt); + + TM=TM3; // average length + if (s_opt==1) TM=TM2; // shorter length + else if (s_opt==2) TM=TM1; // longer length + else if (s_opt==3) TM=(TM1+TM2)/2; // average TM + else if (s_opt==4) TM=2/(1/TM1+1/TM2); // harmonic average + else if (s_opt==5) TM=sqrt(TM1*TM2); // geometric average + else if (s_opt==6) TM=sqrt((TM1*TM1+TM2*TM2)/2); // root mean square + + Lave=sqrt(xlen*ylen); // geometry average because O(L1*L2) + if (TM>=lb_HwRMSD || Lave<=fast_lb) + HwRMSDscore_list.push_back(make_pair(TM,index_vec[j])); + + /* clean up after each HwRMSD */ + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + delete [] invmap; + + /* if a good hit is guaranteed to be found, stop the loop */ + if (TM>=ub_HwRMSD) break; + } + + stable_sort(HwRMSDscore_list.begin(),HwRMSDscore_list.end(), + greater<pair<double,size_t> >()); + + int cur_repr_num_cutoff=min_repr_num; + if (xlen<=fast_lb) cur_repr_num_cutoff=max_repr_num; + else if (xlen>fast_lb && xlen<fast_ub) cur_repr_num_cutoff+= + (fast_ub-xlen)/(fast_ub-fast_lb)*(max_repr_num-min_repr_num); + + index_vec.clear(); + for (j=0;j<HwRMSDscore_list.size();j++) + { + TM=HwRMSDscore_list[j].first; + chain_j=HwRMSDscore_list[j].second; + ylen=xyz_vec[chain_j].size(); + Lave=sqrt(xlen*ylen); // geometry average because O(L1*L2) + if (Lave>fast_lb && TM<TMcut*0.5 && + index_vec.size()>=cur_repr_num_cutoff) break; + index_vec.push_back(chain_j); + cout<<"#"<<chain_j<<"\t"<<chainID_list[chain_j]<<"\t" + <<setiosflags(ios::fixed)<<setprecision(4)<<TM<<endl; + } + cout<<index_vec.size()<<" out of " + <<HwRMSDscore_list.size()<<" entries"<<endl; + HwRMSDscore_list.clear(); +#endif + + found_clust=false; + for (j=0;j<index_vec.size();j++) + { + chain_j=index_vec[j]; + ylen=xyz_vec[chain_j].size(); + if (mol_vec[chain_i]*mol_vec[chain_j]<0) continue; + else if (s_opt==2 && xlen<TMcut*ylen) continue; + else if (s_opt==3 && xlen<(2*TMcut-1)*ylen) continue; + else if (s_opt==4 && xlen*(2/TMcut-1)<ylen) continue; + else if (s_opt==5 && xlen<TMcut*TMcut*ylen) continue; + else if (s_opt==6 && xlen*xlen<(2*TMcut*TMcut-1)*ylen*ylen) continue; + if (s_opt<=1) filter_lower_bound(lb_HwRMSD, lb_TMfast, + TMcut, s_opt, mol_vec[chain_i]+mol_vec[chain_j]); + + NewArray(&ya, ylen, 3); + for (r=0;r<ylen;r++) + { + ya[r][0]=xyz_vec[chain_j][r][0]; + ya[r][1]=xyz_vec[chain_j][r][1]; + ya[r][2]=xyz_vec[chain_j][r][2]; + } + + Lave=sqrt(xlen*ylen); // geometry average because O(L1*L2) + bool overwrite_fast_opt=(fast_opt==true || Lave>=fast_ub); + + /* declare variable specific to this pair of TMalign */ + double t0[3], u0[3][3]; + double TM1, TM2; + double TM3, TM4, TM5; // for s_opt, u_opt, d_opt + double d0_0, TM_0; + double d0A, d0B, d0u, d0a; + double d0_out=5.0; + string seqM, seqxA, seqyA;// for output alignment + double rmsd0 = 0.0; + int L_ali; // Aligned length in standard_TMscore + double Liden=0; + double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore + int n_ali=0; + int n_ali8=0; + + /* entry function for structure alignment */ + int status=TMalign_main( + xa, ya, &seq_vec[chain_i][0], &seq_vec[chain_j][0], + &sec_vec[chain_i][0], &sec_vec[chain_j][0], + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, overwrite_fast_opt, + mol_vec[chain_i]+mol_vec[chain_j],TMcut); + + cout<<status<<'\t'<<chainID_list[chain_j]<<'\t' + <<setiosflags(ios::fixed)<<setprecision(4) + <<TM2<<'\t'<<TM1<<'\t'<<overwrite_fast_opt<<endl; + + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + + double TM=TM3; // average length + if (s_opt==1) TM=TM2; // shorter length + else if (s_opt==2) TM=TM1; // longer length + else if (s_opt==3) TM=(TM1+TM2)/2; // average TM + else if (s_opt==4) TM=2/(1/TM1+1/TM2); // harmonic average + else if (s_opt==5) TM=sqrt(TM1*TM2); // geometric average + else if (s_opt==6) TM=sqrt((TM1*TM1+TM2*TM2)/2); // root mean square + + if (TM<lb_TMfast || + (TM<TMcut && (fast_opt || overwrite_fast_opt==false))) + { + DeleteArray(&ya, ylen); + continue; + } + + if (TM>=ub_TMfast || + (TM>=TMcut && (fast_opt || overwrite_fast_opt==false))) + { + clust_mem_vec[chain_i]=clust_repr_map[chain_j]; + DeleteArray(&ya, ylen); + found_clust=true; + break; + } + + if (TM<lb_TMfast && overwrite_fast_opt==false) + { + TMalign_main( + xa, ya, &seq_vec[chain_i][0], &seq_vec[chain_j][0], + &sec_vec[chain_i][0], &sec_vec[chain_j][0], + t0, u0, TM1, TM2, TM3, TM4, TM5, + d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, + seqM, seqxA, seqyA, + rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, + xlen, ylen, sequence, Lnorm_ass, d0_scale, + i_opt, a_opt, u_opt, d_opt, false, + mol_vec[chain_i]+mol_vec[chain_j],TMcut); + seqM.clear(); + seqxA.clear(); + seqyA.clear(); + DeleteArray(&ya, ylen); + + TM=TM3; // average length + if (s_opt==1) TM=TM2; // shorter length + else if (s_opt==2) TM=TM1; // longer length + else if (s_opt==3) TM=(TM1+TM2)/2; // average TM + else if (s_opt==4) TM=2/(1/TM1+1/TM2); // harmonic average + else if (s_opt==5) TM=sqrt(TM1*TM2); // geometric average + else if (s_opt==6) TM=sqrt((TM1*TM1+TM2*TM2)/2); // root mean square + cout<<"*\t"<<chainID_list[chain_j]<<'\t'<<TM2<<'\t'<<TM1<<endl; + if (TM>=TMcut) + { + clust_mem_vec[chain_i]=clust_repr_map[chain_j]; + found_clust=true; + break; + } + } + } + DeleteArray(&xa, xlen); + index_vec.clear(); + + if (!found_clust) // new cluster + { + clust_mem_vec[chain_i]=clust_repr_vec.size(); + clust_repr_map[chain_i]=clust_repr_vec.size(); + clust_repr_vec.push_back(chain_i); + } + else // member structures are not used further + { + vector<char> ().swap(seq_vec[chain_i]); + vector<char> ().swap(sec_vec[chain_i]); + vector<vector<float> > ().swap(xyz_vec[chain_i]); + } + } + + /* clean up */ + mol_vec.clear(); + xyz_vec.clear(); + seq_vec.clear(); + sec_vec.clear(); + + /* print out cluster */ + stringstream txt; + for (j=0;j<clust_repr_vec.size();j++) + { + chain_j=clust_repr_vec[j]; // cluster representative + txt<<chainID_list[chain_j]; + for (chain_i=0;chain_i<clust_mem_vec.size();chain_i++) + { + if (chain_i!=chain_j && clust_mem_vec[chain_i]==j) + txt<<'\t'<<chainID_list[chain_i]; + } + txt<<'\n'; + } + if (fname_clust.size() && fname_clust!="-") + { + ofstream fp(fname_clust.c_str()); + fp<<txt.str(); + fp.close(); + } + else cout<<txt.str()<<endl; + + /* clean up */ + txt.str(string()); + clust_repr_vec.clear(); + clust_mem_vec.clear(); + chainID_list.clear(); + clust_repr_map.clear(); + + t2 = clock(); + float diff = ((float)t2 - (float)t1)/CLOCKS_PER_SEC; + printf("#Total CPU time is %5.2f seconds\n", diff); + return 0; +} diff --git a/modules/bindings/src/tmalign/readme.txt b/modules/bindings/src/USalign/readme.txt similarity index 66% rename from modules/bindings/src/tmalign/readme.txt rename to modules/bindings/src/USalign/readme.txt index 3249215e8..2a0330252 100644 --- a/modules/bindings/src/tmalign/readme.txt +++ b/modules/bindings/src/USalign/readme.txt @@ -1,15 +1,11 @@ ============================================================================== - TM-align: protein and RNA structure alignment by TM-score superposition. - - This program was written by (in reverse chronological order) - Chengxin Zhang, Sha Gong, Jianjie Wu, and Jianyi Yang - at Yang Zhang lab, Department of Computational Medicine and Bioinformatics, - University of Michigan, 100 Washtenaw Ave, Ann Arbor, MI 48109-2218. - Please report issues to yangzhanglab@umich.edu + US-align: universal structure alignment of monomeric and complex proteins + and nucleic acids References to cite: - S Gong, C Zhang, Y Zhang. Bioinformatics, btz282 (2019) - Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005) + (1) Chengxin Zhang, Morgan Shine, Anna Marie Pyle, Yang Zhang + (2022) Nat Methods + (2) Chengxin Zhang, Anna Marie Pyle (2022) iScience DISCLAIMER: Permission to use, copy, modify, and distribute this program for @@ -61,38 +57,53 @@ 2021/01/07: Fixed bug in TMscore -c 2021/05/29: Remove unnecessary depedency on malloc.h, which prevent compilation on Mac OS + 2021/08/17: Complete implementation of MMalign + 2021/10/03: Support Windows + 2022/02/27: Add -seq (-byresi 4 & 5) for TM-score superimposition guided by + sequence alignment. + 2022/04/12: Support AlphaFold CIF + 2022/05/11: Update -mm 4 output format + 2022/05/24: Limited support for sequence order independent alignment + 2022/05/30: Correct atom pair output for -mm 5 + 2022/06/07: Sequence order semi-independent alignment + 2022/06/20: Sequentiality within SSE in sequence order semi-independent + alignment + 2022/06/22: Fix infinite loop for mal-formatted PDB + 2022/06/23: Fix -m for Windows. Add pymol plugin. + 2022/06/26: Add -full option for -mm 2 and 4 + 2022/09/24: Support -TMscore for complex when the chain order is different =============================================================================== ========================= - How to install TM-align + How to install US-align ========================= To compile the program in your Linux computer, simply enter - make + make or - g++ -static -O3 -ffast-math -lm -o TMalign TMalign.cpp + g++ -static -O3 -ffast-math -lm -o USalign USalign.cpp The '-static' flag should be removed on Mac OS, which does not support building static executables. +USalign compiled on Linux, Mac OS and Linux Subsystem for Windows (WSL2) on +Windows 10 onwards can read both uncompressed files and gz compressed +files, provided that the "gunzip" command is available. On the other hand, due +to the lack of POSIX support on Windows, US-align natively compiled on Windows +without WSL2 cannot parse gz compressed files. + +US-align is known to be compilable by g++ version 4.8.5 or later, clang++ +version 12.0.5 or later and mingw-w64 version 9.3 or later. + ===================== - How to use TM-align + How to use US-align ===================== You can run the program without arguments to obtain a brief instruction - ./TMalign structure1.pdb structure2.pdb - -=================== - Fortran version -=================== -You can download the fortran version of TM-align from -https://zhanglab.ccmb.med.umich.edu/TM-align/ + ./USalign structure1.pdb structure2.pdb -This C++ version of TM-align implemented several features not available in the -fortran version, including RNA alignment and batch alignment of multiple -structures. A full list of available options can be explored by: - ./TMalign -h +A full list of available options can be explored by: -2021/05/20 + ./USalign -h diff --git a/modules/bindings/src/tmalign/se.cpp b/modules/bindings/src/USalign/se.cpp similarity index 94% rename from modules/bindings/src/tmalign/se.cpp rename to modules/bindings/src/USalign/se.cpp index c4d760681..af24ae78c 100644 --- a/modules/bindings/src/tmalign/se.cpp +++ b/modules/bindings/src/USalign/se.cpp @@ -48,12 +48,17 @@ void print_extra_help() " 2: tabular format very compact output\n" "\n" " -byresi Whether to align two structures by residue index.\n" +" The same as -TMscore.\n" " 0: (default) do not align by residue index\n" " 1: (same as TMscore program) align by residue index\n" " 2: (same as TMscore -c, should be used with -ter <=1)\n" " align by residue index and chain ID\n" " 3: (similar to TMscore -c, should be used with -ter <=1)\n" " align by residue index and order of chain\n" +" 4: sequence dependent alignment: perform Needleman-Wunsch\n" +" global sequence alignment\n" +" 5: sequence dependent alignment: perform glocal sequence\n" +" alignment\n" "\n" " -het Whether to align residues marked as 'HETATM' in addition to 'ATOM '\n" " 0: (default) only align 'ATOM ' residues\n" @@ -208,7 +213,8 @@ int main(int argc, char *argv[]) { outfmt_opt=atoi(argv[i + 1]); i++; } - else if ( !strcmp(argv[i],"-byresi") && i < (argc-1) ) + else if ( (!strcmp(argv[i],"-byresi") || !strcmp(argv[i],"-TMscore") || + !strcmp(argv[i],"-tmscore") ) && i < (argc-1) ) { byresi_opt=atoi(argv[i + 1]); i++; } @@ -255,10 +261,10 @@ int main(int argc, char *argv[]) { if (i_opt) PrintErrorAndQuit("-byresi >=1 cannot be used with -i or -I"); - if (byresi_opt<0 || byresi_opt>3) - PrintErrorAndQuit("-byresi can only be 0, 1, 2 or 3"); - if (byresi_opt>=2 && ter_opt>=2) - PrintErrorAndQuit("-byresi >=2 should be used with -ter <=1"); + if (byresi_opt<0 || byresi_opt>5) + PrintErrorAndQuit("-byresi can only be 0, 1, 2, 3, 4, or 5"); + if (byresi_opt>=2 && byresi_opt<=3 && ter_opt>=2) + PrintErrorAndQuit("-byresi 2 and -byresi 3 should be used with -ter <=1"); } if (split_opt==1 && ter_opt!=0) PrintErrorAndQuit("-split 1 should be used with -ter 0"); @@ -398,7 +404,7 @@ int main(int argc, char *argv[]) outfmt_opt, invmap); if (outfmt_opt>=2) - get_seqID(invmap, seqx, seqy, ylen, Liden, n_ali8); + get_seqID(invmap, seqx, seqy, ylen, Liden, n_ali); /* print result */ output_results( diff --git a/modules/bindings/src/tmalign/se.h b/modules/bindings/src/USalign/se.h similarity index 73% rename from modules/bindings/src/tmalign/se.h rename to modules/bindings/src/USalign/se.h index 6ccc84132..27eb3b48c 100644 --- a/modules/bindings/src/tmalign/se.h +++ b/modules/bindings/src/USalign/se.h @@ -1,7 +1,10 @@ #include "TMalign.h" /* entry function for se - * outfmt_opt>=2 should not parse sequence alignment */ + * outfmt_opt>=2 should not parse sequence alignment + * u_opt corresponds to option -L + * if u_opt==2, use d0 from Lnorm_ass for alignment + * if hinge>0, append to original invmap */ int se_main( double **xa, double **ya, const char *seqx, const char *seqy, double &TM1, double &TM2, double &TM3, double &TM4, double &TM5, @@ -12,8 +15,8 @@ int se_main( double &TM_ali, double &rmsd_ali, int &n_ali, int &n_ali8, const int xlen, const int ylen, const vector<string> &sequence, const double Lnorm_ass, const double d0_scale, const bool i_opt, - const bool a_opt, const bool u_opt, const bool d_opt, const int mol_type, - const int outfmt_opt, int *invmap) + const bool a_opt, const int u_opt, const bool d_opt, const int mol_type, + const int outfmt_opt, int *invmap, const int hinge=0) { double D0_MIN; //for d0 double Lnorm; //normalization length @@ -37,7 +40,21 @@ int se_main( NewArray(&score, xlen+1, ylen+1); NewArray(&path, xlen+1, ylen+1); NewArray(&val, xlen+1, ylen+1); - //int *invmap = new int[ylen+1]; + int *invmap0 = new int[ylen+1]; + int i,j; + if (hinge==0) for (j=0;j<=ylen;j++) invmap0[j]=-1; + else for (j=0;j<ylen;j++) invmap0[j]=invmap[j]; + vector<char> seqM_char; + if (hinge) + { + seqM_char.assign(ylen,hinge+'0'); + j=-1; + for (int r=0;r<seqM.size();r++) + { + j+=seqyA[r]!='-'; + if (seqM[r]!=' ') seqM_char[j]=seqM[r]; + } + } /* set d0 */ parameter_set4search(xlen, ylen, D0_MIN, Lnorm, @@ -50,12 +67,19 @@ int se_main( parameter_set4final((xlen+ylen)*0.5, D0_MIN, Lnorm, d0a, d0_search, mol_type); // set d0a if (u_opt) + { parameter_set4final(Lnorm_ass, D0_MIN, Lnorm, d0u, d0_search, mol_type); // set d0u + if (u_opt==2) + { + parameter_set4search(Lnorm_ass, Lnorm_ass, D0_MIN, Lnorm, + score_d8, d0, d0_search, dcu0); // set score_d8 + } + } /* perform alignment */ - for(int j=0; j<ylen; j++) invmap[j]=-1; - if (!i_opt) NWDP_SE(path, val, xa, ya, xlen, ylen, d0*d0, 0, invmap); + if (hinge==0) for(j=0; j<ylen; j++) invmap[j]=-1; + if (!i_opt) NWDP_SE(path, val, xa, ya, xlen, ylen, d0*d0, 0, invmap, hinge); else { int i1 = -1;// in C version, index starts from zero, not from one @@ -74,8 +98,17 @@ int se_main( } } } - - rmsd0=TM1=TM2=TM3=TM4=TM5=0; + + if (hinge==0) rmsd0=TM1=TM2=TM3=TM4=TM5=0; + else + { + TM2*=xlen; + TM1*=ylen; + TM3*=(xlen+ylen)*0.5; + TM4*=Lnorm_ass; + TM5*=ylen; + rmsd0=rmsd0*rmsd0*n_ali8; + } int k=0; n_ali=0; n_ali8=0; @@ -86,7 +119,7 @@ int se_main( { n_ali++; d=sqrt(dist(&xa[i][0], &ya[j][0])); - if (d <= score_d8 || i_opt) + if (d <= score_d8 || i_opt || invmap0[j]==i) { if (outfmt_opt<2) { @@ -94,6 +127,7 @@ int se_main( m2[k]=j; } k++; + if (invmap0[j]==i) continue; TM2+=1/(1+(d/d0B)*(d/d0B)); // chain_1 TM1+=1/(1+(d/d0A)*(d/d0A)); // chain_2 if (a_opt) TM3+=1/(1+(d/d0a)*(d/d0a)); // -a @@ -101,6 +135,7 @@ int se_main( if (d_opt) TM5+=1/(1+(d/d0_scale)*(d/d0_scale)); // -d rmsd0+=d*d; } + else if (hinge) invmap[j]=-1; } } n_ali8=k; @@ -113,6 +148,8 @@ int se_main( if (outfmt_opt>=2) { + if (hinge) seqM_char.clear(); + delete []invmap0; DeleteArray(&score, xlen+1); DeleteArray(&path, xlen+1); DeleteArray(&val, xlen+1); @@ -179,9 +216,18 @@ int se_main( seqxA=seqxA.substr(0,kk); seqyA=seqyA.substr(0,kk); seqM =seqM.substr(0,kk); + if (hinge) + { + j=-1; + for (int r=0;r<seqM.size();r++) + { + j+=seqyA[r]!='-'; + if (seqM[r]!=' ') seqM[r]=seqM_char[j]; + } + } /* free memory */ - //delete [] invmap; + delete [] invmap0; delete [] m1; delete [] m2; DeleteArray(&score, xlen+1); diff --git a/modules/bindings/src/USalign/usalign.py b/modules/bindings/src/USalign/usalign.py new file mode 100644 index 000000000..fc9ddd3df --- /dev/null +++ b/modules/bindings/src/USalign/usalign.py @@ -0,0 +1,132 @@ +#!/usr/bin/env pymol +''' +PyMOL plugin for US-align + +USAGE: + + usalign mobile, fix [,args [,exe]] + +INSTALLATION + + Install this script as a PyMOL plugin by + "Plugin" - "Plugin Manager" - "Install New Plugin" + + This plugin depends on the binary executable of US-align, which must be + available within a directory specified by PATH. You can get the PATH + value within PyMOL by the following command: + + print(os.getenv('PATH')) +''' +#This script is partly based on tmalign plugin by Thomas Holder available at +#https://github.com/Pymol-Scripts/Pymol-script-repo/blob/master/tmalign.py + +from __future__ import print_function + +__author__ = 'Chengxin Zhang' +__version__ = '20220924' +__license__ = 'BSD-2-Clause' + +from pymol import cmd, CmdException +import subprocess +import tempfile +import os +import platform + +def get_usalign_path(exe="USalign"): + if platform.system().lower().startswith("win"): + exe+=".exe" + filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),exe) + if os.path.isfile(filename): + return filename + else: + for p in os.getenv("PATH").split(os.pathsep): + filename=os.path.join(p,exe) + if os.path.isfile(filename): + return filename + print("ERROR! Cannot locate %s at %s or at %s"%(exe, + os.path.dirname(os.path.abspath(__file__)),os.getenv("PATH"))) + print("Please put the USalign executable at one of the aforementioned paths") + return exe + +def usalign(mobile, target, args='', exe='', transform=1): + ''' +USAGE + + usalign mobile, target [, args [, exe ]] + +ARGUMENTS + + mobile, target = string: atom selections + + args = string: Extra arguments such as -mm and -byresi + + exe = string: Path to USalign executable {default: USalign} + +CITATION + + Zhang C, Shine M, Pyle AM, Zhang Y. bioRxiv 2022.04.18.488565. + https://github.com/pylelab/USalign + ''' + + mobile_filename = tempfile.mktemp('.pdb', 'mobile') + target_filename = tempfile.mktemp('.pdb', 'target') + mobile_ca_sele = '(%s) and (not hetatm) and alt +A' % (mobile) + target_ca_sele = '(%s) and (not hetatm) and alt +A' % (target) + if not "-atom" in args: + mobile_ca_sele+=" and (name CA or name C3')" + target_ca_sele+=" and (name CA or name C3')" + + cmd.save(mobile_filename, mobile_ca_sele) + cmd.save(target_filename, target_ca_sele) + + if len(exe)==0: + exe=get_usalign_path("USalign") + if args=='""': + args='' + if len(args)>2 and args[0]=='"' and args[-1]=='"': + args=args[1:-1] + if not "-outfmt" in args: + args+=" -outfmt -1" + args = ' '.join([exe, mobile_filename, target_filename, args, '-m -']) + print(args) + + try: + process = subprocess.Popen(args, stdout=subprocess.PIPE, shell=True, + universal_newlines=True) + lines = process.stdout.readlines() + except OSError: + print('Cannot execute "%s", please provide full path to USalign executable' % (args)) + raise CmdException + finally: + os.remove(mobile_filename) + os.remove(target_filename) + + rowcount = 0 + matrix = [] + for line in iter(lines): + print(line.rstrip()) + if line.strip().startswith('------ The rotation matrix to rotate '): + rowcount = 1 + elif 4 >= rowcount and rowcount> 0: + if rowcount >= 2: + a = list(map(float, line.split())) + matrix.extend(a[2:5]) + matrix.append(a[1]) + rowcount += 1 + + assert len(matrix) == 3 * 4 + matrix.extend([0, 0, 0, 1]) + + if int(transform): + cmd.transform_selection('byobject (%s)' % (mobile), matrix, homogenous=1) + return + +# pymol commands +cmd.extend('usalign', usalign) +cmd.extend('USalign', usalign) + +# autocompletion +cmd.auto_arg[0].update({ 'usalign': cmd.auto_arg[0]['align'], }) +cmd.auto_arg[1].update({ 'usalign': cmd.auto_arg[1]['align'], }) +cmd.auto_arg[0].update({ 'USalign': cmd.auto_arg[0]['align'], }) +cmd.auto_arg[1].update({ 'USalign': cmd.auto_arg[1]['align'], }) diff --git a/modules/bindings/src/tmalign/xyz_sfetch.cpp b/modules/bindings/src/USalign/xyz_sfetch.cpp similarity index 83% rename from modules/bindings/src/tmalign/xyz_sfetch.cpp rename to modules/bindings/src/USalign/xyz_sfetch.cpp index 5d413d5c5..4cf057605 100644 --- a/modules/bindings/src/tmalign/xyz_sfetch.cpp +++ b/modules/bindings/src/USalign/xyz_sfetch.cpp @@ -84,15 +84,27 @@ int main(int argc, char *argv[]) /* read entry list */ vector<string> chain_list; - ifstream fp(list_opt.c_str()); - while (fp.good()) + ifstream fp; + if (list_opt=="-") { - getline(fp, line); - for (i=0;i<line.size();i++) - if (line[i]==' '||line[i]=='\t') break; - if (line.size() && i) chain_list.push_back(line.substr(0,i)); + while (cin.good()) + { + getline(cin, line); + for (i=0;i<line.size();i++) if (line[i]==' '||line[i]=='\t') break; + if (line.size() && i) chain_list.push_back(line.substr(0,i)); + } + } + else + { + fp.open(list_opt.c_str(),ios::in); + while (fp.good()) + { + getline(fp, line); + for (i=0;i<line.size();i++) if (line[i]==' '||line[i]=='\t') break; + if (line.size() && i) chain_list.push_back(line.substr(0,i)); + } + fp.close(); } - fp.close(); /* read xyz index */ /* In xyz file, each line has 28 chacters plus an additional '\n'. In PDB @@ -128,6 +140,6 @@ int main(int argc, char *argv[]) delete[]buf; filename.clear(); list_opt.clear(); - chain_list.clear(); + vector<string>().swap(chain_list); return 0; } diff --git a/modules/bindings/src/tmalign/.gitignore b/modules/bindings/src/tmalign/.gitignore deleted file mode 100644 index 4dbbc7f99..000000000 --- a/modules/bindings/src/tmalign/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -# compiled python code -*.pyc - -# vim temporary backup -.*.sw* - -# binary executables -TMalign -TMalignc -pdb2xyz -pdb2fasta -pdb2ss -xyz_sfetch -se -qTMclust -NWalign -HwRMSD diff --git a/modules/bindings/src/tmalign/MMalign.h b/modules/bindings/src/tmalign/MMalign.h deleted file mode 100644 index af9920a8c..000000000 --- a/modules/bindings/src/tmalign/MMalign.h +++ /dev/null @@ -1,1194 +0,0 @@ -#include "se.h" - -/* count the number of nucleic acid chains (na_chain_num) and - * protein chains (aa_chain_num) in a complex */ -int count_na_aa_chain_num(int &na_chain_num,int &aa_chain_num, - const vector<int>&mol_vec) -{ - na_chain_num=0; - aa_chain_num=0; - for (size_t i=0;i<mol_vec.size();i++) - { - if (mol_vec[i]>0) na_chain_num++; - else aa_chain_num++; - } - return na_chain_num+aa_chain_num; -} - -/* adjust chain assignment for dimer-dimer alignment - * return true if assignment is adjusted */ -bool adjust_dimer_assignment( - const vector<vector<vector<double> > >&xa_vec, - const vector<vector<vector<double> > >&ya_vec, - const vector<int>&xlen_vec, const vector<int>&ylen_vec, - const vector<int>&mol_vec1, const vector<int>&mol_vec2, - int *assign1_list, int *assign2_list, - const vector<vector<string> >&seqxA_mat, - const vector<vector<string> >&seqyA_mat) -{ - /* check currently assigned chains */ - int i1,i2,j1,j2; - i1=i2=j1=j2=-1; - int chain1_num=xa_vec.size(); - int i,j; - for (i=0;i<chain1_num;i++) - { - if (assign1_list[i]>=0) - { - if (i1<0) - { - i1=i; - j1=assign1_list[i1]; - } - else - { - i2=i; - j2=assign1_list[i2]; - } - } - } - - /* normalize d0 by L */ - int xlen=xlen_vec[i1]+xlen_vec[i2]; - int ylen=ylen_vec[j1]+ylen_vec[j2]; - int mol_type=mol_vec1[i1]+mol_vec1[i2]+ - mol_vec2[j1]+mol_vec2[j2]; - double D0_MIN, d0, d0_search; - double Lnorm=getmin(xlen,ylen); - parameter_set4final(getmin(xlen,ylen), D0_MIN, Lnorm, d0, - d0_search, mol_type); - - double **xa,**ya, **xt; - NewArray(&xa, xlen, 3); - NewArray(&ya, ylen, 3); - NewArray(&xt, xlen, 3); - - double RMSD = 0; - double dd = 0; - double t[3]; - double u[3][3]; - size_t L_ali=0; // index of residue in aligned region - size_t r=0; // index of residue in full alignment - - /* total score using current assignment */ - L_ali=0; - i=j=-1; - for (r=0;r<seqxA_mat[i1][j1].size();r++) - { - i+=(seqxA_mat[i1][j1][r]!='-'); - j+=(seqyA_mat[i1][j1][r]!='-'); - if (seqxA_mat[i1][j1][r]=='-' || seqyA_mat[i1][j1][r]=='-') continue; - xa[L_ali][0]=xa_vec[i1][i][0]; - xa[L_ali][1]=xa_vec[i1][i][1]; - xa[L_ali][2]=xa_vec[i1][i][2]; - ya[L_ali][0]=ya_vec[j1][j][0]; - ya[L_ali][1]=ya_vec[j1][j][1]; - ya[L_ali][2]=ya_vec[j1][j][2]; - L_ali++; - } - i=j=-1; - for (r=0;r<seqxA_mat[i2][j2].size();r++) - { - i+=(seqxA_mat[i2][j2][r]!='-'); - j+=(seqyA_mat[i2][j2][r]!='-'); - if (seqxA_mat[i2][j2][r]=='-' || seqyA_mat[i2][j2][r]=='-') continue; - xa[L_ali][0]=xa_vec[i2][i][0]; - xa[L_ali][1]=xa_vec[i2][i][1]; - xa[L_ali][2]=xa_vec[i2][i][2]; - ya[L_ali][0]=ya_vec[j2][j][0]; - ya[L_ali][1]=ya_vec[j2][j][1]; - ya[L_ali][2]=ya_vec[j2][j][2]; - L_ali++; - } - - Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); - do_rotation(xa, xt, L_ali, t, u); - - double total_score1=0; - for (r=0;r<L_ali;r++) - { - dd=dist(xt[r],ya[r]); - total_score1+=1/(1+dd/d0*d0); - } - total_score1/=Lnorm; - - /* total score using reversed assignment */ - L_ali=0; - i=j=-1; - for (r=0;r<seqxA_mat[i1][j2].size();r++) - { - i+=(seqxA_mat[i1][j2][r]!='-'); - j+=(seqyA_mat[i1][j2][r]!='-'); - if (seqxA_mat[i1][j2][r]=='-' || seqyA_mat[i1][j2][r]=='-') continue; - xa[L_ali][0]=xa_vec[i1][i][0]; - xa[L_ali][1]=xa_vec[i1][i][1]; - xa[L_ali][2]=xa_vec[i1][i][2]; - ya[L_ali][0]=ya_vec[j2][j][0]; - ya[L_ali][1]=ya_vec[j2][j][1]; - ya[L_ali][2]=ya_vec[j2][j][2]; - L_ali++; - } - i=j=-1; - for (r=0;r<seqxA_mat[i2][j1].size();r++) - { - i+=(seqxA_mat[i2][j1][r]!='-'); - j+=(seqyA_mat[i2][j1][r]!='-'); - if (seqxA_mat[i2][j1][r]=='-' || seqyA_mat[i2][j1][r]=='-') continue; - xa[L_ali][0]=xa_vec[i2][i][0]; - xa[L_ali][1]=xa_vec[i2][i][1]; - xa[L_ali][2]=xa_vec[i2][i][2]; - ya[L_ali][0]=ya_vec[j1][j][0]; - ya[L_ali][1]=ya_vec[j1][j][1]; - ya[L_ali][2]=ya_vec[j1][j][2]; - L_ali++; - } - - Kabsch(xa, ya, L_ali, 1, &RMSD, t, u); - do_rotation(xa, xt, L_ali, t, u); - - double total_score2=0; - for (r=0;r<L_ali;r++) - { - dd=dist(xt[r],ya[r]); - total_score2+=1/(1+dd/d0*d0); - } - total_score2/=Lnorm; - - /* swap chain assignment */ - if (total_score1<total_score2) - { - assign1_list[i1]=j2; - assign1_list[i2]=j1; - assign2_list[j1]=i2; - assign2_list[j2]=i1; - } - - /* clean up */ - DeleteArray(&xa, xlen); - DeleteArray(&ya, ylen); - DeleteArray(&xt, xlen); - return total_score1<total_score2; -} - -/* assign chain-chain correspondence */ -double enhanced_greedy_search(double **TMave_mat,int *assign1_list, - int *assign2_list, const int chain1_num, const int chain2_num) -{ - double total_score=0; - double tmp_score=0; - int i,j; - int maxi=0; - int maxj=0; - - /* initialize parameters */ - for (i=0;i<chain1_num;i++) assign1_list[i]=-1; - for (j=0;j<chain2_num;j++) assign2_list[j]=-1; - - /* greedy assignment: in each iteration, the highest chain pair is - * assigned, until no assignable chain is left */ - while(1) - { - tmp_score=-1; - for (i=0;i<chain1_num;i++) - { - if (assign1_list[i]>=0) continue; - for (j=0;j<chain2_num;j++) - { - if (assign2_list[j]>=0 || TMave_mat[i][j]<=0) continue; - if (TMave_mat[i][j]>tmp_score) - { - maxi=i; - maxj=j; - tmp_score=TMave_mat[i][j]; - } - } - } - if (tmp_score<=0) break; // error: no assignable chain - assign1_list[maxi]=maxj; - assign2_list[maxj]=maxi; - total_score+=tmp_score; - } - if (total_score<=0) return total_score; // error: no assignable chain - //cout<<"assign1_list={"; - //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; - //cout<<"assign2_list={"; - //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; - - /* iterative refinemnt */ - double delta_score; - int *assign1_tmp=new int [chain1_num]; - int *assign2_tmp=new int [chain2_num]; - for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; - for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; - int old_i=-1; - int old_j=-1; - - for (int iter=0;iter<getmin(chain1_num,chain2_num)*5;iter++) - { - delta_score=-1; - for (i=0;i<chain1_num;i++) - { - old_j=assign1_list[i]; - for (j=0;j<chain2_num;j++) - { - // attempt to swap (i,old_j=assign1_list[i]) with (i,j) - if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; - old_i=assign2_list[j]; - - assign1_tmp[i]=j; - if (old_i>=0) assign1_tmp[old_i]=old_j; - assign2_tmp[j]=i; - if (old_j>=0) assign2_tmp[old_j]=old_i; - - delta_score=TMave_mat[i][j]; - if (old_j>=0) delta_score-=TMave_mat[i][old_j]; - if (old_i>=0) delta_score-=TMave_mat[old_i][j]; - if (old_i>=0 && old_j>=0) delta_score+=TMave_mat[old_i][old_j]; - - if (delta_score>0) // successful swap - { - assign1_list[i]=j; - if (old_i>=0) assign1_list[old_i]=old_j; - assign2_list[j]=i; - if (old_j>=0) assign2_list[old_j]=old_i; - total_score+=delta_score; - break; - } - else - { - assign1_tmp[i]=assign1_list[i]; - if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; - assign2_tmp[j]=assign2_list[j]; - if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; - } - } - if (delta_score>0) break; - } - if (delta_score<=0) break; // cannot swap any chain pair - } - - /* clean up */ - delete[]assign1_tmp; - delete[]assign2_tmp; - return total_score; -} - -double calculate_centroids(const vector<vector<vector<double> > >&a_vec, - const int chain_num, double ** centroids) -{ - int L=0; - int c,r; // index of chain and residue - for (c=0; c<chain_num; c++) - { - centroids[c][0]=0; - centroids[c][1]=0; - centroids[c][2]=0; - L=a_vec[c].size(); - for (r=0; r<L; r++) - { - centroids[c][0]+=a_vec[c][r][0]; - centroids[c][1]+=a_vec[c][r][1]; - centroids[c][2]+=a_vec[c][r][2]; - } - centroids[c][0]/=L; - centroids[c][1]/=L; - centroids[c][2]/=L; - //cout<<centroids[c][0]<<'\t' - //<<centroids[c][1]<<'\t' - //<<centroids[c][2]<<endl; - } - - vector<double> d0_vec(chain_num,-1); - int c2=0; - double d0MM=0; - for (c=0; c<chain_num; c++) - { - for (c2=0; c2<chain_num; c2++) - { - if (c2==c) continue; - d0MM=sqrt(dist(centroids[c],centroids[c2])); - if (d0_vec[c]<=0) d0_vec[c]=d0MM; - else d0_vec[c]=getmin(d0_vec[c], d0MM); - } - } - d0MM=0; - for (c=0; c<chain_num; c++) d0MM+=d0_vec[c]; - d0MM/=chain_num; - d0_vec.clear(); - //cout<<d0MM<<endl; - return d0MM; -} - -/* calculate MMscore of aligned chains - * MMscore = sum(TMave_mat[i][j]) * sum(1/(1+dij^2/d0MM^2)) - * / (L* getmin(chain1_num,chain2_num)) - * dij is the centroid distance between chain pair i and j - * d0MM is scaling factor. TMave_mat[i][j] is the TM-score between - * chain pair i and j multiple by getmin(Li*Lj) */ -double calMMscore(double **TMave_mat,int *assign1_list, - const int chain1_num, const int chain2_num, double **xcentroids, - double **ycentroids, const double d0MM, double **r1, double **r2, - double **xt, double t[3], double u[3][3], const int L) -{ - int Nali=0; // number of aligned chain - int i,j; - double MMscore=0; - for (i=0;i<chain1_num;i++) - { - j=assign1_list[i]; - if (j<0) continue; - - r1[Nali][0]=xcentroids[i][0]; - r1[Nali][1]=xcentroids[i][1]; - r1[Nali][2]=xcentroids[i][2]; - - r2[Nali][0]=ycentroids[j][0]; - r2[Nali][1]=ycentroids[j][1]; - r2[Nali][2]=ycentroids[j][2]; - - Nali++; - MMscore+=TMave_mat[i][j]; - } - MMscore/=L; - - double RMSD = 0; - double TMscore=0; - if (Nali>=3) - { - /* Kabsch superposition */ - Kabsch(r1, r2, Nali, 1, &RMSD, t, u); - do_rotation(r1, xt, Nali, t, u); - - /* calculate pseudo-TMscore */ - double dd=0; - for (i=0;i<Nali;i++) - { - dd=dist(xt[i], r2[i]); - TMscore+=1/(1+dd/(d0MM*d0MM)); - } - } - else if (Nali==2) - { - double dd=dist(r1[0],r2[0]); - TMscore=1/(1+dd/(d0MM*d0MM)); - } - else TMscore=1; // only one aligned chain. - TMscore/=getmin(chain1_num,chain2_num); - MMscore*=TMscore; - return MMscore; -} - -/* check if this is alignment of heterooligomer or homooligomer - * return het_deg, which ranges from 0 to 1. - * The larger the value, the more "hetero"; - * Tthe smaller the value, the more "homo" */ -double check_heterooligomer(double **TMave_mat, const int chain1_num, - const int chain2_num) -{ - double het_deg=0; - double min_TM=-1; - double max_TM=-1; - int i,j; - for (i=0;i<chain1_num;i++) - { - for (j=0;j<chain2_num;j++) - { - if (min_TM<0 || TMave_mat[i][j] <min_TM) min_TM=TMave_mat[i][j]; - if (max_TM<0 || TMave_mat[i][j]>=max_TM) max_TM=TMave_mat[i][j]; - } - } - het_deg=(max_TM-min_TM)/max_TM; - //cout<<"min_TM="<<min_TM<<endl; - //cout<<"max_TM="<<max_TM<<endl; - return het_deg; -} - -/* reassign chain-chain correspondence, specific for homooligomer */ -double homo_refined_greedy_search(double **TMave_mat,int *assign1_list, - int *assign2_list, const int chain1_num, const int chain2_num, - double **xcentroids, double **ycentroids, const double d0MM, - const int L, double **ut_mat) -{ - double MMscore_max=0; - double MMscore=0; - int i,j; - int c1,c2; - int max_i=-1; // the chain pair whose monomer u t yields highest MMscore - int max_j=-1; - - int chain_num=getmin(chain1_num,chain2_num); - int *assign1_tmp=new int [chain1_num]; - int *assign2_tmp=new int [chain2_num]; - double **xt; - NewArray(&xt, chain1_num, 3); - double t[3]; - double u[3][3]; - int ui,uj,ut_idx; - double TMscore=0; // pseudo TM-score - double TMsum =0; - double TMnow =0; - double TMmax =0; - double dd=0; - - size_t total_pair=chain1_num*chain2_num; // total pair - double *ut_tmc_mat=new double [total_pair]; // chain level TM-score - vector<pair<double,int> > ut_tm_vec(total_pair,make_pair(0.0,0)); // product of both - - for (c1=0;c1<chain1_num;c1++) - { - for (c2=0;c2<chain2_num;c2++) - { - if (TMave_mat[c1][c2]<=0) continue; - ut_idx=c1*chain2_num+c2; - for (ui=0;ui<3;ui++) - for (uj=0;uj<3;uj++) u[ui][uj]=ut_mat[ut_idx][ui*3+uj]; - for (uj=0;uj<3;uj++) t[uj]=ut_mat[ut_idx][9+uj]; - - do_rotation(xcentroids, xt, chain1_num, t, u); - - for (i=0;i<chain1_num;i++) assign1_tmp[i]=-1; - for (j=0;j<chain2_num;j++) assign2_tmp[j]=-1; - - - for (i=0;i<chain1_num;i++) - { - for (j=0;j<chain2_num;j++) - { - ut_idx=i*chain2_num+j; - ut_tmc_mat[ut_idx]=0; - ut_tm_vec[ut_idx].first=-1; - ut_tm_vec[ut_idx].second=ut_idx; - if (TMave_mat[i][j]<=0) continue; - dd=dist(xt[i],ycentroids[j]); - ut_tmc_mat[ut_idx]=1/(1+dd/(d0MM*d0MM)); - ut_tm_vec[ut_idx].first= - ut_tmc_mat[ut_idx]*TMave_mat[i][j]; - //cout<<"TM["<<ut_idx<<"]="<<ut_tm_vec[ut_idx].first<<endl; - } - } - //cout<<"sorting "<<total_pair<<" chain pairs"<<endl; - - /* initial assignment */ - assign1_tmp[c1]=c2; - assign2_tmp[c2]=c1; - TMsum=TMave_mat[c1][c2]; - TMscore=ut_tmc_mat[c1*chain2_num+c2]; - - /* further assignment */ - sort(ut_tm_vec.begin(), ut_tm_vec.end()); // sort in ascending order - for (ut_idx=total_pair-1;ut_idx>=0;ut_idx--) - { - j=ut_tm_vec[ut_idx].second % chain2_num; - i=int(ut_tm_vec[ut_idx].second / chain2_num); - if (TMave_mat[i][j]<=0) break; - if (assign1_tmp[i]>=0 || assign2_tmp[j]>=0) continue; - assign1_tmp[i]=j; - assign2_tmp[j]=i; - TMsum+=TMave_mat[i][j]; - TMscore+=ut_tmc_mat[i*chain2_num+j]; - //cout<<"ut_idx="<<ut_tm_vec[ut_idx].second - //<<"\ti="<<i<<"\tj="<<j<<"\ttm="<<ut_tm_vec[ut_idx].first<<endl; - } - - /* final MMscore */ - MMscore=(TMsum/L)*(TMscore/chain_num); - if (max_i<0 || max_j<0 || MMscore>MMscore_max) - { - max_i=c1; - max_j=c2; - MMscore_max=MMscore; - for (i=0;i<chain1_num;i++) assign1_list[i]=assign1_tmp[i]; - for (j=0;j<chain2_num;j++) assign2_list[j]=assign2_tmp[j]; - //cout<<"TMsum/L="<<TMsum/L<<endl; - //cout<<"TMscore/chain_num="<<TMscore/chain_num<<endl; - //cout<<"MMscore="<<MMscore<<endl; - //cout<<"assign1_list={"; - //for (i=0;i<chain1_num;i++) - //cout<<assign1_list[i]<<","; cout<<"}"<<endl; - //cout<<"assign2_list={"; - //for (j=0;j<chain2_num;j++) - //cout<<assign2_list[j]<<","; cout<<"}"<<endl; - } - } - } - - /* clean up */ - delete[]assign1_tmp; - delete[]assign2_tmp; - delete[]ut_tmc_mat; - ut_tm_vec.clear(); - DeleteArray(&xt, chain1_num); - return MMscore; -} - -/* reassign chain-chain correspondence, specific for heterooligomer */ -double hetero_refined_greedy_search(double **TMave_mat,int *assign1_list, - int *assign2_list, const int chain1_num, const int chain2_num, - double **xcentroids, double **ycentroids, const double d0MM, const int L) -{ - double MMscore_old=0; - double MMscore=0; - int i,j; - - double **r1; - double **r2; - double **xt; - int chain_num=getmin(chain1_num,chain2_num); - NewArray(&r1, chain_num, 3); - NewArray(&r2, chain_num, 3); - NewArray(&xt, chain_num, 3); - double t[3]; - double u[3][3]; - - /* calculate MMscore */ - MMscore=MMscore_old=calMMscore(TMave_mat, assign1_list, chain1_num, - chain2_num, xcentroids, ycentroids, d0MM, r1, r2, xt, t, u, L); - //cout<<"MMscore="<<MMscore<<endl; - //cout<<"TMave_mat="<<endl; - //for (i=0;i<chain1_num;i++) - //{ - //for (j=0; j<chain2_num; j++) - //{ - //if (j<chain2_num-1) cout<<TMave_mat[i][j]<<'\t'; - //else cout<<TMave_mat[i][j]<<endl; - //} - //} - - /* iteratively refine chain assignment. in each iteration, attempt - * to swap (i,old_j=assign1_list[i]) with (i,j) */ - double delta_score=-1; - int *assign1_tmp=new int [chain1_num]; - int *assign2_tmp=new int [chain2_num]; - for (i=0;i<chain1_num;i++) assign1_tmp[i]=assign1_list[i]; - for (j=0;j<chain2_num;j++) assign2_tmp[j]=assign2_list[j]; - int old_i=-1; - int old_j=-1; - - //cout<<"assign1_list={"; - //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; - //cout<<"assign2_list={"; - //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; - - for (int iter=0;iter<chain1_num*chain2_num;iter++) - { - delta_score=-1; - for (i=0;i<chain1_num;i++) - { - old_j=assign1_list[i]; - for (j=0;j<chain2_num;j++) - { - if (j==assign1_list[i] || TMave_mat[i][j]<=0) continue; - old_i=assign2_list[j]; - - assign1_tmp[i]=j; - if (old_i>=0) assign1_tmp[old_i]=old_j; - assign2_tmp[j]=i; - if (old_j>=0) assign2_tmp[old_j]=old_i; - - MMscore=calMMscore(TMave_mat, assign1_tmp, chain1_num, - chain2_num, xcentroids, ycentroids, d0MM, - r1, r2, xt, t, u, L); - - //cout<<"(i,j,old_i,old_j,MMscore)=("<<i<<","<<j<<"," - //<<old_i<<","<<old_j<<","<<MMscore<<")"<<endl; - - if (MMscore>MMscore_old) // successful swap - { - assign1_list[i]=j; - if (old_i>=0) assign1_list[old_i]=old_j; - assign2_list[j]=i; - if (old_j>=0) assign2_list[old_j]=old_i; - delta_score=(MMscore-MMscore_old); - MMscore_old=MMscore; - //cout<<"MMscore="<<MMscore<<endl; - break; - } - else - { - assign1_tmp[i]=assign1_list[i]; - if (old_i>=0) assign1_tmp[old_i]=assign1_list[old_i]; - assign2_tmp[j]=assign2_list[j]; - if (old_j>=0) assign2_tmp[old_j]=assign2_list[old_j]; - } - } - } - //cout<<"iter="<<iter<<endl; - //cout<<"assign1_list={"; - //for (i=0;i<chain1_num;i++) cout<<assign1_list[i]<<","; cout<<"}"<<endl; - //cout<<"assign2_list={"; - //for (j=0;j<chain2_num;j++) cout<<assign2_list[j]<<","; cout<<"}"<<endl; - if (delta_score<=0) break; // cannot swap any chain pair - } - MMscore=MMscore_old; - //cout<<"MMscore="<<MMscore<<endl; - - /* clean up */ - delete[]assign1_tmp; - delete[]assign2_tmp; - DeleteArray(&r1, chain_num); - DeleteArray(&r2, chain_num); - DeleteArray(&xt, chain_num); - return MMscore; -} - -void copy_chain_data(const vector<vector<double> >&a_vec_i, - const vector<char>&seq_vec_i,const vector<char>&sec_vec_i, - const int len,double **a,char *seq,char *sec) -{ - int r; - for (r=0;r<len;r++) - { - a[r][0]=a_vec_i[r][0]; - a[r][1]=a_vec_i[r][1]; - a[r][2]=a_vec_i[r][2]; - seq[r]=seq_vec_i[r]; - sec[r]=sec_vec_i[r]; - } - seq[len]=0; - sec[len]=0; -} - -void parse_chain_list(const vector<string>&chain_list, - vector<vector<vector<double> > >&a_vec, vector<vector<char> >&seq_vec, - vector<vector<char> >&sec_vec, vector<int>&mol_vec, vector<int>&len_vec, - vector<string>&chainID_list, const int ter_opt, const int split_opt, - const string mol_opt, const int infmt_opt, const string atom_opt, - const int mirror_opt, const int het_opt, int &len_aa, int &len_na, - const int o_opt, vector<string>&resi_vec) -{ - size_t i; - int chain_i,r; - string name; - int chainnum; - double **xa; - int len; - char *seq,*sec; - - vector<vector<string> >PDB_lines; - vector<double> tmp_atom_array(3,0); - vector<vector<double> > tmp_chain_array; - vector<char>tmp_seq_array; - vector<char>tmp_sec_array; - //vector<string> resi_vec; - int read_resi=0; - if (o_opt) read_resi=2; - - for (i=0;i<chain_list.size();i++) - { - name=chain_list[i]; - chainnum=get_PDB_lines(name, PDB_lines, chainID_list, - mol_vec, ter_opt, infmt_opt, atom_opt, split_opt, het_opt); - if (!chainnum) - { - cerr<<"Warning! Cannot parse file: "<<name - <<". Chain number 0."<<endl; - continue; - } - for (chain_i=0;chain_i<chainnum;chain_i++) - { - len=PDB_lines[chain_i].size(); - if (!len) - { - cerr<<"Warning! Cannot parse file: "<<name - <<". Chain length 0."<<endl; - continue; - } - else if (len<3) - { - cerr<<"Sequence is too short <3!: "<<name<<endl; - continue; - } - NewArray(&xa, len, 3); - seq = new char[len + 1]; - sec = new char[len + 1]; - len = read_PDB(PDB_lines[chain_i], xa, seq, resi_vec, read_resi); - if (mirror_opt) for (r=0;r<len;r++) xa[r][2]=-xa[r][2]; - if (mol_vec[chain_i]>0 || mol_opt=="RNA") - make_sec(seq, xa, len, sec,atom_opt); - else make_sec(xa, len, sec); // secondary structure assignment - - /* store in vector */ - tmp_chain_array.assign(len,tmp_atom_array); - vector<char>tmp_seq_array(len+1,0); - vector<char>tmp_sec_array(len+1,0); - for (r=0;r<len;r++) - { - tmp_chain_array[r][0]=xa[r][0]; - tmp_chain_array[r][1]=xa[r][1]; - tmp_chain_array[r][2]=xa[r][2]; - tmp_seq_array[r]=seq[r]; - tmp_sec_array[r]=sec[r]; - } - a_vec.push_back(tmp_chain_array); - seq_vec.push_back(tmp_seq_array); - sec_vec.push_back(tmp_sec_array); - len_vec.push_back(len); - - /* clean up */ - tmp_chain_array.clear(); - tmp_seq_array.clear(); - tmp_sec_array.clear(); - PDB_lines[chain_i].clear(); - DeleteArray(&xa, len); - delete [] seq; - delete [] sec; - } // chain_i - name.clear(); - PDB_lines.clear(); - mol_vec.clear(); - } // i - tmp_atom_array.clear(); - - if (mol_opt=="RNA") mol_vec.assign(a_vec.size(),1); - else if (mol_opt=="protein") mol_vec.assign(a_vec.size(),-1); - else - { - mol_vec.assign(a_vec.size(),0); - for (i=0;i<a_vec.size();i++) - { - for (r=0;r<len_vec[i];r++) - { - if (seq_vec[i][r]>='a' && seq_vec[i][r]<='z') mol_vec[i]++; - else mol_vec[i]--; - } - } - } - - len_aa=0; - len_na=0; - for (i=0;i<a_vec.size();i++) - { - if (mol_vec[i]>0) len_na+=len_vec[i]; - else len_aa+=len_vec[i]; - } -} - -int copy_chain_pair_data( - const vector<vector<vector<double> > >&xa_vec, - const vector<vector<vector<double> > >&ya_vec, - const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, - const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, - const vector<int> &mol_vec1, const vector<int> &mol_vec2, - const vector<int> &xlen_vec, const vector<int> &ylen_vec, - double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, - int chain1_num, int chain2_num, - vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, - int *assign1_list, int *assign2_list, vector<string>&sequence) -{ - int i,j,r; - sequence.clear(); - sequence.push_back(""); - sequence.push_back(""); - int mol_type=0; - int xlen=0; - int ylen=0; - for (i=0;i<chain1_num;i++) - { - j=assign1_list[i]; - if (j<0) continue; - for (r=0;r<xlen_vec[i];r++) - { - seqx[xlen]=seqx_vec[i][r]; - secx[xlen]=secx_vec[i][r]; - xa[xlen][0]= xa_vec[i][r][0]; - xa[xlen][1]= xa_vec[i][r][1]; - xa[xlen][2]= xa_vec[i][r][2]; - xlen++; - } - sequence[0]+=seqxA_mat[i][j]; - for (r=0;r<ylen_vec[j];r++) - { - seqy[ylen]=seqy_vec[j][r]; - secy[ylen]=secy_vec[j][r]; - ya[ylen][0]= ya_vec[j][r][0]; - ya[ylen][1]= ya_vec[j][r][1]; - ya[ylen][2]= ya_vec[j][r][2]; - ylen++; - } - sequence[1]+=seqyA_mat[i][j]; - mol_type+=mol_vec1[i]+mol_vec2[j]; - } - seqx[xlen]=0; - secx[xlen]=0; - seqy[ylen]=0; - secy[ylen]=0; - return mol_type; -} - -double MMalign_search( - const vector<vector<vector<double> > >&xa_vec, - const vector<vector<vector<double> > >&ya_vec, - const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, - const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, - const vector<int> &mol_vec1, const vector<int> &mol_vec2, - const vector<int> &xlen_vec, const vector<int> &ylen_vec, - double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, - int len_aa, int len_na, int chain1_num, int chain2_num, - double **TM1_mat, double **TM2_mat, double **TMave_mat, - vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqyA_mat, - int *assign1_list, int *assign2_list, vector<string>&sequence, - double d0_scale, bool fast_opt) -{ - double total_score=0; - int i,j; - int xlen=0; - int ylen=0; - for (i=0;i<chain1_num;i++) - { - if (assign1_list[i]<0) continue; - xlen+=xlen_vec[i]; - ylen+=ylen_vec[assign1_list[i]]; - } - if (xlen<=3 || ylen<=3) return total_score; - - seqx = new char[xlen+1]; - secx = new char[xlen+1]; - NewArray(&xa, xlen, 3); - seqy = new char[ylen+1]; - secy = new char[ylen+1]; - NewArray(&ya, ylen, 3); - - int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, - secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, - xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, - seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); - - /* declare variable specific to this pair of TMalign */ - double t0[3], u0[3][3]; - double TM1, TM2; - double TM3, TM4, TM5; // for a_opt, u_opt, d_opt - double d0_0, TM_0; - double d0A, d0B, d0u, d0a; - double d0_out=5.0; - string seqM, seqxA, seqyA;// for output alignment - double rmsd0 = 0.0; - int L_ali; // Aligned length in standard_TMscore - double Liden=0; - double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore - int n_ali=0; - int n_ali8=0; - - double Lnorm_ass=len_aa+len_na; - - /* entry function for structure alignment */ - TMalign_main(xa, ya, seqx, seqy, secx, secy, - t0, u0, TM1, TM2, TM3, TM4, TM5, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, - rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 3, false, true, false, fast_opt, mol_type, -1); - - /* clean up */ - delete [] seqx; - delete [] seqy; - delete [] secx; - delete [] secy; - DeleteArray(&xa,xlen); - DeleteArray(&ya,ylen); - - /* re-compute chain level alignment */ - for (i=0;i<chain1_num;i++) - { - xlen=xlen_vec[i]; - if (xlen<3) - { - for (j=0;j<chain2_num;j++) - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; - continue; - } - seqx = new char[xlen+1]; - secx = new char[xlen+1]; - NewArray(&xa, xlen, 3); - copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], - xlen,xa,seqx,secx); - - double **xt; - NewArray(&xt, xlen, 3); - do_rotation(xa, xt, xlen, t0, u0); - - for (j=0;j<chain2_num;j++) - { - if (mol_vec1[i]*mol_vec2[j]<0) //no protein-RNA alignment - { - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; - continue; - } - - ylen=ylen_vec[j]; - if (ylen<3) - { - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; - continue; - } - seqy = new char[ylen+1]; - secy = new char[ylen+1]; - NewArray(&ya, ylen, 3); - copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], - ylen,ya,seqy,secy); - - /* declare variable specific to this pair of TMalign */ - d0_out=5.0; - seqM.clear(); - seqxA.clear(); - seqyA.clear(); - rmsd0 = 0.0; - Liden=0; - int *invmap = new int[ylen+1]; - - double Lnorm_ass=len_aa; - if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; - - /* entry function for structure alignment */ - se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, - rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 0, false, true, false, - mol_vec1[i]+mol_vec2[j], 1, invmap); - - /* print result */ - TM1_mat[i][j]=TM2; // normalized by chain1 - TM2_mat[i][j]=TM1; // normalized by chain2 - seqxA_mat[i][j]=seqxA; - seqyA_mat[i][j]=seqyA; - - TMave_mat[i][j]=TM4*Lnorm_ass; - - /* clean up */ - seqM.clear(); - seqxA.clear(); - seqyA.clear(); - - delete[]seqy; - delete[]secy; - DeleteArray(&ya,ylen); - } - delete[]seqx; - delete[]secx; - DeleteArray(&xa,xlen); - DeleteArray(&xt,xlen); - } - return total_score; -} - -void MMalign_final( - const string xname, const string yname, - const vector<string> chainID_list1, const vector<string> chainID_list2, - string fname_super, string fname_lign, string fname_matrix, - const vector<vector<vector<double> > >&xa_vec, - const vector<vector<vector<double> > >&ya_vec, - const vector<vector<char> >&seqx_vec, const vector<vector<char> >&seqy_vec, - const vector<vector<char> >&secx_vec, const vector<vector<char> >&secy_vec, - const vector<int> &mol_vec1, const vector<int> &mol_vec2, - const vector<int> &xlen_vec, const vector<int> &ylen_vec, - double **xa, double **ya, char *seqx, char *seqy, char *secx, char *secy, - int len_aa, int len_na, int chain1_num, int chain2_num, - double **TM1_mat, double **TM2_mat, double **TMave_mat, - vector<vector<string> >&seqxA_mat, vector<vector<string> >&seqM_mat, - vector<vector<string> >&seqyA_mat, int *assign1_list, int *assign2_list, - vector<string>&sequence, const double d0_scale, const bool m_opt, - const int o_opt, const int outfmt_opt, const int ter_opt, - const int split_opt, const bool a_opt, const bool d_opt, - const bool fast_opt, const bool full_opt, const int mirror_opt, - const vector<string>&resi_vec1, const vector<string>&resi_vec2) -{ - int i,j; - int xlen=0; - int ylen=0; - for (i=0;i<chain1_num;i++) xlen+=xlen_vec[i]; - for (j=0;j<chain2_num;j++) ylen+=ylen_vec[j]; - if (xlen<=3 || ylen<=3) return; - - seqx = new char[xlen+1]; - secx = new char[xlen+1]; - NewArray(&xa, xlen, 3); - seqy = new char[ylen+1]; - secy = new char[ylen+1]; - NewArray(&ya, ylen, 3); - - int mol_type=copy_chain_pair_data(xa_vec, ya_vec, seqx_vec, seqy_vec, - secx_vec, secy_vec, mol_vec1, mol_vec2, xlen_vec, ylen_vec, - xa, ya, seqx, seqy, secx, secy, chain1_num, chain2_num, - seqxA_mat, seqyA_mat, assign1_list, assign2_list, sequence); - - /* declare variable specific to this pair of TMalign */ - double t0[3], u0[3][3]; - double TM1, TM2; - double TM3, TM4, TM5; // for a_opt, u_opt, d_opt - double d0_0, TM_0; - double d0A, d0B, d0u, d0a; - double d0_out=5.0; - string seqM, seqxA, seqyA;// for output alignment - double rmsd0 = 0.0; - int L_ali; // Aligned length in standard_TMscore - double Liden=0; - double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore - int n_ali=0; - int n_ali8=0; - - double Lnorm_ass=len_aa+len_na; - - /* entry function for structure alignment */ - TMalign_main(xa, ya, seqx, seqy, secx, secy, - t0, u0, TM1, TM2, TM3, TM4, TM5, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, - rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 3, a_opt, false, d_opt, fast_opt, mol_type, -1); - - /* prepare full complex alignment */ - string chainID1=""; - string chainID2=""; - sequence.clear(); - sequence.push_back(""); // seqxA - sequence.push_back(""); // seqyA - sequence.push_back(""); // seqM - int aln_start=0; - int aln_end=0; - for (i=0;i<chain1_num;i++) - { - j=assign1_list[i]; - if (j<0) continue; - chainID1+=chainID_list1[i]; - chainID2+=chainID_list2[j]; - sequence[0]+=seqxA_mat[i][j]+'*'; - sequence[1]+=seqyA_mat[i][j]+'*'; - - aln_end+=seqxA_mat[i][j].size(); - seqM_mat[i][j]=seqM.substr(aln_start,aln_end-aln_start); - sequence[2]+=seqM_mat[i][j]+'*'; - aln_start=aln_end; - } - - /* prepare unaligned region */ - for (i=0;i<chain1_num;i++) - { - if (assign1_list[i]>=0) continue; - chainID1+=chainID_list1[i]; - chainID2+=':'; - string s(seqx_vec[i].begin(),seqx_vec[i].end()); - sequence[0]+=s.substr(0,xlen_vec[i])+'*'; - sequence[1]+=string(xlen_vec[i],'-')+'*'; - s.clear(); - sequence[2]+=string(xlen_vec[i],' ')+'*'; - } - for (j=0;j<chain2_num;j++) - { - if (assign2_list[j]>=0) continue; - chainID1+=':'; - chainID2+=chainID_list2[j]; - string s(seqy_vec[j].begin(),seqy_vec[j].end()); - sequence[0]+=string(ylen_vec[j],'-')+'*'; - sequence[1]+=s.substr(0,ylen_vec[j])+'*'; - s.clear(); - sequence[2]+=string(ylen_vec[j],' ')+'*'; - } - - /* print alignment */ - output_results(xname, yname, chainID1.c_str(), chainID2.c_str(), - xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, - sequence[2].c_str(), sequence[0].c_str(), sequence[1].c_str(), - Liden, n_ali8, L_ali, TM_ali, rmsd_ali, - TM_0, d0_0, d0A, d0B, 0, d0_scale, d0a, d0u, - (m_opt?fname_matrix:"").c_str(), outfmt_opt, ter_opt, true, - split_opt, o_opt, fname_super, - false, a_opt, false, d_opt, mirror_opt, resi_vec1, resi_vec2); - - /* clean up */ - seqM.clear(); - seqxA.clear(); - seqyA.clear(); - delete [] seqx; - delete [] seqy; - delete [] secx; - delete [] secy; - DeleteArray(&xa,xlen); - DeleteArray(&ya,ylen); - sequence[0].clear(); - sequence[1].clear(); - sequence[2].clear(); - - if (!full_opt) return; - - cout<<"# End of alignment for full complex. The following blocks list alignments for individual chains."<<endl; - - /* re-compute chain level alignment */ - for (i=0;i<chain1_num;i++) - { - j=assign1_list[i]; - if (j<0) continue; - xlen=xlen_vec[i]; - seqx = new char[xlen+1]; - secx = new char[xlen+1]; - NewArray(&xa, xlen, 3); - copy_chain_data(xa_vec[i],seqx_vec[i],secx_vec[i], - xlen,xa,seqx,secx); - - double **xt; - NewArray(&xt, xlen, 3); - do_rotation(xa, xt, xlen, t0, u0); - - ylen=ylen_vec[j]; - if (ylen<3) - { - TM1_mat[i][j]=TM2_mat[i][j]=TMave_mat[i][j]=-1; - continue; - } - seqy = new char[ylen+1]; - secy = new char[ylen+1]; - NewArray(&ya, ylen, 3); - copy_chain_data(ya_vec[j],seqy_vec[j],secy_vec[j], - ylen,ya,seqy,secy); - - /* declare variable specific to this pair of TMalign */ - d0_out=5.0; - rmsd0 = 0.0; - Liden=0; - int *invmap = new int[ylen+1]; - seqM=""; - seqxA=""; - seqyA=""; - double Lnorm_ass=len_aa; - if (mol_vec1[i]+mol_vec2[j]>0) Lnorm_ass=len_na; - sequence[0]=seqxA_mat[i][j]; - sequence[1]=seqyA_mat[i][j]; - - /* entry function for structure alignment */ - se_main(xt, ya, seqx, seqy, TM1, TM2, TM3, TM4, TM5, - d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, - rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, - xlen, ylen, sequence, Lnorm_ass, d0_scale, - 1, a_opt, true, d_opt, mol_vec1[i]+mol_vec2[j], 1, invmap); - - //TM2=TM4*Lnorm_ass/xlen; - //TM1=TM4*Lnorm_ass/ylen; - //d0A=d0u; - //d0B=d0u; - - /* print result */ - output_results(xname, yname, - chainID_list1[i].c_str(), chainID_list2[j].c_str(), - xlen, ylen, t0, u0, TM1, TM2, TM3, TM4, TM5, rmsd0, d0_out, - seqM_mat[i][j].c_str(), seqxA_mat[i][j].c_str(), - seqyA_mat[i][j].c_str(), Liden, n_ali8, L_ali, TM_ali, rmsd_ali, - TM_0, d0_0, d0A, d0B, Lnorm_ass, d0_scale, d0a, d0u, - "", outfmt_opt, ter_opt, false, split_opt, 0, - "", false, a_opt, false, d_opt, 0, resi_vec1, resi_vec2); - - /* clean up */ - seqxA.clear(); - seqM.clear(); - seqyA.clear(); - sequence[0].clear(); - sequence[1].clear(); - delete[]seqy; - delete[]secy; - DeleteArray(&ya,ylen); - delete[]seqx; - delete[]secx; - DeleteArray(&xa,xlen); - DeleteArray(&xt,xlen); - } - sequence.clear(); - return; -} diff --git a/modules/bindings/src/tmalign/OST_INFO b/modules/bindings/src/tmalign/OST_INFO deleted file mode 100644 index 16ce11569..000000000 --- a/modules/bindings/src/tmalign/OST_INFO +++ /dev/null @@ -1,7 +0,0 @@ -Source code has been cloned August 2 2022 from: - -https://github.com/kad-ecoli/TMalign - -last commit: -f0824499d8ab4fa84b2e75d253de80ab2c894c56 - diff --git a/modules/bindings/src/wrap_tmalign.cc b/modules/bindings/src/wrap_tmalign.cc index cefbe1a44..8c05e0228 100644 --- a/modules/bindings/src/wrap_tmalign.cc +++ b/modules/bindings/src/wrap_tmalign.cc @@ -17,7 +17,7 @@ // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA //------------------------------------------------------------------------------ -#include "tmalign/TMalign.h" // include for the external TMalign +#include "USalign/TMalign.h" // include for the external TMalign #include <ost/mol/atom_view.hh> #include <ost/message.hh> @@ -29,7 +29,8 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, const geom::Vec3List& pos_two, const ost::seq::SequenceHandle& seq1, const ost::seq::SequenceHandle& seq2, - bool fast) { + bool fast, + bool rna) { int xlen = pos_one.size(); int ylen = pos_two.size(); @@ -99,12 +100,13 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, double TM_ali, rmsd_ali; // TMscore and rmsd in standard_TMscore int n_ali=0; int n_ali8=0; + int mol_type=static_cast<int>(rna); // Treated as RNA if mol_type > 0 TMalign_main(xa, ya, seqx, seqy, secx, secy, t0, u0, TM1, TM2, TM3, TM4, TM5, d0_0, TM_0, d0A, d0B, d0u, d0a, d0_out, seqM, seqxA, seqyA, rmsd0, L_ali, Liden, TM_ali, rmsd_ali, n_ali, n_ali8, xlen, ylen, sequence, Lnorm_ass, d0_scale, i_opt, a_opt, u_opt, d_opt, - fast, 0, TMcut); + fast, mol_type, TMcut); // cleanup DeleteArray(&xa, xlen); @@ -131,60 +133,74 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, return res; } +void ExtractChainInfo(const ost::mol::ChainView& chain, geom::Vec3List& pos, + ost::seq::SequenceHandle& s, bool& rna_mode) { -TMAlignResult WrappedTMAlign(const ost::mol::ChainView& chain1, - const ost::mol::ChainView& chain2, - bool fast) { - - geom::Vec3List pos1; - geom::Vec3List pos2; - std::vector<char> s1; - std::vector<char> s2; - - ost::mol::ResidueViewList res_list_1 = chain1.GetResidueList(); - ost::mol::ResidueViewList res_list_2 = chain2.GetResidueList(); + pos.clear(); + std::vector<char> olcs; + rna_mode = false; + ost::mol::ResidueViewList res_list = chain.GetResidueList(); - for(ost::mol::ResidueViewList::iterator it = res_list_1.begin(); - it != res_list_1.end(); ++it) { - if(!it->IsPeptideLinking()) { - continue; - } - ost::mol::AtomView ca = it->FindAtom("CA"); - if(!ca.IsValid()) { - continue; - } + for(auto it = res_list.begin(); it != res_list.end(); ++it) { char olc = it->GetOneLetterCode(); if(olc == '?') { continue; } - pos1.push_back(ca.GetPos()); - s1.push_back(olc); - } - - for(ost::mol::ResidueViewList::iterator it = res_list_2.begin(); - it != res_list_2.end(); ++it) { - if(!it->IsPeptideLinking()) { - continue; - } - ost::mol::AtomView ca = it->FindAtom("CA"); - if(!ca.IsValid()) { - continue; + if(it->IsPeptideLinking()) { + ost::mol::AtomView ca = it->FindAtom("CA"); + if(!ca.IsValid()) { + continue; + } + if(rna_mode) { + std::stringstream ss; + ss << "Error in WrappedTMAlign: Chains cannot have peptide and RNA "; + ss << "residues. Problematic chain: "<<chain.GetName(); + throw ost::Error(ss.str()); + } + olcs.push_back(olc); + pos.push_back(ca.GetPos()); } - char olc = it->GetOneLetterCode(); - if(olc == '?') { - continue; + else if(it->IsNucleotideLinking()) { + ost::mol::AtomView c3 = it->FindAtom("C3'"); + if(!c3.IsValid()) { + continue; + } + if(rna_mode==false && !pos.empty()) { + std::stringstream ss; + ss << "Error in WrappedTMAlign: Chains cannot have peptide and RNA "; + ss << "residues. Problematic chain: "<<chain.GetName(); + throw ost::Error(ss.str()); + } + rna_mode = true; + olcs.push_back(olc); + pos.push_back(c3.GetPos()); } - pos2.push_back(ca.GetPos()); - s2.push_back(olc); } + String str_s = String(olcs.begin(), olcs.end()); + s = ost::seq::CreateSequence(chain.GetName(), str_s); +} - String str_s1(s1.begin(), s1.end()); - String str_s2(s2.begin(), s2.end()); - ost::seq::SequenceHandle seq_s1 = ost::seq::CreateSequence("one", str_s1); - ost::seq::SequenceHandle seq_s2 = ost::seq::CreateSequence("two", str_s2); +TMAlignResult WrappedTMAlign(const ost::mol::ChainView& chain1, + const ost::mol::ChainView& chain2, + bool fast) { + + geom::Vec3List pos1; + ost::seq::SequenceHandle s1; + bool rna_mode1; + ExtractChainInfo(chain1, pos1, s1, rna_mode1); + + geom::Vec3List pos2; + ost::seq::SequenceHandle s2; + bool rna_mode2; + ExtractChainInfo(chain2, pos2, s2, rna_mode2); + + if(rna_mode1 != rna_mode2) { + throw ost::Error("Error in WrappedTMAlign: Cannot compare peptide with " + "RNA chains"); + } - return WrappedTMAlign(pos1, pos2, seq_s1, seq_s2, fast); + return WrappedTMAlign(pos1, pos2, s1, s2, fast, rna_mode1); } }} //ns diff --git a/modules/bindings/src/wrap_tmalign.hh b/modules/bindings/src/wrap_tmalign.hh index 4163d4644..fcd126e9e 100644 --- a/modules/bindings/src/wrap_tmalign.hh +++ b/modules/bindings/src/wrap_tmalign.hh @@ -56,7 +56,8 @@ TMAlignResult WrappedTMAlign(const geom::Vec3List& pos_one, const geom::Vec3List& pos_two, const ost::seq::SequenceHandle& seq1, const ost::seq::SequenceHandle& seq2, - bool fast = false); + bool fast = false, + bool rna = false); TMAlignResult WrappedTMAlign(const ost::mol::ChainView& ent1, const ost::mol::ChainView& ent2, -- GitLab