diff --git a/modules/mol/alg/doc/molalg.rst b/modules/mol/alg/doc/molalg.rst index a460828b3b82b976eb60afee71740bb7f4f6d2db..4f0ab63a4ab2e38d58ef8a453741d492d4f46a6d 100644 --- a/modules/mol/alg/doc/molalg.rst +++ b/modules/mol/alg/doc/molalg.rst @@ -920,6 +920,15 @@ Algorithms on Structures :return: The summed solvent accessibilty of each atom in *ent*. +.. method:: AssignSecStruct(ent) + + Assigns secondary structures to all residues based on hydrogen bond patterns + as described by DSSP. + + :param ent: Entity on which to assign secondary structures + :type ent: :class:`~ost.mol.EntityView`/ + :class:`~ost.mol.EntityHandle` + .. _traj-analysis: diff --git a/modules/mol/alg/pymod/CMakeLists.txt b/modules/mol/alg/pymod/CMakeLists.txt index 4244ac562e50bad6d9e362ca5d5109e4f134144a..fa2942575835fd70e480563148784918bad58122 100644 --- a/modules/mol/alg/pymod/CMakeLists.txt +++ b/modules/mol/alg/pymod/CMakeLists.txt @@ -6,6 +6,7 @@ set(OST_MOL_ALG_PYMOD_SOURCES export_structure_analysis.cc export_contact_overlap.cc export_accessibility.cc + export_sec_structure.cc ) set(OST_MOL_ALG_PYMOD_MODULES diff --git a/modules/mol/alg/pymod/export_sec_structure.cc b/modules/mol/alg/pymod/export_sec_structure.cc new file mode 100644 index 0000000000000000000000000000000000000000..6191b6921b026c3d6d7d4096fa4bb60b9bc5822a --- /dev/null +++ b/modules/mol/alg/pymod/export_sec_structure.cc @@ -0,0 +1,43 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2017 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <boost/python.hpp> + +#include <ost/mol/alg/sec_struct.hh> + +using namespace boost::python; + +namespace{ + +void AssignSecStructView(ost::mol::EntityView& view) { + ost::mol::alg::AssignSecStruct(view); +} + +void AssignSecStructHandle(ost::mol::EntityHandle& handle) { + ost::mol::alg::AssignSecStruct(handle); +} + +} // ns + +void export_sec_struct() { + def("AssignSecStruct", &AssignSecStructView, (arg("ent"))); + def("AssignSecStruct", &AssignSecStructHandle, (arg("ent"))); +} + diff --git a/modules/mol/alg/pymod/wrap_mol_alg.cc b/modules/mol/alg/pymod/wrap_mol_alg.cc index 00eb51cb35657fba1826eea45525f42fbfe7800a..9f6c7f850fcaa869be30b337b1c94c654e5d2927 100644 --- a/modules/mol/alg/pymod/wrap_mol_alg.cc +++ b/modules/mol/alg/pymod/wrap_mol_alg.cc @@ -41,6 +41,7 @@ void export_StructureAnalysis(); void export_Clash(); void export_contact_overlap(); void export_accessibility(); +void export_sec_struct(); #if OST_IMG_ENABLED void export_entity_to_density(); #endif @@ -111,6 +112,7 @@ BOOST_PYTHON_MODULE(_ost_mol_alg) export_Clash(); export_contact_overlap(); export_accessibility(); + export_sec_struct(); #if OST_IMG_ENABLED export_entity_to_density(); #endif diff --git a/modules/mol/alg/src/CMakeLists.txt b/modules/mol/alg/src/CMakeLists.txt index bd74affe8c31c2e1f246a3a1a88f34002f1208da..d473247e656e17639bc903ce900cfafb42cea83c 100644 --- a/modules/mol/alg/src/CMakeLists.txt +++ b/modules/mol/alg/src/CMakeLists.txt @@ -19,6 +19,7 @@ set(OST_MOL_ALG_HEADERS domain_find.hh similarity_matrix.hh accessibility.hh + sec_struct.hh ) set(OST_MOL_ALG_SOURCES @@ -41,6 +42,7 @@ set(OST_MOL_ALG_SOURCES domain_find.cc similarity_matrix.cc accessibility.cc + sec_struct.cc ) set(MOL_ALG_DEPS ost_mol ost_seq) diff --git a/modules/mol/alg/src/sec_struct.cc b/modules/mol/alg/src/sec_struct.cc new file mode 100644 index 0000000000000000000000000000000000000000..1687726d0f8503e5a49526dc5cd660e192b60b56 --- /dev/null +++ b/modules/mol/alg/src/sec_struct.cc @@ -0,0 +1,665 @@ +#include <ost/mol/alg/sec_struct.hh> + +#include <ost/mol/sec_structure.hh> + +#include <limits> +#include <iostream> + +namespace { + +struct ExtendedBridge { + + ExtendedBridge() { + // should be enough for most strands + strand_one.reserve(16); + strand_two.reserve(16); + } + + bool parallel; // antiparallel otherwise + std::vector<int> strand_one; + std::vector<int> strand_two; +}; + +bool Connected(const std::vector<int>& connected_to_next, + int start_idx, int length) { + if(start_idx < 0) return false; + if(start_idx + length > static_cast<int>(connected_to_next.size())) { + return false; + } + for(int i = 0; i < (length - 1); ++i) { + if(connected_to_next[start_idx + i] == 0) return false; + } + return true; +} + +bool AddIfValid(int idx_one, int idx_two, int start_idx, int end_idx, + const std::vector<int>& connected_to_next, + bool parallel, + std::vector<ExtendedBridge>& extended_bridges) { + + // this function assumes idx_one to be sorted in an increasing manner + // when called multiple times! + + // idx_one is expected to be in range [start_idx, end_idx] + // if idx_two is in the same range we can skip cases where + // idx_one > idx_two, since the interactions are symmetric and we would + // see every bridge twice. If idx_two is outside this range + // (e.g. when we only calculate the secondary structure for a stretch + // of a larger structure) we have to add it anyway. + + if(idx_two >= start_idx && idx_two <= end_idx && idx_two < idx_one) { + return false; + } + + // the abs in the following check is still needed for the case when + // idx_two is outside [start_idx, end_idx] + if(std::abs(idx_two - idx_one) >= 3 && + Connected(connected_to_next, idx_one - 1, 3) && + Connected(connected_to_next, idx_two - 1, 3)) { + + // it is a bridge! Let's see whether we can extend + // an existing bridge... + bool added = false; + + for(uint i = 0; i < extended_bridges.size(); ++i) { + + if(extended_bridges[i].parallel == parallel && + idx_one == extended_bridges[i].strand_one.back() + 1) { + + if(parallel && idx_two == extended_bridges[i].strand_two.back() + 1) { + extended_bridges[i].strand_one.push_back(idx_one); + extended_bridges[i].strand_two.push_back(idx_two); + added = true; + break; + } + + if(!parallel && idx_two == extended_bridges[i].strand_two.back() - 1) { + extended_bridges[i].strand_one.push_back(idx_one); + extended_bridges[i].strand_two.push_back(idx_two); + added = true; + break; + } + } + } + + if(!added) { + // haven't seen it... Lets add a new one! + extended_bridges.push_back(ExtendedBridge()); + extended_bridges.back().parallel = parallel; + extended_bridges.back().strand_one.push_back(idx_one); + extended_bridges.back().strand_two.push_back(idx_two); + } + return true; + } + return false; +} + +void ResolveBulges(const std::vector<int>& connected_to_next, + std::vector<ExtendedBridge>& extended_bridges) { + + for(uint i = 0; i < extended_bridges.size(); ++i) { + for(uint j = i + 1; j < extended_bridges.size(); ++j) { + + if(extended_bridges[i].parallel == extended_bridges[j].parallel) { + + int i_strand_one_begin = extended_bridges[i].strand_one[0]; + int i_strand_one_end = extended_bridges[i].strand_one.back(); + int i_strand_two_begin = extended_bridges[i].strand_two[0]; + int i_strand_two_end = extended_bridges[i].strand_two.back(); + int j_strand_one_begin = extended_bridges[j].strand_one[0]; + int j_strand_one_end = extended_bridges[j].strand_one.back(); + int j_strand_two_begin = extended_bridges[j].strand_two[0]; + int j_strand_two_end = extended_bridges[j].strand_two.back(); + + // first strand must not overlap + if(i_strand_one_end >= j_strand_one_begin && + i_strand_one_begin <= j_strand_one_end) { + continue; + } + + // both strands must be connected, the first strand is the + // same for parallel and anti-parallel bridges + int begin = std::min(i_strand_one_begin, j_strand_one_begin); + int end = std::max(i_strand_one_end, j_strand_one_end); + if(!Connected(connected_to_next, begin, end - begin)) continue; + + // the distances are dependent on parallel / anti-parallel + int d_one = -1; + int d_two = -1; + + if(extended_bridges[i].parallel) { + // check connectivity for second strand + begin = std::min(i_strand_two_begin, j_strand_two_begin); + end = std::max(i_strand_two_end, j_strand_two_end); + if(!Connected(connected_to_next, begin, end - begin)) continue; + + d_one = j_strand_one_begin - i_strand_one_end; + d_two = j_strand_two_begin - i_strand_two_end; + } + else { + // check connectivity for second strand + begin = std::min(i_strand_two_end, j_strand_two_end); + end = std::max(i_strand_two_begin, j_strand_two_begin); + if(!Connected(connected_to_next, begin, end - begin)) continue; + + d_one = j_strand_one_begin - i_strand_one_end; + d_two = i_strand_two_end - j_strand_two_begin; + } + + if(d_one < 0 || d_two < 0) continue; + + if((d_one < 6 && d_two < 3) || (d_one < 3 && d_two < 6)) { + + extended_bridges[i].strand_one.insert( + extended_bridges[i].strand_one.end(), + extended_bridges[j].strand_one.begin(), + extended_bridges[j].strand_one.end()); + extended_bridges[i].strand_two.insert( + extended_bridges[i].strand_two.end(), + extended_bridges[j].strand_two.begin(), + extended_bridges[j].strand_two.end()); + extended_bridges.erase(extended_bridges.begin() + j); + --j; + } + } + } + } +} + +void AssignBridges(int start_idx, int size, + const std::vector<int>& donor_for_one, + const std::vector<int>& donor_for_two, + const std::vector<int>& connected_to_next, + String& ss_assignment) { + + int full_size = donor_for_one.size(); + int end_idx = start_idx + size - 1; + + std::vector<ExtendedBridge> extended_bridges; + + // let's search for bridges + + std::vector<int> parallel_partners; + std::vector<int> antiparallel_partners; + + for(int i = std::max(0, start_idx - 5); + i < std::min(start_idx + size + 5, full_size); ++i) { + + for(int j = 0; j < full_size; ++j) { + + // check for parallel bridge + if(i > 0 && i < full_size - 1) { + if((donor_for_one[i+1] == j && donor_for_one[j] == i - 1) || + (donor_for_one[i+1] == j && donor_for_two[j] == i - 1) || + (donor_for_two[i+1] == j && donor_for_one[j] == i - 1) || + (donor_for_two[i+1] == j && donor_for_two[j] == i - 1)) { + if(AddIfValid(i,j,start_idx, end_idx, connected_to_next, + true, extended_bridges)) continue; + } + } + if(j > 0 && j < full_size - 1) { + if((donor_for_one[j+1] == i && donor_for_one[i] == j - 1) || + (donor_for_one[j+1] == i && donor_for_two[i] == j - 1) || + (donor_for_two[j+1] == i && donor_for_one[i] == j - 1) || + (donor_for_two[j+1] == i && donor_for_two[i] == j - 1)) { + if(AddIfValid(i,j,start_idx, end_idx, connected_to_next, + true, extended_bridges)) continue; + } + } + + // check for antiparallel bridge + if(i > 0 && i < full_size - 1 && j > 0 && j < full_size - 1) { + if((donor_for_one[j+1] == i - 1 && donor_for_one[i+1] == j - 1) || + (donor_for_one[j+1] == i - 1 && donor_for_two[i+1] == j - 1) || + (donor_for_two[j+1] == i - 1 && donor_for_one[i+1] == j - 1) || + (donor_for_two[j+1] == i - 1 && donor_for_two[i+1] == j - 1)) { + if(AddIfValid(i,j,start_idx, end_idx, connected_to_next, + false, extended_bridges)) continue; + } + } + if((donor_for_one[i] == j && donor_for_one[j] == i) || + (donor_for_one[i] == j && donor_for_two[j] == i) || + (donor_for_two[i] == j && donor_for_one[j] == i) || + (donor_for_two[i] == j && donor_for_two[j] == i)) { + AddIfValid(i,j,start_idx, end_idx, connected_to_next, + false, extended_bridges); + } + } + } + + ResolveBulges(connected_to_next, extended_bridges); + + for(uint i = 0; i < extended_bridges.size(); ++i) { + if(extended_bridges[i].strand_one.size() > 1) { + for(int j = extended_bridges[i].strand_one.front(); + j <= extended_bridges[i].strand_one.back(); ++j) { + int idx = j - start_idx; + if(idx >= 0 && idx < size) ss_assignment[idx] = 'E'; + } + if(extended_bridges[i].parallel) { + for(int j = extended_bridges[i].strand_two.front(); + j <= extended_bridges[i].strand_two.back(); ++j) { + int idx = j - start_idx; + if(idx >= 0 && idx < size) ss_assignment[idx] = 'E'; + } + } + else { + for(int j = extended_bridges[i].strand_two.back(); + j <= extended_bridges[i].strand_two.front(); ++j) { + int idx = j - start_idx; + if(idx >= 0 && idx < size) ss_assignment[idx] = 'E'; + } + } + } + else { + int idx = extended_bridges[i].strand_one[0] - start_idx; + if(idx >= 0 && idx < size && ss_assignment[idx] != 'E') { + ss_assignment[idx] = 'B'; + } + idx = extended_bridges[i].strand_two[0] - start_idx; + if(idx >= 0 && idx < size && ss_assignment[idx] != 'E') { + ss_assignment[idx] = 'B'; + } + } + } +} + +void AssignTurns(int start_idx, int size, + const std::vector<geom::Vec3>& ca_positions, + const std::vector<int>& donor_for_one, + const std::vector<int>& donor_for_two, + const std::vector<int>& connected_to_next, + String& ss_assignment) { + + int full_size = donor_for_one.size(); + + // in turns we have following numbering scheme + // 1: its the beginning of a turn + // 2: its in the middle of a turn + // 3: its the end of a turn + // 4: its both, start and end + + std::vector<std::vector<int> > turn_arrays(3); + turn_arrays[0].assign(full_size, 0); + turn_arrays[1].assign(full_size, 0); + turn_arrays[2].assign(full_size, 0); + std::vector<int>& turn_3 = turn_arrays[0]; + std::vector<int>& turn_4 = turn_arrays[1]; + std::vector<int>& turn_5 = turn_arrays[2]; + + // search for turns + for(int turn_type = 3; turn_type <= 5; ++turn_type) { + std::vector<int>& turn_array = turn_arrays[turn_type - 3]; + for(int i = std::max(0, start_idx - turn_type); + i < std::min(start_idx + size, full_size - turn_type); ++i) { + + if((donor_for_one[i + turn_type] == i || + donor_for_two[i + turn_type] == i) && + Connected(connected_to_next, i, turn_type)) { + + // last element is end of turn + turn_array[i + turn_type] = 3; + + // set all the stuff in between to 2 (except it already is a start...) + for(int j = i + 1; j < i + turn_type; ++j) { + if(turn_array[j] == 0) turn_array[j] = 2; + } + + // set first element to start (might be start AND end) + if(turn_array[i] == 3) turn_array[i] = 4; + else turn_array[i] = 1; + } + } + } + + // to fill in all the turns in the relevant range + // its easier to work in a separate array and then fill everything + // into ss_assignment in the end to avoid indexing nightmare... + // + // coding: + // 0: C + // 1: T + // 2: S + // 3: G + // 4: H + // 5: I + // 6: other stuff (all kind of bridges) + std::vector<int> turn_array(full_size, 0); + + // first map the "other stuff" from the ss_assignment we already have + for(int i = 0; i < size; ++i) { + if(ss_assignment[i] != 'C') { + turn_array[start_idx + i] = 6; + } + } + + // do H + for(int i = std::max(start_idx - 3, 1); i < std::min(start_idx + size, full_size - 3); ++i) { + if((turn_4[i] == 1 || turn_4[i] == 4) && + (turn_4[i - 1] == 1 || turn_4[i - 1] == 4)) { + turn_array[i] = 4; + turn_array[i+1] = 4; + turn_array[i+2] = 4; + turn_array[i+3] = 4; + } + } + + // do G + for(int i = std::max(start_idx - 2, 1); i < std::min(start_idx + size, full_size - 2); ++i) { + if((turn_3[i] == 1 || turn_3[i] == 4) && + (turn_3[i - 1] == 1 || turn_3[i - 1] == 4)) { + // check whether we can still add the turn + if((turn_array[i] == 0 || turn_array[i] == 3) && + (turn_array[i+1] == 0 || turn_array[i+1] == 3) && + (turn_array[i+2] == 0 || turn_array[i+2] == 3)) { + turn_array[i] = 3; + turn_array[i+1] = 3; + turn_array[i+2] = 3; + } + } + } + + // do I + for(int i = std::max(start_idx - 4, 1); i < std::min(start_idx + size, full_size - 4); ++i) { + if((turn_5[i] == 1 || turn_5[i] == 4) && + (turn_5[i - 1] == 1 || turn_5[i - 1] == 4)) { + + // check whether we can still add the turn + bool add_it = true; + for(int j = 0; j < 5; ++j) { + if(!(turn_array[i+j] == 0 || + turn_array[i+j] == 5 || + turn_array[i+j] == 4)) { // we prefer pi helices!! + // (default in dssp) + add_it = false; + break; + } + } + + if(add_it) { + turn_array[i] = 5; + turn_array[i+1] = 5; + turn_array[i+2] = 5; + turn_array[i+3] = 5; + turn_array[i+4] = 5; + } + } + } + + // do turns(T) + + // do turn_3 + for(int i = start_idx; i < start_idx + size; ++i) { + if(turn_array[i] == 0) { + if((i > 0 && (turn_3[i - 1] == 1 || turn_3[i - 1] == 4)) || + (i > 1 && (turn_3[i - 2] == 1 || turn_3[i - 2] == 4))) { + turn_array[i] = 1; + } + } + } + + // do turn_4 + for(int i = start_idx; i < start_idx + size; ++i) { + if(turn_array[i] == 0) { + if((i > 0 && (turn_4[i - 1] == 1 || turn_4[i - 1] == 4)) || + (i > 1 && (turn_4[i - 2] == 1 || turn_4[i - 2] == 4)) || + (i > 2 && (turn_4[i - 3] == 1 || turn_4[i - 3] == 4))) { + turn_array[i] = 1; + } + } + } + + // do turn_5 + for(int i = start_idx; i < start_idx + size; ++i) { + if(turn_array[i] == 0) { + if((i > 0 && (turn_5[i - 1] == 1 || turn_5[i - 1] == 4)) || + (i > 1 && (turn_5[i - 2] == 1 || turn_5[i - 2] == 4)) || + (i > 2 && (turn_5[i - 3] == 1 || turn_5[i - 3] == 4)) || + (i > 3 && (turn_5[i - 4] == 1 || turn_5[i - 4] == 4))) { + turn_array[i] = 1; + } + } + } + + // do bends (S) + Real angle; + for(int i = std::max(start_idx, 2); + i < std::min(start_idx + size, full_size - 2); ++i){ + if(turn_array[i] == 0) { + angle = geom::Angle(ca_positions[i]-ca_positions[i-2], + ca_positions[i+2]-ca_positions[i]); + if(angle > 1.2217 && + Connected(connected_to_next, i - 2, 5)) { + turn_array[i] = 2; + } + } + } + + for(int i = start_idx; i < start_idx + size; ++i) { + int idx = i - start_idx; + if(turn_array[i] == 1) ss_assignment[idx] = 'T'; + else if(turn_array[i] == 2) ss_assignment[idx] = 'S'; + else if(turn_array[i] == 3) ss_assignment[idx] = 'G'; + else if(turn_array[i] == 4) ss_assignment[idx] = 'H'; + else if(turn_array[i] == 5) ss_assignment[idx] = 'I'; + } +} + +} // anon ns + +namespace ost{ namespace mol{ namespace alg{ + +String RawEstimateSS(const std::vector<geom::Vec3>& ca_positions, + int start_idx, int size, + const std::vector<int>& donor_for_one, + const std::vector<int>& donor_for_two, + const std::vector<int>& connected_to_next) { + + if(size < 3){ + throw ost::Error("Size of input is too small!in dssp calculation"); + } + + int full_size = ca_positions.size(); + + if(start_idx + size > full_size) { + throw ost::Error("Selected stretch is not fully covered by provided data in dssp calculation!"); + } + + if(full_size != static_cast<int>(donor_for_one.size())){ + throw ost::Error("Position and donor data are inconsistent for dssp calculation!"); + } + + if(full_size != static_cast<int>(donor_for_two.size())){ + throw ost::Error("Position and donor data are inconsistent for dssp calculation!"); + } + + String ss_assignment; + ss_assignment.assign(size,'C'); + + AssignBridges(start_idx, size, donor_for_one, donor_for_two, + connected_to_next, ss_assignment); + + AssignTurns(start_idx, size, ca_positions, + donor_for_one, donor_for_two, + connected_to_next, ss_assignment); + + return ss_assignment; +} + +void PrepareSSData(const ost::mol::ResidueViewList& res_list, + std::vector<int>& res_indices, + std::vector<geom::Vec3>& ca_positions, + std::vector<int>& donor_for_one, + std::vector<int>& donor_for_two, + std::vector<int>& connected_to_next) { + + res_indices.clear(); + ca_positions.clear(); + donor_for_one.clear(); + donor_for_two.clear(); + connected_to_next.clear(); + + std::vector<geom::Vec3> n_positions; + std::vector<geom::Vec3> c_positions; + std::vector<geom::Vec3> o_positions; + + int total_num_residues = res_list.size(); + n_positions.reserve(total_num_residues); + ca_positions.reserve(total_num_residues); + c_positions.reserve(total_num_residues); + o_positions.reserve(total_num_residues); + res_indices.reserve(total_num_residues); + connected_to_next.reserve(total_num_residues); + + for(int res_idx = 0; res_idx < total_num_residues; ++res_idx) { + + const ost::mol::ResidueView& res = res_list[res_idx]; + + // add information if all atoms are present + ost::mol::AtomView n = res.FindAtom("N"); + ost::mol::AtomView ca = res.FindAtom("CA"); + ost::mol::AtomView c = res.FindAtom("C"); + ost::mol::AtomView o = res.FindAtom("O"); + + if(n.IsValid() && ca.IsValid() && c.IsValid() && o.IsValid()) { + n_positions.push_back(n.GetPos()); + ca_positions.push_back(ca.GetPos()); + c_positions.push_back(c.GetPos()); + o_positions.push_back(o.GetPos()); + res_indices.push_back(res_idx); + } + } + + int num_residues = res_indices.size(); + + for(int i = 0; i < num_residues - 1; ++i) { + + if(res_list[res_indices[i]].GetChain() == + res_list[res_indices[i+1]].GetChain() && + geom::Distance(c_positions[i], n_positions[i+1]) <= 2.5) { + connected_to_next.push_back(1); + } + else { + connected_to_next.push_back(0); + } + } + + // the last residue needs to be handled as well... + connected_to_next.push_back(0); + + // estimate the hydrogen positions and also keep track which ones are valid + // (e.g. in case of chain breaks / prolines...) + // please note, that DSSP simply uses the directional vector given by + // carbon and oxygen of the previous residue to determine the hydrogen + // position + std::vector<geom::Vec3> h_positions(num_residues); + std::vector<int> valid_hydrogens(num_residues, 0); + + for(int i = 0; i < num_residues - 1; ++i) { + if(connected_to_next[i] && res_list[res_indices[i+1]].GetName() != "PRO") { + geom::Vec3 dir_vec = geom::Normalize(c_positions[i] - o_positions[i]); + h_positions[i+1] = n_positions[i+1] + dir_vec; + valid_hydrogens[i+1] = 1; + } + } + + std::pair<Real, Real> max_e_pair = + std::make_pair(std::numeric_limits<Real>::max(), + std::numeric_limits<Real>::max()); + + std::vector<std::pair<Real,Real> > donor_energies(num_residues, max_e_pair); + std::vector<std::pair<int,int> > acceptors(num_residues, std::make_pair(-1, -1)); + Real e; + + for(int i = 0; i < num_residues - 1; ++i){ + for(int j = i + 1; j < num_residues; ++j){ + if(geom::Length2(ca_positions[i] - ca_positions[j]) < 81.0){ + + //i as acceptor, j as donor + if(j > i + 1 && valid_hydrogens[j]){ + e = DSSPHBondEnergy(h_positions[j], n_positions[j], + c_positions[i], o_positions[i]); + // stupid rounding to be consistent with dssp + e = std::floor(e * 1000 + 0.5) * 0.001; + + if(e < donor_energies[j].first) { + donor_energies[j].second = donor_energies[j].first; + acceptors[j].second = acceptors[j].first; + donor_energies[j].first = e; + acceptors[j].first = i; + } + else if(e < donor_energies[j].second) { + donor_energies[j].second = e; + acceptors[j].second = i; + } + } + + //i as donor, j as acceptor + if(valid_hydrogens[i]){ + + e = DSSPHBondEnergy(h_positions[i], n_positions[i], + c_positions[j], o_positions[j]); + // stupid rounding to be consistent with dssp + e = std::floor(e * 1000 + 0.5) * 0.001; + + if(e < donor_energies[i].first) { + donor_energies[i].second = donor_energies[i].first; + acceptors[i].second = acceptors[i].first; + donor_energies[i].first = e; + acceptors[i].first = j; + } + else if(e < donor_energies[i].second) { + donor_energies[i].second = e; + acceptors[i].second = j; + } + } + } + } + } + + donor_for_one.assign(num_residues, -1); + donor_for_two.assign(num_residues, -1); + + for(int i = 0; i < num_residues; ++i) { + if(donor_energies[i].first < -0.5) { + donor_for_one[i] = acceptors[i].first; + } + if(donor_energies[i].second < -0.5) { + donor_for_two[i] = acceptors[i].second; + } + } +} + +void AssignSecStruct(ost::mol::EntityView& ent) { + + ost::mol::ResidueViewList res_list = ent.GetResidueList(); + + for(uint res_idx = 0; res_idx < res_list.size(); ++res_idx) { + // set all residues to coil to start with + res_list[res_idx].SetSecStructure(ost::mol::SecStructure('C')); + } + + std::vector<int> res_indices; + std::vector<geom::Vec3> ca_positions; + std::vector<int> donor_for_one; + std::vector<int> donor_for_two; + std::vector<int> connected_to_next; + + PrepareSSData(res_list, res_indices, ca_positions, + donor_for_one, donor_for_two, connected_to_next); + + String ss = RawEstimateSS(ca_positions, 0, ca_positions.size(), + donor_for_one, donor_for_two, + connected_to_next); + + for(uint i = 0; i < ca_positions.size(); ++i) { + res_list[res_indices[i]].SetSecStructure(ost::mol::SecStructure(ss[i])); + } +} + +void AssignSecStruct(ost::mol::EntityHandle& ent) { + ost::mol::EntityView view = ent.CreateFullView(); + AssignSecStruct(view); +} + +}}} //ns diff --git a/modules/mol/alg/src/sec_struct.hh b/modules/mol/alg/src/sec_struct.hh new file mode 100644 index 0000000000000000000000000000000000000000..0fca5e85f9185dd222b32c86a69fa071467e3e14 --- /dev/null +++ b/modules/mol/alg/src/sec_struct.hh @@ -0,0 +1,81 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2017 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ +#ifndef OST_SEC_STRUCT_HH +#define OST_SEC_STRUCT_HH + +#include <vector> + +#include <ost/base.hh> +#include <ost/geom/geom.hh> +#include <ost/message.hh> +#include <ost/mol/mol.hh> + +namespace ost { namespace mol{ namespace alg{ + +inline Real DSSPHBondEnergy(const geom::Vec3& h_pos, const geom::Vec3& n_pos, + const geom::Vec3& c_pos, const geom::Vec3& o_pos) { + Real on = 1.0/geom::Distance(o_pos,n_pos); + Real ch = 1.0/geom::Distance(c_pos,h_pos); + Real oh = 1.0/geom::Distance(o_pos,h_pos); + Real cn = 1.0/geom::Distance(c_pos,n_pos); + return 27.888 * (on+ch-oh-cn); +} + +// Raw estimation of secondary structure +// +// This function is not intended for Python export, since the input has to be +// prepared carefully. It basically estimates the secondary structure of a +// stretch of amino acids based on the hydrogen bond pattern as described for +// the dssp tool. +// +// To define the hydrogen bonds you need to provide the vectors donor_for_one +// and donor_for two. The index of an acceptor residue appears in one of the +// two vectors if the corresponding hbond energy is < -0.5 +// (be aware of prolines that can't be donors!). +// There are two of those vectors, because for every residue we store the two +// lowest energy acceptors. If there is no acceptor available, the value +// at this position must be -1. +// The connected_to_next contains zeros and ones that defines, whether there +// is a peptide bond towards the next residue and the ca_positions vector +// is self explaining. +// As an additional feature you can also provide the according data for the +// full structure but only estimate the secondary structure for a small +// stretch, that gets defined by start_idx and size. +// For an example usage have a look at the AssignSecStruct functions. + +String RawEstimateSS(const std::vector<geom::Vec3>& ca_positions, + int start_idx, int size, + const std::vector<int>& donor_for_one, + const std::vector<int>& donor_for_two, + const std::vector<int>& connected_to_next); + +void PrepareSSData(const ost::mol::ResidueViewList& res_list, + std::vector<int>& res_indices, + std::vector<geom::Vec3>& ca_positions, + std::vector<int>& donor_for_one, + std::vector<int>& donor_for_two, + std::vector<int>& connected_to_next); + +void AssignSecStruct(ost::mol::EntityView& ent); + +void AssignSecStruct(ost::mol::EntityHandle& ent); + +}}} //ns + +#endif diff --git a/modules/mol/alg/tests/CMakeLists.txt b/modules/mol/alg/tests/CMakeLists.txt index 10c5fbc2319956ce21cc8d319b393b89a17fb86f..0002d71492e639d21d456a93f94950db59aa1afe 100644 --- a/modules/mol/alg/tests/CMakeLists.txt +++ b/modules/mol/alg/tests/CMakeLists.txt @@ -2,15 +2,16 @@ set(OST_MOL_ALG_UNIT_TESTS test_superposition.cc tests.cc test_consistency_checks.cc + test_partial_sec_struct_assignment.cc test_pdbize.py test_convenient_superpose.py test_hbond.py test_accessibility.py + test_sec_struct.py ) if (COMPOUND_LIB) list(APPEND OST_MOL_ALG_UNIT_TESTS test_qsscoring.py) endif() -ost_unittest(MODULE mol_alg SOURCES "${OST_MOL_ALG_UNIT_TESTS}") - +ost_unittest(MODULE mol_alg SOURCES "${OST_MOL_ALG_UNIT_TESTS}" LINK ost_io) diff --git a/modules/mol/alg/tests/test_partial_sec_struct_assignment.cc b/modules/mol/alg/tests/test_partial_sec_struct_assignment.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6a9915608edd7858af045686a93e8df8b740369 --- /dev/null +++ b/modules/mol/alg/tests/test_partial_sec_struct_assignment.cc @@ -0,0 +1,88 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2017 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + + +#include <ost/mol/alg/sec_struct.hh> +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include <boost/test/auto_unit_test.hpp> +#include <ost/mol/mol.hh> +#include <ost/io/mol/pdb_reader.hh> +#include <ost/mol/builder.hh> +#include <ost/message.hh> + + + +BOOST_AUTO_TEST_SUITE( mol_alg ); + +BOOST_AUTO_TEST_CASE(partial_sec_struct_assignment) { + + // load a test structure + String filepath = "testfiles/1a0s.pdb"; + ost::io::PDBReader reader(filepath, ost::io::IOProfile()); + ost::mol::EntityHandle test_ent = ost::mol::CreateEntity(); + reader.Import(test_ent); + + ost::mol::EntityView view = test_ent.CreateFullView(); + ost::mol::ResidueViewList res_list = view.GetResidueList(); + + std::vector<int> res_indices; + std::vector<geom::Vec3> ca_positions; + std::vector<int> donor_for_one; + std::vector<int> donor_for_two; + std::vector<int> connected_to_next; + + ost::mol::alg::PrepareSSData(res_list, res_indices, ca_positions, + donor_for_one, donor_for_two, + connected_to_next); + + String full_pred = ost::mol::alg::RawEstimateSS(ca_positions, + 0, ca_positions.size(), + donor_for_one, donor_for_two, + connected_to_next); + + // select some start indices to test + int max_num = ca_positions.size(); + std::vector<int> test_locations; + test_locations.push_back(max_num/10); + test_locations.push_back((max_num/10)*2); + test_locations.push_back((max_num/10)*3); + test_locations.push_back((max_num/10)*4); + test_locations.push_back((max_num/10)*5); + test_locations.push_back((max_num/10)*6); + test_locations.push_back((max_num/10)*7); + test_locations.push_back((max_num/10)*8); + test_locations.push_back((max_num/10)*9); + int stretch_length = std::min(12, max_num - test_locations.back()); + + for(uint i = 0; i < test_locations.size(); ++i) { + String expected_ss = full_pred.substr(test_locations[i], stretch_length); + String calculated_ss = ost::mol::alg::RawEstimateSS(ca_positions, + test_locations[i], + stretch_length, + donor_for_one, + donor_for_two, + connected_to_next); + + BOOST_CHECK(expected_ss == calculated_ss); + } + +} + +BOOST_AUTO_TEST_SUITE_END(); diff --git a/modules/mol/alg/tests/test_sec_struct.py b/modules/mol/alg/tests/test_sec_struct.py new file mode 100644 index 0000000000000000000000000000000000000000..69d1a5fc2b439fce1c9211e1f975e27154361ba8 --- /dev/null +++ b/modules/mol/alg/tests/test_sec_struct.py @@ -0,0 +1,34 @@ +from ost import io, mol, settings +import unittest +import os +from ost.bindings import dssp + + +class TestSecStruct(unittest.TestCase): + + def testSecStruct(self): + + # unit test only makes sense, when a dssp binary is around + dssp_path = None + try: + dssp_path = settings.locate("dssp") + except: + try: + dssp_path = settings.locate("mkdssp") + except: + print "Could not find dssp, could not compare sec struct assignment..." + + dssp_ent = io.LoadPDB(os.path.join("testfiles", "1a0s.pdb")) + ost_ent = io.LoadPDB(os.path.join("testfiles", "1a0s.pdb")) + + dssp.AssignDSSP(dssp_ent, dssp_bin = dssp_path) + mol.alg.AssignSecStruct(ost_ent) + + for a, b in zip(dssp_ent.residues, ost_ent.residues): + self.assertTrue(str(a.GetSecStructure()) == str(b.GetSecStructure())) + + +if __name__ == "__main__": + + from ost import testutils + testutils.RunTests()