diff --git a/modelling/doc/algorithms.rst b/modelling/doc/algorithms.rst index e38de593b9bbb75c228287fd22bdf395285538ba..439fd2d9009650a24933a6ccaebb0a6d567c8528 100644 --- a/modelling/doc/algorithms.rst +++ b/modelling/doc/algorithms.rst @@ -5,81 +5,6 @@ Modelling Algorithms A collection of algorithms that can be useful in modelling -El Enumerador --------------------------------------------------------------------------------- - -In the modelling process you typically have to generate loops at several -locations in the model. They're potentially close enough to interact. If you -simply iterate over to locations and set the loop at one location after the -other, the scorer cannot "see" loops at locations that will be processed later -on. A naive approach to tackle this problem would be to generate an enumeration -with all possible loop combinations: - -For every possible combination of loops: - -* Set the according loop from every location in the environment -* Calculate the scores from all loops given that environment -* Keep the best overall score - -Assuming a small modelling problem with a total of 5 locations with 10 -candidates each, this already gives 100000 set loop scoring environment calls -and 100000 loop scoring calls. ElEnumerador tries to solve the problem in -a more clever approach. It first generates an interaction graph of which -locations actually have candidates that interact (If any of the candidates -have a CA distances closer than a specified threshold). -This graph gets then decomposed into a minimal width tree and solved in a -similar manner as in the sidechain modelling problem. -This typically massively reduces the problem size by solving -small subproblems and allows to tackle larger problems in a feasible amount -of computing time. Despite massively reducing the amount of loop scoring -and set environment calls, we still need to define an upper bound of computation -time. After building the minimal width tree, one can already determine the -exact number of loop scoring calls required to solve that tree. This measure -turnes out to be an accurate measure of overall runtime, since the scoring -clearly is the dominant operation. ElEnumerador checks, whether this number -is above a user defined threshold. If yes, the 20% worst scoring loop candidates -from the location with the largest amount of loop candidates gets removed and -a new tree gets generated. Please note, that this pruning step only -considers the already set environment and no pairwise interactions to other -loops. A global optimum is therefore not guaranteed, but still very likely. - -.. method:: ElEnumerador(mhandle, gaps, loop_candidates, all_atom_scores,\ - [max_complexity=1000000, distance_thresh=10.0]) - - Searches the loops in **loop_candidates** with their exact locations defined - in **gaps** that optimize the score given a scoring environment built from the - current model attached to **mhandle** and an internally loaded default scorer - with default weights. - The scores can either be backbone only or also consider all atom scores if - the according flag is set to true. Please note, that this has a massive impact - on runtime! The algorithm performs pruning iterations - until every connected component in the interaction graph can be solved with - the number of required loop scoring calls being lower than **max_complexity**. - - :param mhandle: Handle with valid model attached. This handle is treated - as const - :param gaps: A list of gaps specifying the loop locations - :param loop_candidates: A list of :class:`LoopCandidates` objects defining - loops at different locations. - :param all_atom_scores: Whether to use all atom scores or go for backbone only - :param max_complexity: Pruning gets performed until every connected component - in the interaction graph can be solved with scoring - steps below this threshold. - :param distance_thresh: If any loop of two LoopCandidates objects has a - pairwise CA distance below this thresh, they're - considered to interact. Use with care, this has - an impact on the problem complexity if increased. - - :type mhandle: :class:`ModellingHandle` - :type gaps: :class:`StructuralGapList` - :type loop_candidates: :class:`list` - :type all_atom_scores: :class:`bool` - :type max_complexity: :class:`int` - :type distance_thresh: :class:`float` - - :returns: A :class:`list` of :class:`int` specifying the optimal loop in every - item of **loop_candidates** - Rigid Blocks -------------------------------------------------------------------------------- diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt index 16bed15943d08107aee1bbf4161823170bf48707..3f6e6f13be2079f47e8a11c1ffb979971cfb5534 100644 --- a/modelling/pymod/CMakeLists.txt +++ b/modelling/pymod/CMakeLists.txt @@ -9,7 +9,6 @@ set(MODELLING_CPP export_rigid_blocks.cc export_score_container.cc export_scoring_weights.cc - export_el_enumerador.cc export_sidechain_reconstructor.cc wrap_modelling.cc ) diff --git a/modelling/pymod/export_el_enumerador.cc b/modelling/pymod/export_el_enumerador.cc deleted file mode 100644 index 5ed8799af9cbd28c6d2c15f0154f8b074de99bf1..0000000000000000000000000000000000000000 --- a/modelling/pymod/export_el_enumerador.cc +++ /dev/null @@ -1,44 +0,0 @@ -#include <boost/python.hpp> -#include <promod3/core/export_helper.hh> -#include <promod3/modelling/el_enumerador.hh> - -using namespace ost; -using namespace boost::python; -using namespace promod3::modelling; - -namespace{ - -boost::python::list WrapEnumerador(ModellingHandle& mhandle, - const StructuralGapList& gaps, - const boost::python::list& loop_candidates, - bool all_atom, - uint max_complexity, - Real distance_thresh){ - - std::vector<promod3::modelling::LoopCandidatesPtr> v_loop_candidates; - promod3::core::ConvertListToVector(loop_candidates, v_loop_candidates); - - std::vector<uint> v_result = - promod3::modelling::ElEnumerador(mhandle, gaps, v_loop_candidates, - all_atom, max_complexity, - distance_thresh); - - boost::python::list result; - promod3::core::AppendVectorToList(v_result, result); - - return result; -} - -} - - -void export_el_enumerador(){ - - def("ElEnumerador", &WrapEnumerador, (arg("mhandle"), - arg("gaps"), - arg("loop_candidates"), - arg("all_atom_scores") = false, - arg("max_complexity")=1000000, - arg("distance_thresh")=10.0)); - -} \ No newline at end of file diff --git a/modelling/pymod/wrap_modelling.cc b/modelling/pymod/wrap_modelling.cc index 69f69a6b0bb621f9ec87412755c0742eb14c5efc..72125d290d12735a145389f9d47b30a9eca7366e 100644 --- a/modelling/pymod/wrap_modelling.cc +++ b/modelling/pymod/wrap_modelling.cc @@ -11,7 +11,6 @@ void export_rigid_blocks(); void export_score_container(); void export_scoring_weights(); void export_SidechainReconstructor(); -void export_el_enumerador(); BOOST_PYTHON_MODULE(_modelling) { @@ -26,5 +25,4 @@ BOOST_PYTHON_MODULE(_modelling) export_score_container(); export_scoring_weights(); export_SidechainReconstructor(); - export_el_enumerador(); } diff --git a/modelling/src/CMakeLists.txt b/modelling/src/CMakeLists.txt index 6dd633e24927c4b93dd6232df2ce07705a0d6203..d1a8740f55374585dcd21efdb9d36f8e764c7348 100644 --- a/modelling/src/CMakeLists.txt +++ b/modelling/src/CMakeLists.txt @@ -18,7 +18,6 @@ set(MODELLING_SOURCES scoring_weights.cc sidechain_reconstructor.cc sidechain_env_listener.cc - el_enumerador.cc ) set(MODELLING_HEADERS @@ -41,7 +40,6 @@ set(MODELLING_HEADERS scoring_weights.hh sidechain_reconstructor.hh sidechain_env_listener.hh - el_enumerador.hh ) module(NAME modelling diff --git a/modelling/src/el_enumerador.cc b/modelling/src/el_enumerador.cc deleted file mode 100644 index b6a624846770fa64238c126dec5e26739c76b448..0000000000000000000000000000000000000000 --- a/modelling/src/el_enumerador.cc +++ /dev/null @@ -1,1169 +0,0 @@ -#include <promod3/modelling/el_enumerador.hh> -#include <promod3/scoring/scoring_object_loader.hh> -#include <promod3/modelling/sidechain_reconstructor.hh> -#include <promod3/core/enumerator.hh> -#include <promod3/core/graph.hh> -#include <promod3/core/tree.hh> -#include <promod3/modelling/scoring_weights.hh> -#include <limits> -#include <numeric> -#include <algorithm> - -namespace{ - -typedef promod3::core::TreeBag<int> TreeBag; - -void CheckInput(const promod3::modelling::ModellingHandle& mhandle, - const promod3::modelling::StructuralGapList& gaps, - const std::vector<promod3::modelling::LoopCandidatesPtr>& - loop_candidates){ - - //following things have to be checked: - // - gaps and loop_candidates must be consistent - // - all gaps must be associated with the model in mhandle - // - the candidates must not be overlapping - // - the sequences of the gaps must be consistent with the seqres - - if(gaps.size() != loop_candidates.size()){ - throw promod3::Error("Require same number of gaps and loop candidates!"); - } - - for(uint i = 0; i < gaps.size(); ++i){ - if(gaps[i].GetFullSeq() != loop_candidates[i]->GetSequence()){ - String err = "Gaps and loop candidates must be consistent in sequence!"; - throw promod3::Error(err); - } - } - - for(uint i = 0; i < gaps.size(); ++i){ - if(gaps[i].GetChain().GetEntity() != mhandle.model){ - String err = "All gaps must be associated with the model in mhandle!"; - throw promod3::Error(err); - } - } - - std::vector<uint> start_resnums; - std::vector<uint> end_resnums; - std::vector<uint> chain_indices; - - for(uint i = 0; i < gaps.size(); ++i){ - uint start_num = 1; - uint chain_idx = gaps[i].GetChainIndex(); - uint end_num = mhandle.seqres[chain_idx].GetLength(); - - if(!gaps[i].IsNTerminal()){ - start_num = gaps[i].before.GetNumber().GetNum(); - } - - if(!gaps[i].IsCTerminal()){ - end_num = gaps[i].after.GetNumber().GetNum(); - } - start_resnums.push_back(start_num); - end_resnums.push_back(end_num); - chain_indices.push_back(chain_idx); - } - - uint outer_start_resnum, outer_end_resnum; - uint inner_start_resnum, inner_end_resnum; - for(uint i = 0; i < loop_candidates.size(); ++i){ - outer_start_resnum = start_resnums[i]; - outer_end_resnum = end_resnums[i]; - for(uint j = 0; j < loop_candidates.size(); ++j){ - - if(chain_indices[i] != chain_indices[j]) continue; - if(i == j) continue; - - - inner_start_resnum = start_resnums[j]; - inner_end_resnum = end_resnums[j]; - - //check if one of the outer resnums lies within the inner resnums - if((outer_start_resnum > inner_start_resnum && - outer_start_resnum < inner_end_resnum) || - (outer_end_resnum > inner_start_resnum && - outer_end_resnum < inner_end_resnum)){ - throw promod3::Error("LoopCandidates must not overlap!"); - } - //check whether the outer resnums enclose the inner resnums - if(outer_start_resnum <= inner_start_resnum && - outer_end_resnum >= inner_end_resnum){ - throw promod3::Error("LoopCandidates must not overlap!"); - } - } - } - - - for(uint i = 0; i < gaps.size(); ++i){ - String gap_string = gaps[i].GetFullSeq(); - String seqres_string = mhandle.seqres[chain_indices[i]].GetString(); - uint start = start_resnums[i] - 1; - size_t length = gap_string.size(); - String seqres_substring = seqres_string.substr(start, length); - if(gap_string != seqres_substring) { - throw promod3::Error("Sequence of gaps must be consistent with sequence " - "in ModellingHandle!"); - } - } - - - - - - // check whether we have an overflow with the number of candidates - // (super unlikely) - for(uint i = 0; i < loop_candidates.size(); ++i){ - if(loop_candidates[i]->size() > - std::numeric_limits<unsigned short>::max()){ - std::stringstream ss; - ss << "Due to technical reasons the maximum number of loops per "; - ss << "position is limited to "; - ss << std::numeric_limits<unsigned short>::max(); - ss << ". Observed a location with "; - ss << loop_candidates[i]->size(); - ss << " loops. You have to reduce the input size!"; - throw promod3::Error(ss.str()); - } - } -} - -class ScorerContainer{ - -public: - -ScorerContainer() { } - -ScorerContainer(const promod3::modelling::ModellingHandle& mhandle, bool all_atom){ - - all_atom_ = all_atom; - - bb_env_ = promod3::scoring::BackboneScoreEnvPtr( - new promod3::scoring::BackboneScoreEnv(mhandle.seqres)); - bb_env_->SetInitialEnvironment(mhandle.model); - bb_scorer_ = promod3::scoring::LoadDefaultBackboneOverallScorer(); - bb_scorer_->AttachEnvironment(*bb_env_); - - if(all_atom_){ - - bb_weights_ = - promod3::modelling::ScoringWeights::GetInstance().GetBackboneWeights(false, - true); - aa_weights_ = - promod3::modelling::ScoringWeights::GetInstance().GetAllAtomWeights(false); - - aa_sc_env_ = promod3::loop::AllAtomEnvPtr( - new promod3::loop::AllAtomEnv(mhandle.seqres)); - aa_sc_env_->SetInitialEnvironment(mhandle.model); - aa_sc_rec_ = promod3::modelling::SidechainReconstructorPtr( - new promod3::modelling::SidechainReconstructor(true, true, - true, 18.0)); - aa_sc_rec_->AttachEnvironment(*aa_sc_env_, false); - aa_scorer_env_ = promod3::loop::AllAtomEnvPtr( - new promod3::loop::AllAtomEnv(mhandle.seqres)); - aa_scorer_env_->SetInitialEnvironment(mhandle.model); - aa_scorer_ = promod3::scoring::LoadDefaultAllAtomOverallScorer(); - aa_scorer_->AttachEnvironment(*aa_scorer_env_); - } - - else{ - bb_weights_ = - promod3::modelling::ScoringWeights::GetInstance().GetBackboneWeights(false, - false); - } -} - -void SetEnvironment(const promod3::loop::BackboneList& bb_list, - uint start_resnum, uint chain_idx){ - - bb_env_->SetEnvironment(bb_list, start_resnum, chain_idx); - - if(all_atom_){ - // we only set the sidechain environment at this point... - // at every scoring step, all sidechains get reconstructured and the - // reconstructed sidechains get set in the scoring environment anyway - aa_sc_env_->SetEnvironment(bb_list, start_resnum, chain_idx); - } -} - -Real GetScore(const promod3::loop::BackboneList& bb_list, - uint start_resnum, uint chain_idx) const{ - - Real score = 0.0; - score += bb_scorer_->CalculateLinearCombination(bb_weights_, - bb_list, - start_resnum, - chain_idx); - if(all_atom_){ - promod3::modelling::SidechainReconstructionDataPtr sc_rec_data = - aa_sc_rec_->Reconstruct(start_resnum, bb_list.size(), chain_idx); - aa_scorer_env_->SetEnvironment(*(sc_rec_data->env_pos)); - score += aa_scorer_->CalculateLinearCombination(aa_weights_, - start_resnum, - bb_list.size(), - chain_idx); - } - return score; -} - -Real GetScore(const std::vector<promod3::loop::BackboneList*>& bb_list_ptrs, - const std::vector<Real>& weight_vector, - const std::vector<uint>& start_resnums, - const std::vector<uint>& chain_indices) { - - uint num_bb_list = bb_list_ptrs.size(); - - std::vector<Real> scores(num_bb_list, 0.0); - for(uint i = 0; i < num_bb_list; ++i){ - scores[i] += bb_scorer_->CalculateLinearCombination(bb_weights_, - *(bb_list_ptrs[i]), - start_resnums[i], - chain_indices[i]); - } - - if(all_atom_){ - // reconstruct the sidechains only once - // it gets assumed, that the appropirate environment has already been set! - std::vector<uint> bb_list_sizes(bb_list_ptrs.size()); - for(uint i = 0; i < num_bb_list; ++i){ - bb_list_sizes[i] = bb_list_ptrs[i]->size(); - } - - promod3::modelling::SidechainReconstructionDataPtr sc_rec_data = - aa_sc_rec_->Reconstruct(start_resnums, bb_list_sizes, chain_indices); - aa_scorer_env_->SetEnvironment(*(sc_rec_data->env_pos)); - - for(uint i = 0; i < num_bb_list; ++i){ - scores[i] += aa_scorer_->CalculateLinearCombination(aa_weights_, - start_resnums[i], - bb_list_sizes[i], - chain_indices[i]); - } - } - - Real score = 0.0; - for(uint i = 0; i < num_bb_list; ++i){ - score += scores[i] * weight_vector[i]; - } - - return score; -} - -std::vector<Real> -GetScores(const promod3::modelling::LoopCandidatesPtr candidates, - uint start_resnum, uint chain_idx) { - std::vector<Real> scores; - for(uint i = 0; i < candidates->size(); ++i){ - // setting the environment in the loop itself is only required for - // all atom scoring - if(all_atom_){ - this->SetEnvironment((*candidates)[i], start_resnum, chain_idx); - } - scores.push_back(this->GetScore((*candidates)[i], start_resnum, chain_idx)); - } - return scores; -} - -private: - -std::map<String, Real> bb_weights_; -std::map<String, Real> aa_weights_; - -promod3::scoring::BackboneScoreEnvPtr bb_env_; -promod3::scoring::BackboneOverallScorerPtr bb_scorer_; - -promod3::loop::AllAtomEnvPtr aa_sc_env_; -promod3::modelling::SidechainReconstructorPtr aa_sc_rec_; - -promod3::loop::AllAtomEnvPtr aa_scorer_env_; -promod3::scoring::AllAtomOverallScorerPtr aa_scorer_; - -bool all_atom_; -}; - - -class EnumeradorDataHandler{ - -public: - - EnumeradorDataHandler(const promod3::modelling::ModellingHandle& mhandle, - const promod3::modelling::StructuralGapList& gaps, - const std::vector<promod3::modelling::LoopCandidatesPtr>& loop_candidates, - bool all_atom_scores, - uint max_complexity, Real distance_thresh){ - - if(!mhandle.model.IsValid()){ - throw promod3::Error("mhandle must have a valid model attached"); - } - - if(max_complexity < loop_candidates.size()){ - String err = "max_complexity parameter must be at least as big as the "; - err += "number of loop positions. If the two numbers are the same, you "; - err += "basically prune until you have exactly one loop candidate per "; - err += "position. You therefore never really consider pairwise "; - err += "interactions between loops and you might have clashes in your "; - err += "solution!"; - throw promod3::Error(err); - } - - if(distance_thresh < 0.0){ - String err = "Cannot have negative distance_thresh in ElEnumerador!"; - throw promod3::Error(err); - } - - CheckInput(mhandle, gaps, loop_candidates); - - scorer_ = ScorerContainer(mhandle, all_atom_scores); - max_complexity_ = max_complexity; - distance_thresh_ = distance_thresh; - all_atom_scores_ = all_atom_scores; - - uint num_loop_candidates = loop_candidates.size(); - active_candidates_.resize(num_loop_candidates, 1); - - for(uint i = 0; i < num_loop_candidates; ++i){ - uint start_num = 1; - uint chain_idx = gaps[i].GetChainIndex(); - if(!gaps[i].IsNTerminal()){ - start_num = gaps[i].before.GetNumber().GetNum(); - } - start_resnums_.push_back(start_num); - chain_indices_.push_back(chain_idx); - } - - - for(uint i = 0; i < num_loop_candidates; ++i){ - promod3::modelling::LoopCandidatesPtr - p(new promod3::modelling::LoopCandidates(*loop_candidates[i])); - candidates_.push_back(p); - } - - idx_mapper_.resize(num_loop_candidates); - for(uint i = 0; i < num_loop_candidates; ++i){ - uint current_size = loop_candidates[i]->size(); - idx_mapper_[i].resize(current_size); - for(uint j = 0; j < current_size; ++j){ - idx_mapper_[i][j] = j; - } - } - } - - void DeactivateCandidates(uint idx){ - if(idx >= active_candidates_.size()){ - String err = "Tried to access invalid LoopCandidate in ElEnumerador!"; - throw promod3::Error(err); - } - active_candidates_[idx] = 0; - } - - bool HasActiveCandidates() const { - return std::accumulate(active_candidates_.begin(), - active_candidates_.end(), 0) != 0; - } - - promod3::core::Graph BuildInteractionGraph(){ - - int num_candidates = candidates_.size(); - promod3::core::Graph graph(num_candidates); - - for(int i = 0; i < num_candidates; ++i){ - if(active_candidates_[i]){ - for(int j = i + 1; j < num_candidates; ++j){ - if(active_candidates_[j]){ - - promod3::modelling::LoopCandidatesPtr one = candidates_[i]; - promod3::modelling::LoopCandidatesPtr two = candidates_[j]; - bool interact = false; - - for(uint k = 0; k < one->size(); ++k){ - for(uint l = 0; l < two->size(); ++l){ - if((*one)[k].MinCADistance((*two)[l]) < distance_thresh_){ - interact = true; - break; - } - } - } - - if(interact){ - graph.Connect(i,j); - } - } - } - } - } - return graph; - } - - - bool Prune(){ - - uint max_num = 0; - uint max_idx = 0; - - for(uint i = 0; i < candidates_.size(); ++i){ - if(active_candidates_[i]){ - uint actual_size = candidates_[i]->size(); - if(actual_size > max_num){ - max_num = actual_size; - max_idx = i; - } - } - } - - std::vector<Real> scores = scorer_.GetScores(candidates_[max_idx], - start_resnums_[max_idx], - chain_indices_[max_idx]); - - std::vector<std::pair<Real,uint> > scores_with_indices; - scores_with_indices.reserve(scores.size()); - for(uint i = 0; i < scores.size(); ++i){ - scores_with_indices.push_back(std::make_pair(scores[i],i)); - } - - std::sort(scores_with_indices.begin(), scores_with_indices.end()); - - uint a = 1; - uint b = static_cast<uint>(std::floor(0.8 * max_num)); - uint num_candidates_to_keep = std::max(a, b); - - if(num_candidates_to_keep == candidates_[max_idx]->size()){ - return false; - } - - std::vector<uint> indices_to_keep; - indices_to_keep.reserve(num_candidates_to_keep); - for(uint i = 0; i < num_candidates_to_keep; ++i){ - indices_to_keep.push_back(scores_with_indices[i].second); - } - - candidates_[max_idx] = candidates_[max_idx]->Extract(indices_to_keep); - - // update the idx_mapper - std::vector<uint> max_idx_mapper(num_candidates_to_keep); - for(uint i = 0; i < num_candidates_to_keep; ++i){ - max_idx_mapper[i] = idx_mapper_[max_idx][indices_to_keep[i]]; - } - idx_mapper_[max_idx] = max_idx_mapper; - - return true; - } - - uint GetOriginalIdx(uint location_idx, uint candidate_idx){ - - if(location_idx >= idx_mapper_.size()){ - String err = "Invalid access of LoopCandidates object in ElEnumerador!"; - throw promod3::Error(err); - } - - if(candidate_idx >= idx_mapper_[location_idx].size()){ - String err = "Invalid access of LoopCandidate in ElEnumerador!"; - throw promod3::Error(err); - } - - return idx_mapper_[location_idx][candidate_idx]; - } - - const std::vector<promod3::modelling::LoopCandidatesPtr>& GetCandidates() const { - return candidates_; - } - - const std::vector<uint>& GetStartResnums() const { - return start_resnums_; - } - - const std::vector<uint>& GetChainIndices() const { - return chain_indices_; - } - - const std::vector<uint>& GetActiveCandidates() const { - return active_candidates_; - } - - promod3::modelling::LoopCandidatesPtr GetCandidates(uint idx) const { - return candidates_[idx]; - } - - uint GetStartResnum(uint idx) const { - return start_resnums_[idx]; - } - - uint GetChainIdx(uint idx) const { - return chain_indices_[idx]; - } - - ScorerContainer& GetScorer() { return scorer_; } - - uint GetMaxComplexity() const { - return max_complexity_; - } - - bool IsAllAtomScoring() const { - return all_atom_scores_; - } - -private: - - std::vector<promod3::modelling::LoopCandidatesPtr> candidates_; - std::vector<uint> active_candidates_; - std::vector<uint> start_resnums_; - std::vector<uint> chain_indices_; - std::vector<std::vector<uint> > idx_mapper_; - ScorerContainer scorer_; - bool all_atom_scores_; - uint max_complexity_; - Real distance_thresh_; -}; - -void GenerateLoopWeights(const std::vector<promod3::modelling::LoopCandidatesPtr>& candidates, - std::vector<Real>& weights){ - weights.clear(); - - Real sum = 0.0; - uint num_candidates = candidates.size(); - for(uint i = 0; i < num_candidates; ++i){ - Real s = static_cast<Real>(candidates[i]->GetSequence().size()); - weights.push_back(s); - sum += s; - } - for(uint i = 0; i < num_candidates; ++i){ - weights[i] /= sum; - } -} - -struct LSetIdxCalculator{ - - LSetIdxCalculator() { } - - LSetIdxCalculator(const std::vector<unsigned short>& num_candidates){ - lset_size = num_candidates.size(); - for(int i = 0; i < lset_size; ++i){ - int temp = 1; - for(int j = i + 1; j < lset_size; ++j){ - temp *= num_candidates[j]; - } - idx_helper.push_back(temp); - } - idx_helper.push_back(1); - } - - int GetLIdx(const std::vector<unsigned short>& indices) const{ - int idx = 0; - for(int i = 0; i < lset_size; ++i){ - idx += idx_helper[i] * indices[i]; - } - return idx; - } - - std::vector<int> idx_helper; - int lset_size; -}; - - -struct BagData{ - - //members of the bag that are also present in parent bag - std::vector<int> lset; - //members of the bag that are not present in parent bag - std::vector<int> rset; - - //number of candidates for every entry in lset/rset - std::vector<unsigned short> num_l_candidates; - std::vector<unsigned short> num_r_candidates; - - //optimal solution in r set given a certain combination in l - std::vector<std::vector<unsigned short> > optimal_r_combination; - //the according score - std::vector<Real> optimal_r_score; - - //calculates an index in optimal_r_combination/optimal_r_score - //given a certain combination in l - LSetIdxCalculator lset_idx_calculator; - - //the bag data indices for the children attached to this bag - std::vector<int> children_indices; - int num_children; - - //super complicated mapping structure to figure out which elements of - //the lset/rset belong to the lset of every child - - //elements in lset - std::vector<std::vector<std::pair<int,int> > > lset_l_mappers; - //elements in rset - std::vector<std::vector<std::pair<int,int> > > rset_l_mappers; - - //this data structure will be altered during the calculation and is a - //placeholder to query optimal solutions for certain lsets of the children - std::vector<std::vector<unsigned short> > children_l_combinations; - - //It's not guaranteed that every element in the lset has a graph connection - //to one of the elements in the rset => we might therefore avoid some - //SetEnvironment calls by tracking this information - std::vector<int> l_connection_to_r; - - //contains data describing the dependency of members of the rset having a - //dependency towards other nodes in the initial graph not part of the - //tree bag - std::vector<std::vector<int> > external_r_dependencies; -}; - - -void FillBagData(const std::vector<TreeBag*>& traversal, - const std::vector<promod3::modelling::LoopCandidatesPtr>& - loop_candidates, - const promod3::core::Graph& graph, - std::vector<BagData>& bag_data){ - - bag_data.clear(); - bag_data.resize(traversal.size()); - - for(uint bag_idx = 0; bag_idx < traversal.size(); ++bag_idx){ - - TreeBag* bag = traversal[bag_idx]; - TreeBag* parent = traversal[bag_idx]->GetParent(); - - //assign rset and lset - if(parent == NULL){ - //it's the root, lets shuffle all to rset - bag_data[bag_idx].rset.assign(bag->members_begin(), - bag->members_end()); - } - else{ - for(TreeBag::const_member_iterator it = bag->members_begin(); - it != bag->members_end(); ++it){ - if(parent->HasMember(*it)) bag_data[bag_idx].lset.push_back(*it); - else bag_data[bag_idx].rset.push_back(*it); - } - } - - //figure out how many candidates there are - for(std::vector<int>::iterator i = bag_data[bag_idx].lset.begin(); - i != bag_data[bag_idx].lset.end(); ++i){ - bag_data[bag_idx].num_l_candidates.push_back(loop_candidates[*i]->size()); - } - for(std::vector<int>::iterator i = bag_data[bag_idx].rset.begin(); - i != bag_data[bag_idx].rset.end(); ++i){ - bag_data[bag_idx].num_r_candidates.push_back(loop_candidates[*i]->size()); - } - - //to resize the optimal_r_combination and optimal_r_score we need to know - //how many combinations we have for the elements in l. If the lset is empty, - //the number of combinations remains one. This is necessary for the root - //node, where only the rset in absence of the lset gets optimized. - int combinations = 1; - for(std::vector<unsigned short>::iterator i = - bag_data[bag_idx].num_l_candidates.begin(); - i != bag_data[bag_idx].num_l_candidates.end(); ++i){ - combinations *= (*i); - } - bag_data[bag_idx].optimal_r_combination.resize(combinations); - bag_data[bag_idx].optimal_r_score.resize(combinations); - - //generate a new lset idxgenerator... - bag_data[bag_idx].lset_idx_calculator = - LSetIdxCalculator(bag_data[bag_idx].num_l_candidates); - - //get all the children stuff right - for(TreeBag::const_child_iterator it = bag->children_begin(); - it != bag->children_end(); ++it){ - bag_data[bag_idx].children_indices.push_back((*it)->GetIdx()); - } - bag_data[bag_idx].num_children = bag_data[bag_idx].children_indices.size(); - - //handle current lset - bag_data[bag_idx].lset_l_mappers.resize(bag_data[bag_idx].num_children); - for(uint i = 0; i < bag_data[bag_idx].lset.size(); ++i){ - int val = bag_data[bag_idx].lset[i]; - //go through all children - for(int j = 0; j < bag_data[bag_idx].num_children; ++j){ - int ch_idx = bag_data[bag_idx].children_indices[j]; - //we can be sure of the children lsets being set due to the postorder - //traversal of the tree - const std::vector<int>& child_lset = bag_data[ch_idx].lset; - std::vector<int>::const_iterator search_it = std::find(child_lset.begin(), - child_lset.end(), - val); - if(search_it != child_lset.end()){ - //its in the childs lset! - int idx_in_child_l = search_it - child_lset.begin(); - bag_data[bag_idx].lset_l_mappers[j].push_back( - std::make_pair(i,idx_in_child_l)); - } - } - } - - //handle current rset - bag_data[bag_idx].rset_l_mappers.resize(bag_data[bag_idx].num_children); - for(uint i = 0; i < bag_data[bag_idx].rset.size(); ++i){ - int val = bag_data[bag_idx].rset[i]; - //go through all children - for(int j = 0; j < bag_data[bag_idx].num_children; ++j){ - int ch_idx = bag_data[bag_idx].children_indices[j]; - //we can be sure of the children lsets being set due to the postorder - //traversal of the tree - const std::vector<int>& child_lset = bag_data[ch_idx].lset; - std::vector<int>::const_iterator search_it = std::find(child_lset.begin(), - child_lset.end(), - val); - if(search_it != child_lset.end()){ - //its in the childs lset! - int idx_in_child_l = search_it - child_lset.begin(); - bag_data[bag_idx].rset_l_mappers[j].push_back( - std::make_pair(i,idx_in_child_l)); - } - } - } - - //prepare the lset combinations - for(std::vector<int>::iterator i = bag_data[bag_idx].children_indices.begin(); - i != bag_data[bag_idx].children_indices.end(); ++i){ - int child_lset_size = bag_data[*i].lset.size(); - bag_data[bag_idx].children_l_combinations.push_back( - std::vector<unsigned short>(child_lset_size,0)); - } - - //check which members of the lset actually have a connection to one of the - //elements in r - for(uint i = 0; i < bag_data[bag_idx].lset.size(); ++i){ - int is_connected = 0; - for(uint j = 0; j < bag_data[bag_idx].rset.size(); ++j){ - if(graph.IsConnected(bag_data[bag_idx].lset[i],bag_data[bag_idx].rset[j])){ - is_connected = 1; - break; - } - } - bag_data[bag_idx].l_connection_to_r.push_back(is_connected); - } - - //check for external dependencies of rset members - for(std::vector<int>::iterator i = bag_data[bag_idx].rset.begin(); - i != bag_data[bag_idx].rset.end(); ++i){ - std::vector<int> neighbours; - graph.GetNeighbours(*i, neighbours); - std::vector<int> external_dependencies; - for(std::vector<int>::iterator j = neighbours.begin(); - j != neighbours.end(); ++j){ - std::vector<int>::iterator l_it; - std::vector<int>::iterator r_it; - l_it = std::find(bag_data[bag_idx].lset.begin(), - bag_data[bag_idx].lset.end(),*j); - r_it = std::find(bag_data[bag_idx].rset.begin(), - bag_data[bag_idx].rset.end(),*j); - if(r_it == bag_data[bag_idx].rset.end() && - l_it == bag_data[bag_idx].lset.end()){ - //its not in the current bag => it's an external dependency! - external_dependencies.push_back(*j); - } - } - bag_data[bag_idx].external_r_dependencies.push_back(external_dependencies); - } - } -} - - -void CollectSolution(const std::vector<BagData>& bag_data, - int bag_idx, - std::vector<unsigned short>& solution){ - - const BagData& bag = bag_data[bag_idx]; - - //collect the lset values from the solution - std::vector<unsigned short> l_solution(bag.lset.size(), 0); - for(uint i = 0; i < l_solution.size(); ++i){ - l_solution[i] = solution[bag.lset[i]]; - } - - //get the index of the ideal r_combination given that l solution - int r_idx = bag.lset_idx_calculator.GetLIdx(l_solution); - //and fill the according values into the solution - for(uint i = 0; i < bag.rset.size(); ++i){ - solution[bag.rset[i]] = bag.optimal_r_combination[r_idx][i]; - } - - //recursively call this function for all the children - for(std::vector<int>::const_iterator i = bag.children_indices.begin(); - i != bag.children_indices.end(); ++i){ - CollectSolution(bag_data, *i, solution); - } -} - - -//collects the solution for the subtree of bag_data enforcing a certain -//combination in rset and lset -void CollectSolution(const std::vector<BagData>& bag_data, - int bag_idx, - const std::vector<unsigned short>& l_combination, - const std::vector<unsigned short>& r_combination, - std::vector<unsigned short>& solution){ - - const BagData& bag = bag_data[bag_idx]; - - //write the defined r and l combinations into the solution - for(uint i = 0; i < l_combination.size(); ++i){ - solution[bag.lset[i]] = l_combination[i]; - } - for(uint i = 0; i < r_combination.size(); ++i){ - solution[bag.rset[i]] = r_combination[i]; - } - - //call the magic recursive function for every child... - for(std::vector<int>::const_iterator i = bag.children_indices.begin(); - i != bag.children_indices.end(); ++i){ - CollectSolution(bag_data, *i, solution); - } -} - - -unsigned long GetComplexity(const std::vector<BagData>& bag_data){ - - unsigned long complexity = 0; // this number can get huuuuuuuuge - - for(std::vector<BagData>::const_iterator i = bag_data.begin(); - i != bag_data.end(); ++i){ - - unsigned long int lset_combinations = 1; - unsigned long int rset_combinations = 1; - - for(std::vector<unsigned short>::const_iterator j = - i->num_l_candidates.begin(); j != i->num_l_candidates.end(); ++j){ - lset_combinations *= (*j); - } - - for(std::vector<unsigned short>::const_iterator j = - i->num_r_candidates.begin(); j != i->num_r_candidates.end(); ++j){ - rset_combinations *= (*j); - } - - complexity += (lset_combinations * rset_combinations * i->rset.size()); - } - - return complexity; -} - -void GenerateTree(const std::vector<promod3::modelling::LoopCandidatesPtr>& - loop_candidates, - const std::vector<uint>& start_resnums, - const std::vector<uint>& chain_indices, - const promod3::core::Graph& graph, - std::vector<BagData>& tree_traversal){ - - TreeBag* tree = promod3::core::GenerateMinimalWidthTree(graph); - - //generate post order traversal of previously constructed tree - std::vector<TreeBag*> traversal = promod3::core::PostOrderTraversal(tree); - - std::vector<BagData> bag_data; - FillBagData(traversal, loop_candidates, graph, tree_traversal); - - // let's delete the tree again by calling the destructor of the root - delete traversal.back(); -} - -std::vector<unsigned short> TreeSolve(ScorerContainer& scorer, - const std::vector<promod3::modelling::LoopCandidatesPtr>& - loop_candidates, - const std::vector<uint>& start_resnums, - const std::vector<uint>& chain_indices, - const std::vector<Real>& loop_candidate_weights, - std::vector<BagData>& tree_traversal){ - - //keep track of the currently set environment - std::vector<int> environment_status(loop_candidates.size(), -1); - - // THE SOLUTION MUAHA - std::vector<unsigned short> solution(loop_candidates.size(), -1); - - //start to iterate over the bags - for(uint bag_idx = 0; bag_idx < tree_traversal.size(); ++bag_idx){ - - //data of the current bag - BagData& data = tree_traversal[bag_idx]; - int lset_size = data.lset.size(); - int rset_size = data.rset.size(); - - //the lset of this bag we have to enumerate - promod3::core::Enumerator<unsigned short> - l_set_enumerator(data.num_l_candidates); - std::vector<unsigned short>& current_l_enum = - l_set_enumerator.CurrentEnum(); - - // vectors, that will serve as input for the scorer - std::vector<promod3::loop::BackboneList*> rset_bb_list_ptrs(rset_size); - std::vector<Real> rset_weight_vector(rset_size); - std::vector<uint> rset_start_resnums(rset_size); - std::vector<uint> rset_chain_indices(rset_size); - - // fill what we already can - for(int i = 0; i < rset_size; ++i){ - rset_weight_vector[i] = loop_candidate_weights[data.rset[i]]; - rset_start_resnums[i] = start_resnums[data.rset[i]]; - rset_chain_indices[i] = chain_indices[data.rset[i]]; - } - - do { - //set the environment of the lset where required - - for(int i = 0; i < lset_size; ++i){ - if(environment_status[data.lset[i]] != current_l_enum[i] && - data.l_connection_to_r[i]){ - const promod3::loop::BackboneList& bb_list = - (*loop_candidates[data.lset[i]])[current_l_enum[i]]; - scorer.SetEnvironment(bb_list, start_resnums[data.lset[i]], - chain_indices[data.lset[i]]); - environment_status[data.lset[i]] = current_l_enum[i]; - } - } - - //set the children lset values from this lset enumeration - for(uint i = 0; i < data.children_indices.size(); ++i){ - for(std::vector<std::pair<int,int> >::const_iterator j = - data.lset_l_mappers[i].begin(); - j != data.lset_l_mappers[i].end(); ++j){ - data.children_l_combinations[i][j->second] = current_l_enum[j->first]; - } - } - - promod3::core::Enumerator<unsigned short> - r_set_enumerator(data.num_r_candidates); - std::vector<unsigned short>& current_r_enum = - r_set_enumerator.CurrentEnum(); - std::vector<unsigned short> old_r_enum(rset_size,-1); - - Real best_score = std::numeric_limits<Real>::max(); - std::vector<unsigned short> best_rset = current_r_enum; - - //let's find the optimal r combination given the current l combination... - do { - - //set the environment of the rset where required - for(int i = 0; i < rset_size; ++i){ - if(environment_status[data.rset[i]] != current_r_enum[i]){ - const promod3::loop::BackboneList& bb_list = - (*loop_candidates[data.rset[i]])[current_r_enum[i]]; - scorer.SetEnvironment(bb_list, start_resnums[data.rset[i]], - chain_indices[data.rset[i]]); - environment_status[data.rset[i]] = current_r_enum[i]; - } - } - - //set the environments of all externally connected loops... - for(int i = 0; i < rset_size; ++i){ - if(old_r_enum[i] != current_r_enum[i]){ - if(!data.external_r_dependencies[i].empty()){ - //there are external dependencies... we have to collect the - //solution for the subtree given the current lset and rset - //enumerations - CollectSolution(tree_traversal, bag_idx, current_l_enum, - current_r_enum, solution); - //and set the current solutions in the scoring environment - for(std::vector<int>::const_iterator j = - data.external_r_dependencies[i].begin(); - j != data.external_r_dependencies[i].end(); ++j){ - - if(environment_status[*j] != solution[*j]){ - - const promod3::loop::BackboneList& bb_list = - (*loop_candidates[*j])[solution[*j]]; - scorer.SetEnvironment(bb_list, start_resnums[*j], - chain_indices[*j]); - environment_status[*j] = solution[*j]; - } - } - } - } - } - - // get the appropriate BackboneLists - for(int i = 0; i < rset_size; ++i){ - rset_bb_list_ptrs[i] = - &(*loop_candidates[data.rset[i]])[current_r_enum[i]]; - } - - Real score = scorer.GetScore(rset_bb_list_ptrs, rset_weight_vector, - rset_start_resnums, rset_chain_indices); - - //set the children lset values from this rset enumeration - for(uint i = 0; i < data.children_indices.size(); ++i){ - for(std::vector<std::pair<int,int> >::const_iterator l = - data.rset_l_mappers[i].begin(); - l != data.rset_l_mappers[i].end(); ++l){ - data.children_l_combinations[i][l->second] = current_r_enum[l->first]; - } - } - - //add up the scores from the children - for(uint i = 0; i < data.children_indices.size(); ++i){ - BagData& child_data = tree_traversal[data.children_indices[i]]; - int l_idx = - child_data.lset_idx_calculator.GetLIdx(data.children_l_combinations[i]); - score += child_data.optimal_r_score[l_idx]; - } - - if(score < best_score){ - best_score = score; - best_rset = current_r_enum; - } - - old_r_enum = current_r_enum; - - } while(r_set_enumerator.Next()); - - int lset_idx = data.lset_idx_calculator.GetLIdx(current_l_enum); - data.optimal_r_combination[lset_idx] = best_rset; - data.optimal_r_score[lset_idx] = best_score; - - } while(l_set_enumerator.Next()); - - } - - //recursively fill the final solution - CollectSolution(tree_traversal, tree_traversal.size() - 1, solution); - - return solution; -} - - -uint SingleResolve(ScorerContainer& scorer, - const promod3::modelling::LoopCandidatesPtr& loop_candidates, - uint start_resnum, - uint chain_idx, - bool all_atom){ - Real best_score = std::numeric_limits<Real>::max(); - uint best_idx = 0; - for(uint i = 0; i < loop_candidates->size(); ++i){ - // if all atom is true, we need to set the environment - if(all_atom){ - scorer.SetEnvironment((*loop_candidates)[i], start_resnum, chain_idx); - } - Real score = scorer.GetScore((*loop_candidates)[i], start_resnum, chain_idx); - if(score < best_score){ - best_idx = i; - best_score = score; - } - } - return best_idx; -} - -} //ns - - -namespace promod3{ namespace modelling{ - -std::vector<uint> ElEnumerador(const ModellingHandle& mhandle, - const StructuralGapList& gaps, - const std::vector<LoopCandidatesPtr>& loop_candidates, - bool all_atom_scores, - uint max_complexity, - Real distance_thresh){ - - // directly return if no loop candidates are present... - if(loop_candidates.empty()){ - std::vector<uint> return_vec; - return return_vec; - } - - // all data gets managed in EnumeradorDataHandler object. - // also input check happens there. - EnumeradorDataHandler data(mhandle, gaps, loop_candidates, - all_atom_scores, max_complexity, - distance_thresh); - - // if there is only one loop_candidate object, the solution is - // also rather trivial - if(loop_candidates.size() == 1){ - uint best_idx = SingleResolve(data.GetScorer(), - data.GetCandidates(0), - data.GetStartResnum(0), - data.GetChainIdx(0), - data.IsAllAtomScoring()); - std::vector<uint> return_vec(1, best_idx); - return return_vec; - } - - // seems that we have to use the full power of el enumerador! - std::vector<uint> solution(loop_candidates.size(), 0); - - while(data.HasActiveCandidates()){ - - promod3::core::Graph graph = data.BuildInteractionGraph(); - - std::vector<std::vector<int> > connected_components; - graph.ConnectedComponents(connected_components); - - for (uint component_idx = 0; component_idx < connected_components.size(); - ++component_idx){ - - // prepare everything for this connected component - - const std::vector<int>& component_indices = - connected_components[component_idx]; - uint num_candidates = component_indices.size(); - - std::vector<LoopCandidatesPtr> component_loop_candidates; - std::vector<uint> component_chain_indices; - std::vector<uint> component_start_resnums; - - for(uint i = 0; i < num_candidates; ++i){ - uint idx = component_indices[i]; - component_loop_candidates.push_back(data.GetCandidates(idx)); - component_chain_indices.push_back(data.GetChainIdx(idx)); - component_start_resnums.push_back(data.GetStartResnum(idx)); - } - - if(num_candidates == 1){ - //there is not much graph stuff to do... - uint best_idx = SingleResolve(data.GetScorer(), - component_loop_candidates[0], - component_start_resnums[0], - component_chain_indices[0], - data.IsAllAtomScoring()); - solution[component_indices[0]] = - data.GetOriginalIdx(component_indices[0], best_idx); - data.DeactivateCandidates(component_indices[0]); - continue; - } - - promod3::core::Graph component_graph = graph.SubGraph(component_indices); - - std::vector<BagData> tree_traversal; - GenerateTree(component_loop_candidates, component_start_resnums, - component_chain_indices, component_graph, tree_traversal); - - // check whether complexity is below thresh. Leave it for later otherwise - unsigned long complexity = GetComplexity(tree_traversal); - - if(complexity <= data.GetMaxComplexity()){ - - // the scores get weighted by the loop size... - std::vector<Real> weights; - GenerateLoopWeights(component_loop_candidates, weights); - - // solve it! - std::vector<unsigned short> component_solution = - TreeSolve(data.GetScorer(), component_loop_candidates, - component_start_resnums, component_chain_indices, - weights, tree_traversal); - - //fill in the component solution - for(uint i = 0; i < num_candidates; ++i){ - solution[component_indices[i]] = - data.GetOriginalIdx(component_indices[i], component_solution[i]); - data.DeactivateCandidates(component_indices[i]); - } - } - } - - // check whether there still are active candidates.... - if(!data.HasActiveCandidates()){ - // we're done - break; - } - - if(!data.Prune()){ - String err = "Failed to prune any further in ElEnumerador! "; - err += "Try with larger max_complexity..."; - throw promod3::Error(err); - } - } - - return solution; -} - -}} //ns diff --git a/modelling/src/el_enumerador.hh b/modelling/src/el_enumerador.hh deleted file mode 100644 index 0c1b1ebe52517c60eeb153bc0c3a9af5270fb3ef..0000000000000000000000000000000000000000 --- a/modelling/src/el_enumerador.hh +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef PM3_MODELLING_ELENUMERADOR_HH -#define PM3_MODELLING_ELENUMERADOR_HH - -#include <promod3/modelling/loop_candidate.hh> -#include <promod3/modelling/model.hh> -#include <promod3/loop/structure_db.hh> - -#include <vector> - -namespace promod3 { namespace modelling { - -std::vector<uint> ElEnumerador(const ModellingHandle& mhandle, - const StructuralGapList& gaps, - const std::vector<LoopCandidatesPtr>& loop_candidates, - bool all_atom_scores = false, - uint max_complexity = 10000000, - Real distance_thresh = 10.0); -}} //ns - -#endif