diff --git a/modules/db/doc/db.rst b/modules/db/doc/db.rst index 87dc0919e840133815ea56af101ca8d9096b2d11..73f885dae77fbed5a89709ba219514349c10aaa9 100644 --- a/modules/db/doc/db.rst +++ b/modules/db/doc/db.rst @@ -351,45 +351,164 @@ marks non-resolved residues with '-'. :raises: :exc:`ost.Error` if requested data is not present -.. method:: ExtractTemplateData(entry_name, chain_name, aln, indexer, \ - seqres_container, atomseq_container, \ - position_container) +.. class:: DisCoDataContainers(indexer_path, \ + seqres_container_path, \ + atomseq_container_path, \ + position_container_path) + Thin wrapper to organize the data containers required in + :meth:`ExtractTemplateDataDisCo`. All containers are loaded + from disk and are expected to be consistent. + + :param indexer_path: Path to indexer file to access the data containers + (:class:`LinearIndexer`) + :param seqres_container_path: Path to container with SEQRES data + (:class:`LinearCharacterContainer`) + :param atomseq_container_path: Path to container with ATOMSEQ data + (:class:`LinearCharacterContainer`) + :param position_container_path: Path to container with CA position data + (:class:`LinearPositionContainer`) + + :type indexer_path: :class:`str` + :type seqres_container_path: :class:`str` + :type atomseq_container_path: :class:`str` + :type position_container_path: :class:`str` + + .. attribute:: indexer + + .. attribute:: seqres_container + + .. attribute:: atomseq_container + + .. attribute:: position_container + +.. method:: ExtractTemplateDataDisCo(entry_name, chain_name, aln, \ + data_containers, residue_numbers, \ + positions) + + Data extraction helper for the specific use of extracting data required + to generate DisCoContainers in the QMEAN scoring function. Let's say we have a target-template alignment in *aln* (first seq: target, second seq: template). This function extracts all valid template positions - given the entry specified by *entry_name* and *chain_name*. The template - sequence in *aln* must match the sequence in *seqres_container*. Again, - the *atomseq_container* is used to identify valid positions. The according - residue numbers relative to the target sequence in *aln* are also returned. + and their residue numbers given the entry specified by *entry_name* and + *chain_name*. The template sequence in *aln* must match the sequence in + *seqres_container*. Again, the *atomseq_container* is used to identify valid + positions. :param entry_name: Name of assembly you want the data from :param chain_name: Name of chain you want the data from :param aln: Target-template sequence alignment - :param indexer: Used to access *atomseq_container*, *seqres_container* - and *position_container* - :param seqres_container: Container containing the full sequence data - :param atomseq_container: Container that marks locations with invalid position - data with '-' - :param position_container: Container containing position data + :param data_containers: Contains all required data containers + :param residue_numbers: Residue numbers of extracted positions will be stored + in here + :param positions: Extracted positions will be stored in here + :type entry_name: :class:`str` + :type chain_name: :class:`str` + :type aln: :ost.seq.SequenceHandle: + :type data_containers: :class:`DisCoDataContainers` + :type residue_numbers: :class:`list` of :class:`int` + :type positions: :class:`geom::Vec3List` + + :raises: :exc:`ost.Error` if requested data is not present in + the containers or if the template sequence in *aln* + doesn't match with the sequence in *seqres_container* + + + + +.. class:: GMQEDataContainers(indexer_path, \ + seqres_container_path, \ + atomseq_container_path, \ + dssp_container_path, \ + n_position_container_path, \ + ca_position_container_path, \ + c_position_container_path, \ + cb_position_container_path, \) + + Thin wrapper to organize the data containers required in + :meth:`ExtractTemplateDataGMQE`. All containers are loaded + from disk and are expected to be consistent. + + :param indexer_path: Path to indexer file to access the data containers + (:class:`LinearIndexer`) + :param seqres_container_path: Path to container with SEQRES data + (:class:`LinearCharacterContainer`) + :param atomseq_container_path: Path to container with ATOMSEQ data + (:class:`LinearCharacterContainer`) + :param dssp_container_path: Path to container containing dssp states + (:class:`LinearCharacterContainer`) + :param n_position_container_path: Path to container with N position data + (:class:`LinearPositionContainer`) + :param ca_position_container_path: Path to container with CA position data + (:class:`LinearPositionContainer`) + :param c_position_container_path: Path to container with C position data + (:class:`LinearPositionContainer`) + :param cb_position_container_path: Path to container with CB position data + (:class:`LinearPositionContainer`) + + :type indexer_path: :class:`str` + :type seqres_container_path: :class:`str` + :type atomseq_container_path: :class:`str` + :type dssp_container_path: :class:`str` + :type n_position_container_path: :class:`str` + :type ca_position_container_path: :class:`str` + :type c_position_container_path: :class:`str` + :type cb_position_container_path: :class:`str` + + .. attribute:: indexer + + .. attribute:: seqres_container + + .. attribute:: atomseq_container + + .. attribute:: dssp_container + + .. attribute:: n_position_container + + .. attribute:: ca_position_container + + .. attribute:: c_position_container + + .. attribute:: cb_position_container + + +.. method:: ExtractTemplateDataGMQE(entry_name, chain_name, aln, \ + data_containers, residue_numbers, \ + dssp, n_positions, ca_positions, \ + c_positions, cb_positions) + + Data extraction helper for the specific use of extracting data required + to estimate GMQE as available in the QMEAN software package. + Let's say we have a target-template alignment in *aln* (first seq: target, + second seq: template). This function extracts all valid template positions, + their dssp states and their residue numbers given the entry specified by + *entry_name* and *chain_name*. The template sequence in *aln* must match the + sequence in *seqres_container*. Again, the *atomseq_container* is used to + identify valid positions. + + :param entry_name: Name of assembly you want the data from + :param chain_name: Name of chain you want the data from + :param aln: Target-template sequence alignment + :param data_containers: Contains all required data containers + :param residue_numbers: Residue numbers of extracted positions will be stored + in here + :param dssp: Extracted DSSP states will be stored in here + :param n_positions: Extracted n_positions will be stored in here + :param ca_positions: Extracted ca_positions will be stored in here + :param c_positions: Extracted c_positions will be stored in here + :param cb_positions: Extracted cb_positions will be stored in here :type entry_name: :class:`str` :type chain_name: :class:`str` :type aln: :ost.seq.SequenceHandle: - :type indexer: :class:`LinearIndexer` - :type seqres_container: :class:`LinearCharacterContainer` - :type atomseq_container: :class:`LinearCharacterContainer` - :type position_container: :class:`LinearPositionContainer` or - :class:`list` of :class:`LinearPositionContainer` - - :returns: First element: :class:`list` of residue numbers that - relate each entry in the second element to the target - sequence specified in *aln*. The numbering scheme - starts from one. Second Element: the according positions. - :class:`ost.geom.Vec3List` if *position_container* was a - :class:`LinearPositionContainer`, a list of - :class:`ost.geom.Vec3List` if it was a :class:`list`. - :rtype: :class:`tuple` + :type data_containers: :class:`GMQEDataContainers` + :type residue_numbers: :class:`list` of :class:`int` + :type dssp: :class:`str` + :type n_positions: :class:`geom::Vec3List` + :type ca_positions: :class:`geom::Vec3List` + :type c_positions: :class:`geom::Vec3List` + :type cb_positions: :class:`geom::Vec3List` :raises: :exc:`ost.Error` if requested data is not present in - the container or if the template sequence in *aln* + the containers or if the template sequence in *aln* doesn't match with the sequence in *seqres_container* diff --git a/modules/db/pymod/export_linear_db.cc b/modules/db/pymod/export_linear_db.cc index 1cb623cb5b8644d6af0dbd939ca84fc8d44a31c0..6ee2d2f0191be3b4a78a3652d0edf3da2ca33483 100644 --- a/modules/db/pymod/export_linear_db.cc +++ b/modules/db/pymod/export_linear_db.cc @@ -147,52 +147,102 @@ void WrapGetPositions(LinearPositionContainerPtr container, container->GetPositions(p_range, positions); } +// helper struct to reduce number of input parameters +struct DisCoDataContainers { + + DisCoDataContainers(const String& indexer_path, + const String& seqres_container_path, + const String& atomseq_container_path, + const String& position_container_path) { + indexer = LinearIndexer::Load(indexer_path); + seqres_container = LinearCharacterContainer::Load(seqres_container_path); + atomseq_container = LinearCharacterContainer::Load(atomseq_container_path); + position_container = LinearPositionContainer::Load(position_container_path); + } -tuple WrapExtractTemplateDataSingle(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - LinearPositionContainerPtr position_container) { + LinearIndexerPtr indexer; + LinearCharacterContainerPtr seqres_container; + LinearCharacterContainerPtr atomseq_container; + LinearPositionContainerPtr position_container; +}; - std::vector<LinearPositionContainerPtr> position_containers; - position_containers.push_back(position_container); - std::vector<int> v_residue_numbers; - std::vector<geom::Vec3List> position_vec; - ost::db::ExtractTemplateData(entry_name, chain_name, aln, indexer, - seqres_container, atomseq_container, - position_containers, v_residue_numbers, - position_vec); +void WrapExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + DisCoDataContainers& data_containers, + boost::python::list& residue_numbers, + geom::Vec3List& positions) { - list residue_numbers; + std::vector<int> v_residue_numbers; + ost::db::ExtractTemplateDataDisCo(entry_name, chain_name, aln, + data_containers.indexer, + data_containers.seqres_container, + data_containers.atomseq_container, + data_containers.position_container, + v_residue_numbers, + positions); + residue_numbers = boost::python::list(); VecToList(v_residue_numbers, residue_numbers); - return boost::python::make_tuple(residue_numbers, position_vec[0]); } -tuple WrapExtractTemplateDataList(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - boost::python::list& position_containers) { - std::vector<LinearPositionContainerPtr> v_position_containers; - ListToVec(position_containers, v_position_containers); - std::vector<int> v_residue_numbers; - std::vector<geom::Vec3List> position_vec; +// helper struct to reduce number of input parameters +struct GMQEDataContainers { + + GMQEDataContainers(const String& indexer_path, + const String& seqres_container_path, + const String& atomseq_container_path, + const String& dssp_container_path, + const String& n_position_container_path, + const String& ca_position_container_path, + const String& c_position_container_path, + const String& cb_position_container_path) { + indexer = LinearIndexer::Load(indexer_path); + seqres_container = LinearCharacterContainer::Load(seqres_container_path); + atomseq_container = LinearCharacterContainer::Load(atomseq_container_path); + dssp_container = LinearCharacterContainer::Load(dssp_container_path); + n_position_container = LinearPositionContainer::Load(n_position_container_path); + ca_position_container = LinearPositionContainer::Load(ca_position_container_path); + c_position_container = LinearPositionContainer::Load(c_position_container_path); + cb_position_container = LinearPositionContainer::Load(cb_position_container_path); + } - ost::db::ExtractTemplateData(entry_name, chain_name, aln, indexer, - seqres_container, atomseq_container, - v_position_containers, v_residue_numbers, - position_vec); + LinearIndexerPtr indexer; + LinearCharacterContainerPtr seqres_container; + LinearCharacterContainerPtr atomseq_container; + LinearCharacterContainerPtr dssp_container; + LinearPositionContainerPtr n_position_container; + LinearPositionContainerPtr ca_position_container; + LinearPositionContainerPtr c_position_container; + LinearPositionContainerPtr cb_position_container; +}; + + +void WrapExtractTemplateDataGMQE(const String& entry_name, + const String& chain_name, + const ost::seq::AlignmentHandle& aln, + GMQEDataContainers& data_containers, + boost::python::list& residue_numbers, + String& dssp, + geom::Vec3List& n_positions, + geom::Vec3List& ca_positions, + geom::Vec3List& c_positions, + geom::Vec3List& cb_positions) { - list residue_numbers; - list position_list; + std::vector<int> v_residue_numbers; + ost::db::ExtractTemplateDataGMQE(entry_name, chain_name, aln, + data_containers.indexer, + data_containers.seqres_container, + data_containers.atomseq_container, + data_containers.dssp_container, + data_containers.n_position_container, + data_containers.ca_position_container, + data_containers.c_position_container, + data_containers.cb_position_container, + v_residue_numbers, dssp, n_positions, + ca_positions, c_positions, cb_positions); VecToList(v_residue_numbers, residue_numbers); - VecToList(position_vec, position_list); - return boost::python::make_tuple(residue_numbers, position_list); -} +} } @@ -252,20 +302,51 @@ void export_linear_db() { arg("position_container"), arg("seq"), arg("positions"))); - def("ExtractTemplateData", &WrapExtractTemplateDataSingle, (arg("entry_name"), - arg("chain_name"), - arg("aln"), - arg("linear_indexer"), - arg("seqres_container"), - arg("atomseq_container"), - arg("position_container"))); - - def("ExtractTemplateData", &WrapExtractTemplateDataList, (arg("entry_name"), - arg("chain_name"), - arg("aln"), - arg("linear_indexer"), - arg("seqres_container"), - arg("atomseq_container"), - arg("position_containers"))); + class_<DisCoDataContainers>("DisCoDataContainers", init<const String&, + const String&, + const String&, + const String&>()) + .def_readonly("indexer", &DisCoDataContainers::indexer) + .def_readonly("seqres_container", &DisCoDataContainers::seqres_container) + .def_readonly("atomseq_container", &DisCoDataContainers::atomseq_container) + .def_readonly("position_container", &DisCoDataContainers::position_container) + ; + + def("ExtractTemplateDataDisCo", &WrapExtractTemplateDataDisCo, (arg("entry_name"), + arg("chain_name"), + arg("aln"), + arg("data_containers"), + arg("residue_numbers"), + arg("positions"))); + + class_<GMQEDataContainers>("GMQEDataContainers", init<const String&, + const String&, + const String&, + const String&, + const String&, + const String&, + const String&, + const String&>()) + .def_readonly("indexer", &GMQEDataContainers::indexer) + .def_readonly("seqres_container", &GMQEDataContainers::seqres_container) + .def_readonly("atomseq_container", &GMQEDataContainers::atomseq_container) + .def_readonly("dssp_container", &GMQEDataContainers::dssp_container) + .def_readonly("n_position_container", &GMQEDataContainers::n_position_container) + .def_readonly("ca_position_container", &GMQEDataContainers::ca_position_container) + .def_readonly("c_position_container", &GMQEDataContainers::c_position_container) + .def_readonly("cb_position_container", &GMQEDataContainers::cb_position_container) + ; + + def("ExtractTemplateDataGMQE", &WrapExtractTemplateDataGMQE, (arg("entry_name"), + arg("chain_name"), + arg("aln"), + arg("data_containers"), + arg("residue_numbers"), + arg("dssp"), + arg("n_positions"), + arg("ca_positions"), + arg("c_positions"), + arg("cb_positions"))); + } diff --git a/modules/db/src/extract_data_helper.cc b/modules/db/src/extract_data_helper.cc index d8e18bc6978714693dd0ee9bd6a94601f5570215..519c6945feae7c6122c59ddf38f779b0a6c2aaaa 100644 --- a/modules/db/src/extract_data_helper.cc +++ b/modules/db/src/extract_data_helper.cc @@ -48,14 +48,14 @@ void ExtractValidPositions(const String& entry_name, const String& chain_name, valid_seq.end())); } -void ExtractTemplateData(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - std::vector<LinearPositionContainerPtr>& position_containers, - std::vector<int>& residue_numbers, - std::vector<geom::Vec3List>& positions) { +void ExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + LinearPositionContainerPtr& position_container, + std::vector<int>& residue_numbers, + geom::Vec3List& positions) { std::pair<uint64_t, uint64_t> data_range = indexer->GetDataRange(entry_name, chain_name); @@ -75,34 +75,119 @@ void ExtractTemplateData(const String& entry_name, const String& chain_name, String template_atomseq; atomseq_container->GetCharacters(data_range, template_atomseq); - int n_pos_containers = position_containers.size(); - std::vector<geom::Vec3List> extracted_positions(n_pos_containers); - for(int i = 0; i < n_pos_containers; ++i) { - position_containers[i]->GetPositions(data_range, extracted_positions[i]); - } + geom::Vec3List extracted_positions; + position_container->GetPositions(data_range, extracted_positions); + // prepare output + uint template_atomseq_size = template_atomseq.size(); + residue_numbers.clear(); + residue_numbers.reserve(template_atomseq_size); + positions.clear(); + positions.reserve(template_atomseq_size); + + // extract uint current_rnum = aln.GetSequence(0).GetOffset() + 1; uint current_template_pos = 0; String seqres_seq = aln.GetSequence(0).GetString(); String template_seq = aln.GetSequence(1).GetString(); + for(int i = 0; i < aln.GetLength(); ++i) { + if(seqres_seq[i] != '-' && template_seq[i] != '-') { + if(template_atomseq[current_template_pos] != '-') { + // it is aligned and we have a valid position! + residue_numbers.push_back(current_rnum); + positions.push_back(extracted_positions[current_template_pos]); + } + } + + if(seqres_seq[i] != '-') { + ++current_rnum; + } + + if(template_seq[i] != '-') { + ++current_template_pos; + } + } +} + +void ExtractTemplateDataGMQE(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + LinearCharacterContainerPtr dssp_container, + LinearPositionContainerPtr& n_position_container, + LinearPositionContainerPtr& ca_position_container, + LinearPositionContainerPtr& c_position_container, + LinearPositionContainerPtr& cb_position_container, + std::vector<int>& residue_numbers, + String& dssp, + geom::Vec3List& n_positions, + geom::Vec3List& ca_positions, + geom::Vec3List& c_positions, + geom::Vec3List& cb_positions) { + + std::pair<uint64_t, uint64_t> data_range = indexer->GetDataRange(entry_name, + chain_name); + + String template_seqres = aln.GetSequence(1).GetGaplessString(); + data_range.first += aln.GetSequence(1).GetOffset(); + data_range.second = data_range.first + template_seqres.size(); + + // check, whether the the template seqres is consistent with what + // we find in seqres_container + String expected_template_seqres; + seqres_container->GetCharacters(data_range, expected_template_seqres); + if(expected_template_seqres != template_seqres) { + throw std::runtime_error("Template sequence in input alignment is " + "inconsistent with sequence in SEQRES container!"); + } + + String template_atomseq; + atomseq_container->GetCharacters(data_range, template_atomseq); + String extracted_dssp; + dssp_container->GetCharacters(data_range, extracted_dssp); + geom::Vec3List extracted_n_positions; + n_position_container->GetPositions(data_range, extracted_n_positions); + geom::Vec3List extracted_ca_positions; + ca_position_container->GetPositions(data_range, extracted_ca_positions); + geom::Vec3List extracted_c_positions; + c_position_container->GetPositions(data_range, extracted_c_positions); + geom::Vec3List extracted_cb_positions; + cb_position_container->GetPositions(data_range, extracted_cb_positions); + // prepare output uint template_atomseq_size = template_atomseq.size(); - positions.assign(n_pos_containers, geom::Vec3List()); - for(int i = 0; i < n_pos_containers; ++i) { - positions[i].reserve(template_atomseq_size); - } residue_numbers.clear(); residue_numbers.reserve(template_atomseq_size); + dssp.clear(); + dssp.reserve(template_atomseq_size); + n_positions.clear(); + n_positions.reserve(template_atomseq_size); + ca_positions.clear(); + ca_positions.reserve(template_atomseq_size); + c_positions.clear(); + c_positions.reserve(template_atomseq_size); + cb_positions.clear(); + cb_positions.reserve(template_atomseq_size); + + + // extract + uint current_rnum = aln.GetSequence(0).GetOffset() + 1; + uint current_template_pos = 0; + String seqres_seq = aln.GetSequence(0).GetString(); + String template_seq = aln.GetSequence(1).GetString(); for(int i = 0; i < aln.GetLength(); ++i) { if(seqres_seq[i] != '-' && template_seq[i] != '-') { if(template_atomseq[current_template_pos] != '-') { // it is aligned and we have a valid position! residue_numbers.push_back(current_rnum); - for(int j = 0; j < n_pos_containers; ++j) { - positions[j].push_back(extracted_positions[j][current_template_pos]); - } + dssp.push_back(extracted_dssp[current_template_pos]); + n_positions.push_back(extracted_n_positions[current_template_pos]); + ca_positions.push_back(extracted_ca_positions[current_template_pos]); + c_positions.push_back(extracted_c_positions[current_template_pos]); + cb_positions.push_back(extracted_cb_positions[current_template_pos]); } } diff --git a/modules/db/src/extract_data_helper.hh b/modules/db/src/extract_data_helper.hh index b83c9a3154c0fb0ce4f5865b21b9e0c755cee260..dbcf7a214d526ea5c8d2919eff0f7dc5a0edb241 100644 --- a/modules/db/src/extract_data_helper.hh +++ b/modules/db/src/extract_data_helper.hh @@ -35,15 +35,31 @@ void ExtractValidPositions(const String& entry_name, const String& chain_name, ost::seq::SequenceHandle& seq, geom::Vec3List& positions); -void ExtractTemplateData(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - std::vector<LinearPositionContainerPtr>& position_container, - std::vector<int>& residue_numbers, - std::vector<geom::Vec3List>& positions); +void ExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + LinearPositionContainerPtr& position_container, + std::vector<int>& residue_numbers, + geom::Vec3List& positions); +void ExtractTemplateDataGMQE(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + LinearCharacterContainerPtr dssp_container, + LinearPositionContainerPtr& n_position_container, + LinearPositionContainerPtr& ca_position_container, + LinearPositionContainerPtr& c_position_container, + LinearPositionContainerPtr& cb_position_container, + std::vector<int>& residue_numbers, + String& dssp, + geom::Vec3List& n_positions, + geom::Vec3List& ca_positions, + geom::Vec3List& c_positions, + geom::Vec3List& cb_positions); }} //ns #endif