diff --git a/modules/db/doc/db.rst b/modules/db/doc/db.rst index 73f885dae77fbed5a89709ba219514349c10aaa9..87dc0919e840133815ea56af101ca8d9096b2d11 100644 --- a/modules/db/doc/db.rst +++ b/modules/db/doc/db.rst @@ -351,164 +351,45 @@ marks non-resolved residues with '-'. :raises: :exc:`ost.Error` if requested data is not present -.. class:: DisCoDataContainers(indexer_path, \ - seqres_container_path, \ - atomseq_container_path, \ - position_container_path) +.. method:: ExtractTemplateData(entry_name, chain_name, aln, indexer, \ + seqres_container, atomseq_container, \ + position_container) - Thin wrapper to organize the data containers required in - :meth:`ExtractTemplateDataDisCo`. All containers are loaded - from disk and are expected to be consistent. - - :param indexer_path: Path to indexer file to access the data containers - (:class:`LinearIndexer`) - :param seqres_container_path: Path to container with SEQRES data - (:class:`LinearCharacterContainer`) - :param atomseq_container_path: Path to container with ATOMSEQ data - (:class:`LinearCharacterContainer`) - :param position_container_path: Path to container with CA position data - (:class:`LinearPositionContainer`) - - :type indexer_path: :class:`str` - :type seqres_container_path: :class:`str` - :type atomseq_container_path: :class:`str` - :type position_container_path: :class:`str` - - .. attribute:: indexer - - .. attribute:: seqres_container - - .. attribute:: atomseq_container - - .. attribute:: position_container - -.. method:: ExtractTemplateDataDisCo(entry_name, chain_name, aln, \ - data_containers, residue_numbers, \ - positions) - - Data extraction helper for the specific use of extracting data required - to generate DisCoContainers in the QMEAN scoring function. Let's say we have a target-template alignment in *aln* (first seq: target, second seq: template). This function extracts all valid template positions - and their residue numbers given the entry specified by *entry_name* and - *chain_name*. The template sequence in *aln* must match the sequence in - *seqres_container*. Again, the *atomseq_container* is used to identify valid - positions. - - :param entry_name: Name of assembly you want the data from - :param chain_name: Name of chain you want the data from - :param aln: Target-template sequence alignment - :param data_containers: Contains all required data containers - :param residue_numbers: Residue numbers of extracted positions will be stored - in here - :param positions: Extracted positions will be stored in here - :type entry_name: :class:`str` - :type chain_name: :class:`str` - :type aln: :ost.seq.SequenceHandle: - :type data_containers: :class:`DisCoDataContainers` - :type residue_numbers: :class:`list` of :class:`int` - :type positions: :class:`geom::Vec3List` - - :raises: :exc:`ost.Error` if requested data is not present in - the containers or if the template sequence in *aln* - doesn't match with the sequence in *seqres_container* - - - - -.. class:: GMQEDataContainers(indexer_path, \ - seqres_container_path, \ - atomseq_container_path, \ - dssp_container_path, \ - n_position_container_path, \ - ca_position_container_path, \ - c_position_container_path, \ - cb_position_container_path, \) - - Thin wrapper to organize the data containers required in - :meth:`ExtractTemplateDataGMQE`. All containers are loaded - from disk and are expected to be consistent. - - :param indexer_path: Path to indexer file to access the data containers - (:class:`LinearIndexer`) - :param seqres_container_path: Path to container with SEQRES data - (:class:`LinearCharacterContainer`) - :param atomseq_container_path: Path to container with ATOMSEQ data - (:class:`LinearCharacterContainer`) - :param dssp_container_path: Path to container containing dssp states - (:class:`LinearCharacterContainer`) - :param n_position_container_path: Path to container with N position data - (:class:`LinearPositionContainer`) - :param ca_position_container_path: Path to container with CA position data - (:class:`LinearPositionContainer`) - :param c_position_container_path: Path to container with C position data - (:class:`LinearPositionContainer`) - :param cb_position_container_path: Path to container with CB position data - (:class:`LinearPositionContainer`) - - :type indexer_path: :class:`str` - :type seqres_container_path: :class:`str` - :type atomseq_container_path: :class:`str` - :type dssp_container_path: :class:`str` - :type n_position_container_path: :class:`str` - :type ca_position_container_path: :class:`str` - :type c_position_container_path: :class:`str` - :type cb_position_container_path: :class:`str` - - .. attribute:: indexer - - .. attribute:: seqres_container - - .. attribute:: atomseq_container - - .. attribute:: dssp_container - - .. attribute:: n_position_container - - .. attribute:: ca_position_container - - .. attribute:: c_position_container - - .. attribute:: cb_position_container - - -.. method:: ExtractTemplateDataGMQE(entry_name, chain_name, aln, \ - data_containers, residue_numbers, \ - dssp, n_positions, ca_positions, \ - c_positions, cb_positions) - - Data extraction helper for the specific use of extracting data required - to estimate GMQE as available in the QMEAN software package. - Let's say we have a target-template alignment in *aln* (first seq: target, - second seq: template). This function extracts all valid template positions, - their dssp states and their residue numbers given the entry specified by - *entry_name* and *chain_name*. The template sequence in *aln* must match the - sequence in *seqres_container*. Again, the *atomseq_container* is used to - identify valid positions. + given the entry specified by *entry_name* and *chain_name*. The template + sequence in *aln* must match the sequence in *seqres_container*. Again, + the *atomseq_container* is used to identify valid positions. The according + residue numbers relative to the target sequence in *aln* are also returned. :param entry_name: Name of assembly you want the data from :param chain_name: Name of chain you want the data from :param aln: Target-template sequence alignment - :param data_containers: Contains all required data containers - :param residue_numbers: Residue numbers of extracted positions will be stored - in here - :param dssp: Extracted DSSP states will be stored in here - :param n_positions: Extracted n_positions will be stored in here - :param ca_positions: Extracted ca_positions will be stored in here - :param c_positions: Extracted c_positions will be stored in here - :param cb_positions: Extracted cb_positions will be stored in here + :param indexer: Used to access *atomseq_container*, *seqres_container* + and *position_container* + :param seqres_container: Container containing the full sequence data + :param atomseq_container: Container that marks locations with invalid position + data with '-' + :param position_container: Container containing position data :type entry_name: :class:`str` :type chain_name: :class:`str` :type aln: :ost.seq.SequenceHandle: - :type data_containers: :class:`GMQEDataContainers` - :type residue_numbers: :class:`list` of :class:`int` - :type dssp: :class:`str` - :type n_positions: :class:`geom::Vec3List` - :type ca_positions: :class:`geom::Vec3List` - :type c_positions: :class:`geom::Vec3List` - :type cb_positions: :class:`geom::Vec3List` + :type indexer: :class:`LinearIndexer` + :type seqres_container: :class:`LinearCharacterContainer` + :type atomseq_container: :class:`LinearCharacterContainer` + :type position_container: :class:`LinearPositionContainer` or + :class:`list` of :class:`LinearPositionContainer` + + :returns: First element: :class:`list` of residue numbers that + relate each entry in the second element to the target + sequence specified in *aln*. The numbering scheme + starts from one. Second Element: the according positions. + :class:`ost.geom.Vec3List` if *position_container* was a + :class:`LinearPositionContainer`, a list of + :class:`ost.geom.Vec3List` if it was a :class:`list`. + :rtype: :class:`tuple` :raises: :exc:`ost.Error` if requested data is not present in - the containers or if the template sequence in *aln* + the container or if the template sequence in *aln* doesn't match with the sequence in *seqres_container* diff --git a/modules/db/pymod/export_linear_db.cc b/modules/db/pymod/export_linear_db.cc index 6ee2d2f0191be3b4a78a3652d0edf3da2ca33483..1cb623cb5b8644d6af0dbd939ca84fc8d44a31c0 100644 --- a/modules/db/pymod/export_linear_db.cc +++ b/modules/db/pymod/export_linear_db.cc @@ -147,102 +147,52 @@ void WrapGetPositions(LinearPositionContainerPtr container, container->GetPositions(p_range, positions); } -// helper struct to reduce number of input parameters -struct DisCoDataContainers { - - DisCoDataContainers(const String& indexer_path, - const String& seqres_container_path, - const String& atomseq_container_path, - const String& position_container_path) { - indexer = LinearIndexer::Load(indexer_path); - seqres_container = LinearCharacterContainer::Load(seqres_container_path); - atomseq_container = LinearCharacterContainer::Load(atomseq_container_path); - position_container = LinearPositionContainer::Load(position_container_path); - } - LinearIndexerPtr indexer; - LinearCharacterContainerPtr seqres_container; - LinearCharacterContainerPtr atomseq_container; - LinearPositionContainerPtr position_container; -}; +tuple WrapExtractTemplateDataSingle(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + LinearPositionContainerPtr position_container) { + std::vector<LinearPositionContainerPtr> position_containers; + position_containers.push_back(position_container); + std::vector<int> v_residue_numbers; + std::vector<geom::Vec3List> position_vec; -void WrapExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - DisCoDataContainers& data_containers, - boost::python::list& residue_numbers, - geom::Vec3List& positions) { + ost::db::ExtractTemplateData(entry_name, chain_name, aln, indexer, + seqres_container, atomseq_container, + position_containers, v_residue_numbers, + position_vec); - std::vector<int> v_residue_numbers; - ost::db::ExtractTemplateDataDisCo(entry_name, chain_name, aln, - data_containers.indexer, - data_containers.seqres_container, - data_containers.atomseq_container, - data_containers.position_container, - v_residue_numbers, - positions); - residue_numbers = boost::python::list(); + list residue_numbers; VecToList(v_residue_numbers, residue_numbers); + return boost::python::make_tuple(residue_numbers, position_vec[0]); } +tuple WrapExtractTemplateDataList(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + boost::python::list& position_containers) { -// helper struct to reduce number of input parameters -struct GMQEDataContainers { - - GMQEDataContainers(const String& indexer_path, - const String& seqres_container_path, - const String& atomseq_container_path, - const String& dssp_container_path, - const String& n_position_container_path, - const String& ca_position_container_path, - const String& c_position_container_path, - const String& cb_position_container_path) { - indexer = LinearIndexer::Load(indexer_path); - seqres_container = LinearCharacterContainer::Load(seqres_container_path); - atomseq_container = LinearCharacterContainer::Load(atomseq_container_path); - dssp_container = LinearCharacterContainer::Load(dssp_container_path); - n_position_container = LinearPositionContainer::Load(n_position_container_path); - ca_position_container = LinearPositionContainer::Load(ca_position_container_path); - c_position_container = LinearPositionContainer::Load(c_position_container_path); - cb_position_container = LinearPositionContainer::Load(cb_position_container_path); - } + std::vector<LinearPositionContainerPtr> v_position_containers; + ListToVec(position_containers, v_position_containers); + std::vector<int> v_residue_numbers; + std::vector<geom::Vec3List> position_vec; - LinearIndexerPtr indexer; - LinearCharacterContainerPtr seqres_container; - LinearCharacterContainerPtr atomseq_container; - LinearCharacterContainerPtr dssp_container; - LinearPositionContainerPtr n_position_container; - LinearPositionContainerPtr ca_position_container; - LinearPositionContainerPtr c_position_container; - LinearPositionContainerPtr cb_position_container; -}; - - -void WrapExtractTemplateDataGMQE(const String& entry_name, - const String& chain_name, - const ost::seq::AlignmentHandle& aln, - GMQEDataContainers& data_containers, - boost::python::list& residue_numbers, - String& dssp, - geom::Vec3List& n_positions, - geom::Vec3List& ca_positions, - geom::Vec3List& c_positions, - geom::Vec3List& cb_positions) { + ost::db::ExtractTemplateData(entry_name, chain_name, aln, indexer, + seqres_container, atomseq_container, + v_position_containers, v_residue_numbers, + position_vec); - std::vector<int> v_residue_numbers; - ost::db::ExtractTemplateDataGMQE(entry_name, chain_name, aln, - data_containers.indexer, - data_containers.seqres_container, - data_containers.atomseq_container, - data_containers.dssp_container, - data_containers.n_position_container, - data_containers.ca_position_container, - data_containers.c_position_container, - data_containers.cb_position_container, - v_residue_numbers, dssp, n_positions, - ca_positions, c_positions, cb_positions); + list residue_numbers; + list position_list; VecToList(v_residue_numbers, residue_numbers); -} + VecToList(position_vec, position_list); + return boost::python::make_tuple(residue_numbers, position_list); +} } @@ -302,51 +252,20 @@ void export_linear_db() { arg("position_container"), arg("seq"), arg("positions"))); - class_<DisCoDataContainers>("DisCoDataContainers", init<const String&, - const String&, - const String&, - const String&>()) - .def_readonly("indexer", &DisCoDataContainers::indexer) - .def_readonly("seqres_container", &DisCoDataContainers::seqres_container) - .def_readonly("atomseq_container", &DisCoDataContainers::atomseq_container) - .def_readonly("position_container", &DisCoDataContainers::position_container) - ; - - def("ExtractTemplateDataDisCo", &WrapExtractTemplateDataDisCo, (arg("entry_name"), - arg("chain_name"), - arg("aln"), - arg("data_containers"), - arg("residue_numbers"), - arg("positions"))); - - class_<GMQEDataContainers>("GMQEDataContainers", init<const String&, - const String&, - const String&, - const String&, - const String&, - const String&, - const String&, - const String&>()) - .def_readonly("indexer", &GMQEDataContainers::indexer) - .def_readonly("seqres_container", &GMQEDataContainers::seqres_container) - .def_readonly("atomseq_container", &GMQEDataContainers::atomseq_container) - .def_readonly("dssp_container", &GMQEDataContainers::dssp_container) - .def_readonly("n_position_container", &GMQEDataContainers::n_position_container) - .def_readonly("ca_position_container", &GMQEDataContainers::ca_position_container) - .def_readonly("c_position_container", &GMQEDataContainers::c_position_container) - .def_readonly("cb_position_container", &GMQEDataContainers::cb_position_container) - ; - - def("ExtractTemplateDataGMQE", &WrapExtractTemplateDataGMQE, (arg("entry_name"), - arg("chain_name"), - arg("aln"), - arg("data_containers"), - arg("residue_numbers"), - arg("dssp"), - arg("n_positions"), - arg("ca_positions"), - arg("c_positions"), - arg("cb_positions"))); - + def("ExtractTemplateData", &WrapExtractTemplateDataSingle, (arg("entry_name"), + arg("chain_name"), + arg("aln"), + arg("linear_indexer"), + arg("seqres_container"), + arg("atomseq_container"), + arg("position_container"))); + + def("ExtractTemplateData", &WrapExtractTemplateDataList, (arg("entry_name"), + arg("chain_name"), + arg("aln"), + arg("linear_indexer"), + arg("seqres_container"), + arg("atomseq_container"), + arg("position_containers"))); } diff --git a/modules/db/src/extract_data_helper.cc b/modules/db/src/extract_data_helper.cc index 519c6945feae7c6122c59ddf38f779b0a6c2aaaa..d8e18bc6978714693dd0ee9bd6a94601f5570215 100644 --- a/modules/db/src/extract_data_helper.cc +++ b/modules/db/src/extract_data_helper.cc @@ -48,14 +48,14 @@ void ExtractValidPositions(const String& entry_name, const String& chain_name, valid_seq.end())); } -void ExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - LinearPositionContainerPtr& position_container, - std::vector<int>& residue_numbers, - geom::Vec3List& positions) { +void ExtractTemplateData(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + std::vector<LinearPositionContainerPtr>& position_containers, + std::vector<int>& residue_numbers, + std::vector<geom::Vec3List>& positions) { std::pair<uint64_t, uint64_t> data_range = indexer->GetDataRange(entry_name, chain_name); @@ -75,119 +75,34 @@ void ExtractTemplateDataDisCo(const String& entry_name, const String& chain_name String template_atomseq; atomseq_container->GetCharacters(data_range, template_atomseq); - geom::Vec3List extracted_positions; - position_container->GetPositions(data_range, extracted_positions); - - // prepare output - uint template_atomseq_size = template_atomseq.size(); - residue_numbers.clear(); - residue_numbers.reserve(template_atomseq_size); - positions.clear(); - positions.reserve(template_atomseq_size); + int n_pos_containers = position_containers.size(); + std::vector<geom::Vec3List> extracted_positions(n_pos_containers); + for(int i = 0; i < n_pos_containers; ++i) { + position_containers[i]->GetPositions(data_range, extracted_positions[i]); + } - // extract uint current_rnum = aln.GetSequence(0).GetOffset() + 1; uint current_template_pos = 0; String seqres_seq = aln.GetSequence(0).GetString(); String template_seq = aln.GetSequence(1).GetString(); - for(int i = 0; i < aln.GetLength(); ++i) { - if(seqres_seq[i] != '-' && template_seq[i] != '-') { - if(template_atomseq[current_template_pos] != '-') { - // it is aligned and we have a valid position! - residue_numbers.push_back(current_rnum); - positions.push_back(extracted_positions[current_template_pos]); - } - } - - if(seqres_seq[i] != '-') { - ++current_rnum; - } - - if(template_seq[i] != '-') { - ++current_template_pos; - } - } -} - -void ExtractTemplateDataGMQE(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - LinearCharacterContainerPtr dssp_container, - LinearPositionContainerPtr& n_position_container, - LinearPositionContainerPtr& ca_position_container, - LinearPositionContainerPtr& c_position_container, - LinearPositionContainerPtr& cb_position_container, - std::vector<int>& residue_numbers, - String& dssp, - geom::Vec3List& n_positions, - geom::Vec3List& ca_positions, - geom::Vec3List& c_positions, - geom::Vec3List& cb_positions) { - - std::pair<uint64_t, uint64_t> data_range = indexer->GetDataRange(entry_name, - chain_name); - - String template_seqres = aln.GetSequence(1).GetGaplessString(); - data_range.first += aln.GetSequence(1).GetOffset(); - data_range.second = data_range.first + template_seqres.size(); - - // check, whether the the template seqres is consistent with what - // we find in seqres_container - String expected_template_seqres; - seqres_container->GetCharacters(data_range, expected_template_seqres); - if(expected_template_seqres != template_seqres) { - throw std::runtime_error("Template sequence in input alignment is " - "inconsistent with sequence in SEQRES container!"); - } - - String template_atomseq; - atomseq_container->GetCharacters(data_range, template_atomseq); - String extracted_dssp; - dssp_container->GetCharacters(data_range, extracted_dssp); - geom::Vec3List extracted_n_positions; - n_position_container->GetPositions(data_range, extracted_n_positions); - geom::Vec3List extracted_ca_positions; - ca_position_container->GetPositions(data_range, extracted_ca_positions); - geom::Vec3List extracted_c_positions; - c_position_container->GetPositions(data_range, extracted_c_positions); - geom::Vec3List extracted_cb_positions; - cb_position_container->GetPositions(data_range, extracted_cb_positions); - // prepare output uint template_atomseq_size = template_atomseq.size(); + positions.assign(n_pos_containers, geom::Vec3List()); + for(int i = 0; i < n_pos_containers; ++i) { + positions[i].reserve(template_atomseq_size); + } residue_numbers.clear(); residue_numbers.reserve(template_atomseq_size); - dssp.clear(); - dssp.reserve(template_atomseq_size); - n_positions.clear(); - n_positions.reserve(template_atomseq_size); - ca_positions.clear(); - ca_positions.reserve(template_atomseq_size); - c_positions.clear(); - c_positions.reserve(template_atomseq_size); - cb_positions.clear(); - cb_positions.reserve(template_atomseq_size); - - - // extract - uint current_rnum = aln.GetSequence(0).GetOffset() + 1; - uint current_template_pos = 0; - String seqres_seq = aln.GetSequence(0).GetString(); - String template_seq = aln.GetSequence(1).GetString(); for(int i = 0; i < aln.GetLength(); ++i) { if(seqres_seq[i] != '-' && template_seq[i] != '-') { if(template_atomseq[current_template_pos] != '-') { // it is aligned and we have a valid position! residue_numbers.push_back(current_rnum); - dssp.push_back(extracted_dssp[current_template_pos]); - n_positions.push_back(extracted_n_positions[current_template_pos]); - ca_positions.push_back(extracted_ca_positions[current_template_pos]); - c_positions.push_back(extracted_c_positions[current_template_pos]); - cb_positions.push_back(extracted_cb_positions[current_template_pos]); + for(int j = 0; j < n_pos_containers; ++j) { + positions[j].push_back(extracted_positions[j][current_template_pos]); + } } } diff --git a/modules/db/src/extract_data_helper.hh b/modules/db/src/extract_data_helper.hh index dbcf7a214d526ea5c8d2919eff0f7dc5a0edb241..b83c9a3154c0fb0ce4f5865b21b9e0c755cee260 100644 --- a/modules/db/src/extract_data_helper.hh +++ b/modules/db/src/extract_data_helper.hh @@ -35,31 +35,15 @@ void ExtractValidPositions(const String& entry_name, const String& chain_name, ost::seq::SequenceHandle& seq, geom::Vec3List& positions); -void ExtractTemplateDataDisCo(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - LinearPositionContainerPtr& position_container, - std::vector<int>& residue_numbers, - geom::Vec3List& positions); +void ExtractTemplateData(const String& entry_name, const String& chain_name, + const ost::seq::AlignmentHandle& aln, + LinearIndexerPtr indexer, + LinearCharacterContainerPtr seqres_container, + LinearCharacterContainerPtr atomseq_container, + std::vector<LinearPositionContainerPtr>& position_container, + std::vector<int>& residue_numbers, + std::vector<geom::Vec3List>& positions); -void ExtractTemplateDataGMQE(const String& entry_name, const String& chain_name, - const ost::seq::AlignmentHandle& aln, - LinearIndexerPtr indexer, - LinearCharacterContainerPtr seqres_container, - LinearCharacterContainerPtr atomseq_container, - LinearCharacterContainerPtr dssp_container, - LinearPositionContainerPtr& n_position_container, - LinearPositionContainerPtr& ca_position_container, - LinearPositionContainerPtr& c_position_container, - LinearPositionContainerPtr& cb_position_container, - std::vector<int>& residue_numbers, - String& dssp, - geom::Vec3List& n_positions, - geom::Vec3List& ca_positions, - geom::Vec3List& c_positions, - geom::Vec3List& cb_positions); }} //ns #endif