From e61319899eea975db2adf8ec07cc32e72b1ae6f7 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Fri, 29 Jul 2011 18:00:30 +0200 Subject: [PATCH] First step towards filling ChainType with live from the MMCifParser. Its fully functional but only stores the type of chain, at the moment. --- modules/io/src/mol/mmcif_reader.cc | 84 ++++++++- modules/io/src/mol/mmcif_reader.hh | 37 +++- modules/io/tests/test_mmcif_reader.cc | 162 ++++++++++++++++-- .../io/tests/testfiles/mmcif/atom_site.mmcif | 14 ++ .../tests/testfiles/mmcif/model_truepos.mmcif | 3 + 5 files changed, 269 insertions(+), 31 deletions(-) diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index 587cc670c..3bac82e55 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -58,11 +58,12 @@ void MMCifParser::Init() residue_count_ = 0; auth_chain_id_ = false; has_model_ = false; - //memset(indices_, -1, MAX_ITEMS_IN_ROW * sizeof(int)); restrict_chains_ = ""; subst_res_id_ = ""; - //curr_chain_ = mol::ChainHandle(); - //curr_residue_ = mol::ResidueHandle(); + curr_chain_ = mol::ChainHandle(); + curr_residue_ = mol::ResidueHandle(); + //chain_id_pairs_ = + //entity_desc_map_ } void MMCifParser::ClearState() @@ -117,7 +118,7 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header) category_counts_[category_]++; // mandatory items this->TryStoreIdx(AUTH_ASYM_ID, "auth_asym_id", header); - this->TryStoreIdx(ID, "id", header); + this->TryStoreIdx(AS_ID, "id", header); this->TryStoreIdx(LABEL_ALT_ID, "label_alt_id", header); this->TryStoreIdx(LABEL_ASYM_ID, "label_asym_id", header); this->TryStoreIdx(LABEL_ATOM_ID, "label_atom_id", header); @@ -147,6 +148,14 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header) } } return true; + } else if (header.GetCategory()=="entity") { + category_ = ENTITY; + category_counts_[category_]++; + // mandatory items + this->TryStoreIdx(E_ID, "id", header); + // optional + indices_[E_TYPE] = header.GetIndex("type"); + return true; } /*else if (header.GetCategory()=="entity_poly") { } else if (header.GetCategory()=="pdbx_poly_seq_scheme") { @@ -187,7 +196,7 @@ bool MMCifParser::ParseAtomIdent(const std::vector<StringRef>& columns, return false; } - std::pair<bool, int> a_num = this->TryGetInt(columns[indices_[ID]], + std::pair<bool, int> a_num = this->TryGetInt(columns[indices_[AS_ID]], "atom_site.id", profile_.fault_tolerant); // unit test @@ -275,8 +284,8 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns) LOG_TRACE( "s_chain: [" << chain_name << "]" ); // determine chain and residue update - bool update_chain=false; - bool update_residue=false; + bool update_chain = false; + bool update_residue = false; if(!curr_chain_) { // unit test update_chain=true; update_residue=true; @@ -315,8 +324,19 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns) LOG_DEBUG("new chain " << chain_name); curr_chain_=editor.InsertChain(chain_name); ++chain_count_; + // store entity id + chain_id_pairs_.push_back(std::pair<mol::ChainHandle,String>(curr_chain_, + columns[indices_[LABEL_ENTITY_ID]].str())); } assert(curr_chain_.IsValid()); + } else if (chain_id_pairs_.back().second != // unit test + columns[indices_[LABEL_ENTITY_ID]].str()) { + // check that label_entity_id stays the same + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "Change of 'atom_site.label_entity_id' item for chain " + + curr_chain_.GetName() + "! Expected: " + chain_id_pairs_.back().second + + ", found: " + columns[indices_[LABEL_ENTITY_ID]].str() + ".", + this->GetCurrentLinenum())); } if(update_residue) { // unit test @@ -408,6 +428,37 @@ void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns) } +void MMCifParser::ParseEntity(const std::vector<StringRef>& columns) +{ + bool store = false; // is it worth storing this record? + MMCifEntityDesc desc; + + // type + if (indices_[E_TYPE] != -1) { + if(StringRef("polymer", 7) == columns[indices_[E_TYPE]]) { + desc.type = CHAINTYPE_POLY; + } else if(StringRef("non-polymer", 11) == columns[indices_[E_TYPE]]) { + desc.type = CHAINTYPE_NON_POLY; + } else if(StringRef("water", 5) == columns[indices_[E_TYPE]]) { + desc.type = CHAINTYPE_WATER; + } else { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "Unrecognised chain type '" + + columns[indices_[E_TYPE]].str() + + "' found.", + this->GetCurrentLinenum())); + } + store = true; + } + + if (store) { + entity_desc_map_.insert( + MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(), + desc) + ); + } +} + void MMCifParser::OnDataRow(const StarLoopDesc& header, const std::vector<StringRef>& columns) { @@ -416,6 +467,9 @@ void MMCifParser::OnDataRow(const StarLoopDesc& header, LOG_TRACE("processing atom_site entry"); this->ParseAndAddAtom(columns); break; + case ENTITY: + LOG_TRACE("processing entity entry"); + this->ParseEntity(columns); default: return; } @@ -563,6 +617,22 @@ void PDBReader::Import(mol::EntityHandle& ent, void MMCifParser::OnEndData() { + mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT); + + // process chain types + std::vector<std::pair<mol::ChainHandle, String> >::const_iterator css; + MMCifEntityDescMap::const_iterator edm_it; + for (css = chain_id_pairs_.begin(); css != chain_id_pairs_.end(); ++css) { + edm_it = entity_desc_map_.find(css->second); + + if (edm_it != entity_desc_map_.end()) { + editor.SetChainType(css->first, edm_it->second.type); + } else { + LOG_WARNING("No entity description found for atom_site.label_entity_id '" + << css->second << "'"); + } + } + LOG_INFO("imported " << chain_count_ << " chains, " << residue_count_ << " residues, " diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index 41e586a14..529702a74 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -19,10 +19,13 @@ #ifndef OST_MMCIF_PARSER_HH #define OST_MMCIF_PARSER_HH +#include <map> + //#include <boost/iostreams/filtering_stream.hpp> //#include <boost/filesystem/fstream.hpp> #include <ost/mol/residue_handle.hh> +#include <ost/mol/chain_type.hh> #include <ost/io/mol/io_profile.hh> #include <ost/io/io_exception.hh> #include <ost/io/mol/star_parser.hh> @@ -127,11 +130,12 @@ public: /// \brief fetch values identifying atoms /// - /// \param[in] columns data row + /// \param[in] columns data row /// \param[out] chain_name takes atom_site.label_asym_id or, if /// auth_chain_id_ is set, atom_site.auth_asym_id as a chain name + /// \param[out] res_name fetches atom_site.label_comp_id - /// \param atom_name corresponds to label_atom_id + /// \param[out] atom_name corresponds to label_atom_id bool ParseAtomIdent(const std::vector<StringRef>& columns, String& chain_name, StringRef& res_name, @@ -145,6 +149,11 @@ public: /// \param columns data row void ParseAndAddAtom(const std::vector<StringRef>& columns); + /// \brief Fetch MMCif entity information + /// + /// \param columns data row + void ParseEntity(const std::vector<StringRef>& columns); + private: /// \enum magic numbers of this class typedef enum { @@ -155,12 +164,12 @@ private: /// \enum items of the atom_site category typedef enum { AUTH_ASYM_ID, ///< chain name by author as in PDB - ID, ///< atom serial id + AS_ID, ///< atom serial id LABEL_ALT_ID, ///< AltLoc LABEL_ASYM_ID, ///< chain name by PDB LABEL_ATOM_ID, LABEL_COMP_ID, - LABEL_ENTITY_ID, + LABEL_ENTITY_ID, ///< link to category entity LABEL_SEQ_ID, ///< residue no. AUTH_SEQ_ID, ///< residue no. by author TYPE_SYMBOL, ///< chemical element @@ -174,12 +183,26 @@ private: PDBX_PDB_MODEL_NUM ///< model no. (especially NMR structures) } AtomSiteItems; + /// \enum items of the entity category + typedef enum { + E_ID, ///< unique identifier + E_TYPE ///< polymer, non-polymer or water + } EntityItems; + /// \enum categories of the mmcif format typedef enum { ATOM_SITE, + ENTITY, DONT_KNOW } MMCifCategory; + /// \struct keeping track of entity information + typedef struct { + ChainType type; ///< characterise entity + } MMCifEntityDesc; + + typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap; + // members MMCifCategory category_; int category_counts_[DONT_KNOW]; ///< overall no. of atom_site loops @@ -197,9 +220,9 @@ private: String subst_res_id_; ///< work around for missing label_seq_id's bool has_model_; ///< keep track of models through different atom_sites int curr_model_; ///< if we have pdbx_PDB_model_num, store no. - //from pdbdreader - //entity als member, fill in ondatarow - //import function + std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_; + ///< chain and label_entity_id + MMCifEntityDescMap entity_desc_map_; ///< stores entity items }; }} diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index 524ad6bc6..ad669bd7a 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -39,9 +39,16 @@ public: MMCifParser(stream, ent_handle, profile) { } + TestMMCifParserProtected(const String& filename, + mol::EntityHandle& ent_handle): + MMCifParser(filename, ent_handle, IOProfile()) + { } + + using MMCifParser::OnBeginLoop; using MMCifParser::IsValidPDBIdent; using MMCifParser::ParseAtomIdent; using MMCifParser::ParseAndAddAtom; + using MMCifParser::ParseEntity; using MMCifParser::TryStoreIdx; }; @@ -94,43 +101,54 @@ BOOST_AUTO_TEST_CASE(mmcif_onbeginloop) std::ifstream s("testfiles/mmcif/atom_site.mmcif"); MMCifParser mmcif_p(s, eh, IOProfile()); StarLoopDesc mmcif_h; + BOOST_MESSAGE(" testing atom_site items..."); mmcif_h.SetCategory(StringRef("atom_site", 9)); - BOOST_MESSAGE(" auth_asym_id"); + BOOST_MESSAGE(" auth_asym_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("auth_asym_id", 12)); - BOOST_MESSAGE(" id"); + BOOST_MESSAGE(" id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("id", 2)); - BOOST_MESSAGE(" label_alt_id"); + BOOST_MESSAGE(" label_alt_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_alt_id", 12)); - BOOST_MESSAGE(" label_asym_id"); + BOOST_MESSAGE(" label_asym_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_asym_id", 13)); - BOOST_MESSAGE(" label_atom_id"); + BOOST_MESSAGE(" label_atom_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_atom_id", 13)); - BOOST_MESSAGE(" label_comp_id"); + BOOST_MESSAGE(" label_comp_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_comp_id", 13)); - BOOST_MESSAGE(" label_entity_id"); + BOOST_MESSAGE(" label_entity_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_entity_id", 15)); - BOOST_MESSAGE(" label_seq_id"); + BOOST_MESSAGE(" label_seq_id"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("label_seq_id", 12)); - BOOST_MESSAGE(" type_symbol"); + BOOST_MESSAGE(" type_symbol"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("type_symbol", 11)); - BOOST_MESSAGE(" Cartn_x"); + BOOST_MESSAGE(" Cartn_x"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("Cartn_x", 7)); - BOOST_MESSAGE(" Cartn_y"); + BOOST_MESSAGE(" Cartn_y"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("Cartn_y", 7)); - BOOST_MESSAGE(" Cartn_z"); + BOOST_MESSAGE(" Cartn_z"); BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); mmcif_h.Add(StringRef("Cartn_z", 7)); + BOOST_CHECK_NO_THROW(mmcif_p.OnBeginLoop(mmcif_h)); + BOOST_MESSAGE(" done."); + mmcif_h.Clear(); + BOOST_MESSAGE(" testing entity items..."); + mmcif_h.SetCategory(StringRef("entity", 6)); + BOOST_MESSAGE(" id"); + BOOST_CHECK_THROW(mmcif_p.OnBeginLoop(mmcif_h), IOException); + mmcif_h.Add(StringRef("id", 2)); + BOOST_CHECK_NO_THROW(mmcif_p.OnBeginLoop(mmcif_h)); + BOOST_MESSAGE(" done."); BOOST_MESSAGE(" done."); } @@ -178,23 +196,133 @@ BOOST_AUTO_TEST_CASE(mmcif_parse_models) BOOST_MESSAGE(" done."); } -BOOST_AUTO_TEST_CASE(mmcif_parseatomident) +BOOST_AUTO_TEST_CASE(mmcif_changing_label_entity_id) +{ + BOOST_MESSAGE(" Running mmcif_changing_label_entity_id tests..."); + IOProfile profile; + + // positive + BOOST_MESSAGE(" true positive test..."); + { + mol::EntityHandle eh = mol::CreateEntity(); + MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile); + BOOST_CHECK_NO_THROW(mmcif_p.Parse()); + } + BOOST_MESSAGE(" done."); + + // negative + BOOST_MESSAGE(" true negative test..."); + { + mol::EntityHandle eh = mol::CreateEntity(); + MMCifParser mmcif_p("testfiles/mmcif/changing_label_entity_id.mmcif", eh, + profile); + BOOST_CHECK_THROW(mmcif_p.Parse(), IOException); + } + BOOST_MESSAGE(" done."); + + BOOST_MESSAGE(" done."); +} + +BOOST_AUTO_TEST_CASE(mmcif_unknown_entity_type) { + BOOST_MESSAGE(" Running mmcif_unknown_entity_type tests..."); + mol::EntityHandle eh = mol::CreateEntity(); + std::vector<StringRef> columns; + TestMMCifParserProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh); + StarLoopDesc tmmcif_h; + // build dummy header + tmmcif_h.SetCategory(StringRef("entity", 6)); + tmmcif_h.Add(StringRef("id", 2)); + tmmcif_h.Add(StringRef("type", 4)); + tmmcif_p.OnBeginLoop(tmmcif_h); + + // positive + BOOST_MESSAGE(" known type..."); + // build datarow + columns.push_back(StringRef("1", 1)); + columns.push_back(StringRef("polymer", 7)); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns)); + columns.pop_back(); + columns.push_back(StringRef("non-polymer", 11)); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns)); + columns.pop_back(); + columns.push_back(StringRef("water", 5)); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntity(columns)); + BOOST_MESSAGE(" done."); + + // negative + BOOST_MESSAGE(" unknown type..."); + columns.pop_back(); + columns.push_back(StringRef("foo", 3)); + BOOST_CHECK_THROW(tmmcif_p.ParseEntity(columns), IOException); + BOOST_MESSAGE(" done."); + + BOOST_MESSAGE(" done."); +} + +BOOST_AUTO_TEST_CASE(mmcif_chaintype_setting) +{ + BOOST_MESSAGE(" Running mmcif_chaintype_setting tests..."); + mol::ChainHandle ch; + IOProfile profile; + + // positive + BOOST_MESSAGE(" check correct settings..."); + { + mol::EntityHandle eh = mol::CreateEntity(); + MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile); + mmcif_p.Parse(); + ch = eh.FindChain("A"); + BOOST_CHECK(ch.IsValid()); + BOOST_CHECK(ch.GetChainType() == CHAINTYPE_POLY); + ch = eh.FindChain("C"); + BOOST_CHECK(ch.IsValid()); + BOOST_CHECK(ch.GetChainType() == CHAINTYPE_POLY); + ch = eh.FindChain("O"); + BOOST_CHECK(ch.IsValid()); + BOOST_CHECK(ch.GetChainType() == CHAINTYPE_WATER); + } + BOOST_MESSAGE(" done."); + // negative: no entity description + BOOST_MESSAGE(" check missing entity description..."); + { + mol::EntityHandle eh = mol::CreateEntity(); + MMCifParser mmcif_p("testfiles/mmcif/model_truepos.mmcif", + eh, + profile); + mmcif_p.Parse(); + ch = eh.FindChain("A"); + BOOST_CHECK(ch.IsValid()); + BOOST_CHECK(ch.GetChainType() == CHAINTYPE_UNKNOWN); + ch = eh.FindChain("B"); + BOOST_CHECK(ch.IsValid()); + BOOST_CHECK(ch.GetChainType() == CHAINTYPE_UNKNOWN); + } + BOOST_MESSAGE(" done."); + + BOOST_MESSAGE(" done."); +} + +BOOST_AUTO_TEST_CASE(mmcif_parseatomident) +{ BOOST_MESSAGE(" Running mmcif_parseatomident tests..."); + + mol::EntityHandle eh = mol::CreateEntity(); + std::ifstream s("testfiles/mmcif/atom_site.mmcif"); IOProfile profile; TestMMCifParserProtected tmmcif_p(s, eh, profile); - //std::vector<StringRef> cols; - //String chain_name; - //StringRef res_name; + std::vector<StringRef> columns; + String chain_name; + StringRef res_name; //mol::ResNum resnum(0); //StringRef atom_name; //char alt_loc; BOOST_MESSAGE(" testing valid line"); - //tmmcif_p.ParseAtomIdent(); + //tmmcif_p.ParseAtomIdent(columns, chain_name, res_name); BOOST_MESSAGE(" done."); // negative //cols.push_back(StringRef("ATOM", 4)); diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif index f2568fe3b..c1a43746b 100644 --- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif +++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif @@ -3,6 +3,20 @@ data_1BAR # this file is also used in the mmcif_parse_models tests for a true negative # test, hence it is not allowed to carry atom_site.pdbx_PDB_model_num entries +# this file is also used in the mmcif_mmcif_chaintype_setting test for a true +# positive test, hence the entity category is not to be changed + +loop_ +_entity.id +_entity.type +_entity.details +1 polymer +; The enzymatically competent form of HIV + protease is a dimer. This entity + corresponds to one monomer of an active dimer. +; +5 water . + loop_ _atom_site.group_PDB _atom_site.type_symbol diff --git a/modules/io/tests/testfiles/mmcif/model_truepos.mmcif b/modules/io/tests/testfiles/mmcif/model_truepos.mmcif index 613854466..9f1e39f16 100644 --- a/modules/io/tests/testfiles/mmcif/model_truepos.mmcif +++ b/modules/io/tests/testfiles/mmcif/model_truepos.mmcif @@ -1,6 +1,9 @@ data_1TPM # derived from 2JSP +# this file is also used in the mmcif_mmcif_chaintype_setting test for a true +# negative test, hence no entity category may be added + loop_ _atom_site.group_PDB _atom_site.id -- GitLab