From 84b8cdfa2fbe79837a581e84c1bc1ca78ac57394 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Wed, 3 Aug 2011 10:59:49 +0200 Subject: [PATCH] MMCifParser now reading seqres --- modules/io/pymod/export_mmcif_io.cc | 3 + modules/io/src/mol/mmcif_reader.cc | 98 ++++++++++++------- modules/io/src/mol/mmcif_reader.hh | 26 ++++- modules/io/tests/test_mmcif_reader.cc | 36 ++++++- .../io/tests/testfiles/mmcif/atom_site.mmcif | 1 + 5 files changed, 124 insertions(+), 40 deletions(-) diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc index 4e0a47240..e50e128c4 100644 --- a/modules/io/pymod/export_mmcif_io.cc +++ b/modules/io/pymod/export_mmcif_io.cc @@ -30,9 +30,12 @@ void export_mmcif_io() class_<MMCifParser, boost::noncopyable>("MMCifParser", init<const String&, EntityHandle&, const IOProfile&>()) .def("Parse", &MMCifParser::Parse) .def("SetRestrictChains", &MMCifParser::SetRestrictChains) + .def("SetReadCanonicalSeqRes", &MMCifParser::SetReadCanonicalSeqRes) + .def("GetSeqRes", &MMCifParser::GetSeqRes) .add_property("restrict_chains", make_function(&MMCifParser::GetRestrictChains, return_value_policy<copy_const_reference>()), &MMCifParser::SetRestrictChains) + .add_property("seqres", &MMCifParser::GetSeqRes) ; } diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index dd43aa093..59e40c3ae 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -57,13 +57,13 @@ void MMCifParser::Init() atom_count_ = 0; residue_count_ = 0; auth_chain_id_ = false; + seqres_can_ = false; has_model_ = false; restrict_chains_ = ""; subst_res_id_ = ""; curr_chain_ = mol::ChainHandle(); curr_residue_ = mol::ResidueHandle(); - //chain_id_pairs_ = - //entity_desc_map_ + seqres_ = seq::CreateSequenceList(); } void MMCifParser::ClearState() @@ -75,6 +75,7 @@ void MMCifParser::ClearState() atom_count_ = 0; category_ = DONT_KNOW; warned_name_mismatch_ = false; + seqres_ = seq::CreateSequenceList(); } void MMCifParser::SetRestrictChains(const String& restrict_chains) @@ -164,6 +165,10 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header) this->TryStoreIdx(ENTITY_ID, "entity_id", header); // optional indices_[EP_TYPE] = header.GetIndex("type"); + indices_[PDBX_SEQ_ONE_LETTER_CODE] = + header.GetIndex("pdbx_seq_one_letter_code"); + indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] = + header.GetIndex("pdbx_seq_one_letter_code_can"); return true; } /*else if (header.GetCategory()=="pdbx_poly_seq_scheme") { } else if (header.GetCategory()=="pdbx_struct_assembly") { @@ -466,6 +471,7 @@ void MMCifParser::ParseEntity(const std::vector<StringRef>& columns) } if (store) { + desc.seqres = ""; entity_desc_map_.insert( MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(), desc) @@ -480,46 +486,62 @@ void MMCifParser::ParseEntityPoly(const std::vector<StringRef>& columns) MMCifEntityDescMap::iterator edm_it = entity_desc_map_.find(columns[indices_[ENTITY_ID]].str()); - // store values in description map - if (edm_it != entity_desc_map_.end()) { - if (indices_[EP_TYPE] != -1) { - if(StringRef("polypeptide(D)", 14) == columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_D; - } else if(StringRef("polypeptide(L)", 14) == columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_L; - } else if(StringRef("polydeoxyribonucleotide", 23) == - columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_DN; - } else if(StringRef("polyribonucleotide", 18) == - columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_RN; - } else if(StringRef("polysaccharide(D)", 17) == - columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_SAC_D; - } else if(StringRef("polysaccharide(L)", 17) == - columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_SAC_L; - } else if(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid", - 49) == columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_POLY_DN_RN; - } else if(StringRef("other", - 5) == columns[indices_[EP_TYPE]]) { - edm_it->second.type = CHAINTYPE_UNKNOWN; - } else { - throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, - "Unrecognised polymner type '" + - columns[indices_[EP_TYPE]].str() + - "' found.", - this->GetCurrentLinenum())); - } - } - } else { + if (edm_it == entity_desc_map_.end()) { throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "'entity_poly' category defined before 'entity' for id '" + columns[indices_[ENTITY_ID]].str() + "' or missing.", this->GetCurrentLinenum())); } + + // store type + if (indices_[EP_TYPE] != -1) { + if(StringRef("polypeptide(D)", 14) == columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_D; + } else if(StringRef("polypeptide(L)", 14) == columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_PEPTIDE_L; + } else if(StringRef("polydeoxyribonucleotide", 23) == + columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_DN; + } else if(StringRef("polyribonucleotide", 18) == + columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_RN; + } else if(StringRef("polysaccharide(D)", 17) == + columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_SAC_D; + } else if(StringRef("polysaccharide(L)", 17) == + columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_SAC_L; + } else if(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid", + 49) == columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_POLY_DN_RN; + } else if(StringRef("other", + 5) == columns[indices_[EP_TYPE]]) { + edm_it->second.type = CHAINTYPE_UNKNOWN; + } else { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "Unrecognised polymner type '" + + columns[indices_[EP_TYPE]].str() + + "' found.", + this->GetCurrentLinenum())); + } + } + + // store seqres + if (edm_it->second.seqres.length() > 0) { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "entity_poly.pdbx_seq_one_letter_code[_can] clash: sequence for entry '" + + columns[indices_[ENTITY_ID]].str() + + "' is already set to '" + + edm_it->second.seqres + "'.", + this->GetCurrentLinenum())); + } + if ((seqres_can_) && (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1)) { + edm_it->second.seqres = + columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]].str(); + } else if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) { + edm_it->second.seqres = columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]].str(); + } } void MMCifParser::OnDataRow(const StarLoopDesc& header, @@ -696,6 +718,10 @@ void MMCifParser::OnEndData() if (edm_it != entity_desc_map_.end()) { editor.SetChainType(css->first, edm_it->second.type); editor.SetChainDescription(css->first, edm_it->second.details); + if (edm_it->second.seqres.length() > 0) { + seqres_.AddSequence(seq::CreateSequence(css->first.GetName(), + edm_it->second.seqres)); + } } else { LOG_WARNING("No entity description found for atom_site.label_entity_id '" << css->second << "'"); diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index b63e5c51d..b12d8fae6 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -24,6 +24,7 @@ //#include <boost/iostreams/filtering_stream.hpp> //#include <boost/filesystem/fstream.hpp> +#include <ost/seq/sequence_list.hh> #include <ost/mol/residue_handle.hh> #include <ost/mol/chain_type.hh> #include <ost/io/mol/io_profile.hh> @@ -68,6 +69,15 @@ public: /// \param restrict_chains chain name void SetRestrictChains(const String& restrict_chains); + /// \brief Enable reading of canonical sequence residues + /// (entity_poly.pdbx_seq_one_letter_code_can instead of + /// entity_poly.pdbx_seq_one_letter_code). This flag is exclusive. + /// + void SetReadCanonicalSeqRes() + { + seqres_can_ = true; + } + const String& GetRestrictChains() const { return restrict_chains_; @@ -98,6 +108,13 @@ public: /// \brief Finalise parsing. virtual void OnEndData(); + /// \brief Return sequences + /// + /// \return List of sequences + seq::SequenceList GetSeqRes() const { + return seqres_; + } + protected: /// \brief Store an item index from loop header in preparation for reading a /// row. Throws an exception if the item does not exist. @@ -198,8 +215,10 @@ private: /// \enum items of the entity_poly category typedef enum { - ENTITY_ID, ///< pointer to entity.id - EP_TYPE ///< type of polymer + ENTITY_ID, ///< pointer to entity.id + EP_TYPE, ///< type of polymer + PDBX_SEQ_ONE_LETTER_CODE, ///< sequence, 1-letter code + PDBX_SEQ_ONE_LETTER_CODE_CAN ///< canonical sequence, 1-letter code } EntityPolyItems; /// \enum categories of the mmcif format @@ -214,6 +233,7 @@ private: typedef struct { ChainType type; ///< characterise entity String details; ///< description of this entity + String seqres; ///< chain of monomers } MMCifEntityDesc; typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap; @@ -226,6 +246,7 @@ private: mol::EntityHandle& ent_handle_; String restrict_chains_; bool auth_chain_id_; ///< use chain IDs given by authors rather than pdb + bool seqres_can_; ///< read canonical 1-letter residues? mol::ChainHandle curr_chain_; mol::ResidueHandle curr_residue_; int chain_count_; @@ -238,6 +259,7 @@ private: std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_; ///< chain and label_entity_id MMCifEntityDescMap entity_desc_map_; ///< stores entity items + seq::SequenceList seqres_; }; }} diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index 7134e2f9e..f4f50b33f 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -322,13 +322,15 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests) mol::ChainHandle ch; IOProfile profile; StarLoopDesc tmmcif_h; - // positive - // negative: unknown polymer type + mol::EntityHandle eh = mol::CreateEntity(); MMCifParser mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile); mmcif_p.Parse(); + seq::SequenceList seqres = mmcif_p.GetSeqRes(); + seq::SequenceHandle curr = seqres.FindSequence("A"); + BOOST_CHECK(curr.GetString() == "VTI"); BOOST_MESSAGE(" testing missing corresponding entity entry..."); { @@ -400,6 +402,36 @@ columns.push_back(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid", columns.pop_back(); } BOOST_MESSAGE(" done."); + BOOST_MESSAGE(" testing pdbx_seq_one_letter_code reading..."); + { + TestMMCifParserProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh); + std::vector<StringRef> columns; + + tmmcif_h.Clear(); + tmmcif_h.SetCategory(StringRef("entity", 6)); + tmmcif_h.Add(StringRef("id", 2)); + tmmcif_h.Add(StringRef("type", 4)); + tmmcif_p.OnBeginLoop(tmmcif_h); + columns.push_back(StringRef("1", 1)); + columns.push_back(StringRef("polymer", 7)); + tmmcif_p.ParseEntity(columns); + columns.pop_back(); + columns.pop_back(); + + tmmcif_h.Clear(); + tmmcif_h.SetCategory(StringRef("entity_poly", 11)); + tmmcif_h.Add(StringRef("entity_id", 9)); + tmmcif_h.Add(StringRef("type", 4)); + tmmcif_h.Add(StringRef("pdbx_seq_one_letter_code", 24)); + tmmcif_p.OnBeginLoop(tmmcif_h); + + columns.push_back(StringRef("1", 1)); + columns.push_back(StringRef("other", 5)); + columns.push_back(StringRef("ABRND", 5)); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntityPoly(columns)); + BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException); + } + BOOST_MESSAGE(" done."); BOOST_MESSAGE(" done."); } diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif index 5987ec461..2dd1c2c52 100644 --- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif +++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif @@ -19,6 +19,7 @@ _entity_poly.entity_id 1 _entity_poly.type 'polypeptide(L)' _entity_poly.nstd_linkage no _entity_poly.nstd_monomer no +_entity_poly.pdbx_seq_one_letter_code 'VTI' loop_ _atom_site.group_PDB -- GitLab