diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index 340d16c985e46752351ed7b877e10d5d041796f2..c744d54eed3679a115853d5343f291835fe95398 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -397,6 +397,14 @@ of the annotation available. Establish all bonds stored for branched entities. + .. method:: GetEntityDesc(entity_id) + + Get info of type :class:`MMCifEntityDesc` for specified *entity_id*. + The entity id for a chain can be fetched with :func:`GetMMCifEntityIdTr`. + + :param entity_id: ID of entity + :type entity_id: :class:`str` + .. class:: MMCifInfoCitation This stores citation information from an input file. @@ -1349,6 +1357,62 @@ of the annotation available. See :attr:`bond_order` +.. class:: MMCifEntityDesc + + Data collected for certain mmCIF entity + + .. attribute:: type + + The ost chain type which can be assigned to :class:`ost.mol.ChainHandle` + + :type: :class:`ost.mol.ChainType` + + .. attribute:: entity_type + + value of _entity.type token + + :class:`str` + + .. attribute:: entity_poly_type + + value of _entity_poly.type token - empty string if entity is not of type + "polymer" + + :class:`str` + + .. attribute:: branched_type + + value of _pdbx_entity_branch.type token - empty string if entity is not of + type "branched" + + :type: :class:`str` + + .. attribute:: details + + value of _entity.pdbx_description token + + :class:`str` + + .. attribute:: seqres + + SEQRES with gentle preprocessing - empty string if entity is not of type + "polymer". By default, the :class:`ost.io.MMCifReader` reads the value of the + _entity_poly.pdbx_seq_one_letter_code token. Copies all letters but + searches a :class:`ost.conop.CompoundLib` for compound names in brackets. + *seqres* gets an 'X' if no compound is found or the respective compound has + one letter code '?'. Uses the one letter code of the found compound + otherwise. So it's basically a canonical SEQRES with exactly one character + per residue. + + :type: :class:`str` + + .. attribute:: mon_ids + + Monomer ids of all residues in a polymer - empty if entity is not of + type "polymer". Read from _entity_poly_seq category. + + :type: :class:`ost.base.StringList` + Writing mmCIF files -------------------------------------------------------------------------------- diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc index 411750b9194a884052a542ba891550ff042c152a..200e4255696c3bb5c3084e94a3460d3651deaa15 100644 --- a/modules/io/pymod/export_mmcif_io.cc +++ b/modules/io/pymod/export_mmcif_io.cc @@ -485,6 +485,16 @@ void export_mmcif_io() .def(self_ns::str(self)) ; + class_<MMCifEntityDesc>("MMCifEntityDesc", init<>()) + .add_property("type", &MMCifEntityDesc::type) + .add_property("entity_type", &MMCifEntityDesc::entity_type) + .add_property("entity_poly_type", &MMCifEntityDesc::entity_poly_type) + .add_property("branched_type", &MMCifEntityDesc::branched_type) + .add_property("details", &MMCifEntityDesc::details) + .add_property("seqres", &MMCifEntityDesc::seqres) + .add_property("mon_ids", &MMCifEntityDesc::mon_ids) + ; + class_<MMCifInfo>("MMCifInfo", init<>()) .def("AddCitation", &MMCifInfo::AddCitation) .def("GetCitations", make_function(&MMCifInfo::GetCitations, @@ -525,6 +535,8 @@ void export_mmcif_io() .def("ConnectBranchLinks", &MMCifInfo::ConnectBranchLinks) .def("GetEntityBranchChainNames", &WrapGetNames) .def("GetEntityBranchChains", &MMCifInfo::GetEntityBranchChains) + .def("SetEntityDesc", &MMCifInfo::SetEntityDesc) + .def("GetEntityDesc", &MMCifInfo::GetEntityDesc, return_value_policy<copy_const_reference>()) .add_property("citations", make_function(&MMCifInfo::GetCitations, return_value_policy<copy_const_reference>())) .add_property("biounits", make_function(&MMCifInfo::GetBioUnits, diff --git a/modules/io/src/mol/mmcif_info.cc b/modules/io/src/mol/mmcif_info.cc index ef44d4813fef36f2c709888b2acaef4d007342ee..96f256ca91fc80811eb8380d67a611d43ce3e116 100644 --- a/modules/io/src/mol/mmcif_info.cc +++ b/modules/io/src/mol/mmcif_info.cc @@ -293,6 +293,23 @@ void MMCifInfo::ConnectBranchLinks() } } +const MMCifEntityDesc& MMCifInfo::GetEntityDesc(const String& entity_id) const { + MMCifEntityDescMap::const_iterator it = entity_desc_.find(entity_id); + if(it == entity_desc_.end()) { + throw IOException("No EntityDesc for entity id \""+entity_id+"\""); + } + return it->second; +} + +void MMCifInfo::SetEntityDesc(const String& entity_id, + const MMCifEntityDesc& entity_desc) +{ + if(entity_desc_.find(entity_id) != entity_desc_.end()) { + throw IOException("EntityDesc for entity_id \""+entity_id+"\" already set"); + } + entity_desc_[entity_id] = entity_desc; +} + std::ostream& operator<<(std::ostream& os, const MMCifInfoEntityBranchLink& eb) { os << "<MMCifInfoEntityBranchLink atom1:" << eb.GetAtom1() << " atom2:" diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh index 389d7d1cd433eaf646634426c33f8bb10b5729a4..5114b4d9e1526d382e765d39b05d69171c7d64cc 100644 --- a/modules/io/src/mol/mmcif_info.hh +++ b/modules/io/src/mol/mmcif_info.hh @@ -959,6 +959,20 @@ private: }; typedef std::map<String, std::vector<MMCifInfoEntityBranchLink> > MMCifInfoEntityBranchLinkMap; + +/// \struct keeping track of entity information +typedef struct { + mol::ChainType type; ///< characterise entity + String entity_type; ///< value of _entity.type + String entity_poly_type; ///< value of _entity_poly.type + String branched_type; ///< value of _pdbx_entity_branch.type + String details; ///< description of this entity + String seqres; ///< chain of monomers + std::vector<String> mon_ids; ///< list of full monomer names +} MMCifEntityDesc; +typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap; + + /// \brief container class for additional information from MMCif files /// /// \section mmcif annotation information @@ -1201,6 +1215,11 @@ public: /// void ConnectBranchLinks(); + const MMCifEntityDesc& GetEntityDesc(const String& entity_id) const; + + void SetEntityDesc(const String& entity_id, + const MMCifEntityDesc& entity_desc); + //protected: private: @@ -1216,6 +1235,7 @@ private: std::vector<MMCifInfoBioUnit> biounits_; ///< list of biounits std::vector<MMCifInfoTransOpPtr> transops_; MMCifInfoStructRefs struct_refs_; + MMCifEntityDescMap entity_desc_; std::map<String, String> cif_2_pdb_chain_id_; std::map<String, String> pdb_2_cif_chain_id_; std::map<String, String> cif_2_entity_id_; diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index b6c2167cb5bc78b902be3bcce04a7601775cb5a3..9ad17290f263753625b45c7f26e485a03fe05b0d 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -92,6 +92,7 @@ void MMCifReader::ClearState() revision_types_.clear(); database_PDB_rev_added_ = false; entity_branch_link_map_.clear(); + entity_poly_seq_map_.clear(); } void MMCifReader::SetRestrictChains(const String& restrict_chains) @@ -389,6 +390,13 @@ bool MMCifReader::OnBeginLoop(const StarLoopDesc& header) indices_[BL_ATOM_STEREO_CONFIG_2] = header.GetIndex("atom_stereo_config_2"); indices_[BL_VALUE_ORDER] = header.GetIndex("value_order"); cat_available = true; + } else if(header.GetCategory() == "entity_poly_seq") { + category_ = ENTITY_POLY_SEQ; + // mandatory + this->TryStoreIdx(EPS_ENTITY_ID, "entity_id", header); + this->TryStoreIdx(EPS_MON_ID, "mon_id", header); + this->TryStoreIdx(EPS_NUM, "num", header); + cat_available = true; } category_counts_[category_]++; return cat_available; @@ -710,15 +718,19 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns) columns[indices_[GROUP_PDB]][0]=='H'); } -MMCifReader::MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator( +MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator( const String& entity_id) { MMCifEntityDescMap::iterator edm_it = entity_desc_map_.find(entity_id); // if the entity ID is not already stored, insert it with empty values if (edm_it == entity_desc_map_.end()) { MMCifEntityDesc desc = {.type=mol::CHAINTYPE_N_CHAINTYPES, + .entity_type = "", + .entity_poly_type = "", + .branched_type = "", .details="", - .seqres=""}; + .seqres="", + .mon_ids=std::vector<String>()}; edm_it = entity_desc_map_.insert(entity_desc_map_.begin(), MMCifEntityDescMap::value_type(entity_id, desc)); @@ -738,6 +750,8 @@ void MMCifReader::ParseEntity(const std::vector<StringRef>& columns) if (edm_it->second.type == mol::CHAINTYPE_N_CHAINTYPES) { edm_it->second.type = mol::ChainTypeFromString(columns[indices_[E_TYPE]]); } + // but set entity_type anyways + edm_it->second.entity_type = columns[indices_[E_TYPE]].str(); } else { // don't deal with entities without type entity_desc_map_.erase(edm_it); @@ -758,6 +772,7 @@ void MMCifReader::ParseEntityPoly(const std::vector<StringRef>& columns) // store type if (indices_[EP_TYPE] != -1) { edm_it->second.type = mol::ChainTypeFromString(columns[indices_[EP_TYPE]]); + edm_it->second.entity_poly_type = columns[indices_[EP_TYPE]].str(); } // store seqres @@ -1633,6 +1648,10 @@ void MMCifReader::OnDataRow(const StarLoopDesc& header, LOG_TRACE("processing pdbx_entity_branch_link entry"); this->ParsePdbxEntityBranchLink(columns); break; + case ENTITY_POLY_SEQ: + LOG_TRACE("processing entity_poly_seq entry"); + this->ParseEntityPolySeq(columns); + break; default: throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "Uncatched category '"+ header.GetCategory() +"' found.", @@ -1780,6 +1799,7 @@ void MMCifReader::ParsePdbxEntityBranch(const std::vector<StringRef>& columns) // store type if (indices_[BR_ENTITY_TYPE] != -1) { edm_it->second.type = mol::ChainTypeFromString(columns[indices_[EP_TYPE]]); + edm_it->second.branched_type = columns[indices_[EP_TYPE]].str(); } } @@ -1833,6 +1853,45 @@ void MMCifReader::ParsePdbxEntityBranchLink(const std::vector<StringRef>& column blm_it->second.push_back(link_pair); } +void MMCifReader::ParseEntityPolySeq(const std::vector<StringRef>& columns) +{ + std::pair<bool, int> tmp = columns[indices_[EPS_NUM]].to_int(); + if(!tmp.first) { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "Could not cast " + "_entity_poly_seq.num to integer: " + + columns[indices_[EPS_NUM]].str(), + this->GetCurrentLinenum())); + } + int num = tmp.second; + + if(num < 1) { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "_entity_poly_seq.num are " + "expected to be ints >= 1. Got:" + + columns[indices_[EPS_NUM]].str(), + this->GetCurrentLinenum())); + } + + // [] inserts new value if not present in container + std::map<int, String>& entity_map = + entity_poly_seq_map_[columns[indices_[EPS_ENTITY_ID]].str()]; + + if(entity_map.find(num) != entity_map.end()) { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "_entity_poly_seq.num must be " + "unique in same " + "_entity_poly_seq.entity_id. " + "entity_id: " + + columns[indices_[EPS_ENTITY_ID]].str() + + ", num: " + + columns[indices_[EPS_ENTITY_ID]].str(), + this->GetCurrentLinenum())); + } + + entity_map[num] = columns[indices_[EPS_MON_ID]].str(); +} + void MMCifReader::OnEndData() { mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT); @@ -1972,6 +2031,21 @@ void MMCifReader::OnEndData() } } + // conclude EntityDesc (add entity_poly_seq if present) and add to MMCifInfo + for(auto entity_it: entity_desc_map_) { + if(entity_poly_seq_map_.find(entity_it.first) != entity_poly_seq_map_.end()) { + int max_num = 1; + for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) { + max_num = std::max(max_num, seqres_it.first); + } + entity_it.second.mon_ids.assign(max_num, "?"); + for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) { + entity_it.second.mon_ids[seqres_it.first-1] = seqres_it.second; + } + } + info_.SetEntityDesc(entity_it.first, entity_it.second); + } + LOG_INFO("imported " << chain_count_ << " chains, " << residue_count_ << " residues, " diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index f2f2d2dbd78069a778c6cd3ccab9d49abb7b0cb1..43f6cc220a8db607af23f8884edf20a35e407b71 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -338,6 +338,11 @@ protected: /// \param columns data row void ParsePdbxEntityBranchLink(const std::vector<StringRef>& columns); + /// \brief Fetch mmCIF entity_poly_seq information + /// + /// \param columns data row + void ParseEntityPolySeq(const std::vector<StringRef>& columns); + /// \struct types of secondary structure typedef enum { MMCIF_HELIX, @@ -595,6 +600,13 @@ private: BL_VALUE_ORDER /// < bond order } EntityBranchLinkItems; + /// \enum items of the entity_poly_seq category + typedef enum { + EPS_ENTITY_ID, + EPS_MON_ID, + EPS_NUM + } EntityPolySeqItems; + /// \enum categories of the mmcif format typedef enum { ATOM_SITE, @@ -620,17 +632,10 @@ private: PDBX_DATABASE_STATUS, PDBX_ENTITY_BRANCH, PDBX_ENTITY_BRANCH_LINK, + ENTITY_POLY_SEQ, DONT_KNOW } MMCifCategory; - /// \struct keeping track of entity information - typedef struct { - mol::ChainType type; ///< characterise entity - String details; ///< description of this entity - String seqres; ///< chain of monomers - } MMCifEntityDesc; - typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap; - /// \brief Get an iterator for MMCifEntityDescMap by finding an element or /// inserting a new one into the map. /// \param entity_id ID of the entity to talk to @@ -729,6 +734,8 @@ private: bool database_PDB_rev_added_; // for entity_branch connections MMCifPdbxEntityBranchLinkMap entity_branch_link_map_; + // for storing entity_poly_seq + std::map<String, std::map<int, String> > entity_poly_seq_map_; }; /// \brief Translate mmCIF info on bond type (e.g.