From 391def0820474a881746772673057b1c39864a86 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Thu, 25 Jan 2024 15:29:05 +0100 Subject: [PATCH] mmcif reader: enable heterogeneities in entity_poly_seq parsing --- modules/io/doc/mmcif.rst | 32 ++++++++++++++++++--- modules/io/pymod/export_mmcif_io.cc | 2 ++ modules/io/src/mol/mmcif_info.hh | 16 ++++++----- modules/io/src/mol/mmcif_reader.cc | 44 ++++++++++++++++++++++------- modules/io/src/mol/mmcif_reader.hh | 4 ++- 5 files changed, 76 insertions(+), 22 deletions(-) diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index 285e51cc8..8bb3b6fc6 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -1421,10 +1421,32 @@ of the annotation available. .. attribute:: mon_ids Monomer ids of all residues in a polymer - empty if entity is not of - type "polymer". Read from ``_entity_poly_seq`` category. + type "polymer". Read from ``_entity_poly_seq`` category. If a residue is + heterogeneous, this list contains the monomer id that comes first in + the CIF file. The other variants end up in + :attr:`hetero_num` / :attr:`hetero_ids`. :type: :class:`ost.base.StringList` + .. attribute:: hetero_num + + Residue numbers of heterogeneous compounds - empty if entity is not + of type "polymer". Read from _entity_poly_seq category. If a residue is + heterogeneous, the monomer id that comes first in the CIF file ends up + in :attr:`mon_ids`. The remnant is listed here. + This list specifies the residue numbers for the respective monomer ids + in :attr:`hetero_ids`. + + .. attribute:: hetero_ids + + Monomer ids of heterogeneous compounds - empty if entity is not + of type "polymer". Read from _entity_poly_seq category. If a residue is + heterogeneous, the monomer id that comes first in the CIF file ends up + in :attr:`mon_ids`. The remnant is listed here. + This list specifies the monomer ids for the respective locations in + :attr:`hetero_num`. + + Writing mmCIF files -------------------------------------------------------------------------------- @@ -1805,15 +1827,17 @@ SEQRES is set to what we observe in the chain residues given their residue numbers (i.e. the ATOMSEQ). If the first residue has residue number 10, the SEQRES gets prefixed by 9 elements using a default value (e.g. UNK for a chain of type CHAINTYPE_POLY_PEPTIDE_D). The same is done for gaps. -Matching requires an exact match for ALL residues given their residue number -with that SEQRES. However, there might be the case that one chain resolves +A chain is considered matching an mmCIF entity, if all of its residue names +are an exact match at the respective location in the SEQRES. Location is +determined with residue numbers which follow a 1-based indexing scheme. +However, there might be the case that one chain resolves more residues than another. So you may have residues at locations that are undefined in the current SEQRES. If the fraction of matches with undefined locations does not exceed 5%, we still assume an overall match and fill in the previsouly undefined locations in the SEQRES with the newly gained information. This is a heuristic that works in most cases but potentially introduces errors in entity assignment. If you want to avoid that, you -must set your entities manually and pass a list of :class:`MMCifWriterEntity` +must set your entities manually and pass a :class:`MMCifWriterEntityList` when calling :func:`MMCifWriter.SetStructure`. if *mmcif_conform* is enabled, there is pretty much everything in place diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc index bd892405f..a6ff22c0d 100644 --- a/modules/io/pymod/export_mmcif_io.cc +++ b/modules/io/pymod/export_mmcif_io.cc @@ -494,6 +494,8 @@ void export_mmcif_io() .add_property("details", &MMCifEntityDesc::details) .add_property("seqres", &MMCifEntityDesc::seqres) .add_property("mon_ids", &MMCifEntityDesc::mon_ids) + .add_property("hetero_num", &MMCifEntityDesc::hetero_num) + .add_property("hetero_ids", &MMCifEntityDesc::hetero_ids) ; class_<MMCifInfo>("MMCifInfo", init<>()) diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh index 054ecb3a6..d4ca56194 100644 --- a/modules/io/src/mol/mmcif_info.hh +++ b/modules/io/src/mol/mmcif_info.hh @@ -962,13 +962,15 @@ typedef std::map<String, std::vector<MMCifInfoEntityBranchLink> > MMCifInfoEntit /// \struct keeping track of entity information typedef struct { - mol::ChainType type; ///< characterise entity - String entity_type; ///< value of _entity.type - String entity_poly_type; ///< value of _entity_poly.type - String branched_type; ///< value of _pdbx_entity_branch.type - String details; ///< description of this entity - String seqres; ///< chain of monomers - std::vector<String> mon_ids; ///< list of full monomer names + mol::ChainType type; ///< characterise entity + String entity_type; ///< value of _entity.type + String entity_poly_type; ///< value of _entity_poly.type + String branched_type; ///< value of _pdbx_entity_branch.type + String details; ///< description of this entity + String seqres; ///< chain of monomers + std::vector<String> mon_ids; ///< list of monomer names from _entity_poly_seq + std::vector<int> hetero_num; ///< res num of heterogeneous compounds + std::vector<String> hetero_ids;///< names of heterogeneous compounds } MMCifEntityDesc; typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap; diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index 90970daa3..6f5553150 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -396,6 +396,9 @@ bool MMCifReader::OnBeginLoop(const StarLoopDesc& header) this->TryStoreIdx(EPS_ENTITY_ID, "entity_id", header); this->TryStoreIdx(EPS_MON_ID, "mon_id", header); this->TryStoreIdx(EPS_NUM, "num", header); + + // optional items + indices_[EPS_HETERO] = header.GetIndex("hetero"); cat_available = true; } else if (header.GetCategory() == "em_3d_reconstruction") { category_ = EM_3D_RECONSTRUCTION; @@ -735,7 +738,9 @@ MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator( .branched_type = "", .details="", .seqres="", - .mon_ids=std::vector<String>()}; + .mon_ids=std::vector<String>(), + .hetero_num=std::vector<int>(), + .hetero_ids=std::vector<String>()}; edm_it = entity_desc_map_.insert(entity_desc_map_.begin(), MMCifEntityDescMap::value_type(entity_id, desc)); @@ -1895,15 +1900,28 @@ void MMCifReader::ParseEntityPolySeq(const std::vector<StringRef>& columns) entity_poly_seq_map_[columns[indices_[EPS_ENTITY_ID]].str()]; if(entity_map.find(num) != entity_map.end()) { - throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, - "_entity_poly_seq.num must be " - "unique in same " - "_entity_poly_seq.entity_id. " - "entity_id: " + - columns[indices_[EPS_ENTITY_ID]].str() + - ", num: " + - columns[indices_[EPS_ENTITY_ID]].str(), - this->GetCurrentLinenum())); + + if(indices_[EPS_HETERO] != -1 && columns[indices_[EPS_HETERO]][0] == 'y') { + // controlled vocabulary: "y", "yes", "n", "no" + // the first two mark hetereogeneous compounds + + // [] inserts new value if not present + std::vector<std::pair<int, String> >& hetero_list = + entity_poly_seq_h_map_[columns[indices_[EPS_ENTITY_ID]].str()]; + + hetero_list.push_back(std::make_pair(num, columns[indices_[EPS_MON_ID]].str())); + } else { + + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "_entity_poly_seq.num must be " + "unique in same " + "_entity_poly_seq.entity_id. " + "entity_id: " + + columns[indices_[EPS_ENTITY_ID]].str() + + ", num: " + + columns[indices_[EPS_ENTITY_ID]].str(), + this->GetCurrentLinenum())); + } } entity_map[num] = columns[indices_[EPS_MON_ID]].str(); @@ -2060,6 +2078,12 @@ void MMCifReader::OnEndData() entity_it.second.mon_ids[seqres_it.first-1] = seqres_it.second; } } + if(entity_poly_seq_h_map_.find(entity_it.first) != entity_poly_seq_h_map_.end()) { + for(auto hetero_it: entity_poly_seq_h_map_[entity_it.first]) { + entity_it.second.hetero_num.push_back(hetero_it.first); + entity_it.second.hetero_ids.push_back(hetero_it.second); + } + } info_.SetEntityDesc(entity_it.first, entity_it.second); } diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index e0b4c4c0b..28798a49f 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -609,7 +609,8 @@ private: typedef enum { EPS_ENTITY_ID, EPS_MON_ID, - EPS_NUM + EPS_NUM, + EPS_HETERO } EntityPolySeqItems; /// \enum items of the entity_poly_seq category @@ -747,6 +748,7 @@ private: MMCifPdbxEntityBranchLinkMap entity_branch_link_map_; // for storing entity_poly_seq std::map<String, std::map<int, String> > entity_poly_seq_map_; + std::map<String, std::vector<std::pair<int, String> > > entity_poly_seq_h_map_; }; /// \brief Translate mmCIF info on bond type (e.g. -- GitLab