From b7f87fa4c44db5a8e19a435492313a912365f584 Mon Sep 17 00:00:00 2001 From: Marco Biasini <marco.biasini@unibas.ch> Date: Tue, 8 May 2012 14:32:56 +0200 Subject: [PATCH] added support for struct_ref, and friends The mmCIF reader now also parses struct_ref_seq and struct_ref_seq_dif. Includes unit tests and docs --- modules/io/doc/mmcif.rst | 99 ++++++++++++- modules/io/pymod/export_mmcif_io.cc | 43 +++++- modules/io/src/mol/mmcif_info.cc | 36 +++++ modules/io/src/mol/mmcif_info.hh | 92 +++++++++++- modules/io/src/mol/mmcif_reader.cc | 136 +++++++++++++++++- modules/io/src/mol/mmcif_reader.hh | 41 +++++- modules/io/tests/test_mmcif_reader.cc | 34 +++++ .../io/tests/testfiles/mmcif/struct_ref.cif | 47 ++++++ 8 files changed, 522 insertions(+), 6 deletions(-) create mode 100644 modules/io/tests/testfiles/mmcif/struct_ref.cif diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index 078a66667..7e1c330a4 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -38,6 +38,9 @@ The following categories of a mmCIF file are considered by the reader: the :class:`entity <ost.mol.EntityHandle>` * ``pdbx_database_PDB_obs_spr``: Verbose information on obsoleted/ superseded entries, stored in :class:`MMCifInfoObsolete`. +* ``struct_ref`` stored in :class:`MMCifInfoStructRef` +* ``struct_ref_seq`` stored in :class:`MMCifInfoStructRefSeq` +* ``struct_ref_seq_dif`` stored in :class:`MMCifInfoStructRefDif` Info Classes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -93,6 +96,9 @@ of the annotation available. Also available as :meth:`GetStructDetails`. May also be modified by :meth:`SetStructDetails`. + .. attribute:: struct_refs + + Lists all links to external databases in the mmCIF file. .. method:: AddCitation(citation) Add a citation to the citation list of an info object. @@ -452,7 +458,7 @@ of the annotation available. See :attr:`operations` - .. method:: PDBize(asu, seqres=None, min_polymer_size=10) +.. function:: PDBize(asu, seqres=None, min_polymer_size=10) Returns the biological assembly (bio unit) for an entity. The new entity created is well suited to be saved as a PDB file. Therefore the function @@ -676,6 +682,95 @@ of the annotation available. See :attr:`replace_pdb_id` -.. LocalWords: autofunction ChainTypes exptl attr pdbx oper conf spr biounits +.. class:: MMCifINfoStructRef + + Holds the information of the struct_ref category. The category describes the + link of polymers in the mmCIF file to sequences stored in external databases + such as uniprot. The related categories ``struct_ref_seq`` and + ``struct_ref_seq_dif`` also list differences between the sequences of the + deposited structure and the sequences in the database. A promintent example of + such differences include point mutations and/or expression tags. + + .. attribute:: db_name + + + Name of the external database, for example UNP for uniprot. + + :type: :class:`str` + + + .. attribute:: db_id + + Name of the reference sequence in the database pointed to by :attr:`db_name`. + + :type: :class:`str` + + .. attribute:: db_access + + Alternative accession code for the sequence in the database pointed to by + :attr:`db_name`. + + :type: :class:`str` + + .. method:: GetAlignedSeq(name) + + Returns the aligned sequence for the given name, None if the sequence does + not exist. + + .. attribute:: aligned_seqs + + List of aligned sequences (all entries of the struct_ref_seq category + mapping to this struct_ref). + +.. class:: MMCifInfoStructRefSeq + + An aligned range of residues between a sequence in a reference database and the + deposited sequence. + + .. attribute:: align_id + + Uniquely identifies every struct_ref_seq item in the mmCIF file. + :type: :class:`str` + + .. attribute:: seq_begin + seq_end + The starting point (1-based) and end point of the aligned range in the + deposited sequence, respectively. + + :type: :class:`int` + + .. attribute:: db_begin + db_end + The starting point (1-based) and end point of the aligned range in the + database sequence, respectively. + + :type: :class:`int` + + .. attribute:: difs + + List of differences between the deposited sequence and the sequence in the + database. + + .. attribute:: chain_name + + Chain name of the polymer in the mmCIF file. + +.. class:: MMCifInfoStructRefSeqDif + + A particular difference between the deposited sequence and the sequence in + the database. + + .. attribute:: rnum + + The residue number (1-based) of the residue in the deposited sequence + + :type: :class:`int` + + .. attribute:: details + + A textual description of the difference, e.g. point mutation, + expressiontag, purification artifact. + + :type: :class:`str` .. LocalWords: cas isbn pubmed asu seqres conop ConnectAll casp COMPND OBSLTE .. LocalWords: SPRSDE pdb func diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc index b502aa515..908886257 100644 --- a/modules/io/pymod/export_mmcif_io.cc +++ b/modules/io/pymod/export_mmcif_io.cc @@ -138,7 +138,37 @@ void export_mmcif_io() init<>()) .def(vector_indexing_suite<std::vector<MMCifInfoTransOpPtrList >, true >()) ; - + class_<MMCifInfoStructRef, MMCifInfoStructRefPtr>("MMCifInfoStructRef", no_init) + .add_property("db_name", make_function(&MMCifInfoStructRef::GetDBName, + return_value_policy<copy_const_reference>())) + .add_property("db_id", make_function(&MMCifInfoStructRef::GetDBID, + return_value_policy<copy_const_reference>())) + .add_property("entity_id", make_function(&MMCifInfoStructRef::GetEntityID, + return_value_policy<copy_const_reference>())) + .add_property("db_access", make_function(&MMCifInfoStructRef::GetDBAccess, + return_value_policy<copy_const_reference>())) + .def("GetAlignedSeq", &MMCifInfoStructRef::GetAlignedSeq, arg("align_id")) + .def("GetAlignedSeqs", &MMCifInfoStructRef::GetAlignedSeqs) + .add_property("aligned_seqs", &MMCifInfoStructRef::GetAlignedSeqs) + ; + class_<MMCifInfoStructRefSeq, MMCifInfoStructRefSeqPtr>("MMCifInfoStructRefSeq", no_init) + .add_property("align_id", make_function(&MMCifInfoStructRefSeq::GetID, + return_value_policy<copy_const_reference>())) + .add_property("chain_name", make_function(&MMCifInfoStructRefSeq::GetChainName, + return_value_policy<copy_const_reference>())) + .add_property("seq_begin", &MMCifInfoStructRefSeq::GetSeqBegin) + .add_property("seq_end", &MMCifInfoStructRefSeq::GetSeqEnd) + .add_property("db_begin", &MMCifInfoStructRefSeq::GetDBBegin) + .add_property("db_end", &MMCifInfoStructRefSeq::GetDBEnd) + .add_property("difs", make_function(&MMCifInfoStructRefSeq::GetDifs, + return_value_policy<copy_const_reference>())) + ; + class_<MMCifInfoStructRefSeqDif, + MMCifInfoStructRefSeqDifPtr>("MMCifInfoStructRefSeqDif", no_init) + .add_property("details", make_function(&MMCifInfoStructRefSeqDif::GetDetails, + return_value_policy<copy_const_reference>())) + .add_property("rnum", &MMCifInfoStructRefSeqDif::GetRNum) + ; class_<MMCifInfoBioUnit>("MMCifInfoBioUnit", init<>()) .def("SetDetails", &MMCifInfoBioUnit::SetDetails) .def("GetDetails", &MMCifInfoBioUnit::GetDetails) @@ -156,6 +186,15 @@ void export_mmcif_io() return_value_policy<copy_const_reference>())) ; + class_<MMCifInfoStructRefs>("MMCifInfoStructRefs", init<>()) + .def(vector_indexing_suite<MMCifInfoStructRefs, true>()) + ; + class_<MMCifInfoStructRefSeqs>("MMCifInfoStructRefSeqs", init<>()) + .def(vector_indexing_suite<MMCifInfoStructRefSeqs, true>()) + ; + class_<MMCifInfoStructRefSeqDifs>("MMCifInfoStructRefSeqDifs", init<>()) + .def(vector_indexing_suite<MMCifInfoStructRefSeqDifs, true>()) + ; typedef std::vector<MMCifInfoBioUnit> MMCifInfoBioUnitList; class_<std::vector<MMCifInfoBioUnit> >("MMCifInfoBioUnitList", init<>()) .def(vector_indexing_suite<std::vector<MMCifInfoBioUnit> >()) @@ -246,6 +285,8 @@ void export_mmcif_io() return_value_policy<copy_const_reference>())) .add_property("struct_details", &MMCifInfo::GetStructDetails, &MMCifInfo::SetStructDetails) + .add_property("struct_refs", make_function(&MMCifInfo::GetStructRefs, + return_value_policy<copy_const_reference>())) .add_property("obsolete", &MMCifInfo::GetObsoleteInfo, &MMCifInfo::SetObsoleteInfo) ; diff --git a/modules/io/src/mol/mmcif_info.cc b/modules/io/src/mol/mmcif_info.cc index 1e4c89281..d99b0bd22 100644 --- a/modules/io/src/mol/mmcif_info.cc +++ b/modules/io/src/mol/mmcif_info.cc @@ -36,4 +36,40 @@ void MMCifInfo::AddAuthorsToCitation(StringRef id, std::vector<String> list) throw IOException("No citation for identifier '" + id.str() + "' found."); } + + +MMCifInfoStructRefSeqPtr +MMCifInfoStructRef::AddAlignedSeq(const String& aid, const String& chain_name, + int seq_begin, int seq_end, int db_begin, + int db_end) +{ + std::map<String, MMCifInfoStructRefSeqPtr>::const_iterator i=seqs_.find(aid); + if (i!=seqs_.end()) { + throw IOException("duplicate align_id for struct_ref '"+id_+"'"); + } + MMCifInfoStructRefSeqPtr p(new MMCifInfoStructRefSeq(aid, chain_name, + seq_begin, seq_end, + db_begin, db_end)); + seqs_[aid]=p; + return p; +} + + +MMCifInfoStructRefSeqPtr +MMCifInfoStructRef::GetAlignedSeq(const String& aid) const +{ + + std::map<String, MMCifInfoStructRefSeqPtr>::const_iterator i=seqs_.find(aid); + return i==seqs_.end() ? MMCifInfoStructRefSeqPtr() : i->second; +} + +MMCifInfoStructRefSeqDifPtr +MMCifInfoStructRefSeq::AddDif(int rnum, const String& details) +{ + MMCifInfoStructRefSeqDifPtr d(new MMCifInfoStructRefSeqDif(rnum, details)); + difs_.push_back(d); + return d; +} + + }} //ns diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh index 469e52188..81d5b546a 100644 --- a/modules/io/src/mol/mmcif_info.hh +++ b/modules/io/src/mol/mmcif_info.hh @@ -20,6 +20,7 @@ #define OST_MMCIF_INFO_HH #include <vector> +#include <map> #include <boost/shared_ptr.hpp> #include <ost/geom/geom.hh> #include <ost/string_ref.hh> @@ -605,6 +606,92 @@ private: String replaced_pdb_id_; ///< replaced entry }; +class MMCifInfoStructRef; +class MMCifInfoStructRefSeq; +class MMCifInfoStructRefSeqDif; + + +typedef boost::shared_ptr<MMCifInfoStructRef> MMCifInfoStructRefPtr; +typedef boost::shared_ptr<MMCifInfoStructRefSeq> MMCifInfoStructRefSeqPtr; +typedef boost::shared_ptr<MMCifInfoStructRefSeqDif> MMCifInfoStructRefSeqDifPtr; + +typedef std::vector<MMCifInfoStructRefPtr> MMCifInfoStructRefs; +typedef std::vector<MMCifInfoStructRefSeqPtr> MMCifInfoStructRefSeqs; +typedef std::vector<MMCifInfoStructRefSeqDifPtr> MMCifInfoStructRefSeqDifs; +class DLLEXPORT_OST_IO MMCifInfoStructRef { +public: + MMCifInfoStructRef(const String& id, const String& ent_id, + const String& db_name, + const String& db_ident, const String& db_access): + id_(id), ent_id_(ent_id), db_name_(db_name), db_ident_(db_ident), + db_access_(db_access) + { } + const String& GetID() const { return id_; } + const String& GetDBName() const { return db_name_; } + const String& GetDBID() const { return db_ident_; } + const String& GetEntityID() const { return ent_id_; } + const String& GetDBAccess() const { return db_access_; } + MMCifInfoStructRefSeqPtr AddAlignedSeq(const String& align_id, + const String& chain_name, int seq_begin, + int seq_end, int db_begin, int db_end); + MMCifInfoStructRefSeqPtr GetAlignedSeq(const String& align_id) const; + MMCifInfoStructRefSeqs GetAlignedSeqs() const + { + MMCifInfoStructRefSeqs seqs; + seqs.reserve(seqs_.size()); + for (std::map<String, MMCifInfoStructRefSeqPtr>::const_iterator + i=seqs_.begin(), e=seqs_.end(); i!=e; ++i) { + seqs.push_back(i->second); + } + return seqs; + } +private: + String id_; + String ent_id_; + String db_name_; + String db_ident_; + String db_access_; + std::map<String, MMCifInfoStructRefSeqPtr> seqs_; +}; + +class DLLEXPORT_OST_IO MMCifInfoStructRefSeq { +public: + MMCifInfoStructRefSeq(const String& align_id, const String& chain_name, + int seq_begin, int seq_end, + int db_begin, int db_end): + id_(align_id), chain_name_(chain_name), + seq_begin_(seq_begin), seq_end_(seq_end), db_begin_(db_begin), db_end_(db_end) + { } + + const String& GetID() const { return id_; } + const String& GetChainName() const { return chain_name_; } + int GetSeqBegin() const { return seq_begin_; } + int GetSeqEnd() const { return seq_end_; } + int GetDBBegin() const { return db_begin_; } + int GetDBEnd() const { return db_end_; } + MMCifInfoStructRefSeqDifPtr AddDif(int seq_num, const String& details); + const std::vector<MMCifInfoStructRefSeqDifPtr>& GetDifs() const { return difs_; } +private: + String id_; + String chain_name_; + int seq_begin_; + int seq_end_; + int db_begin_; + int db_end_; + std::vector<MMCifInfoStructRefSeqDifPtr> difs_; +}; + +class DLLEXPORT_OST_IO MMCifInfoStructRefSeqDif { +public: + MMCifInfoStructRefSeqDif(int rnum, const String& details): + rnum_(rnum), details_(details) {} + int GetRNum() const { return rnum_;} + const String& GetDetails() const { return details_; } +private: + int rnum_; + String details_; +}; + /// \brief container class for additional information from MMCif files /// /// \section mmcif annotation information @@ -731,7 +818,8 @@ public: { return obsolete_; } - + const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; } + void SetStructRefs(const MMCifInfoStructRefs& sr) { struct_refs_=sr; } //protected: private: @@ -743,8 +831,10 @@ private: std::vector<MMCifInfoCitation> citations_; ///< list of citations std::vector<MMCifInfoBioUnit> biounits_; ///< list of biounits std::vector<MMCifInfoTransOpPtr> transops_; + MMCifInfoStructRefs struct_refs_; }; + }} // ns #endif diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index 09fa8b6ef..f2b66ee35 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -300,7 +300,31 @@ bool MMCifReader::OnBeginLoop(const StarLoopDesc& header) this->TryStoreIdx(PDB_ID, "pdb_id", header); this->TryStoreIdx(REPLACE_PDB_ID, "replace_pdb_id", header); cat_available = true; - } + } else if (header.GetCategory() == "struct_ref") { + category_ = STRUCT_REF; + this->TryStoreIdx(SR_ENTITY_ID, "entity_id", header); + this->TryStoreIdx(SR_ID, "id", header); + this->TryStoreIdx(SR_DB_NAME, "db_name", header); + this->TryStoreIdx(SR_DB_CODE, "db_code", header); + indices_[SR_DB_ACCESS]=header.GetIndex("pdbx_db_accession"); + cat_available = true; + } else if (header.GetCategory() == "struct_ref_seq") { + category_ = STRUCT_REF_SEQ; + this->TryStoreIdx(SRS_ALIGN_ID, "align_id", header); + this->TryStoreIdx(SRS_STRUCT_REF_ID, "ref_id", header); + this->TryStoreIdx(SRS_ENT_ALIGN_BEG, "seq_align_beg", header); + this->TryStoreIdx(SRS_ENT_ALIGN_END, "seq_align_end", header); + this->TryStoreIdx(SRS_DB_ALIGN_BEG, "db_align_beg", header); + this->TryStoreIdx(SRS_DB_ALIGN_END, "db_align_end", header); + indices_[SRS_PDBX_STRAND_ID]=header.GetIndex("pdbx_strand_id"); + cat_available = true; + } else if (header.GetCategory()=="struct_ref_seq_dif") { + category_ = STRUCT_REF_SEQ_DIF; + this->TryStoreIdx(SRSD_ALIGN_ID, "align_id", header); + this->TryStoreIdx(SRSD_RNUM, "seq_num", header); + indices_[SRSD_DETAILS]=header.GetIndex("details"); + cat_available = true; + } category_counts_[category_]++; return cat_available; } @@ -1328,6 +1352,18 @@ void MMCifReader::OnDataRow(const StarLoopDesc& header, LOG_TRACE("processing pdbx_database_PDB_obs_spr entry") this->ParsePdbxDatabasePdbObsSpr(columns); break; + case STRUCT_REF: + LOG_TRACE("processing struct_ref entry"); + this->ParseStructRef(columns); + break; + case STRUCT_REF_SEQ: + LOG_TRACE("processing struct_ref entry"); + this->ParseStructRefSeq(columns); + break; + case STRUCT_REF_SEQ_DIF: + LOG_TRACE("processing struct_ref entry"); + this->ParseStructRefSeqDif(columns); + break; default: throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "Uncatched category '"+ header.GetCategory() +"' found.", @@ -1379,6 +1415,103 @@ void MMCifReader::AssignSecStructure(mol::EntityHandle ent) } } + +void MMCifReader::ParseStructRef(const std::vector<StringRef>& columns) +{ + String ent_id=columns[indices_[SR_ENTITY_ID]].str(); + String db_name=columns[indices_[SR_DB_NAME]].str(); + String db_code=columns[indices_[SR_DB_CODE]].str(); + String id=columns[indices_[SR_ID]].str(); + String db_access; + if (indices_[SR_DB_ACCESS]!=-1) { + db_access=columns[indices_[SR_DB_ACCESS]].str(); + } + MMCifInfoStructRefPtr sr(new MMCifInfoStructRef(id, ent_id, db_name, + db_code, db_access)); + struct_refs_.push_back(sr); +} + +void MMCifReader::ParseStructRefSeq(const std::vector<StringRef>& columns) +{ + String aln_id=columns[indices_[SRS_ALIGN_ID]].str(); + String sr_id=columns[indices_[SRS_STRUCT_REF_ID]].str(); + String chain_name; + if (indices_[SRS_PDBX_STRAND_ID]!=-1) { + chain_name=columns[indices_[SRS_PDBX_STRAND_ID]].str(); + } + std::pair<bool,int> dbbeg=this->TryGetInt(columns[indices_[SRS_DB_ALIGN_BEG]], + "_struct_ref_seq.db_align_beg", + profile_.fault_tolerant); + std::pair<bool,int> dbend=this->TryGetInt(columns[indices_[SRS_DB_ALIGN_END]], + "_struct_ref_seq.db_align_end", + profile_.fault_tolerant); + std::pair<bool,int> entbeg=this->TryGetInt(columns[indices_[SRS_ENT_ALIGN_BEG]], + "_struct_ref_seq.seq_align_beg", + profile_.fault_tolerant); + std::pair<bool,int> entend=this->TryGetInt(columns[indices_[SRS_ENT_ALIGN_END]], + "_struct_ref_seq.seq_align_END", + profile_.fault_tolerant); + if (!(dbbeg.first && dbend.first && entbeg.first && entend.first)) { + return; + } + bool found=false; + for (MMCifInfoStructRefs::iterator i=struct_refs_.begin(), + e=struct_refs_.end(); i!=e; ++i) { + if ((*i)->GetID()==sr_id) { + (*i)->AddAlignedSeq(aln_id, chain_name, entbeg.second, entend.second, + dbbeg.second, dbend.second); + found=true; + break; + } + } + if (!found) { + if (profile_.fault_tolerant) { + LOG_ERROR("struct_ref_seq.ref_id points to inexistent struct_ref '" + << sr_id << "'"); + return; + } + std::stringstream ss; + ss << "struct_ref_seq.ref_id points to inexistent struct_ref '"; + ss << sr_id << "'"; + throw IOException(ss.str()); + } +} + +void MMCifReader::ParseStructRefSeqDif(const std::vector<StringRef>& columns) +{ + String aln_id=columns[indices_[SRSD_ALIGN_ID]].str(); + std::pair<bool,int> rnum=this->TryGetInt(columns[indices_[SRSD_RNUM]], + "_struct_ref_seq_dif.seq_num", + profile_.fault_tolerant); + if (!rnum.first) { + return; + } + String details; + if (indices_[SRSD_DETAILS]!=-1) { + details=columns[indices_[SRSD_DETAILS]].str(); + } + bool found=false; + for (MMCifInfoStructRefs::iterator i=struct_refs_.begin(), + e=struct_refs_.end(); i!=e; ++i) { + if (MMCifInfoStructRefSeqPtr s=(*i)->GetAlignedSeq(aln_id)) { + s->AddDif(rnum.second, details); + found=true; + break; + } + } + if (!found) { + if (profile_.fault_tolerant) { + LOG_ERROR("struct_ref_seq_dif.align_id points to inexistent " + "struct_ref_seq '" << aln_id << "'"); + return; + } + std::stringstream ss; + ss << "struct_ref_seq.ref_id points to inexistent struct_ref '"; + ss << aln_id << "'"; + throw IOException(ss.str()); + } +} + void MMCifReader::OnEndData() { mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT); @@ -1427,6 +1560,7 @@ void MMCifReader::OnEndData() std::vector<MMCifInfoTransOpPtr> operation_list; std::map<String, String>::const_iterator buom_it; std::vector<MMCifInfoTransOpPtr> operations = info_.GetOperations(); + info_.SetStructRefs(struct_refs_); std::vector<MMCifInfoTransOpPtr>::const_iterator buop_it; for (bua_it = bu_assemblies_.begin(); bua_it != bu_assemblies_.end(); diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index af12c813a..2a8f549ec 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -229,6 +229,7 @@ protected: /// \param columns data row void ParseCitation(const std::vector<StringRef>& columns); + const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; } /// \brief convert the seqres data item to canonical form. /// /// The seqres sequence lists non-standard residues in paranthesis. For @@ -242,7 +243,15 @@ protected: /// /// \param columns data row void ParseCitationAuthor(const std::vector<StringRef>& columns); - + + /// \ brief parse a row in the struct_ref category + void ParseStructRef(const std::vector<StringRef>& columns); + + /// \brief parse row in the struct_ref_seq category + void ParseStructRefSeq(const std::vector<StringRef>& columns); + + /// \brief parse row in the struct_ref_seq_dif category + void ParseStructRefSeqDif(const std::vector<StringRef>& columns); /// \brief Fetch mmCIF exptl information /// /// \param columns data row @@ -401,6 +410,32 @@ private: METHOD_DETAILS ///< details about assembly computation } PdbxStructAssemblyItems; + // \enum items of the struct_ref category + typedef enum { + SR_ENTITY_ID, + SR_ID, + SR_DB_CODE, + SR_DB_NAME, + SR_DB_ACCESS + } StructRefItems; + + /// \enum items of the struct_ref_seq category + typedef enum { + SRS_ALIGN_ID, + SRS_STRUCT_REF_ID, + SRS_PDBX_STRAND_ID, + SRS_DB_ALIGN_BEG, + SRS_DB_ALIGN_END, + SRS_ENT_ALIGN_BEG, + SRS_ENT_ALIGN_END + } StructRefSeqItems; + + /// \enum items of the struct_ref_seq_dif category + typedef enum { + SRSD_ALIGN_ID, + SRSD_RNUM, + SRSD_DETAILS + } StructRefSeqDifItems; /// \enum items of the pdbx_struct_assembly_gen category typedef enum { ASSEMBLY_ID, ///< link to pdbx_struct_assembly.id @@ -490,6 +525,9 @@ private: STRUCT_CONF, STRUCT_SHEET_RANGE, PDBX_DATABASE_PDB_OBS_SPR, + STRUCT_REF, + STRUCT_REF_SEQ, + STRUCT_REF_SEQ_DIF, DONT_KNOW } MMCifCategory; @@ -550,6 +588,7 @@ private: std::map<String, String> bu_origin_map_; ///< pdbx_struct_assembly.details MMCifHSVector helix_list_; ///< for storing struct_conf sec.struct. data MMCifHSVector strand_list_; ///< for storing struct_conf sec.struct. data + MMCifInfoStructRefs struct_refs_; }; }} diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index e33f6ad6f..b2294ba90 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -614,6 +614,40 @@ BOOST_AUTO_TEST_CASE(mmcif_citation_author_tests) BOOST_MESSAGE(" done."); } +BOOST_AUTO_TEST_CASE(mmcif_struct_ref) +{ + mol::EntityHandle eh = mol::CreateEntity(); + std::ifstream s("testfiles/mmcif/struct_ref.cif"); + IOProfile profile; + MMCifReader mmcif_p(s, eh, profile); + mmcif_p.Parse(); + MMCifInfoStructRefs refs=mmcif_p.GetInfo().GetStructRefs(); + BOOST_CHECK_EQUAL(refs.size(), 1); + MMCifInfoStructRefPtr sr1=refs[0]; + BOOST_CHECK_EQUAL(sr1->GetDBName(), "UNP"); + BOOST_CHECK_EQUAL(sr1->GetDBID(), "BLA2_BACCE"); + BOOST_CHECK_EQUAL(sr1->GetDBAccess(), "P04190"); + BOOST_CHECK_EQUAL(sr1->GetID(), "1"); + MMCifInfoStructRefSeqs seqs=sr1->GetAlignedSeqs(); + BOOST_CHECK_EQUAL(seqs.size(), 2); + BOOST_CHECK_EQUAL(seqs[0]->GetID(), "1"); + BOOST_CHECK_EQUAL(seqs[0]->GetChainName(), "A"); + BOOST_CHECK_EQUAL(seqs[0]->GetSeqBegin(), 1); + BOOST_CHECK_EQUAL(seqs[0]->GetSeqEnd(), 19); + BOOST_CHECK_EQUAL(seqs[0]->GetDBBegin(), 31); + BOOST_CHECK_EQUAL(seqs[0]->GetDBEnd(), 49); + BOOST_CHECK_EQUAL(seqs[1]->GetID(), "13"); + BOOST_CHECK_EQUAL(seqs[1]->GetChainName(), "B"); + BOOST_CHECK_EQUAL(seqs[1]->GetSeqBegin(), 1); + BOOST_CHECK_EQUAL(seqs[1]->GetSeqEnd(), 19); + BOOST_CHECK_EQUAL(seqs[1]->GetDBBegin(), 31); + BOOST_CHECK_EQUAL(seqs[1]->GetDBEnd(), 49); + MMCifInfoStructRefSeqDifs diffs=seqs[0]->GetDifs(); + BOOST_CHECK_EQUAL(diffs.size(), 1); + BOOST_CHECK_EQUAL(diffs[0]->GetRNum(), 91); + BOOST_CHECK_EQUAL(diffs[0]->GetDetails(), "ENGINEERED MUTATION"); +} + BOOST_AUTO_TEST_CASE(mmcif_refine_tests) { BOOST_MESSAGE(" Running mmcif_refine_tests..."); diff --git a/modules/io/tests/testfiles/mmcif/struct_ref.cif b/modules/io/tests/testfiles/mmcif/struct_ref.cif new file mode 100644 index 000000000..1d87ec60b --- /dev/null +++ b/modules/io/tests/testfiles/mmcif/struct_ref.cif @@ -0,0 +1,47 @@ +data_2bfl +# taken from 2bfl.cif +_struct_ref.id 1 +_struct_ref.db_name UNP +_struct_ref.db_code BLA2_BACCE +_struct_ref.entity_id 1 +_struct_ref.pdbx_seq_one_letter_code ? +_struct_ref.pdbx_align_begin ? +_struct_ref.biol_id . +_struct_ref.pdbx_db_accession P04190 +# +loop_ +_struct_ref_seq.align_id +_struct_ref_seq.ref_id +_struct_ref_seq.pdbx_PDB_id_code +_struct_ref_seq.pdbx_strand_id +_struct_ref_seq.seq_align_beg +_struct_ref_seq.pdbx_seq_align_beg_ins_code +_struct_ref_seq.seq_align_end +_struct_ref_seq.pdbx_seq_align_end_ins_code +_struct_ref_seq.pdbx_db_accession +_struct_ref_seq.db_align_beg +_struct_ref_seq.pdbx_db_align_beg_ins_code +_struct_ref_seq.db_align_end +_struct_ref_seq.pdbx_db_align_end_ins_code +_struct_ref_seq.pdbx_auth_seq_align_beg +_struct_ref_seq.pdbx_auth_seq_align_end +1 1 2BFL A 1 ? 19 ? P04190 31 ? 49 ? 27 45 +13 1 2BFL B 1 ? 19 ? P04190 31 ? 49 ? 27 45 +# +loop_ +_struct_ref_seq_dif.align_id +_struct_ref_seq_dif.pdbx_pdb_id_code +_struct_ref_seq_dif.mon_id +_struct_ref_seq_dif.pdbx_pdb_strand_id +_struct_ref_seq_dif.seq_num +_struct_ref_seq_dif.pdbx_pdb_ins_code +_struct_ref_seq_dif.pdbx_seq_db_name +_struct_ref_seq_dif.pdbx_seq_db_accession_code +_struct_ref_seq_dif.db_mon_id +_struct_ref_seq_dif.pdbx_seq_db_seq_num +_struct_ref_seq_dif.details +_struct_ref_seq_dif.pdbx_auth_seq_num +_struct_ref_seq_dif.pdbx_ordinal +1 2BFL CYS A 91 ? UNP P04190 ARG 121 'ENGINEERED MUTATION' 121 1 +13 2BFL CYS B 91 ? UNP P04190 ARG 121 'ENGINEERED MUTATION' 121 2 + -- GitLab