diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index 3a725c823ceabe4036465439f8e2473e90108a92..03305a8813a58accf29e8a19a63272562fe1aeb4 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -32,6 +32,8 @@ The following categories of a mmCIF file are considered by the parser: * ``pdbx_struct_oper_list``: Used for :class:`MMCifInfoBioUnit`. * ``struct``: Details about a structure, stored in :class:`MMCifInfoStructDetails`. +* ``struct_conf``: Stores secondary structure information (practically helices) + in the :class:`entity <ost.mol.EntityHandle>` Info Classes diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index 91c35928c21c39ee486e91e3b4d7818cafa2eb00..4050f96448496820b1f62b4df4e9e9896adc2ced 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -80,7 +80,8 @@ void MMCifParser::ClearState() authors_map_.clear(); bu_origin_map_.clear(); bu_assemblies_.clear(); - secstruct_list_.clear(); + helix_list_.clear(); + strand_list_.clear(); } void MMCifParser::SetRestrictChains(const String& restrict_chains) @@ -274,6 +275,7 @@ bool MMCifParser::OnBeginLoop(const StarLoopDesc& header) this->TryStoreIdx(STRUCT_CONF_ID, "id", header); // optional items indices_[BEG_AUTH_ASYM_ID] = header.GetIndex("beg_auth_asym_id"); + indices_[END_AUTH_ASYM_ID] = header.GetIndex("end_auth_asym_id"); cat_available = true; } category_counts_[category_]++; @@ -306,8 +308,8 @@ bool MMCifParser::ParseAtomIdent(const std::vector<StringRef>& columns, } else { chain_name = columns[indices_[LABEL_ASYM_ID]].str(); } - if (restrict_chains_.size() > 0 && // unit test - restrict_chains_.find(chain_name) == String::npos) { // unit test + if (restrict_chains_.size() > 0 && + restrict_chains_.find(chain_name) == String::npos) { return false; } @@ -1162,31 +1164,34 @@ void MMCifParser::ParseStructConf(const std::vector<StringRef>& columns) int e_res_num; // fetch start and end s_res_num = this->TryGetInt(columns[indices_[BEG_LABEL_SEQ_ID]], - "struct_conf.beg_label_seq_id"); // unit test + "struct_conf.beg_label_seq_id"); e_res_num = this->TryGetInt(columns[indices_[END_LABEL_SEQ_ID]], - "struct_conf.end_label_seq_id"); // unit test - if(auth_chain_id_) { // unit test both ways - if (indices_[BEG_AUTH_ASYM_ID] != -1) { // unit test + "struct_conf.end_label_seq_id"); + if(auth_chain_id_) { + if (indices_[BEG_AUTH_ASYM_ID] != -1) { chain_name = columns[indices_[BEG_AUTH_ASYM_ID]]; - } else { // unit test + } else { throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "Chain name by author requested but 'struct_conf.beg_auth_asym_id' is not set.", this->GetCurrentLinenum())); } - } else { // unit test + } else { chain_name = columns[indices_[BEG_LABEL_ASYM_ID]]; } - MMCifHSEntry hse = {to_res_num(s_res_num, ' '), - to_res_num(e_res_num, ' '), - chain_name.str(), - columns[indices_[CONF_TYPE_ID]].str()}; - - MMCifSecStructElement type = - DetermineSecStructType(columns[indices_[CONF_TYPE_ID]]); - if (type == MMCIF_HELIX) { // unit test helix and strand reading - secstruct_list_.push_back(hse); - } else if (type == MMCIF_STRAND) { - secstruct_list_.push_back(hse); + + if (restrict_chains_.size() == 0 || + restrict_chains_.find(chain_name.str()) != String::npos) { + MMCifHSEntry hse = {to_res_num(s_res_num, ' '), + to_res_num(e_res_num, ' '), + chain_name.str()}; + + MMCifSecStructElement type = + DetermineSecStructType(columns[indices_[CONF_TYPE_ID]]); + if (type == MMCIF_HELIX) { + helix_list_.push_back(hse); + } else if (type == MMCIF_STRAND) { + strand_list_.push_back(hse); + } } } @@ -1252,14 +1257,45 @@ void MMCifParser::OnDataRow(const StarLoopDesc& header, void MMCifParser::AssignSecStructure(mol::EntityHandle ent) { - // for each helix, take chain - // check for overlaps (visual artifacts) - // assign helix to chain - - // for all strands - // take chain - // check for overlaps - // assign strand to chain + for (MMCifHSVector::const_iterator i=helix_list_.begin(), e=helix_list_.end(); + i!=e; ++i) { + mol::ChainHandle chain = ent.FindChain(i->chain_name); + if (!chain.IsValid()) { + LOG_INFO("ignoring helix record for unknown chain " + i->chain_name); + continue; + } + mol::SecStructure alpha(mol::SecStructure::ALPHA_HELIX); + // some PDB files contain helix/strand entries that are adjacent to each + // other. To avoid visual artifacts, we effectively shorten the first of + // the two secondary structure segments to insert one residue of coil + // conformation. + mol::ResNum start = i->start, end = i->end; + if (helix_list_.end() != i+1 && // unit test + (*(i+1)).start.GetNum() <= end.GetNum()+1 && + (*(i+1)).end.GetNum() > end.GetNum()) { + end = mol::ResNum((*(i+1)).start.GetNum()-2); + } + chain.AssignSecondaryStructure(alpha, start, end); + } + + for (MMCifHSVector::const_iterator i=strand_list_.begin(), + e=strand_list_.end(); + i!=e; ++i) { + mol::ChainHandle chain=ent.FindChain(i->chain_name); + if (!chain.IsValid()) { + LOG_INFO("ignoring strand record for unknown chain " + i->chain_name); + continue; + } + mol::SecStructure extended(mol::SecStructure::EXTENDED); + mol::ResNum start = i->start, end = i->end; + // see comment for helix assignment + if (strand_list_.end() != i+1 && // unit test + (*(i+1)).start.GetNum() <= end.GetNum()+1 && + (*(i+1)).end.GetNum() > end.GetNum()) { + end=mol::ResNum((*(i+1)).start.GetNum()-2); + } + chain.AssignSecondaryStructure(extended, start, end); + } } void MMCifParser::OnEndData() @@ -1354,11 +1390,14 @@ void MMCifParser::OnEndData() bu_assemblies_.clear(); // create secondary structure from struct_conf info + this->AssignSecStructure(ent_handle_); LOG_INFO("imported " << chain_count_ << " chains, " << residue_count_ << " residues, " - << atom_count_ << " atoms;"); + << atom_count_ << " atoms;" + << helix_list_.size() << " helices and " + << strand_list_.size() << " strands"); } }} diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index 2863d4537feb79f080bc9f52f95963d4f5e22e7c..c8a8b5181cd1bbd0b519ff8ccc5a66518d3384cf 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -52,6 +52,7 @@ namespace ost { namespace io { /// \li pdbx_struct_assembly_gen /// \li pdbx_struct_oper_list /// \li struct +/// \li struct_conf class DLLEXPORT_OST_IO MMCifParser : public StarParser { public: /// \brief create a MMCifParser @@ -94,6 +95,15 @@ public: return restrict_chains_; } + /// \brief Enable or disable reading of auth_chain_id instead aof label_chain + /// id (default) + /// + /// \param id enable (true) or disable (false) reading of auth_chain_id. + void SetAuthChainID(bool id) + { + auth_chain_id_ = id; + } + /// \brief check mmcif input to be read. Substitutional function for /// \link StarParser StarParser\endlink. /// @@ -175,7 +185,7 @@ protected: /// \param pdbid putative PDB id /// /// \return true for a valid id, false otherwise - bool IsValidPDBIdent(const StringRef& pdbid); // tested + bool IsValidPDBIdent(const StringRef& pdbid); /// \brief fetch values identifying atoms /// @@ -423,6 +433,7 @@ private: BEG_LABEL_COMP_ID, ///< Starting residue, points to atom_site.label_comp_id BEG_LABEL_SEQ_ID, ///< Starting residue, points to atom_site.label_seq_id CONF_TYPE_ID, ///< Pointer to struct_conf_type.id + END_AUTH_ASYM_ID, ///< Ending residue, points to atom_site.auth_asym_id END_LABEL_ASYM_ID, ///< Ending residue, points to atom_site.label_asym_id END_LABEL_COMP_ID, ///< Ending residue, points to atom_site.label_comp_id END_LABEL_SEQ_ID, ///< Ending residue, points to atom_site.label_seq_id @@ -470,7 +481,6 @@ private: mol::ResNum start; mol::ResNum end; String chain_name; - String type; } MMCifHSEntry; typedef std::vector<MMCifHSEntry> MMCifHSVector; @@ -502,7 +512,8 @@ private: MMCifCitationAuthorMap authors_map_; MMCifBioUAssemblyVector bu_assemblies_; std::map<String, String> bu_origin_map_; ///< pdbx_struct_assembly.details - MMCifHSVector secstruct_list_; ///< for storing struct_conf sec.struct. data + MMCifHSVector helix_list_; ///< for storing struct_conf sec.struct. data + MMCifHSVector strand_list_; ///< for storing struct_conf sec.struct. data }; }} diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index 997dd89ccbf2656abd210fedb56686906ded3327..667684c6e3ce223b41287ac9807b8ef0d1177aec 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -60,7 +60,9 @@ public: using MMCifParser::ParsePdbxStructAssembly; using MMCifParser::ParsePdbxStructOperList; using MMCifParser::ParseStruct; + using MMCifParser::ParseStructConf; using MMCifParser::TryStoreIdx; + using MMCifParser::SetRestrictChains; using MMCifParser::SetReadSeqRes; using MMCifParser::SetReadCanonicalSeqRes; using MMCifParser::ClearState; @@ -71,6 +73,7 @@ public: using MMCifParser::MMCIF_HELIX; using MMCifParser::MMCIF_TURN; using MMCifParser::MMCIF_STRAND; + using MMCifParser::SetAuthChainID; }; void SetAtomSiteHeader(StarLoopDesc* mmcif_h) @@ -936,6 +939,51 @@ BOOST_AUTO_TEST_CASE(mmcif_struct_conf_tests) type = StringRef("Foo", 3); BOOST_CHECK_THROW(tmmcif_p.DetermineSecStructType(type), IOException); + BOOST_MESSAGE(" done."); + BOOST_MESSAGE(" testing auth_chain_id switch..."); + + StarLoopDesc tmmcif_h; + std::vector<StringRef> columns; + tmmcif_h.SetCategory(StringRef("struct_conf", 11)); + tmmcif_h.Add(StringRef("beg_label_asym_id", 17)); + tmmcif_h.Add(StringRef("beg_label_comp_id", 17)); + tmmcif_h.Add(StringRef("beg_label_seq_id", 16)); + tmmcif_h.Add(StringRef("conf_type_id", 12)); + tmmcif_h.Add(StringRef("end_label_asym_id", 17)); + tmmcif_h.Add(StringRef("end_label_comp_id", 17)); + tmmcif_h.Add(StringRef("end_label_seq_id", 16)); + tmmcif_h.Add(StringRef("id", 2)); + tmmcif_h.Add(StringRef("beg_auth_asym_id", 16)); + tmmcif_p.OnBeginLoop(tmmcif_h); + columns.push_back(StringRef("A", 1)); + columns.push_back(StringRef("ARG", 3)); + columns.push_back(StringRef("1", 1)); + columns.push_back(StringRef("HELX_RH_AL_P", 12)); + columns.push_back(StringRef("A", 1)); + columns.push_back(StringRef("ARG", 3)); + columns.push_back(StringRef("2", 1)); + columns.push_back(StringRef("DHLX1", 5)); + columns.push_back(StringRef("A", 1)); + tmmcif_p.SetAuthChainID(true); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseStructConf(columns)); + tmmcif_p.SetAuthChainID(false); + BOOST_CHECK_NO_THROW(tmmcif_p.ParseStructConf(columns)); + tmmcif_h.Clear(); + tmmcif_h.SetCategory(StringRef("struct_conf", 11)); + tmmcif_h.Add(StringRef("beg_label_asym_id", 17)); + tmmcif_h.Add(StringRef("beg_label_comp_id", 17)); + tmmcif_h.Add(StringRef("beg_label_seq_id", 16)); + tmmcif_h.Add(StringRef("conf_type_id", 12)); + tmmcif_h.Add(StringRef("end_label_asym_id", 17)); + tmmcif_h.Add(StringRef("end_label_comp_id", 17)); + tmmcif_h.Add(StringRef("end_label_seq_id", 16)); + tmmcif_h.Add(StringRef("id", 2)); + tmmcif_p.OnBeginLoop(tmmcif_h); + columns.pop_back(); + tmmcif_p.SetAuthChainID(true); + BOOST_CHECK_THROW(tmmcif_p.ParseStructConf(columns), IOException); + tmmcif_p.SetAuthChainID(false); + BOOST_MESSAGE(" done."); BOOST_MESSAGE(" done."); @@ -1060,6 +1108,8 @@ BOOST_AUTO_TEST_CASE(mmcif_testreader) IOProfile profile; MMCifParser mmcif_p(s, eh, profile); + mmcif_p.SetRestrictChains("A O C"); + BOOST_MESSAGE(" testing Parse()..."); BOOST_CHECK_NO_THROW(mmcif_p.Parse()); @@ -1080,9 +1130,19 @@ BOOST_AUTO_TEST_CASE(mmcif_testreader) for (rs = rl.begin(); rs != rl.end(); ++rs, ++i) { BOOST_CHECK_EQUAL(rs->GetNumber().GetNum(), i); } + BOOST_MESSAGE(" done."); - // add checking of struct_conf info, here - + BOOST_MESSAGE(" testing secondary structure..."); + // pick chains, iterate residues, check for correct sec.struct. + ch = eh.FindChain("A"); + rl = ch.GetResidueList(); + for (rs = rl.begin(); rs != rl.end(); ++rs) { + std::cout << "Foo " << (char)rs->GetSecStructure() << std::endl; + if (rs->GetSecStructure().IsExtended()) { + std::cout << "Bar" << std::endl; + } + //BOOST_CHECK_EQUAL(rs->GetSecStructure().IsHelical()); + } BOOST_MESSAGE(" done."); BOOST_MESSAGE(" reading data fields which should not fail..."); diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif index b8e2473936cc72486c5e06856bf2093456e6caba..3268d114eea08cf4bb0072cf9bcecea410391222 100644 --- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif +++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif @@ -94,6 +94,7 @@ _struct.pdbx_formula_weight_method 'Good Guess' _struct.pdbx_model_details 'Even better guessing' _struct.pdbx_model_type_details 'Guess' +# do not change anything, here loop_ _struct_conf.id _struct_conf.conf_type_id @@ -104,16 +105,9 @@ _struct_conf.end_label_comp_id _struct_conf.end_label_asym_id _struct_conf.end_label_seq_id _struct_conf.details -HELX1 HELX_RH_AL_P ARG A 87 GLN A 92 . -HELX2 HELX_RH_AL_P ARG B 287 GLN B 292 . -STRN1 STRN PRO A 1 LEU A 5 . -STRN2 STRN CYS B 295 PHE B 299 . -STRN3 STRN CYS A 95 PHE A 299 . -STRN4 STRN PRO B 201 LEU B 205 . -TURN1 TURN_TY1P_P ILE A 15 GLN A 18 . -TURN2 TURN_TY2_P GLY A 49 GLY A 52 . -TURN3 TURN_TY1P_P ILE A 55 HIS A 69 . -TURN4 TURN_TY1_P THR A 91 GLY A 94 . +HELX1 HELX_RH_AL_P VAL A 11 THR A 12 . +STRN1 STRN ILE A 13 ILE A 13 . +HELX1 HELX_RH_AL_P ILE Z 1 ILE Z 1 . loop_ _atom_site.group_PDB @@ -163,6 +157,15 @@ HETATM C C2 APS C 14 1 1 4.949 27.758 6.793 0.58 16.95 1 300 102 ? A HETATM O O3 APS C 14 1 1 4.800 26.678 7.393 0.58 16.85 1 300 103 ? A HETATM N N4 APS C 14 1 1 5.930 27.841 5.869 0.58 16.43 1 300 104 ? A # - - - - data truncated for brevity - - - - +# chain to be ignored by 'restrict_chains' feature +ATOM N N ILE Z 1 . 1 23.664 33.855 16.884 1.00 22.08 . 1 17 ? Z +ATOM C CA ILE Z 1 . 1 22.623 34.850 17.093 1.00 23.44 . 1 18 ? Z +ATOM C C ILE Z 1 . 1 22.657 35.113 18.610 1.00 25.77 . 1 19 ? Z +ATOM O O ILE Z 1 . 1 23.123 34.250 19.406 1.00 26.28 . 1 20 ? Z +ATOM C CB ILE Z 1 . 1 21.236 34.463 16.492 1.00 22.67 . 1 21 ? Z +ATOM C CG1 ILE Z 1 . 1 20.478 33.469 17.371 1.00 22.14 . 1 22 ? Z +ATOM C CG2 ILE Z 1 . 1 21.357 33.986 15.016 1.00 21.75 . 1 23 ? Z +# # H2O HETATM O O HOH O . . 5 -5.322 10.972 9.720 0.95 7.25 . 2001 2731 ? B HETATM O O HOH O . . 5 4.148 4.008 12.383 0.64 14.61 . 2002 2732 ? B