diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index bb5241e7ea511a646b1ecc4e1d097691a7e1b4c2..dc384b57f8de98163474437056824d442f6ba5cf 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -19,17 +19,13 @@ #include <ctype.h> -//#include <boost/iostreams/filter/gzip.hpp> -//#include <boost/filesystem/convenience.hpp> -//#include <boost/algorithm/string.hpp> -//#include <boost/algorithm/string/trim.hpp> - -//#include <boost/format.hpp> -//#include <boost/lexical_cast.hpp> - #include <ost/profile.hh> #include <ost/log.hh> +#include <ost/dyn_cast.hh> #include <ost/mol/xcs_editor.hh> +#include <ost/conop/conop.hh> + +#include <ost/conop/rule_based_builder.hh> #include <ost/io/mol/mmcif_reader.hh> namespace ost { namespace io { @@ -65,6 +61,7 @@ void MMCifParser::Init() curr_residue_ = mol::ResidueHandle(); seqres_ = seq::CreateSequenceList(); read_seqres_ = false; + warned_rule_based_ = false; } void MMCifParser::ClearState() @@ -522,6 +519,7 @@ void MMCifParser::ParseEntityPoly(const std::vector<StringRef>& columns) if (seqres_can_) { if (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1) { seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]]; + edm_it->second.seqres = seqres.str_no_whitespace(); } else { throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "'entity_poly.pdbx_seq_one_letter_code_can' not available.'", @@ -529,15 +527,67 @@ void MMCifParser::ParseEntityPoly(const std::vector<StringRef>& columns) } } else if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) { seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]]; + conop::BuilderP builder=conop::Conopology::Instance().GetBuilder("DEFAULT"); + conop::RuleBasedBuilderPtr rbb=dyn_cast<conop::RuleBasedBuilder>(builder); + if (!rbb) { + if (!warned_rule_based_) { + LOG_WARNING("SEQRES import requires the rule-based builder. Ignoring " + "SEQRES records"); + } + warned_rule_based_=true; + return; + } + conop::CompoundLibPtr comp_lib=rbb->GetCompoundLib(); + edm_it->second.seqres = this->ConvertSEQRES(seqres.str_no_whitespace(), + comp_lib); } else { throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, "'entity_poly.pdbx_seq_one_letter_code' not available.'", this->GetCurrentLinenum())); } - edm_it->second.seqres = seqres.str_no_whitespace(); } } +String MMCifParser::ConvertSEQRES(const String& seqres, + conop::CompoundLibPtr comp_lib) +{ + String can_seqres; + for (String::const_iterator i=seqres.begin(), e=seqres.end(); i!=e; ++i) { + if (*i=='(') { + bool found_end_paren=false; + String tlc; + tlc.reserve(3); + while ((++i)!=seqres.end()) { + if (*i==')') { + found_end_paren=true; + break; + } + tlc.push_back(*i); + } + if (!found_end_paren) { + throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR, + "'entity_poly.pdbx_seq_one_letter_code' contains " + "unmatched '('", this->GetCurrentLinenum())); + } + conop::CompoundPtr compound=comp_lib->FindCompound(tlc, + conop::Compound::PDB); + if (!compound) { + if (tlc!="UNK") { + + LOG_WARNING("unknown residue '" << tlc << "' in SEQRES record. " + "Setting one-letter-code to '?'"); + } + can_seqres.push_back('?'); + continue; + } + can_seqres.push_back(compound->GetOneLetterCode()); + } else { + can_seqres.push_back(*i); + } + } + return can_seqres; +} + void MMCifParser::ParseCitation(const std::vector<StringRef>& columns) { // fetch dependencies from dscription, like article requires year diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh index 2d1990b809aafb0570e72ea8b60051830de7eaf0..d82b9fb103d8dcf269af4463f9d6b51ab2106e57 100644 --- a/modules/io/src/mol/mmcif_reader.hh +++ b/modules/io/src/mol/mmcif_reader.hh @@ -27,6 +27,7 @@ #include <ost/seq/sequence_list.hh> #include <ost/mol/residue_handle.hh> #include <ost/mol/chain_type.hh> +#include <ost/conop/compound_lib.hh> #include <ost/io/mol/io_profile.hh> #include <ost/io/io_exception.hh> #include <ost/io/mol/star_parser.hh> @@ -200,6 +201,15 @@ public: /// \param columns data row void ParseCitation(const std::vector<StringRef>& columns); + /// \brief convert the seqres data item to canonical form. + /// + /// The seqres sequence lists non-standard residues in paranthesis. For + /// proper handling of our sequence classes, these need to be converted to + /// one-letter-codes. Ideally, we would use the canonical SEQRES. This is + /// not possible, however, since the PDB assigns multiple one letter codes + /// to some of the residues. To be consistent, we have to do the conversion on + /// our own. + String ConvertSEQRES(const String& seqres, conop::CompoundLibPtr compound_lib); private: /// \enum magic numbers of this class typedef enum { @@ -293,6 +303,7 @@ private: int residue_count_; int atom_count_; bool warned_name_mismatch_; + bool warned_rule_based_; String subst_res_id_; ///< work around for missing label_seq_id's bool has_model_; ///< keep track of models through different atom_sites int curr_model_; ///< if we have pdbx_PDB_model_num, store no. diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index 34836932889c477d70c2174441606d9139bd36a8..08b3a8bc972e1e53b0f2d06eae5e81795c237559 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -18,8 +18,11 @@ //------------------------------------------------------------------------------ #include <fstream> +#include <ost/platform.hh> #include <ost/io/io_exception.hh> #include <ost/io/mol/mmcif_reader.hh> +#include <ost/conop/conop.hh> +#include <ost/conop/rule_based_builder.hh> #define BOOST_AUTO_TEST_DYN_LINK #include <boost/test/unit_test.hpp> @@ -54,6 +57,7 @@ public: using MMCifParser::SetReadSeqRes; using MMCifParser::SetReadCanonicalSeqRes; using MMCifParser::ClearState; + using MMCifParser::ConvertSEQRES; }; BOOST_AUTO_TEST_SUITE( io ); @@ -96,6 +100,30 @@ BOOST_AUTO_TEST_CASE(mmcif_trystoreidx) BOOST_CHECK_NO_THROW(tmmcif_p.TryStoreIdx(0, "bar", mmcif_h)); } +BOOST_AUTO_TEST_CASE(mmcif_convert_seqres) +{ + SetPrefixPath(getenv("OST_ROOT")); + String lib_path=GetSharedDataPath()+"/compounds.chemlib"; + conop::CompoundLibPtr compound_lib=conop::CompoundLib::Load(lib_path); + if (!compound_lib) { + std::cout << "WARNING: skipping SEQRES import unit test. " + << "Rule-based builder is required" << std::endl; + return; + } + conop::RuleBasedBuilderPtr rbb(new conop::RuleBasedBuilder(compound_lib)); + conop::Conopology::Instance().RegisterBuilder(rbb, "RBB"); + conop::Conopology::Instance().SetDefaultBuilder("RBB"); + mol::EntityHandle eh=mol::CreateEntity(); + + TestMMCifParserProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh); + std::vector<StringRef> columns; + StarLoopDesc tmmcif_h; + BOOST_CHECK_EQUAL(tmmcif_p.ConvertSEQRES("A(MSE)Y", compound_lib), "AMY"); + BOOST_CHECK_THROW(tmmcif_p.ConvertSEQRES("A(MSEY", compound_lib), + IOException); + conop::Conopology::Instance().SetDefaultBuilder("HEURISTIC"); +} + BOOST_AUTO_TEST_CASE(mmcif_onbeginloop) { mol::EntityHandle eh=mol::CreateEntity(); @@ -321,6 +349,7 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_tests) BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests) { + conop::Conopology::Instance().SetDefaultBuilder("RBB"); BOOST_MESSAGE(" Running mmcif_entity_poly_tests..."); mol::ChainHandle ch; IOProfile profile; @@ -462,7 +491,7 @@ columns.push_back(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid", tmmcif_h.Add(StringRef("type", 4)); tmmcif_h.Add(StringRef("pdbx_seq_one_letter_code_can", 28)); tmmcif_p.OnBeginLoop(tmmcif_h); - + tmmcif_p.SetReadCanonicalSeqRes(false); columns.push_back(StringRef("1", 1)); columns.push_back(StringRef("other", 5)); columns.push_back(StringRef("ABRND", 5)); @@ -475,6 +504,7 @@ columns.push_back(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid", BOOST_MESSAGE(" done."); BOOST_MESSAGE(" done."); + conop::Conopology::Instance().SetDefaultBuilder("HEURISTIC"); } BOOST_AUTO_TEST_CASE(mmcif_parseatomident) @@ -490,7 +520,6 @@ BOOST_AUTO_TEST_CASE(mmcif_parseatomident) String chain_name; StringRef res_name; mol::ResNum resnum(0); - char alt_loc; //StringRef atom_name; BOOST_MESSAGE(" testing valid line"); diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif index 2a01dc7331acf7b877f1035561c86d3ea3bba853..eaa46793866696ac28513af3c1c467f7dfe005fa 100644 --- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif +++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif @@ -19,7 +19,7 @@ _entity_poly.entity_id 1 _entity_poly.type 'polypeptide(L)' _entity_poly.nstd_linkage no _entity_poly.nstd_monomer no -_entity_poly.pdbx_seq_one_letter_code 'VTI' +_entity_poly.pdbx_seq_one_letter_code 'VTI' loop_ _atom_site.group_PDB