From 9146eff8041ade5a78703d93b51ea67e778f0a29 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Mon, 29 Jan 2024 14:35:19 +0100 Subject: [PATCH] mmcif writer: Derive info on chemical components from compound library previously this was done with residue ChemClasses. This is set by the compound library anyways during processing when loading a structure. The problem: when we suddenly need info on chemical components for stuff that is in the SEQRES but nowhere resolved in the structure. I really tried hard to avoid the requirement for the compound library in mmcif writing but there seems no way around. --- modules/io/doc/mmcif.rst | 30 ++- modules/io/pymod/__init__.py | 10 +- modules/io/pymod/export_mmcif_io.cc | 30 ++- modules/io/src/mol/mmcif_str.cc | 12 +- modules/io/src/mol/mmcif_str.hh | 4 +- modules/io/src/mol/mmcif_writer.cc | 387 ++++++++-------------------- modules/io/src/mol/mmcif_writer.hh | 9 +- 7 files changed, 174 insertions(+), 308 deletions(-) diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index 87256aa47..feb05fca8 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -1816,11 +1816,12 @@ Expected properties when *mmcif_conform* is enabled: * The residue numbers in "polymer" chains must match the SEQRES of the underlying entity with 1-based indexing. Insertion codes are not allowed and raise an error. -* Each residue must have a valid chem class assigned (available as - :func:`ost.mol.ResidueHandle.GetChemClass`). Even though this information - would be available when reading valid mmCIF files, OpenStructure delegates - this to the :class:`ost.conop.Processor` and thus requires a valid - :class:`ost.conop.CompoundLib` when reading in order to correctly set them. +* Each residue must be named according to the entries in the + :class:`ost.conop.CompoundLib` which is provided when calling + :func:`MMCifWriter.SetStructure`. This is relevant for the _chem_comp + category. If the respective compound cannot be found, the type for that + compound is set to "OTHER" + There is one quirk remaining: The assignment of underlying mmCIF entities. This is a challenge primarily for polymers. The @@ -1867,12 +1868,14 @@ a few special cases: Behaviour when *mmcif_conform* is False """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -If *mmcif_conform* is not enabled, the only expectation is that chem classes -(available as :func:`ost.mol.ResidueHandle.GetChemClass`) are set. OpenStructure -delegates this to the :class:`ost.conop.Processor` and thus requires a valid -:class:`ost.conop.CompoundLib` when reading a structure. There will be -significant preprocessing involving the split of chains which is purely based -on the set chem classes. Each chain gets split with the following rules: +If *mmcif_conform* is not enabled, the only expectation is that residues are +named according to the :class:`ost.conop.CompoundLib` which is provided when +calling :func:`MMCifWriter.SetStructure`. The :class:`ost.conop.CompoundLib` is +used to extract the respective chem classes (see :class:`ost.mol.ChemClass`). +Residues with no entry in the :class:`ost.conop.CompoundLib` are set to +:class:`UNKNOWN`. There will be significant preprocessing involving the split of +chains which is purely based on these chem classes. Each chain gets split with +the following rules: * separate chain of |entity.type|_ "non-polymer" for each residue with chem class :class:`NON_POLYMER`/ :class:`UNKNOWN` @@ -2013,7 +2016,8 @@ To see it all in action: to extract relevant mmCIF information from :class:`ost.mol.EntityHandle`/ :class:`ost.mol.EntityView` - .. method:: SetStructure(ent, mmcif_conform=True, entity_info=list()) + .. method:: SetStructure(ent, compound_lib, mmcif_conform=True, + entity_info=list()) Extracts mmCIF categories/attributes based on the description above. An object of type :class:`MMCifWriter` can only be associated with one @@ -2021,6 +2025,8 @@ To see it all in action: :param ent: The stucture to write :type ent: :class:`ost.mol.EntityHandle`/ :class:`ost.mol.EntityView` + :param compound_lib: The compound library + :type compound_lib: :class:`ost.conop.CompoundLib` :param mmcif_conform: Determines data extraction strategy as described above :type mmcif_conform: :class:`bool` :param entity_info: Predefine mmCIF entities - useful to define complete diff --git a/modules/io/pymod/__init__.py b/modules/io/pymod/__init__.py index b67bb9241..e15bc81eb 100644 --- a/modules/io/pymod/__init__.py +++ b/modules/io/pymod/__init__.py @@ -459,13 +459,16 @@ def LoadMMCIF(filename, fault_tolerant=None, calpha_only=None, raise -def SaveMMCIF(ent, filename, data_name="OST_structure", mmcif_conform = True, +def SaveMMCIF(ent, filename, compound_lib = conop.GetDefaultLib(), + data_name="OST_structure", mmcif_conform = True, entity_info = MMCifWriterEntityList()): """ Save OpenStructure entity in mmCIF format :param ent: OpenStructure Entity to be saved :param filename: Filename - .gz suffix triggers gzip compression + :param compound_lib: Compound library required when writing, uses + :func:`ost.conop.GetDefaultLib` if not given :param data_name: Name of data block that will be written to mmCIF file. Typically, thats the PDB ID or some identifier. @@ -483,12 +486,15 @@ def SaveMMCIF(ent, filename, data_name="OST_structure", mmcif_conform = True, :param entity_info: Advanced usage - passed as *entity_info* parameter to :func:`MMCifWriter.SetStructure` :type filename: :class:`str` + :type compound_lib: :class:`ost.conop.CompoundLib` :type data_name: :class:`str` :type mmcif_conform: :class:`bool` :type entity_info: :class:`MMCifWriterEntityList` """ + if compound_lib is None: + raise RuntimeError("Require valid compound library to write mmCIF format") writer = MMCifWriter() - writer.SetStructure(ent, mmcif_conform = mmcif_conform, + writer.SetStructure(ent, compound_lib, mmcif_conform = mmcif_conform, entity_info = entity_info) writer.Write(data_name, filename) diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc index a6ff22c0d..325fb2439 100644 --- a/modules/io/pymod/export_mmcif_io.cc +++ b/modules/io/pymod/export_mmcif_io.cc @@ -59,14 +59,18 @@ boost::python::tuple WrapMMCifStringToEntity(const String& mmcif, String WrapEntityToMMCifStringEnt(const ost::mol::EntityHandle& ent, const String& data_name, + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform) { - return EntityToMMCifString(ent, data_name, mmcif_conform); + return EntityToMMCifString(ent, data_name, compound_lib, + mmcif_conform); } String WrapEntityToMMCifStringView(const ost::mol::EntityView& ent, const String& data_name, + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform) { - return EntityToMMCifString(ent, data_name, mmcif_conform); + return EntityToMMCifString(ent, data_name, compound_lib, + mmcif_conform); } void WrapStarLoopAddData(StarWriterLoop& sl, const boost::python::list& l) { @@ -84,16 +88,18 @@ void WrapStarWriterWrite(StarWriter& writer, const String& data_name, void WrapSetStructureHandle(MMCifWriter& writer, const ost::mol::EntityHandle& ent, + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform, const std::vector<MMCifWriterEntity>& entity_info) { - writer.SetStructure(ent, mmcif_conform, entity_info); + writer.SetStructure(ent, compound_lib, mmcif_conform, entity_info); } void WrapSetStructureView(MMCifWriter& writer, const ost::mol::EntityView& ent, + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform, const std::vector<MMCifWriterEntity>& entity_info) { - writer.SetStructure(ent, mmcif_conform, entity_info); + writer.SetStructure(ent, compound_lib, mmcif_conform, entity_info); } void export_mmcif_io() @@ -166,9 +172,11 @@ void export_mmcif_io() ; class_<MMCifWriter, bases<StarWriter> >("MMCifWriter", init<>()) - .def("SetStructure", &WrapSetStructureHandle, (arg("ent"), arg("mmcif_conform")=true, + .def("SetStructure", &WrapSetStructureHandle, (arg("ent"), arg("compound_lib"), + arg("mmcif_conform")=true, arg("entity_info")=std::vector<MMCifWriterEntity>())) - .def("SetStructure", &WrapSetStructureView, (arg("ent"), arg("mmcif_conform")=true, + .def("SetStructure", &WrapSetStructureView, (arg("ent"), arg("compound_lib"), + arg("mmcif_conform")=true, arg("entity_info")=std::vector<MMCifWriterEntity>())) .def("GetEntities", &MMCifWriter::GetEntities, return_value_policy<copy_const_reference>()) ; @@ -569,10 +577,12 @@ void export_mmcif_io() arg("process")=false)); def("EntityToMMCifString", &WrapEntityToMMCifStringEnt, (arg("ent"), - arg("data_name")="OST_structure", - arg("mmcif_conform")=true)); + arg("data_name"), + arg("compound_lib"), + arg("mmcif_conform"))); def("EntityToMMCifString", &WrapEntityToMMCifStringView, (arg("ent"), - arg("data_name")="OST_structure", - arg("mmcif_conform")=true)); + arg("data_name"), + arg("compound_lib"), + arg("mmcif_conform"))); } diff --git a/modules/io/src/mol/mmcif_str.cc b/modules/io/src/mol/mmcif_str.cc index a127ab7fa..6fde265e4 100644 --- a/modules/io/src/mol/mmcif_str.cc +++ b/modules/io/src/mol/mmcif_str.cc @@ -24,19 +24,23 @@ namespace ost { namespace io { String EntityToMMCifString(const ost::mol::EntityHandle& ent, - const String& data_name, bool mmcif_conform) { + const String& data_name, + ost::conop::CompoundLibPtr compound_lib, + bool mmcif_conform) { std::stringstream ss; MMCifWriter writer; - writer.SetStructure(ent, mmcif_conform); + writer.SetStructure(ent, compound_lib, mmcif_conform); writer.Write(data_name, ss); return ss.str(); } String EntityToMMCifString(const ost::mol::EntityView& ent, - const String& data_name, bool mmcif_conform) { + const String& data_name, + ost::conop::CompoundLibPtr compound_lib, + bool mmcif_conform) { std::stringstream ss; MMCifWriter writer; - writer.SetStructure(ent, mmcif_conform); + writer.SetStructure(ent, compound_lib, mmcif_conform); writer.Write(data_name, ss); return ss.str(); } diff --git a/modules/io/src/mol/mmcif_str.hh b/modules/io/src/mol/mmcif_str.hh index ec0c92df4..97978b341 100644 --- a/modules/io/src/mol/mmcif_str.hh +++ b/modules/io/src/mol/mmcif_str.hh @@ -29,11 +29,11 @@ namespace ost { namespace io { String DLLEXPORT_OST_IO EntityToMMCifString(const ost::mol::EntityHandle& ent, const String& data_name, - bool mmcif_conform); + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform); String DLLEXPORT_OST_IO EntityToMMCifString(const ost::mol::EntityView& ent, const String& data_name, - bool mmcif_conform); + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform); std::tuple<mol::EntityHandle, MMCifInfo, ost::seq::SequenceList> DLLEXPORT_OST_IO MMCifStringToEntity(const String& mmcif, const IOProfile& profile, bool process); diff --git a/modules/io/src/mol/mmcif_writer.cc b/modules/io/src/mol/mmcif_writer.cc index eb9614718..8c5ba161d 100644 --- a/modules/io/src/mol/mmcif_writer.cc +++ b/modules/io/src/mol/mmcif_writer.cc @@ -91,152 +91,6 @@ namespace { } } - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList - template<class T> - String GuessEntityPolyType(const T& res_list) { - - // guesses _entity_poly.type based on residue chem classes - - // allowed values according to mmcif_pdbx_v50.dic: - // - cyclic-pseudo-peptide - // - other - // - peptide nucleic acid - // - polydeoxyribonucleotide - // - polydeoxyribonucleotide/polyribonucleotide hybrid - // - polypeptide(D) - // - polypeptide(L) - // - polyribonucleotide - - // this function won't identify cyclic-pseudo-peptides - - std::set<char> chem_classes; - for(auto res: res_list) { - chem_classes.insert(res.GetChemClass()); - } - - // check for polypeptide(L) - if(chem_classes.size() == 1 && - chem_classes.find(ost::mol::ChemClass::L_PEPTIDE_LINKING) != chem_classes.end()) { - return "polypeptide(L)"; - } - - if(chem_classes.size() == 2 && - chem_classes.find(ost::mol::ChemClass::L_PEPTIDE_LINKING) != chem_classes.end() && - chem_classes.find(ost::mol::ChemClass::PEPTIDE_LINKING) != chem_classes.end()) { - return "polypeptide(L)"; - } - - // check for polypeptide(D) - if(chem_classes.size() == 1 && - chem_classes.find(ost::mol::ChemClass::D_PEPTIDE_LINKING) != chem_classes.end()) { - return "polypeptide(D)"; - } - - if(chem_classes.size() == 2 && - chem_classes.find(ost::mol::ChemClass::D_PEPTIDE_LINKING) != chem_classes.end() && - chem_classes.find(ost::mol::ChemClass::PEPTIDE_LINKING) != chem_classes.end()) { - return "polypeptide(D)"; - } - - // check for polydeoxyribonucleotide - if(chem_classes.size() == 1 && - chem_classes.find(ost::mol::ChemClass::DNA_LINKING) != chem_classes.end()) { - return "polydeoxyribonucleotide"; - } - - // check for polyribonucleotide - if(chem_classes.size() == 1 && - chem_classes.find(ost::mol::ChemClass::RNA_LINKING) != chem_classes.end()) { - return "polyribonucleotide"; - } - - // check for polydeoxyribonucleotide/polyribonucleotide hybrid - if(chem_classes.size() == 2 && - chem_classes.find(ost::mol::ChemClass::RNA_LINKING) != chem_classes.end() && - chem_classes.find(ost::mol::ChemClass::DNA_LINKING) != chem_classes.end()) { - return "polydeoxyribonucleotide/polyribonucleotide hybrid"; - } - - // check for peptide nucleic acid - bool peptide_linking = chem_classes.find(ost::mol::ChemClass::L_PEPTIDE_LINKING) != chem_classes.end() || - chem_classes.find(ost::mol::ChemClass::D_PEPTIDE_LINKING) != chem_classes.end() || - chem_classes.find(ost::mol::ChemClass::PEPTIDE_LINKING) != chem_classes.end(); - bool nucleotide_linking = chem_classes.find(ost::mol::ChemClass::DNA_LINKING) != chem_classes.end() || - chem_classes.find(ost::mol::ChemClass::RNA_LINKING) != chem_classes.end(); - std::set<char> pepnuc_set; - pepnuc_set.insert(ost::mol::ChemClass::L_PEPTIDE_LINKING); - pepnuc_set.insert(ost::mol::ChemClass::D_PEPTIDE_LINKING); - pepnuc_set.insert(ost::mol::ChemClass::PEPTIDE_LINKING); - pepnuc_set.insert(ost::mol::ChemClass::DNA_LINKING); - pepnuc_set.insert(ost::mol::ChemClass::RNA_LINKING); - pepnuc_set.insert(chem_classes.begin(), chem_classes.end()); - if(peptide_linking && nucleotide_linking && pepnuc_set.size() == 5) { - return "peptide nucleic acid"; - } - - return "other"; - } - - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList - template<class T> - String GuessEntityBranchType(const T& res_list) { - // guesses _pdbx_entity_branch.type based on residue chem classes - // - // Thats a hard one... only allowed value according to mmcif_pdbx_v50.dic: - // oligosaccharide - return "oligosaccharide"; - } - - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList - template<class T> - String GuessEntityType(const T& res_list) { - - // guesses _entity.type based on residue chem classes - - // allowed values according to mmcif_pdbx_v50.dic: - // - branched - // - macrolide - // - non-polymer - // - polymer - // - water - - // this function is overly simplistic and won't identify macrolide - - std::set<char> chem_classes; - for(auto res: res_list) { - chem_classes.insert(res.GetChemClass()); - } - - // check for water - if(chem_classes.size() == 1 && - chem_classes.find(ost::mol::ChemClass::WATER) != chem_classes.end()) { - return "water"; - } - - std::set<char> branched_set; - branched_set.insert(ost::mol::ChemClass::L_SACCHARIDE); - branched_set.insert(ost::mol::ChemClass::D_SACCHARIDE); - branched_set.insert(ost::mol::ChemClass::SACCHARIDE); - branched_set.insert(chem_classes.begin(), chem_classes.end()); - // Single saccharides are non-poly (handled in next conditional), from 2 - // accharides covalently bound onwards, its called a 'branched' molecular - // entity. Check 3AXH & 1A4G from RCSB for examples. - if(branched_set.size() == 3 && res_list.size() > 1) { - return "branched"; - } - - // Entities must have at least 3 residues to be considered polymers - // For now, we just set entities with 1 or 2 residues as non-polymer - // For examples from RCSB, check 1E8K (dipeptide), 4K9A (dinucleotide), BUT - // watch out for oligosaccharides like 3AXH, 2 sugars are already of type - // `branched` and single sugars are non-polymer (1A4G) - if(res_list.size() == 1 || res_list.size() == 2) { - return "non-polymer"; - } - - return "polymer"; - } - inline String ChemClassToChemCompType(char chem_class) { String type = ""; switch(chem_class) { @@ -461,43 +315,6 @@ namespace { return "(" + mon_id + ")"; } - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList - template<class T> - void SetupChemComp(const T& res_list, - std::map<String, ost::io::MMCifWriterComp>& comp_infos) { - for(auto res: res_list) { - String res_name = res.GetName(); - String type = ChemClassToChemCompType(res.GetChemClass()); - auto it = comp_infos.find(res_name); - if(it != comp_infos.end()) { - if(it->second.type != type) { - // If the already stored type or the incoming type are OTHER, - // we keep the one that is NOT OTHER => everything has preference over - // OTHER. However, if both types are NOT OTHER and they do not match, - // we throw an error. - if(type == "OTHER") { - continue; - } else if (it->second.type == "OTHER") { - ost::io::MMCifWriterComp info; - info.type = type; - comp_infos[res_name] = info; - } else { - std::stringstream ss; - ss << "Residue " << res << "has _chem_comp.type \"" << type; - ss << "\" which is derived from its chem class: " << res.GetChemClass(); - ss << ". Observed already another _chem_comp.type for a residue of "; - ss << "the same name: " << it->second.type; - throw ost::io::IOException(ss.str()); - } - } - } else { - ost::io::MMCifWriterComp info; - info.type = type; - comp_infos[res_name] = info; - } - } - } - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList template<class T> bool MatchEntity(const T& res_list, @@ -760,29 +577,6 @@ namespace { return entity_idx; } - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList - template<class T> - int SetupEntity(const String& asym_chain_name, - const T& res_list, - bool resnum_alignment, - std::vector<ost::io::MMCifWriterEntity>& entity_infos) { - // use chem types in res_list to determine _entity.type and - // _entity_poly.type - String type = GuessEntityType(res_list); - bool is_poly = type == "polymer"; - String poly_type = ""; - if(is_poly) { - poly_type = GuessEntityPolyType(res_list); - } - bool is_branched = type == "branched"; - String branch_type = ""; - if(is_branched) { - branch_type = GuessEntityBranchType(res_list); - } - return SetupEntity(asym_chain_name, type, poly_type, branch_type, res_list, - resnum_alignment, entity_infos); - } - // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList template<class T> int SetupEntity(const String& asym_chain_name, @@ -1135,11 +929,24 @@ namespace { } void Feed_chem_comp(ost::io::StarWriterLoopPtr chem_comp_ptr, - const std::map<String, ost::io::MMCifWriterComp>& comp_infos) { + const std::vector<ost::io::MMCifWriterEntity>& entity_infos, + ost::conop::CompoundLibPtr compound_lib) { + std::set<String> unique_compounds; + for(auto ent: entity_infos) { + unique_compounds.insert(ent.mon_ids.begin(), ent.mon_ids.end()); + } std::vector<ost::io::StarWriterValue> comp_data(2); - for(auto it = comp_infos.begin(); it != comp_infos.end(); ++it) { - comp_data[0] = ost::io::StarWriterValue::FromString(it->first); - comp_data[1] = ost::io::StarWriterValue::FromString(it->second.type); + for(auto mon_id: unique_compounds) { + comp_data[0] = ost::io::StarWriterValue::FromString(mon_id); + ost::conop::CompoundPtr comp = compound_lib->FindCompound(mon_id, + ost::conop::Compound::PDB); + if(comp) { + String type = ChemClassToChemCompType(comp->GetChemClass()); + comp_data[1] = ost::io::StarWriterValue::FromString(type); + } else { + String type = ChemClassToChemCompType(ost::mol::ChemClass::UNKNOWN); + comp_data[1] = ost::io::StarWriterValue::FromString(type); + } chem_comp_ptr->AddData(comp_data); } } @@ -1159,7 +966,7 @@ namespace { // template to allow ost::mol::ResidueHandleList and ost::mol::ResidueViewList template<class T> void ProcessEntmmCIFify(const std::vector<T>& res_lists, - std::map<String, ost::io::MMCifWriterComp>& comp_infos, + ost::conop::CompoundLibPtr compound_lib, std::vector<ost::io::MMCifWriterEntity>& entity_info, ost::io::StarWriterLoopPtr atom_site, ost::io::StarWriterLoopPtr pdbx_poly_seq_scheme) { @@ -1168,8 +975,6 @@ namespace { for(auto res_list: res_lists) { - SetupChemComp(res_list, comp_infos); - T L_chain; T D_chain; T P_chain; @@ -1178,21 +983,33 @@ namespace { T Z_chain; T W_chain; + std::vector<ost::mol::ChemClass> chem_classes; + chem_classes.reserve(res_list.size()); + for(auto res: res_list) { + ost::conop::CompoundPtr comp = compound_lib->FindCompound(res.GetName(), + ost::conop::Compound::PDB); + if(comp) { + chem_classes.push_back(comp->GetChemClass()); + } else { + chem_classes.push_back(ost::mol::ChemClass(ost::mol::ChemClass::UNKNOWN)); + } + } + // first scan only concerning peptides... // Avoid mix of both in same chain: L-peptide linking, D-peptide linking // But we still want to know that in advance as we assign non chiral // peptides to either of those bool has_l_peptide_linking = false; bool has_d_peptide_linking = false; - for(auto res: res_list) { - if(res.GetChemClass() == ost::mol::ChemClass::D_PEPTIDE_LINKING) { + for(auto chem_class: chem_classes) { + if(chem_class == ost::mol::ChemClass::D_PEPTIDE_LINKING) { if(has_l_peptide_linking) { throw ost::io::IOException("Cannot write mmCIF when same chain " "contains D- and L-peptides"); } has_d_peptide_linking = true; } - if(res.GetChemClass() == ost::mol::ChemClass::L_PEPTIDE_LINKING) { + if(chem_class == ost::mol::ChemClass::L_PEPTIDE_LINKING) { if(has_d_peptide_linking) { throw ost::io::IOException("Cannot write mmCIF when same chain " "contains D- and L-peptides"); @@ -1201,34 +1018,40 @@ namespace { } } - for(auto res: res_list) { - if(res.GetChemClass().IsPeptideLinking()) { + for(size_t i = 0; i < res_list.size(); ++i) { + if(chem_classes[i].IsPeptideLinking()) { if(has_l_peptide_linking) { - L_chain.push_back(res); + L_chain.push_back(res_list[i]); } else if(has_d_peptide_linking) { - D_chain.push_back(res); + D_chain.push_back(res_list[i]); } else { - P_chain.push_back(res); + P_chain.push_back(res_list[i]); } - } else if(res.GetChemClass() == ost::mol::ChemClass::RNA_LINKING) { - R_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::DNA_LINKING) { - S_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::L_SACCHARIDE) { - Z_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::D_SACCHARIDE) { - Z_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::SACCHARIDE) { - Z_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::WATER) { - W_chain.push_back(res); - } else if(res.GetChemClass() == ost::mol::ChemClass::NON_POLYMER || - res.GetChemClass() == ost::mol::ChemClass::UNKNOWN) { + } else if(chem_classes[i] == ost::mol::ChemClass::RNA_LINKING) { + R_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::DNA_LINKING) { + S_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::L_SACCHARIDE) { + Z_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::D_SACCHARIDE) { + Z_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::SACCHARIDE) { + Z_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::WATER) { + W_chain.push_back(res_list[i]); + } else if(chem_classes[i] == ost::mol::ChemClass::NON_POLYMER || + chem_classes[i] == ost::mol::ChemClass::UNKNOWN) { // already process non-poly and unknown + String type = "non-polymer"; + String poly_type = ""; + String branch_type = ""; T tmp; - tmp.push_back(res); + tmp.push_back(res_list[i]); String chain_name = chain_name_gen.Get(); int entity_id = SetupEntity(chain_name, + type, + poly_type, + branch_type, tmp, false, entity_info); @@ -1237,18 +1060,32 @@ namespace { } else { // this should not happen... std::stringstream ss; - ss << "Unsupported chem class (" << res.GetChemClass(); - ss << ") for residue "<< res; + ss << "Unsupported chem class (" << res_list[i].GetChemClass(); + ss << ") for residue "<< res_list[i]; throw ost::io::IOException(ss.str()); } } - // process poly chains T* poly_chains[5] = {&L_chain, &D_chain, &P_chain, &R_chain, &S_chain}; + String poly_types[5] = {"polypeptide(L)", "polypeptide(D)", + "polypeptide(L)", "polyribonucleotide", + "polydeoxyribonucleotide"}; + for(int i = 0; i < 5; ++i) { if(!poly_chains[i]->empty()) { + String type = "polymer"; + String poly_type = poly_types[i]; + if(poly_chains[i]->size() <= 2) { + // must have length of at least 3 to be polymer + type = "non-polymer"; + poly_type = ""; + } + String branch_type = ""; String chain_name = chain_name_gen.Get(); int entity_id = SetupEntity(chain_name, + type, + poly_type, + branch_type, *poly_chains[i], false, entity_info); @@ -1266,8 +1103,14 @@ namespace { // process water chain if(!W_chain.empty()) { + String type = "water"; + String poly_type = ""; + String branch_type = ""; String chain_name = chain_name_gen.Get(); int entity_id = SetupEntity(chain_name, + type, + poly_type, + branch_type, W_chain, false, entity_info); @@ -1276,8 +1119,20 @@ namespace { } // process sugar chain if(!Z_chain.empty()) { + String type = "branched"; + String poly_type = ""; + String branch_type = "oligosaccharide"; + if(Z_chain.size() == 1) { + // not really branched if the poor little Zueckerli is alone... + type = "non-polymer"; + branch_type = ""; + } + String chain_name = chain_name_gen.Get(); int entity_id = SetupEntity(chain_name, + type, + poly_type, + branch_type, Z_chain, false, entity_info); @@ -1288,7 +1143,7 @@ namespace { } void ProcessEntmmCIFify(const ost::mol::EntityHandle& ent, - std::map<String, ost::io::MMCifWriterComp>& comp_infos, + ost::conop::CompoundLibPtr compound_lib, std::vector<ost::io::MMCifWriterEntity>& entity_info, ost::io::StarWriterLoopPtr atom_site, ost::io::StarWriterLoopPtr pdbx_poly_seq_scheme) { @@ -1297,12 +1152,12 @@ namespace { for(auto ch: chain_list) { res_lists.push_back(ch.GetResidueList()); } - ProcessEntmmCIFify(res_lists, comp_infos, entity_info, + ProcessEntmmCIFify(res_lists, compound_lib, entity_info, atom_site, pdbx_poly_seq_scheme); } void ProcessEntmmCIFify(const ost::mol::EntityView& ent, - std::map<String, ost::io::MMCifWriterComp>& comp_infos, + ost::conop::CompoundLibPtr compound_lib, std::vector<ost::io::MMCifWriterEntity>& entity_info, ost::io::StarWriterLoopPtr atom_site, ost::io::StarWriterLoopPtr pdbx_poly_seq_scheme) { @@ -1311,15 +1166,15 @@ namespace { for(auto ch: chain_list) { res_lists.push_back(ch.GetResidueList()); } - ProcessEntmmCIFify(res_lists, comp_infos, entity_info, + ProcessEntmmCIFify(res_lists, compound_lib, entity_info, atom_site, pdbx_poly_seq_scheme); } // template to allow ost::mol::EntityHandle and ost::mol::EntityView template<class T> void ProcessEnt(const T& ent, + ost::conop::CompoundLibPtr compound_lib, bool mmcif_conform, - std::map<String, ost::io::MMCifWriterComp>& comp_infos, std::vector<ost::io::MMCifWriterEntity>& entity_info, ost::io::StarWriterLoopPtr atom_site, ost::io::StarWriterLoopPtr pdbx_poly_seq_scheme) { @@ -1328,7 +1183,6 @@ namespace { auto chain_list = ent.GetChainList(); for(auto ch: chain_list) { auto res_list = ch.GetResidueList(); - SetupChemComp(res_list, comp_infos); String chain_name = ch.GetName(); int entity_id = SetupEntity(chain_name, ch.GetType(), @@ -1344,13 +1198,12 @@ namespace { } } else { // delegate to more complex ProcessEntmmCIFify - ProcessEntmmCIFify(ent, comp_infos, entity_info, atom_site, + ProcessEntmmCIFify(ent, compound_lib, entity_info, atom_site, pdbx_poly_seq_scheme); } } - void ProcessUnknowns(std::vector<ost::io::MMCifWriterEntity>& entity_infos, - std::map<String, ost::io::MMCifWriterComp>& comp_infos) { + void ProcessUnknowns(std::vector<ost::io::MMCifWriterEntity>& entity_infos) { for(size_t entity_idx = 0; entity_idx < entity_infos.size(); ++entity_idx) { if(entity_infos[entity_idx].is_poly) { @@ -1367,22 +1220,12 @@ namespace { entity_infos[entity_idx].mon_ids[mon_id_idx] = "UNK"; entity_infos[entity_idx].seq_olcs[mon_id_idx] = "(UNK)"; entity_infos[entity_idx].seq_can_olcs[mon_id_idx] = "X"; - if(comp_infos.find("UNK") == comp_infos.end()) { - ost::io::MMCifWriterComp info; - info.type = "L-PEPTIDE LINKING"; - comp_infos["UNK"] = info; - } } else if(entity_infos[entity_idx].poly_type == "polydeoxyribonucleotide") { entity_infos[entity_idx].mon_ids[mon_id_idx] = "DN"; entity_infos[entity_idx].seq_olcs[mon_id_idx] = "(DN)"; entity_infos[entity_idx].seq_can_olcs[mon_id_idx] = "N"; - if(comp_infos.find("DN") == comp_infos.end()) { - ost::io::MMCifWriterComp info; - info.type = "DNA LINKING"; - comp_infos["DN"] = info; - } } else if(entity_infos[entity_idx].poly_type == "polyribonucleotide" || @@ -1390,11 +1233,6 @@ namespace { entity_infos[entity_idx].mon_ids[mon_id_idx] = "N"; entity_infos[entity_idx].seq_olcs[mon_id_idx] = "N"; entity_infos[entity_idx].seq_can_olcs[mon_id_idx] = "N"; - if(comp_infos.find("N") == comp_infos.end()) { - ost::io::MMCifWriterComp info; - info.type = "RNA LINKING"; - comp_infos["N"] = info; - } } else { @@ -1459,24 +1297,26 @@ int MMCifWriterEntity::GetAsymIdx(const String& asym_id) const { } void MMCifWriter::SetStructure(const ost::mol::EntityHandle& ent, + conop::CompoundLibPtr compound_lib, bool mmcif_conform, const std::vector<MMCifWriterEntity>& entity_info) { this->Setup(); entity_info_ = entity_info; - ProcessEnt(ent, mmcif_conform, comp_info_, entity_info_, atom_site_, - pdbx_poly_seq_scheme_); - this->Finalize(); + ProcessEnt(ent, compound_lib, mmcif_conform, entity_info_, + atom_site_, pdbx_poly_seq_scheme_); + this->Finalize(compound_lib); } void MMCifWriter::SetStructure(const ost::mol::EntityView& ent, + conop::CompoundLibPtr compound_lib, bool mmcif_conform, const std::vector<MMCifWriterEntity>& entity_info) { this->Setup(); entity_info_ = entity_info; - ProcessEnt(ent, mmcif_conform, comp_info_, entity_info_, atom_site_, - pdbx_poly_seq_scheme_); - this->Finalize(); + ProcessEnt(ent, compound_lib, mmcif_conform, entity_info_, + atom_site_, pdbx_poly_seq_scheme_); + this->Finalize(compound_lib); } void MMCifWriter::Setup() { @@ -1496,19 +1336,18 @@ void MMCifWriter::Setup() { pdbx_entity_branch_ = Setup_pdbx_entity_branch_ptr(); } -void MMCifWriter::Finalize() { +void MMCifWriter::Finalize(ost::conop::CompoundLibPtr compound_lib) { // depending on the strategy (mmcif_conform), there might be gaps in the // entities mon_ids/ seq_olcs/ seq_can_olcs // The following function adds valid stuff depending on chain type - // (e.g. UNK if we're having a peptide linking chain and then adds - // that UNK directly to comp_info) - ProcessUnknowns(entity_info_, comp_info_); + // (e.g. UNK if we're having a peptide linking chain) + ProcessUnknowns(entity_info_); Feed_entity(entity_, entity_info_); Feed_struct_asym(struct_asym_, entity_info_); Feed_entity_poly(entity_poly_, entity_info_); Feed_entity_poly_seq(entity_poly_seq_, entity_info_); - Feed_chem_comp(chem_comp_, comp_info_); + Feed_chem_comp(chem_comp_, entity_info_, compound_lib); Feed_atom_type(atom_type_, atom_site_); Feed_pdbx_entity_branch(pdbx_entity_branch_, entity_info_); diff --git a/modules/io/src/mol/mmcif_writer.hh b/modules/io/src/mol/mmcif_writer.hh index 3560834bf..25f20f5db 100644 --- a/modules/io/src/mol/mmcif_writer.hh +++ b/modules/io/src/mol/mmcif_writer.hh @@ -104,10 +104,12 @@ public: virtual ~MMCifWriter() { } - void SetStructure(const ost::mol::EntityHandle& ent, bool mmcif_conform=true, + void SetStructure(const ost::mol::EntityHandle& ent, conop::CompoundLibPtr compound_lib, + bool mmcif_conform=true, const std::vector<MMCifWriterEntity>& entity_info=std::vector<MMCifWriterEntity>()); - void SetStructure(const ost::mol::EntityView& ent, bool mmcif_conform=true, + void SetStructure(const ost::mol::EntityView& ent, conop::CompoundLibPtr compound_lib, + bool mmcif_conform=true, const std::vector<MMCifWriterEntity>& entity_info=std::vector<MMCifWriterEntity>()); const std::vector<MMCifWriterEntity>& GetEntities() const { return entity_info_; } @@ -116,10 +118,9 @@ private: void Setup(); - void Finalize(); + void Finalize(ost::conop::CompoundLibPtr compound_lib); std::vector<MMCifWriterEntity> entity_info_; - std::map<String, MMCifWriterComp> comp_info_; StarWriterLoopPtr atom_type_; StarWriterLoopPtr atom_site_; StarWriterLoopPtr pdbx_poly_seq_scheme_; -- GitLab