From 352335f134c90006fe9787acdfb603fda4d7e9f2 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Wed, 22 Nov 2023 09:32:32 +0100 Subject: [PATCH] mmcif writer: write _entity, _struct_asym, _entity_poly_seq categories --- modules/io/src/mol/mmcif_writer.cc | 380 ++++++++++++++++++++++------- modules/io/src/mol/mmcif_writer.hh | 13 +- modules/mol/base/src/chem_class.hh | 7 + 3 files changed, 307 insertions(+), 93 deletions(-) diff --git a/modules/io/src/mol/mmcif_writer.cc b/modules/io/src/mol/mmcif_writer.cc index c266d01c2..6abcd266d 100644 --- a/modules/io/src/mol/mmcif_writer.cc +++ b/modules/io/src/mol/mmcif_writer.cc @@ -70,6 +70,208 @@ namespace { int n_chain_names; std::vector<int> indices; }; + + // internal object with all info to fill entity_, struct_asym_, + // entity_poly_seq_ categories + struct EntityInfo { + char chem_class; // all residues of this entity have this ChemClass + std::vector<String> asym_ids; // relevant for _struct_asym.id + std::vector<String> mon_ids; // relevant for _entity_poly_seq.mon_id + }; + + int Setup_entity_(const String& asym_chain_name, + char chem_class, + const ost::mol::ResidueHandleList& res_list, + std::vector<EntityInfo>& entity_infos) { + std::vector<String> mon_ids; + for(auto res : res_list) { + mon_ids.push_back(res.GetName()); + } + + // check whether we already have that entity + // right now we're just looking for exact matches in chem_class and + // mon_ids (i.e. sequence). + int entity_idx = -1; + for(size_t i = 0; i < entity_infos.size(); ++i) { + if(entity_infos[i].chem_class == chem_class && + entity_infos[i].mon_ids == mon_ids) { + entity_idx = i; + break; + } + } + + if(entity_idx != -1) { + entity_infos[entity_idx].asym_ids.push_back(asym_chain_name); + } else { + entity_idx = entity_infos.size(); + entity_infos.push_back(EntityInfo()); + entity_infos.back().chem_class = chem_class; + entity_infos.back().asym_ids.push_back(asym_chain_name); + entity_infos.back().mon_ids = mon_ids; + } + return entity_idx; + } + + ost::io::StarLoop* Setup_atom_site_ptr() { + ost::io::StarLoopDesc desc; + desc.SetCategory("_atom_site"); + desc.Add("group_PDB"); + desc.Add("type_symbol"); + desc.Add("label_atom_id"); + desc.Add("label_comp_id"); + desc.Add("label_asym_id"); + desc.Add("label_entity_id"); + desc.Add("label_seq_id"); + desc.Add("label_alt_id"); + desc.Add("Cartn_x"); + desc.Add("Cartn_y"); + desc.Add("Cartn_z"); + desc.Add("occupancy"); + desc.Add("B_iso_or_equiv"); + desc.Add("auth_seq_id"); + desc.Add("auth_asym_id"); + desc.Add("id"); + ost::io::StarLoop* sl = new ost::io::StarLoop(desc); + return sl; + } + + ost::io::StarLoop* Setup_entity_ptr() { + ost::io::StarLoopDesc desc; + desc.SetCategory("_entity"); + desc.Add("id"); + desc.Add("type"); + ost::io::StarLoop* sl = new ost::io::StarLoop(desc); + return sl; + } + + ost::io::StarLoop* Setup_struct_asym_ptr() { + ost::io::StarLoopDesc desc; + desc.SetCategory("_struct_asym"); + desc.Add("id"); + desc.Add("entity_id"); + ost::io::StarLoop* sl = new ost::io::StarLoop(desc); + return sl; + } + + ost::io::StarLoop* Setup_entity_poly_seq_ptr() { + ost::io::StarLoopDesc desc; + desc.SetCategory("_entity_poly_seq"); + desc.Add("entity_id"); + desc.Add("mon_id"); + desc.Add("num"); + ost::io::StarLoop* sl = new ost::io::StarLoop(desc); + return sl; + } + + void Feed_atom_site_(ost::io::StarLoop* atom_site_ptr, + const String& label_asym_id, + int label_entity_id, + const ost::mol::ResidueHandleList& res_list) { + int label_seq_id = 1; + for(auto res: res_list) { + String comp_id = res.GetName(); + ost::mol::AtomHandleList at_list = res.GetAtomList(); + String auth_asym_id = res.GetChain().GetName(); + String auth_seq_id = res.GetNumber().AsString(); + for(auto at: at_list) { + std::vector<ost::io::StarLoopDataItemDO> at_data; + // group_PDB + if(at.IsHetAtom()) { + at_data.push_back(ost::io::StarLoopDataItemDO("HETATM")); + } else { + at_data.push_back(ost::io::StarLoopDataItemDO("ATOM")); + } + // type_symbol + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetElement())); + // label_atom_id + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetName())); + // label_comp_id + at_data.push_back(ost::io::StarLoopDataItemDO(comp_id)); + // label_asym_id + at_data.push_back(ost::io::StarLoopDataItemDO(label_asym_id)); + // label_entity_id + at_data.push_back(ost::io::StarLoopDataItemDO(label_entity_id)); + // label_seq_id + at_data.push_back(ost::io::StarLoopDataItemDO(label_seq_id)); + // label_alt_id + at_data.push_back(ost::io::StarLoopDataItemDO(".")); + // Cartn_x + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetPos().GetX(), 3)); + // Cartn_y + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetPos().GetY(), 3)); + // Cartn_z + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetPos().GetZ(), 3)); + // occupancy + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetOccupancy(), 2)); + // B_iso_or_equiv + at_data.push_back(ost::io::StarLoopDataItemDO(at.GetBFactor(), 2)); + // auth_seq_id + at_data.push_back(ost::io::StarLoopDataItemDO(auth_seq_id)); + // auth_asym_id + at_data.push_back(ost::io::StarLoopDataItemDO(auth_asym_id)); + // id + at_data.push_back(ost::io::StarLoopDataItemDO(atom_site_ptr->GetN())); + atom_site_ptr->AddData(at_data); + } + ++label_seq_id; + } + } + + void Feed_entity_(ost::io::StarLoop* entity_ptr, + const std::vector<EntityInfo>& entity_info) { + for(size_t entity_idx = 0; entity_idx < entity_info.size(); ++entity_idx) { + std::vector<ost::io::StarLoopDataItemDO> ent_data; + // id + ent_data.push_back(ost::io::StarLoopDataItemDO(entity_idx)); + // type + ost::mol::ChemClass chem_class(entity_info[entity_idx].chem_class); + if(chem_class.IsPeptideLinking() || chem_class.IsNucleotideLinking()) { + ent_data.push_back(ost::io::StarLoopDataItemDO("polymer")); + } else if(chem_class.IsWater()) { + ent_data.push_back(ost::io::StarLoopDataItemDO("water")); + } else if(chem_class == ost::mol::ChemClass::NON_POLYMER) { + ent_data.push_back(ost::io::StarLoopDataItemDO("non-polymer")); + } else if(chem_class.IsSaccharide()) { + // I doubt that this is correct + ent_data.push_back(ost::io::StarLoopDataItemDO("branched")); + } else { + throw ost::io::IOException("Entity type issue"); + } + entity_ptr->AddData(ent_data); + } + } + + void Feed_struct_asym_(ost::io::StarLoop* struct_asym_ptr, + const std::vector<EntityInfo>& entity_info) { + for(size_t entity_idx = 0; entity_idx < entity_info.size(); ++entity_idx) { + for(auto asym_id : entity_info[entity_idx].asym_ids) { + std::vector<ost::io::StarLoopDataItemDO> asym_data; + asym_data.push_back(ost::io::StarLoopDataItemDO(asym_id)); + asym_data.push_back(ost::io::StarLoopDataItemDO(entity_idx)); + struct_asym_ptr->AddData(asym_data); + } + } + } + + void Feed_entity_poly_seq_(ost::io::StarLoop* entity_poly_seq_ptr, + const std::vector<EntityInfo>& entity_info) { + // reuse data vector for efficiency + std::vector<ost::io::StarLoopDataItemDO> mon_data; + mon_data.push_back(ost::io::StarLoopDataItemDO(0)); + mon_data.push_back(ost::io::StarLoopDataItemDO("ALA")); + mon_data.push_back(ost::io::StarLoopDataItemDO(1)); + + for(size_t entity_idx = 0; entity_idx < entity_info.size(); ++entity_idx) { + const std::vector<String>& mon_ids = entity_info[entity_idx].mon_ids; + for(size_t mon_idx = 0; mon_idx < mon_ids.size(); ++mon_idx) { + mon_data[0] = ost::io::StarLoopDataItemDO(entity_idx); + mon_data[1] = ost::io::StarLoopDataItemDO(mon_ids[mon_idx]); + mon_data[2] = ost::io::StarLoopDataItemDO(mon_idx+1); + entity_poly_seq_ptr->AddData(mon_data); + } + } + } + } namespace ost { namespace io { @@ -77,17 +279,47 @@ namespace ost { namespace io { MMCifWriter::MMCifWriter(const String& filename, const IOProfile& profile): StarWriter(filename), profile_(profile), - atom_site_(NULL) { } + atom_site_(NULL), + entity_(NULL), + struct_asym_(NULL), + entity_poly_seq_(NULL) { } MMCifWriter::~MMCifWriter() { if(atom_site_ != NULL) { delete atom_site_; } + if(entity_ != NULL) { + delete entity_; + } + if(struct_asym_ != NULL) { + delete struct_asym_; + } + if(entity_poly_seq_ != NULL) { + delete entity_poly_seq_; + } } void MMCifWriter::Process_atom_site(const ost::mol::EntityHandle& ent) { - this->Setup_atom_site_(); + + // tabula rasa + if(atom_site_ != NULL) { + delete atom_site_; + } + if(entity_ != NULL) { + delete entity_; + } + if(struct_asym_ != NULL) { + delete struct_asym_; + } + if(entity_poly_seq_ != NULL) { + delete entity_poly_seq_; + } + + atom_site_ = Setup_atom_site_ptr(); + entity_ = Setup_entity_ptr(); + struct_asym_ = Setup_struct_asym_ptr(); + entity_poly_seq_ = Setup_entity_poly_seq_ptr(); std::vector<std::vector<ost::mol::ResidueHandle> > L_chains; // L_PEPTIDE_LINKING std::vector<std::vector<ost::mol::ResidueHandle> > D_chains; // D_PEPTIDE_LINKING @@ -229,53 +461,86 @@ void MMCifWriter::Process_atom_site(const ost::mol::EntityHandle& ent) { } ChainNameGenerator chain_name_gen; + std::vector<EntityInfo> entity_info; // process L_PEPTIDE_LINKING for(auto res_list: L_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::L_PEPTIDE_LINKING, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process D_PEPTIDE_LINKING for(auto res_list: D_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::D_PEPTIDE_LINKING, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process PEPTIDE_LINKING for(auto res_list: P_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::PEPTIDE_LINKING, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process RNA_LINKING for(auto res_list: R_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::RNA_LINKING, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process DNA_LINKING for(auto res_list: S_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::DNA_LINKING, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process L_SACHARIDE for(auto res_list: X_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::L_SACCHARIDE, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process D_SACHARIDE for(auto res_list: Y_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::D_SACCHARIDE, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process WATER for(auto res_list: W_chains) { String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::WATER, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process NON_POLYMER @@ -283,7 +548,11 @@ void MMCifWriter::Process_atom_site(const ost::mol::EntityHandle& ent) { ost::mol::ResidueHandleList res_list; res_list.push_back(res); String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::NON_POLYMER, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } // process UNKNOWN @@ -291,87 +560,22 @@ void MMCifWriter::Process_atom_site(const ost::mol::EntityHandle& ent) { ost::mol::ResidueHandleList res_list; res_list.push_back(res); String chain_name = chain_name_gen.Get(); - Feed_atom_site_(chain_name, 0, res_list); + int entity_id = Setup_entity_(chain_name, + ost::mol::ChemClass::UNKNOWN, + res_list, + entity_info); + Feed_atom_site_(atom_site_, chain_name, entity_id, res_list); } + Feed_entity_(entity_, entity_info); + Feed_struct_asym_(struct_asym_, entity_info); + Feed_entity_poly_seq_(entity_poly_seq_, entity_info); + // finalize + this->Push(entity_); + this->Push(struct_asym_); + this->Push(entity_poly_seq_); this->Push(atom_site_); } -void MMCifWriter::Setup_atom_site_() { - StarLoopDesc desc; - desc.SetCategory("_atom_site"); - desc.Add("group_PDB"); - desc.Add("type_symbol"); - desc.Add("label_atom_id"); - desc.Add("label_comp_id"); - desc.Add("label_asym_id"); - desc.Add("label_entity_id"); - desc.Add("label_seq_id"); - desc.Add("label_alt_id"); - desc.Add("Cartn_x"); - desc.Add("Cartn_y"); - desc.Add("Cartn_z"); - desc.Add("occupancy"); - desc.Add("B_iso_or_equiv"); - desc.Add("auth_seq_id"); - desc.Add("auth_asym_id"); - desc.Add("id"); - atom_site_ = new StarLoop(desc); -} - -void MMCifWriter::Feed_atom_site_(const String& label_asym_id, - int label_entity_id, - const ost::mol::ResidueHandleList& res_list) { - - int label_seq_id = 1; - for(auto res: res_list) { - String comp_id = res.GetName(); - ost::mol::AtomHandleList at_list = res.GetAtomList(); - String auth_asym_id = res.GetChain().GetName(); - String auth_seq_id = res.GetNumber().AsString(); - for(auto at: at_list) { - std::vector<StarLoopDataItemDO> at_data; - // group_PDB - if(at.IsHetAtom()) { - at_data.push_back(StarLoopDataItemDO("HETATM")); - } else { - at_data.push_back(StarLoopDataItemDO("ATOM")); - } - // type_symbol - at_data.push_back(StarLoopDataItemDO(at.GetElement())); - // label_atom_id - at_data.push_back(StarLoopDataItemDO(at.GetName())); - // label_comp_id - at_data.push_back(StarLoopDataItemDO(comp_id)); - // label_asym_id - at_data.push_back(StarLoopDataItemDO(label_asym_id)); - // label_entity_id - at_data.push_back(StarLoopDataItemDO(label_entity_id)); - // label_seq_id - at_data.push_back(StarLoopDataItemDO(label_seq_id)); - // label_alt_id - at_data.push_back(StarLoopDataItemDO(".")); - // Cartn_x - at_data.push_back(StarLoopDataItemDO(at.GetPos().GetX(), 3)); - // Cartn_y - at_data.push_back(StarLoopDataItemDO(at.GetPos().GetY(), 3)); - // Cartn_z - at_data.push_back(StarLoopDataItemDO(at.GetPos().GetZ(), 3)); - // occupancy - at_data.push_back(StarLoopDataItemDO(at.GetOccupancy(), 2)); - // B_iso_or_equiv - at_data.push_back(StarLoopDataItemDO(at.GetBFactor(), 2)); - // auth_seq_id - at_data.push_back(StarLoopDataItemDO(auth_seq_id)); - // auth_asym_id - at_data.push_back(StarLoopDataItemDO(auth_asym_id)); - // id - at_data.push_back(StarLoopDataItemDO(atom_site_->GetN())); - atom_site_->AddData(at_data); - } - ++label_seq_id; - } -} - }} // ns diff --git a/modules/io/src/mol/mmcif_writer.hh b/modules/io/src/mol/mmcif_writer.hh index f5b5ece37..d5f1e01bd 100644 --- a/modules/io/src/mol/mmcif_writer.hh +++ b/modules/io/src/mol/mmcif_writer.hh @@ -36,17 +36,20 @@ public: virtual ~MMCifWriter(); + // initializes + // atom_site_ + // entity_ + // struct_asym_ + // entity_poly_seq_ void Process_atom_site(const ost::mol::EntityHandle& ent); private: - void Setup_atom_site_(); - void Feed_atom_site_(const String& label_asym_id, - int label_entity_id, - const ost::mol::ResidueHandleList& res_list); - IOProfile profile_; StarLoop* atom_site_; + StarLoop* entity_; + StarLoop* struct_asym_; + StarLoop* entity_poly_seq_; }; diff --git a/modules/mol/base/src/chem_class.hh b/modules/mol/base/src/chem_class.hh index efd838679..aa4033e7f 100644 --- a/modules/mol/base/src/chem_class.hh +++ b/modules/mol/base/src/chem_class.hh @@ -79,6 +79,13 @@ struct DLLEXPORT ChemClass { operator char() const { return chem_class_; } + + bool IsSaccharide() const { + return (chem_class_==ChemClass::SACCHARIDE || + chem_class_==ChemClass::L_SACCHARIDE || + chem_class_==ChemClass::D_SACCHARIDE); + } + private: char chem_class_; }; -- GitLab