From 9eb4cf9c157ca2435646825075a7d43c563897fe Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Mon, 18 Dec 2023 15:39:09 +0100 Subject: [PATCH] mmcif writer: resnum based entity matching, even if atomseq differs By default, the mmcif_conform flag is on. That means, that we assume residue numbers to be correctly set according to the underlying SEQRES. This allows for residue number based alignment between chains. BUT: For the scenario of no user defined mmcif entities (which is not possible as of now anyways), there is no other way than deriving entities and their SEQRES from the structure itself. Let's assume an Entity with SEQRES HELLYEAH and two chains with ATOMSEQ HELLYEA and HELLYEAH. The new implementation iterates over the first chain and defines an entity with SEQRES HELLYEA. It then process the second chain and sees that we have a perfect match but an added residue. If the fraction of added residues with respect to matches is below a certain threshold, the entity SEQRES gets updated and the second chain gets also assigned to that entity. --- modules/io/src/mol/mmcif_writer.cc | 433 +++++++++++++++++++++++------ 1 file changed, 347 insertions(+), 86 deletions(-) diff --git a/modules/io/src/mol/mmcif_writer.cc b/modules/io/src/mol/mmcif_writer.cc index a246fc297..9c0b8ce4d 100644 --- a/modules/io/src/mol/mmcif_writer.cc +++ b/modules/io/src/mol/mmcif_writer.cc @@ -522,9 +522,7 @@ namespace { break; } } - } else { - throw ost::io::IOException("Can only get OLCs for peptides/nucleotides"); - } + } return "(" + mon_id + ")"; } @@ -551,69 +549,244 @@ namespace { // internal object with all info to fill entity_, struct_asym_, // entity_poly_seq_ categories struct EntityInfo { + + int GetAsymIdx(const String& asym_id) const { + for(size_t i = 0; i < asym_ids.size(); ++i) { + if(asym_ids[i] == asym_id) { + return i; + } + } + throw "asdf"; + } + String type; // relevant for _entity String poly_type; // relevant for _entity_poly + bool is_poly; // in principle type == "polymer" std::vector<String> asym_ids; // relevant for _struct_asym.id std::vector<String> mon_ids; // relevant for _entity_poly_seq.mon_id - bool is_poly; // in principle type == "polymer" - String seq; // _entity_poly.pdbx_seq_one_letter_code - String seq_can; // _entity_poly.pdbx_seq_one_letter_code_can + std::vector<String> seq_olcs; // relevant for pdbx_seq_one_letter_code + std::vector<String> seq_can_olcs; // relevant for pdbx_seq_one_letter_code_can + std::vector<std::vector<String> > asym_alns; // one alignment to mon_ids for + // each element in asym_ids. + // relevant for label_seq_id + // contains "-" for residues + // that are missing in ATOMSEQ. + // in theory that would only be + // necessary for polymers but + // here we add everything... }; + bool Match_entity_resnum(const ost::mol::ResidueHandleList& res_list, + const EntityInfo& info, + Real beyond_frac = 0.05) { + // Checks if res_list matches SEQRES given in info.mon_ids + // It may be that res_list defines residues beyond this SEQRES or + // has residues that are not yet defined, i.e. the respective mon_id is "-". + // This function returns True if the number of such occurences is below + // the specified fraction of actual matches (default: 5%). + int n_beyond = 0; + int n_matches = 0; + int num_mon_ids = info.mon_ids.size(); + for(auto res: res_list) { + int num = res.GetNumber().GetNum(); + if(num < 1) { + std::stringstream ss; + ss << "Cannot perform resnum based alignment with invalid resnum in "; + ss << "residue " << res; + throw ost::io::IOException(ss.str()); + } else if (num > num_mon_ids) { + ++n_beyond; + } else { + if(info.mon_ids[num-1] == "-") { + ++n_beyond; // we're basically filling an unknown gap... + } else if(info.mon_ids[num-1] == res.GetName()) { + ++n_matches; + } else { + return false; + } + } + } + if(n_matches == 0) { + return false; + } else { + return n_beyond / n_matches <= beyond_frac; + } + } + + bool Match_entity(const ost::mol::ResidueHandleList& res_list, + const EntityInfo& info) { + // checks if the residue names in res_list are an exact match + // with mon_ids in info + std::vector<String> mon_ids; + for(auto res : res_list) { + mon_ids.push_back(res.GetName()); + } + return mon_ids == info.mon_ids; + } + + void Add_asym(const String& asym_chain_name, + EntityInfo& info) { + // adds asym_chain_name to info under the assumption that mon_ids + // exactly match => just add a copy of mon_ids to asym_alns + info.asym_ids.push_back(asym_chain_name); + info.asym_alns.push_back(info.mon_ids); + } + + void Add_asym_resnum(const String& asym_chain_name, + const ost::mol::ResidueHandleList& res_list, + EntityInfo& info) { + + if(!info.is_poly) { + // no need for SEQRES alignment vodoo + Add_asym(asym_chain_name, info); + return; + } + + int max_resnum = info.mon_ids.size(); + std::vector<String> mon_ids; + std::vector<int> resnums; + + for(auto res: res_list) { + int num = res.GetNumber().GetNum(); + // assumes that Match_entity_resnum has already been run to check for + // resnum < 1 + mon_ids.push_back(res.GetName()); + resnums.push_back(num); + max_resnum = std::max(max_resnum, num); + } + + std::vector<String> aln_mon_ids(max_resnum, "-"); + for(size_t i = 0; i < mon_ids.size(); ++i) { + aln_mon_ids[resnums[i]-1] = mon_ids[i]; + } + + if(max_resnum > static_cast<int>(info.mon_ids.size())) { + // This chain covers more residues towards C-terminus than any chain that + // is associated with this entity - expand to enforce equal size + int N = max_resnum - info.mon_ids.size(); + info.mon_ids.insert(info.mon_ids.end(), N, "-"); + info.seq_olcs.insert(info.seq_olcs.end(), N, "-"); + info.seq_can_olcs.insert(info.seq_can_olcs.end(), N, "-"); + for(size_t asym_idx = 0; asym_idx < info.asym_alns.size(); ++asym_idx) { + info.asym_alns[asym_idx].insert(info.asym_alns[asym_idx].end(), N, "-"); + } + } + + // Fill SEQRES infos newly covered by this asym chain + for(size_t i = 0; i < resnums.size(); ++i) { + if(info.mon_ids[resnums[i]-1] == "-") { + info.mon_ids[resnums[i]-1] = mon_ids[i]; + const ost::mol::ResidueHandle& res = res_list[i]; + info.seq_olcs[resnums[i]-1] = mon_id_to_olc(res.GetChemClass(), + mon_ids[i]); + char olc = res.GetOneLetterCode(); + if(olc < 'A' || olc > 'Z') { + info.seq_can_olcs[resnums[i]-1] = "X"; + } else { + info.seq_can_olcs[resnums[i]-1] = String(1, olc); + } + } + } + + // finalize + info.asym_ids.push_back(asym_chain_name); + info.asym_alns.push_back(aln_mon_ids); + } + int Setup_entity(const String& asym_chain_name, const String& type, const String& poly_type, const ost::mol::ResidueHandleList& res_list, + bool resnum_alignment, std::vector<EntityInfo>& entity_infos) { bool is_poly = type == "polymer"; - std::vector<String> mon_ids; - if(type == "water") { - mon_ids.push_back("HOH"); - } else { - for(auto res : res_list) { - mon_ids.push_back(res.GetName()); - } + if(!is_poly && res_list.size() != 1 && type != "water") { + std::stringstream ss; + ss << "Cannot setup entity with " << res_list.size() << " residues "; + ss << "but is of type: " << type; + throw ost::io::IOException(ss.str()); } // check if entity is already there for(size_t i = 0; i < entity_infos.size(); ++i) { if(entity_infos[i].type == type && - entity_infos[i].poly_type == poly_type && - entity_infos[i].mon_ids == mon_ids) { - entity_infos[i].asym_ids.push_back(asym_chain_name); - return i; + entity_infos[i].poly_type == poly_type) { + if(is_poly && resnum_alignment) { + if(Match_entity_resnum(res_list, entity_infos[i])) { + Add_asym_resnum(asym_chain_name, res_list, entity_infos[i]); + return i; + } + } else { + if(Match_entity(res_list, entity_infos[i])) { + Add_asym(asym_chain_name, entity_infos[i]); + return i; + } + } } } // need to create new entity - int entity_idx = entity_infos.size(); - entity_infos.push_back(EntityInfo()); - entity_infos.back().type = type; - entity_infos.back().poly_type = poly_type; - entity_infos.back().asym_ids.push_back(asym_chain_name); - entity_infos.back().mon_ids = mon_ids; - entity_infos.back().is_poly = is_poly; - if(is_poly) { - std::stringstream seq; - std::stringstream seq_can; + std::vector<String> mon_ids; + std::vector<String> seq; + std::vector<String> seq_can; + + if(is_poly && resnum_alignment) { + int max_resnum = res_list.size(); + std::vector<String> res_mon_ids; + std::vector<int> resnums; + for(auto res: res_list) { + int num = res.GetNumber().GetNum(); + if(num < 1) { + throw "asdf"; + } + res_mon_ids.push_back(res.GetName()); + resnums.push_back(num); + max_resnum = std::max(max_resnum, num); + } + mon_ids.assign(max_resnum, "-"); + seq.assign(max_resnum, "-"); + seq_can.assign(max_resnum, "-"); + for(size_t i = 0; i < res_mon_ids.size(); ++i) { + mon_ids[resnums[i]-1] = res_mon_ids[i]; + const ost::mol::ResidueHandle& res = res_list[i]; + seq[resnums[i]-1] = mon_id_to_olc(res.GetChemClass(), + mon_ids[resnums[i]-1]); + char olc = res.GetOneLetterCode(); + if(olc < 'A' || olc > 'Z') { + seq_can[resnums[i]-1] = "X"; + } else { + seq_can[resnums[i]-1] = String(1, olc); + } + } + } else { for(auto res: res_list) { - // seqres basically follows a hardcoded table from the mmcif reference - seq << mon_id_to_olc(res.GetChemClass(), res.GetName()); - // canonical seqres maps one letter codes of parent residues - // OpenStructure does the same when setting up the entity in the - // processor. We just trust OpenStructure to do the right thing but set - // olc to 'X' in case of invalid olc outside [A-Z] (example is '?') + mon_ids.push_back(res.GetName()); + seq.push_back(mon_id_to_olc(res.GetChemClass(), res.GetName())); char olc = res.GetOneLetterCode(); if(olc < 'A' || olc > 'Z') { - seq_can << 'X'; + seq_can.push_back("X"); } else { - seq_can << res.GetOneLetterCode(); + seq_can.push_back(String(1, olc)); } } - entity_infos.back().seq = seq.str(); - entity_infos.back().seq_can = seq_can.str(); + } + + + int entity_idx = entity_infos.size(); + entity_infos.push_back(EntityInfo()); + entity_infos.back().type = type; + entity_infos.back().poly_type = poly_type; + entity_infos.back().mon_ids = mon_ids; + entity_infos.back().seq_olcs = seq; + entity_infos.back().seq_can_olcs = seq_can; + entity_infos.back().is_poly = is_poly; + + if(is_poly && resnum_alignment) { + Add_asym_resnum(asym_chain_name, res_list, entity_infos.back()); + } else { + Add_asym(asym_chain_name, entity_infos.back()); } return entity_idx; @@ -621,6 +794,7 @@ namespace { int Setup_entity_(const String& asym_chain_name, const ost::mol::ResidueHandleList& res_list, + bool resnum_alignment, std::vector<EntityInfo>& entity_infos) { String type = GuessEntityType(res_list); @@ -630,12 +804,13 @@ namespace { poly_type = GuessEntityPolyType(res_list); } return Setup_entity(asym_chain_name, type, poly_type, res_list, - entity_infos); + resnum_alignment, entity_infos); } int Setup_entity_(const String& asym_chain_name, ost::mol::ChainType chain_type, const ost::mol::ResidueHandleList& res_list, + bool resnum_alignment, std::vector<EntityInfo>& entity_infos) { // use chain_type info attached to chain to determine // _entity.type and _entity_poly.type @@ -657,7 +832,7 @@ namespace { } } return Setup_entity(asym_chain_name, type, poly_type, res_list, - entity_infos); + resnum_alignment, entity_infos); } ost::io::StarLoop* Setup_atom_type_ptr() { @@ -777,7 +952,9 @@ namespace { void Feed_pdbx_poly_seq_scheme(ost::io::StarLoop* pdbx_poly_seq_scheme_ptr, const String& label_asym_id, int label_entity_id, + EntityInfo& entity_info, const ost::mol::ResidueHandleList& res_list) { + std::vector<ost::io::StarLoopDataItemDO> data; data.push_back(ost::io::StarLoopDataItemDO(label_asym_id)); data.push_back(ost::io::StarLoopDataItemDO(label_entity_id)); @@ -786,20 +963,52 @@ namespace { data.push_back(ost::io::StarLoopDataItemDO("")); data.push_back(ost::io::StarLoopDataItemDO(0)); data.push_back(ost::io::StarLoopDataItemDO("")); - int label_seq_id = 1; + + int asym_idx = entity_info.GetAsymIdx(label_asym_id); + const std::vector<String>& aln = entity_info.asym_alns[asym_idx]; + int label_seq_id = 0; // 0-based index + for(auto res: res_list) { - data[2] = ost::io::StarLoopDataItemDO(res.GetName()); - data[3] = ost::io::StarLoopDataItemDO(label_seq_id); - data[4] = ost::io::StarLoopDataItemDO(res.GetChain().GetName()); - data[5] = ost::io::StarLoopDataItemDO(res.GetNumber().GetNum()); - char ins_code = res.GetNumber().GetInsCode(); - if(ins_code == '\0') { - data[6] = ost::io::StarLoopDataItemDO(""); + String res_name = res.GetName(); + while(aln[label_seq_id] == "-") { + ++label_seq_id; + } + + if(res_name != aln[label_seq_id]) { + throw "ksajdhfgjkaljshdfsfgd"; + } + + data[2] = ost::io::StarLoopDataItemDO(res_name); + data[3] = ost::io::StarLoopDataItemDO(label_seq_id + 1); + + // the remaining data items honor String properties if set: + // pdb_auth_chain_name, pdb_auth_resnum and pdb_auth_ins_code + + if(res.GetChain().HasProp("pdb_auth_chain_name")) { + data[4] = + ost::io::StarLoopDataItemDO(res.GetChain().GetStringProp("pdb_auth_chain_name")); } else { - String tmp = " "; - tmp[0] = ins_code; - data[6] = ost::io::StarLoopDataItemDO(tmp); - } + data[4] = ost::io::StarLoopDataItemDO(res.GetChain().GetName()); + } + + if(res.HasProp("pdb_auth_resnum")) { + data[5] = ost::io::StarLoopDataItemDO(res.GetStringProp("pdb_auth_resnum")); + } else { + data[5] = ost::io::StarLoopDataItemDO(res.GetNumber().GetNum()); + } + + if(res.HasProp("pdb_auth_ins_code")) { + data[6] = ost::io::StarLoopDataItemDO(res.GetStringProp("pdb_auth_ins_code")); + } else { + char ins_code = res.GetNumber().GetInsCode(); + if(ins_code == '\0') { + data[6] = ost::io::StarLoopDataItemDO(""); + } else { + String tmp = " "; + tmp[0] = ins_code; + data[6] = ost::io::StarLoopDataItemDO(tmp); + } + } pdbx_poly_seq_scheme_ptr->AddData(data); label_seq_id += 1; } @@ -808,14 +1017,41 @@ namespace { void Feed_atom_site_(ost::io::StarLoop* atom_site_ptr, const String& label_asym_id, int label_entity_id, - const ost::mol::ResidueHandleList& res_list, - bool is_poly) { - int label_seq_id = 1; + const EntityInfo& entity_info, + const ost::mol::ResidueHandleList& res_list) { + + int asym_idx = entity_info.GetAsymIdx(label_asym_id); + const std::vector<String>& aln = entity_info.asym_alns[asym_idx]; + int label_seq_id = 0; // 0-based index + for(auto res: res_list) { String comp_id = res.GetName(); + + while(aln[label_seq_id] == "-") { + ++label_seq_id; + } + + if(comp_id != aln[label_seq_id]) { + throw "ksajdhfgjkaljshdfsfgd"; + } + ost::mol::AtomHandleList at_list = res.GetAtomList(); String auth_asym_id = res.GetChain().GetName(); + if(res.HasProp("pdb_auth_chain_name")) { + auth_asym_id = res.GetStringProp("pdb_auth_chain_name"); + } String auth_seq_id = res.GetNumber().AsString(); + if(res.HasProp("pdb_auth_resnum")) { + std::stringstream ss; + ss << res.GetStringProp("pdb_auth_resnum"); + if(res.HasProp("pdb_auth_ins_code")) { + String ins_code = res.GetStringProp("pdb_auth_ins_code"); + if(ins_code != "?") { + ss << ins_code; + } + } + auth_seq_id = ss.str(); + } for(auto at: at_list) { std::vector<ost::io::StarLoopDataItemDO> at_data; // group_PDB @@ -835,8 +1071,8 @@ namespace { // label_entity_id at_data.push_back(ost::io::StarLoopDataItemDO(label_entity_id)); // label_seq_id - if(is_poly) { - at_data.push_back(ost::io::StarLoopDataItemDO(label_seq_id)); + if(entity_info.is_poly) { + at_data.push_back(ost::io::StarLoopDataItemDO(label_seq_id+1)); } else { at_data.push_back(ost::io::StarLoopDataItemDO(".")); } @@ -859,7 +1095,7 @@ namespace { // id at_data.push_back(ost::io::StarLoopDataItemDO(atom_site_ptr->GetN())); // pdbx_PDB_ins_code - at_data.push_back(ost::io::StarLoopDataItemDO("")); + at_data.push_back(ost::io::StarLoopDataItemDO("")); // CHECK THIS, ADD STUFF FROM AUTH_SEQ_ID? atom_site_ptr->AddData(at_data); } ++label_seq_id; @@ -921,8 +1157,24 @@ namespace { if(entity_info[entity_idx].is_poly) { entity_poly_data[0] = ost::io::StarLoopDataItemDO(entity_idx); entity_poly_data[1] = ost::io::StarLoopDataItemDO(entity_info[entity_idx].poly_type); - entity_poly_data[2] = ost::io::StarLoopDataItemDO(entity_info[entity_idx].seq); - entity_poly_data[3] = ost::io::StarLoopDataItemDO(entity_info[entity_idx].seq_can); + std::stringstream seq; + std::stringstream seq_can; + for(size_t idx = 0; idx < entity_info[entity_idx].mon_ids.size(); ++idx) { + if(entity_info[entity_idx].seq_olcs[idx] == "-") { + // X IS PROBABLY NOT THE RIGHT THING FOR + // pdbx_seq_one_letter_code + seq << "X"; + } else { + seq << entity_info[entity_idx].seq_olcs[idx]; + } + if(entity_info[entity_idx].seq_can_olcs[idx] == "-") { + seq_can << "X"; + } else { + seq_can << entity_info[entity_idx].seq_can_olcs[idx]; + } + } + entity_poly_data[2] = seq.str(); + entity_poly_data[3] = seq_can.str(); entity_poly_ptr->AddData(entity_poly_data); } } @@ -955,12 +1207,13 @@ namespace { int entity_id = Setup_entity_(chain_name, ch.GetType(), res_list, + true, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } } @@ -1119,12 +1372,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1133,12 +1387,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1147,12 +1402,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1161,12 +1417,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1175,12 +1432,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1189,12 +1447,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1203,12 +1462,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } @@ -1219,12 +1479,13 @@ namespace { String chain_name = chain_name_gen.Get(); int entity_id = Setup_entity_(chain_name, res_list, + false, entity_info); - Feed_atom_site_(atom_site, chain_name, entity_id, res_list, - entity_info[entity_id].is_poly); + Feed_atom_site_(atom_site, chain_name, entity_id, entity_info[entity_id], + res_list); if(entity_info[entity_id].is_poly) { Feed_pdbx_poly_seq_scheme(pdbx_poly_seq_scheme, chain_name, - entity_id, res_list); + entity_id, entity_info[entity_id], res_list); } } } -- GitLab