diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index 8b3e18b645fc0d78475034dfe0bd44be753ee927..c9e9aec85a40f46929024d8510058a194b1197fd 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -1211,6 +1211,11 @@ class ChainMapper: substructure_chem_groups.append(substructure_chem_group) substructure_chem_mapping.append(mapping) + # early stopping if no mdl chain can be mapped to substructure + n_mapped_mdl_chains = sum([len(m) for m in substructure_chem_mapping]) + if n_mapped_mdl_chains == 0: + return list() + # strip the reference sequence in alignments to only contain # sequence from substructure substructure_ref_mdl_alns = dict() @@ -1251,6 +1256,7 @@ class ChainMapper: # chain_mapping and alns as input for lDDT computation lddt_chain_mapping = dict() lddt_alns = dict() + n_res_aln = 0 for ref_chem_group, mdl_chem_group in zip(substructure_chem_groups, mapping): for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group): @@ -1259,6 +1265,12 @@ class ChainMapper: lddt_chain_mapping[mdl_ch] = ref_ch aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)] lddt_alns[mdl_ch] = aln + tmp = [int(c[0] != '-' and c[1] != '-') for c in aln] + n_res_aln += sum(tmp) + # don't compute lDDT if no single residue in mdl and ref is aligned + if n_res_aln == 0: + continue + lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds, chain_mapping=lddt_chain_mapping, residue_mapping = lddt_alns, @@ -3300,6 +3312,8 @@ def _ChainMappings(ref_chains, mdl_chains, n_max=None): # reference iterators = list() for ref, mdl in zip(ref_chains, mdl_chains): + if len(ref) == 0: + raise RuntimeError("Expext at least one chain in ref chem group") if len(ref) == len(mdl): iterators.append(_RefEqualGenerator(ref, mdl)) elif len(ref) < len(mdl): diff --git a/modules/mol/alg/pymod/dockq.py b/modules/mol/alg/pymod/dockq.py index 126c8f57519472b8cb74ccaa45e5b0719c9043f8..d3695c1de54ea43bda7cb9a61af332b4dca9023f 100644 --- a/modules/mol/alg/pymod/dockq.py +++ b/modules/mol/alg/pymod/dockq.py @@ -6,22 +6,41 @@ def _PreprocessStructures(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, ch1_aln = None, ch2_aln = None): """ Preprocesses *mdl* and *ref* - Returns two entity views with the exact same number of residues. I.e. the - residues correspond to a one-to-one mapping. Additionally, each residue gets - the int property "dockq_map" assigned, which corresponds to the residue - index in the respective chain of the processed structures. + Sets int properties to each residue in mdl_ch1, mdl_ch2 as well as + the respective reference chains. + dockq_mapped: 1 if a residues in mdl_ch1 is mapped to a residue in ref_ch1 + and vice versa, 0 otherwise. Same is done for mdl_ch2 and + ref_ch2. + dockq_idx: If a pair of residue is mapped, the same index will be set + to both residues. The index is unique otherwise. + + By default, mapping happens with residue numbers but you can enforce a + mapping with an alignment. In the example of ch1_aln, the first sequence + corresponds to the ATOMSEQ of ref_ch1 in ref and the second sequence to + the ATOMSEQ of mdl_ch1 in mdl. """ - mdl_residues_1 = list() - mdl_residues_2 = list() - ref_residues_1 = list() - ref_residues_2 = list() + # set default values for dockq_mapped and dockq_idx properties + # => makes sure that we have a clean slate if stuff has been set in + # previous runs + for cname in [ref_ch1, ref_ch2]: + ch = ref.FindChain(cname) + for r in ch.residues: + r.SetIntProp("dockq_mapped", 0) + r.SetIntProp("dockq_idx", -1) + for cname in [mdl_ch1, mdl_ch2]: + ch = mdl.FindChain(cname) + for r in ch.residues: + r.SetIntProp("dockq_mapped", 0) + r.SetIntProp("dockq_idx", -1) + + dockq_idx = 0 if ch1_aln is not None and ch2_aln is not None: # there are potentially already views attached to the alns but # we go for *mdl* and *ref* here if ch1_aln.GetCount() != 2 or ch2_aln.GetCount() != 2: - raise RuntimeError("Expect exactly two sequences in provided alns!") - + raise RuntimeError("Expect exactly two sequences in alns provided " + "to DockQ!") tmp = ch1_aln.GetSequence(0) ref_s1 = seq.CreateSequence(tmp.GetName(), str(tmp)) ref_s1.SetOffset(tmp.GetOffset()) @@ -41,8 +60,11 @@ def _PreprocessStructures(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, if not (mdl_r.IsValid() and mdl_r.one_letter_code == col[1]): raise RuntimeError("DockQ: mismatch between provided " "alignments and ATOMSEQ in structures") - ref_residues_1.append(ref_r) - mdl_residues_1.append(mdl_r) + ref_r.SetIntProp("dockq_idx", dockq_idx) + mdl_r.SetIntProp("dockq_idx", dockq_idx) + ref_r.SetIntProp("dockq_mapped", 1) + mdl_r.SetIntProp("dockq_mapped", 1) + dockq_idx += 1 tmp = ch2_aln.GetSequence(0) ref_s2 = seq.CreateSequence(tmp.GetName(), str(tmp)) @@ -63,47 +85,43 @@ def _PreprocessStructures(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, if not (mdl_r.IsValid() and mdl_r.one_letter_code == col[1]): raise RuntimeError("DockQ: mismatch between provided " "alignments and ATOMSEQ in structures") - ref_residues_2.append(ref_r) - mdl_residues_2.append(mdl_r) + ref_r.SetIntProp("dockq_idx", dockq_idx) + mdl_r.SetIntProp("dockq_idx", dockq_idx) + ref_r.SetIntProp("dockq_mapped", 1) + mdl_r.SetIntProp("dockq_mapped", 1) + dockq_idx += 1 else: # go by residue numbers for mdl_r in mdl.Select(f"cname={mdl_ch1}").residues: ref_r = ref.FindResidue(ref_ch1, mdl_r.GetNumber()) if ref_r.IsValid(): - mdl_residues_1.append(mdl_r) - ref_residues_1.append(ref_r) + ref_r.SetIntProp("dockq_idx", dockq_idx) + mdl_r.SetIntProp("dockq_idx", dockq_idx) + ref_r.SetIntProp("dockq_mapped", 1) + mdl_r.SetIntProp("dockq_mapped", 1) + dockq_idx += 1 for mdl_r in mdl.Select(f"cname={mdl_ch2}").residues: ref_r = ref.FindResidue(ref_ch2, mdl_r.GetNumber()) if ref_r.IsValid(): - mdl_residues_2.append(mdl_r) - ref_residues_2.append(ref_r) - - new_mdl = mdl.handle.CreateEmptyView() - new_ref = ref.handle.CreateEmptyView() - for r in mdl_residues_1: - new_mdl.AddResidue(r.handle, mol.INCLUDE_ALL) - for r in mdl_residues_2: - new_mdl.AddResidue(r.handle, mol.INCLUDE_ALL) - for r in ref_residues_1: - new_ref.AddResidue(r.handle, mol.INCLUDE_ALL) - for r in ref_residues_2: - new_ref.AddResidue(r.handle, mol.INCLUDE_ALL) + ref_r.SetIntProp("dockq_idx", dockq_idx) + mdl_r.SetIntProp("dockq_idx", dockq_idx) + ref_r.SetIntProp("dockq_mapped", 1) + mdl_r.SetIntProp("dockq_mapped", 1) + dockq_idx += 1 - # set dockq_map property - ch = new_mdl.FindChain(mdl_ch1) - for r_idx, r in enumerate(ch.residues): - r.SetIntProp("dockq_map", r_idx) - ch = new_mdl.FindChain(mdl_ch2) - for r_idx, r in enumerate(ch.residues): - r.SetIntProp("dockq_map", r_idx) - ch = new_ref.FindChain(ref_ch1) - for r_idx, r in enumerate(ch.residues): - r.SetIntProp("dockq_map", r_idx) - ch = new_ref.FindChain(ref_ch2) - for r_idx, r in enumerate(ch.residues): - r.SetIntProp("dockq_map", r_idx) - - return (new_mdl, new_ref) + # set unique dockq_idx property for all residues that are still -1 + for cname in [ref_ch1, ref_ch2]: + ch = ref.FindChain(cname) + for r in ch.residues: + if r.GetIntProp("dockq_idx") == -1: + r.SetIntProp("dockq_idx", dockq_idx) + dockq_idx += 1 + for cname in [mdl_ch1, mdl_ch2]: + ch = mdl.FindChain(cname) + for r in ch.residues: + if r.GetIntProp("dockq_idx") == -1: + r.SetIntProp("dockq_idx", dockq_idx) + dockq_idx += 1 def _GetContacts(ent, ch1, ch2, dist_thresh): int1 = ent.Select(f"cname={ch1} and {dist_thresh} <> [cname={ch2}]") @@ -114,10 +132,12 @@ def _GetContacts(ent, ch1, ch2, dist_thresh): for r1, p1 in zip(int1.residues, int1_p): for r2, p2 in zip(int2.residues, int2_p): if p1.IsWithin(p2, dist_thresh): - contacts.add((r1.GetIntProp("dockq_map"), r2.GetIntProp("dockq_map"))) + contacts.add((r1.GetIntProp("dockq_idx"), + r2.GetIntProp("dockq_idx"))) return contacts -def _ContactScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, dist_thresh=5.0): +def _ContactScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, + dist_thresh=5.0): ref_contacts = _GetContacts(ref, ref_ch1, ref_ch2, dist_thresh) mdl_contacts = _GetContacts(mdl, mdl_ch1, mdl_ch2, dist_thresh) @@ -136,39 +156,49 @@ def _ContactScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, dist_thresh=5.0 def _RMSDScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, dist_thresh=10.0): - mdl_ch1_residues = mdl.FindChain(mdl_ch1).residues - mdl_ch2_residues = mdl.FindChain(mdl_ch2).residues - ref_ch1_residues = ref.FindChain(ref_ch1).residues - ref_ch2_residues = ref.FindChain(ref_ch2).residues + # backbone atoms used for superposition + sup_atoms = ['CA','C','N','O'] + + # make mapped residues accessible by the dockq_idx property + mapped_mdl = mdl.Select(f"cname={mdl_ch1},{mdl_ch2} and grdockq_mapped=1") + mapped_ref = ref.Select(f"cname={ref_ch1},{ref_ch2} and grdockq_mapped=1") + ch = mapped_mdl.FindChain(mdl_ch1) + mdl_ch1_residues = {r.GetIntProp("dockq_idx"): r for r in ch.residues} + ch = mapped_mdl.FindChain(mdl_ch2) + mdl_ch2_residues = {r.GetIntProp("dockq_idx"): r for r in ch.residues} + ch = mapped_ref.FindChain(ref_ch1) + ref_ch1_residues = {r.GetIntProp("dockq_idx"): r for r in ch.residues} + ch = mapped_ref.FindChain(ref_ch2) + ref_ch2_residues = {r.GetIntProp("dockq_idx"): r for r in ch.residues} # iRMSD ####### int1 = ref.Select(f"cname={ref_ch1} and {dist_thresh} <> [cname={ref_ch2}]") int2 = ref.Select(f"cname={ref_ch2} and {dist_thresh} <> [cname={ref_ch1}]") - int1_indices = [r.GetIntProp("dockq_map") for r in int1.residues] - int2_indices = [r.GetIntProp("dockq_map") for r in int2.residues] ref_pos = geom.Vec3List() mdl_pos = geom.Vec3List() - atom_names = ['CA','C','N','O'] - for idx in int1_indices: - ref_r = ref_ch1_residues[idx] - mdl_r = mdl_ch1_residues[idx] - for aname in atom_names: - ref_a = ref_r.FindAtom(aname) - mdl_a = mdl_r.FindAtom(aname) - if ref_a.IsValid() and mdl_a.IsValid(): - ref_pos.append(ref_a.pos) - mdl_pos.append(mdl_a.pos) - - for idx in int2_indices: - ref_r = ref_ch2_residues[idx] - mdl_r = mdl_ch2_residues[idx] - for aname in atom_names: - ref_a = ref_r.FindAtom(aname) - mdl_a = mdl_r.FindAtom(aname) - if ref_a.IsValid() and mdl_a.IsValid(): - ref_pos.append(ref_a.pos) - mdl_pos.append(mdl_a.pos) + for r in int1.residues: + idx = r.GetIntProp("dockq_idx") + if idx in ref_ch1_residues and idx in mdl_ch1_residues: + ref_r = ref_ch1_residues[idx] + mdl_r = mdl_ch1_residues[idx] + for aname in sup_atoms: + ref_a = ref_r.FindAtom(aname) + mdl_a = mdl_r.FindAtom(aname) + if ref_a.IsValid() and mdl_a.IsValid(): + ref_pos.append(ref_a.pos) + mdl_pos.append(mdl_a.pos) + for r in int2.residues: + idx = r.GetIntProp("dockq_idx") + if idx in ref_ch2_residues and idx in mdl_ch2_residues: + ref_r = ref_ch2_residues[idx] + mdl_r = mdl_ch2_residues[idx] + for aname in sup_atoms: + ref_a = ref_r.FindAtom(aname) + mdl_a = mdl_r.FindAtom(aname) + if ref_a.IsValid() and mdl_a.IsValid(): + ref_pos.append(ref_a.pos) + mdl_pos.append(mdl_a.pos) if len(mdl_pos) >= 3: sup_result = mol.alg.SuperposeSVD(mdl_pos, ref_pos) @@ -178,33 +208,36 @@ def _RMSDScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, dist_thresh=10.0): # lRMSD ####### - # receptor is by definition the larger chain - if len(ref_ch1_residues) > len(ref_ch2_residues): - ref_receptor_residues = ref_ch1_residues - ref_ligand_residues = ref_ch2_residues - mdl_receptor_residues = mdl_ch1_residues - mdl_ligand_residues = mdl_ch2_residues + # receptor is by definition the larger chain in ref + n_ch1 = len(ref.FindChain(ref_ch1).residues) + n_ch2 = len(ref.FindChain(ref_ch2).residues) + if n_ch1 > n_ch2: + ref_receptor_residues = ref_ch1_residues.values() + ref_ligand_residues = ref_ch2_residues.values() + mdl_receptor_residues = \ + [mdl_ch1_residues[idx] for idx in ref_ch1_residues.keys()] + mdl_ligand_residues = \ + [mdl_ch2_residues[idx] for idx in ref_ch2_residues.keys()] else: - ref_receptor_residues = ref_ch2_residues - ref_ligand_residues = ref_ch1_residues - mdl_receptor_residues = mdl_ch2_residues - mdl_ligand_residues = mdl_ch1_residues - + ref_receptor_residues = ref_ch2_residues.values() + ref_ligand_residues = ref_ch1_residues.values() + mdl_receptor_residues = \ + [mdl_ch2_residues[idx] for idx in ref_ch2_residues.keys()] + mdl_ligand_residues = \ + [mdl_ch1_residues[idx] for idx in ref_ch1_residues.keys()] ref_receptor_positions = geom.Vec3List() mdl_receptor_positions = geom.Vec3List() ref_ligand_positions = geom.Vec3List() mdl_ligand_positions = geom.Vec3List() - for ref_r, mdl_r in zip(ref_receptor_residues, mdl_receptor_residues): - for aname in atom_names: + for aname in sup_atoms: ref_a = ref_r.FindAtom(aname) mdl_a = mdl_r.FindAtom(aname) if ref_a.IsValid() and mdl_a.IsValid(): ref_receptor_positions.append(ref_a.pos) mdl_receptor_positions.append(mdl_a.pos) - for ref_r, mdl_r in zip(ref_ligand_residues, mdl_ligand_residues): - for aname in atom_names: + for aname in sup_atoms: ref_a = ref_r.FindAtom(aname) mdl_a = mdl_r.FindAtom(aname) if ref_a.IsValid() and mdl_a.IsValid(): @@ -265,14 +298,11 @@ def DockQ(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, DockQ which corresponds to the equivalent values in the original DockQ implementation. """ - mapped_model, mapped_ref = _PreprocessStructures(mdl, ref, mdl_ch1, mdl_ch2, - ref_ch1, ref_ch2, - ch1_aln = ch1_aln, - ch2_aln = ch2_aln) - nnat, nmdl, fnat, fnonnat = _ContactScores(mapped_model, mapped_ref, - mdl_ch1, mdl_ch2, ref_ch1, ref_ch2) - irmsd, lrmsd = _RMSDScores(mapped_model, mapped_ref, - mdl_ch1, mdl_ch2, ref_ch1, ref_ch2) + _PreprocessStructures(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2, + ch1_aln = ch1_aln, ch2_aln = ch2_aln) + nnat, nmdl, fnat, fnonnat = _ContactScores(mdl, ref, mdl_ch1, mdl_ch2, + ref_ch1, ref_ch2) + irmsd, lrmsd = _RMSDScores(mdl, ref, mdl_ch1, mdl_ch2, ref_ch1, ref_ch2) return {"nnat": nnat, "nmdl": nmdl, "fnat": fnat, diff --git a/modules/mol/alg/pymod/scoring.py b/modules/mol/alg/pymod/scoring.py index a2fd44555d11563032e9152e0ad61b42245f445a..6d5b345ec9bb012c6b262c43bb0b587ff096641d 100644 --- a/modules/mol/alg/pymod/scoring.py +++ b/modules/mol/alg/pymod/scoring.py @@ -879,6 +879,36 @@ class Scorer: flat_mapping = self.mapping.GetFlatMapping() pep_seqs = set([s.GetName() for s in self.chain_mapper.polypep_seqs]) + + # perform required alignments - cannot take the alignments from the + # mapping results as we potentially remove stuff there as compared + # to self.model and self.target + trg_seqs = dict() + for ch in self.target.chains: + cname = ch.GetName() + s = ''.join([r.one_letter_code for r in ch.residues]) + s = seq.CreateSequence(ch.GetName(), s) + s.AttachView(self.target.Select(f"cname={cname}")) + trg_seqs[ch.GetName()] = s + mdl_seqs = dict() + for ch in self.model.chains: + cname = ch.GetName() + s = ''.join([r.one_letter_code for r in ch.residues]) + s = seq.CreateSequence(cname, s) + s.AttachView(self.model.Select(f"cname={cname}")) + mdl_seqs[ch.GetName()] = s + + trg_pep_chains = [s.GetName() for s in self.chain_mapper.polypep_seqs] + trg_nuc_chains = [s.GetName() for s in self.chain_mapper.polynuc_seqs] + trg_pep_chains = set(trg_pep_chains) + trg_nuc_chains = set(trg_nuc_chains) + dockq_alns = dict() + for trg_ch, mdl_ch in flat_mapping.items(): + if trg_ch in pep_seqs: + dockq_alns[(trg_ch, mdl_ch)] = \ + self.chain_mapper.Align(trg_seqs[trg_ch], mdl_seqs[mdl_ch], + mol.ChemType.AMINOACIDS) + for trg_int in self.qs_scorer.qsent1.interacting_chains: trg_ch1 = trg_int[0] trg_ch2 = trg_int[1] @@ -886,16 +916,9 @@ class Scorer: if trg_ch1 in flat_mapping and trg_ch2 in flat_mapping: mdl_ch1 = flat_mapping[trg_ch1] mdl_ch2 = flat_mapping[trg_ch2] - aln1 = self.mapping.alns[(trg_ch1, mdl_ch1)] - aln2 = self.mapping.alns[(trg_ch2, mdl_ch2)] - # we're operating on the model/target from the MappingResult - # as their ATOMSEQ corresponds to the alignments above. - # Residues that do not contain all atoms required for - # ChainMapper (e.g. N, CA, C, CB) are removed which triggers - # a mismatch error in DockQ - model = self.mapping.model - target = self.mapping.target - res = dockq.DockQ(model, target, mdl_ch1, mdl_ch2, + aln1 = dockq_alns[(trg_ch1, mdl_ch1)] + aln2 = dockq_alns[(trg_ch2, mdl_ch2)] + res = dockq.DockQ(self.model, self.target, mdl_ch1, mdl_ch2, trg_ch1, trg_ch2, ch1_aln=aln1, ch2_aln=aln2) if res["nnat"] > 0: