From 935bc608440c2dfdfbc0ac54dfcbce44fb3bb16f Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Thu, 9 Mar 2023 16:07:49 +0100 Subject: [PATCH] improve doc and output of compare-structures - Tell the user that biounit indices are 0-based - More info in output on what chains are actually processed --- actions/ost-compare-structures | 27 ++++++++++++++------ modules/doc/actions.rst | 28 +++++++++++++++------ modules/mol/alg/pymod/chain_mapping.py | 35 ++++++++++++++++---------- 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index dd513e0ea..e2cb51438 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -12,18 +12,28 @@ Loads the structures and performs basic cleanup: * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB compound dictionary + * Select for peptide/nucleotide residues The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional -options, this is a dictionary with five keys: - +options, this is a dictionary with 8 keys: + + * "reference_chains": Chain names of reference + * "model_chains": Chain names of model + * "chem_groups": Groups of polypeptides/polynucleotides from reference that + are considered chemically equivalent. You can derive stoichiometry from this. + Contains only chains that are considered in chain mapping, i.e. pass a + size threshold (defaults: 10 for peptides, 4 for nucleotides). + * "chem_mapping": List of same length as "chem_groups". Assigns model chains to + the respective chem group. Again, only contains chains that are considered + in chain mapping. * "chain_mapping": A dictionary with reference chain names as keys and the - mapped model chain names as values. + mapped model chain names as values. Missing chains are either not mapped + (but present in "chem_groups", "chem_mapping") or were not considered in + chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. - * "chem_groups": Groups of polypeptides/polynucleotides that are considered - chemically equivalent. You can derive stoichiometry from this. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. @@ -380,7 +390,7 @@ def _LoadStructure(structure_path, sformat=None, fault_tolerant=False, fault_tolerant=fault_tolerant) if bu_idx >= len(cif_info.biounits): raise RuntimeError(f"Invalid biounit index - requested {bu_idx} " - f"cif file has {len(cif_info.biounits)}") + f"must be < {len(cif_info.biounits)}.") biounit = cif_info.biounits[bu_idx] entity = biounit.PDBize(cif_entity, min_polymer_size=0) if not entity.IsValid(): @@ -458,10 +468,13 @@ def _Process(model, reference, args): raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") out = dict() + out["reference_chains"] = [ch.GetName() for ch in scorer.target.chains] + out["model_chains"] = [ch.GetName() for ch in scorer.model.chains] + out["chem_groups"] = scorer.chain_mapper.chem_groups + out["chem_mapping"] = scorer.mapping.chem_mapping out["chain_mapping"] = scorer.mapping.GetFlatMapping() out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] out["inconsistent_residues"] = ir - out["chem_groups"] = scorer.chain_mapper.chem_groups if args.lddt: out["lddt"] = scorer.lddt diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index c10f942e8..df447090b 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -55,18 +55,28 @@ Details on the usage (output of ``ost compare-structures --help``): * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB compound dictionary + * Select for peptide/nucleotide residues The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional - options, this is a dictionary with five keys: + options, this is a dictionary with 8 keys: + * "reference_chains": Chain names of reference + * "model_chains": Chain names of model + * "chem_groups": Groups of polypeptides/polynucleotides from reference that + are considered chemically equivalent. You can derive stoichiometry from this. + Contains only chains that are considered in chain mapping, i.e. pass a + size threshold (defaults: 10 for peptides, 4 for nucleotides). + * "chem_mapping": List of same length as "chem_groups". Assigns model chains to + the respective chem group. Again, only contains chains that are considered + in chain mapping. * "chain_mapping": A dictionary with reference chain names as keys and the - mapped model chain names as values. + mapped model chain names as values. Missing chains are either not mapped + (but present in "chem_groups", "chem_mapping") or were not considered in + chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. - * "chem_groups": Groups of polypeptides/polynucleotides that are considered - chemically equivalent. You can derive stoichiometry from this. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. @@ -76,7 +86,7 @@ Details on the usage (output of ``ost compare-structures --help``): * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". - + The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, @@ -91,7 +101,7 @@ Details on the usage (output of ``ost compare-structures --help``): Example to inject custom chain mapping ost compare-structures -m model.pdb -r reference.cif -c A:B B:A - + optional arguments: -h, --help show this help message and exit -m MODEL, --model MODEL @@ -113,12 +123,14 @@ Details on the usage (output of ``ost compare-structures --help``): Only has an effect if model is in mmcif format. By default, the assymetric unit (AU) is used for scoring. If there are biounits defined in the mmcif file, you - can specify the index of the one which should be used. + can specify the (0-based) index of the one which + should be used. -rb REFERENCE_BIOUNIT, --reference-biounit REFERENCE_BIOUNIT Only has an effect if reference is in mmcif format. By default, the assymetric unit (AU) is used for scoring. If there are biounits defined in the mmcif file, you - can specify the index of the one which should be used. + can specify the (0-based) index of the one which + should be used. -rna, --residue-number-alignment Make alignment based on residue number instead of using a global BLOSUM62-based alignment (NUC44 for diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index 932e00332..37c36a132 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -34,11 +34,12 @@ class MappingResult: Constructor is directly called within the functions, no need to construct such objects yourself. """ - def __init__(self, target, model, chem_groups, mapping, alns, + def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None): self._target = target self._model = model self._chem_groups = chem_groups + self._chem_mapping = chem_mapping self._mapping = mapping self._alns = alns self._opt_score = opt_score @@ -72,6 +73,14 @@ class MappingResult: """ return self._chem_groups + @property + def chem_mapping(self): + """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`. + + :class:`list` of :class:`list` of :class:`str` (chain names) + """ + return self._chem_mapping + @property def mapping(self): """ Mapping of :attr:`~model` chains onto :attr:`~target` @@ -853,8 +862,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) mapping = None opt_lddt = None @@ -890,8 +899,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, mapping, - alns, opt_score = opt_lddt) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + mapping, alns, opt_score = opt_lddt) def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "naive", @@ -970,8 +979,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) mapping = None opt_qsscore = None @@ -1007,8 +1016,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, mapping, - alns, opt_score = opt_qsscore) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + mapping, alns, opt_score = opt_qsscore) def GetRigidMapping(self, model, strategy = "greedy_single_gdtts", single_chain_gdtts_thresh=0.4, subsampling=None, @@ -1108,8 +1117,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) trg_group_pos, mdl_group_pos = _GetRefPos(self.target, mdl, self.chem_group_alignments, @@ -1188,8 +1197,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, final_mapping, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + final_mapping, alns) def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, -- GitLab