diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index dd513e0eaade675fccf7b68c777822c956a27ffd..e2cb51438e14a15d67ec9558b43368c6211eaeaf 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -12,18 +12,28 @@ Loads the structures and performs basic cleanup: * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB compound dictionary + * Select for peptide/nucleotide residues The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional -options, this is a dictionary with five keys: - +options, this is a dictionary with 8 keys: + + * "reference_chains": Chain names of reference + * "model_chains": Chain names of model + * "chem_groups": Groups of polypeptides/polynucleotides from reference that + are considered chemically equivalent. You can derive stoichiometry from this. + Contains only chains that are considered in chain mapping, i.e. pass a + size threshold (defaults: 10 for peptides, 4 for nucleotides). + * "chem_mapping": List of same length as "chem_groups". Assigns model chains to + the respective chem group. Again, only contains chains that are considered + in chain mapping. * "chain_mapping": A dictionary with reference chain names as keys and the - mapped model chain names as values. + mapped model chain names as values. Missing chains are either not mapped + (but present in "chem_groups", "chem_mapping") or were not considered in + chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. - * "chem_groups": Groups of polypeptides/polynucleotides that are considered - chemically equivalent. You can derive stoichiometry from this. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. @@ -380,7 +390,7 @@ def _LoadStructure(structure_path, sformat=None, fault_tolerant=False, fault_tolerant=fault_tolerant) if bu_idx >= len(cif_info.biounits): raise RuntimeError(f"Invalid biounit index - requested {bu_idx} " - f"cif file has {len(cif_info.biounits)}") + f"must be < {len(cif_info.biounits)}.") biounit = cif_info.biounits[bu_idx] entity = biounit.PDBize(cif_entity, min_polymer_size=0) if not entity.IsValid(): @@ -458,10 +468,13 @@ def _Process(model, reference, args): raise RuntimeError(f"Inconsistent residues observed: {' '.join(ir)}") out = dict() + out["reference_chains"] = [ch.GetName() for ch in scorer.target.chains] + out["model_chains"] = [ch.GetName() for ch in scorer.model.chains] + out["chem_groups"] = scorer.chain_mapper.chem_groups + out["chem_mapping"] = scorer.mapping.chem_mapping out["chain_mapping"] = scorer.mapping.GetFlatMapping() out["aln"] = [_AlnToFastaStr(aln) for aln in scorer.aln] out["inconsistent_residues"] = ir - out["chem_groups"] = scorer.chain_mapper.chem_groups if args.lddt: out["lddt"] = scorer.lddt diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index c10f942e809bcbdfa3aec0da85ccb1e6f401ac08..df447090b880c865fa8ef2b99d78f018bf3c22cb 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -55,18 +55,28 @@ Details on the usage (output of ``ost compare-structures --help``): * Remove OXT atoms * Remove unknown atoms, i.e. atoms that are not expected according to the PDB compound dictionary + * Select for peptide/nucleotide residues The cleaned structures are optionally dumped using -d/--dump-structures Output is written in JSON format (default: out.json). In case of no additional - options, this is a dictionary with five keys: + options, this is a dictionary with 8 keys: + * "reference_chains": Chain names of reference + * "model_chains": Chain names of model + * "chem_groups": Groups of polypeptides/polynucleotides from reference that + are considered chemically equivalent. You can derive stoichiometry from this. + Contains only chains that are considered in chain mapping, i.e. pass a + size threshold (defaults: 10 for peptides, 4 for nucleotides). + * "chem_mapping": List of same length as "chem_groups". Assigns model chains to + the respective chem group. Again, only contains chains that are considered + in chain mapping. * "chain_mapping": A dictionary with reference chain names as keys and the - mapped model chain names as values. + mapped model chain names as values. Missing chains are either not mapped + (but present in "chem_groups", "chem_mapping") or were not considered in + chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. - * "chem_groups": Groups of polypeptides/polynucleotides that are considered - chemically equivalent. You can derive stoichiometry from this. * "inconsistent_residues": List of strings that represent name mismatches of aligned residues in form <trg_cname>.<trg_rname><trg_rnum>-<mdl_cname>.<mdl_rname><mdl_rnum>. @@ -76,7 +86,7 @@ Details on the usage (output of ``ost compare-structures --help``): * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". - + The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the structures to ensure matching residue numbers (CASP/CAMEO). In these cases, @@ -91,7 +101,7 @@ Details on the usage (output of ``ost compare-structures --help``): Example to inject custom chain mapping ost compare-structures -m model.pdb -r reference.cif -c A:B B:A - + optional arguments: -h, --help show this help message and exit -m MODEL, --model MODEL @@ -113,12 +123,14 @@ Details on the usage (output of ``ost compare-structures --help``): Only has an effect if model is in mmcif format. By default, the assymetric unit (AU) is used for scoring. If there are biounits defined in the mmcif file, you - can specify the index of the one which should be used. + can specify the (0-based) index of the one which + should be used. -rb REFERENCE_BIOUNIT, --reference-biounit REFERENCE_BIOUNIT Only has an effect if reference is in mmcif format. By default, the assymetric unit (AU) is used for scoring. If there are biounits defined in the mmcif file, you - can specify the index of the one which should be used. + can specify the (0-based) index of the one which + should be used. -rna, --residue-number-alignment Make alignment based on residue number instead of using a global BLOSUM62-based alignment (NUC44 for diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index 932e00332bfd3956f806acd0a7be026def4e2b37..37c36a132bedbff0fff6dcbf2bf7f82134d04621 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -34,11 +34,12 @@ class MappingResult: Constructor is directly called within the functions, no need to construct such objects yourself. """ - def __init__(self, target, model, chem_groups, mapping, alns, + def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None): self._target = target self._model = model self._chem_groups = chem_groups + self._chem_mapping = chem_mapping self._mapping = mapping self._alns = alns self._opt_score = opt_score @@ -72,6 +73,14 @@ class MappingResult: """ return self._chem_groups + @property + def chem_mapping(self): + """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`. + + :class:`list` of :class:`list` of :class:`str` (chain names) + """ + return self._chem_mapping + @property def mapping(self): """ Mapping of :attr:`~model` chains onto :attr:`~target` @@ -853,8 +862,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) mapping = None opt_lddt = None @@ -890,8 +899,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, mapping, - alns, opt_score = opt_lddt) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + mapping, alns, opt_score = opt_lddt) def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "naive", @@ -970,8 +979,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) mapping = None opt_qsscore = None @@ -1007,8 +1016,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, mapping, - alns, opt_score = opt_qsscore) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + mapping, alns, opt_score = opt_qsscore) def GetRigidMapping(self, model, strategy = "greedy_single_gdtts", single_chain_gdtts_thresh=0.4, subsampling=None, @@ -1108,8 +1117,8 @@ class ChainMapper: aln.AttachView(0, _CSel(self.target, [ref_ch])) aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, one_to_one, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + one_to_one, alns) trg_group_pos, mdl_group_pos = _GetRefPos(self.target, mdl, self.chem_group_alignments, @@ -1188,8 +1197,8 @@ class ChainMapper: aln.AttachView(1, _CSel(mdl, [mdl_ch])) alns[(ref_ch, mdl_ch)] = aln - return MappingResult(self.target, mdl, self.chem_groups, final_mapping, - alns) + return MappingResult(self.target, mdl, self.chem_groups, chem_mapping, + final_mapping, alns) def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,