From 341783cce4abe39ef1a33db8f844571efe7b30a6 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Thu, 11 Jan 2024 18:30:27 +0100 Subject: [PATCH] scoring: min_pep_length/min_nuc_length in compare-structures action Minimum length for a chain in the target structure to be considered in chain mapping. The chain mapping algorithm first performs an all vs. all pairwise sequence alignment to identify "equal" chains within the target structure. We go for simple sequence identity there. Short sequences can be problematic as they may produce high sequence identity alignments by pure chance. BUT: if you're scoring peptides or short nucleotides, you really want to be able to reduce the default thresholds (pep: 10, nuc: 4) --- actions/ost-compare-structures | 38 +++++++++++++++++++++++++++++++- modules/doc/actions.rst | 31 ++++++++++++++++++++++++-- modules/mol/alg/pymod/scoring.py | 29 ++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 5 deletions(-) diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index cfaabbd37..9606e32f4 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -57,6 +57,8 @@ results: * "cad_exec" * "usalign_exec" * "lddt_no_stereochecks" + * "min_pep_length" + * "min_nuc_length" The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the @@ -492,6 +494,36 @@ def _ParseArgs(): action="store_true", help=("Dump additional info on model and reference residues that occur " "in pepnuc alignments.")) + + parser.add_argument( + "--min-pep-length", + dest="min_pep_length", + default = 10, + type=int, + help=("Relevant parameter if short peptides are involved in scoring." + "Minimum peptide length for a chain in the target structure to " + "be considered in chain mapping. The chain mapping algorithm " + "first performs an all vs. all pairwise sequence alignment to " + "identify \"equal\" chains within the target structure. We go " + "for simple sequence identity there. Short sequences can be " + "problematic as they may produce high sequence identity " + "alignments by pure chance.") + ) + + parser.add_argument( + "--min-nuc-length", + dest="min_nuc_length", + default = 4, + type=int, + help=("Relevant parameter if short nucleotides are involved in scoring." + "Minimum nucleotide length for a chain in the target structure to " + "be considered in chain mapping. The chain mapping algorithm " + "first performs an all vs. all pairwise sequence alignment to " + "identify \"equal\" chains within the target structure. We go " + "for simple sequence identity there. Short sequences can be " + "problematic as they may produce high sequence identity " + "alignments by pure chance.") + ) return parser.parse_args() @@ -695,7 +727,9 @@ def _Process(model, reference, args): usalign_exec = args.usalign_exec, lddt_no_stereochecks = args.lddt_no_stereochecks, n_max_naive = args.n_max_naive, - oum = args.oum) + oum = args.oum, + min_pep_length = args.min_pep_length, + min_nuc_length = args.min_nuc_length) ir = _GetInconsistentResidues(scorer.aln) if len(ir) > 0 and args.enforce_consistency: @@ -876,6 +910,8 @@ def _Main(): out["cad_exec"] = args.cad_exec out["usalign_exec"] = args.usalign_exec out["lddt_no_stereochecks"] = args.lddt_no_stereochecks + out["min_pep_length"] = args.min_pep_length + out["min_nuc_length"] = args.min_nuc_length out["status"] = "SUCCESS" with open(args.output, 'w') as fh: json.dump(out, fh, indent=4, sort_keys=False) diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index eee1181a1..523926fc3 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -40,11 +40,16 @@ Details on the usage (output of ``ost compare-structures --help``): [--local-lddt] [--bb-lddt] [--bb-local-lddt] [--cad-score] [--local-cad-score] [--cad-exec CAD_EXEC] - [--usalign-exec USALIGN_EXEC] [--qs-score] - [--dockq] [--contact-scores] [--rigid-scores] + [--usalign-exec USALIGN_EXEC] + [--override-usalign-mapping] [--qs-score] + [--dockq] [--ics] [--ips] [--rigid-scores] [--patch-scores] [--tm-score] [--lddt-no-stereochecks] [--n-max-naive N_MAX_NAIVE] + [--dump-aligned-residues] [--dump-pepnuc-alns] + [--dump-pepnuc-aligned-residues] + [--min-pep-length MIN_PEP_LENGTH] + [--min-nuc-length MIN_NUC_LENGTH] Evaluate model against reference @@ -104,6 +109,8 @@ Details on the usage (output of ``ost compare-structures --help``): * "cad_exec" * "usalign_exec" * "lddt_no_stereochecks" + * "min_pep_length" + * "min_nuc_length" The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the @@ -351,6 +358,26 @@ Details on the usage (output of ``ost compare-structures --help``): --dump-pepnuc-aligned-residues Dump additional info on model and reference residues that occur in pepnuc alignments. + --min-pep-length MIN_PEP_LENGTH + Relevant parameter if short peptides are involved in + scoring.Minimum peptide length for a chain in the + target structure to be considered in chain mapping. + The chain mapping algorithm first performs an all vs. + all pairwise sequence alignment to identify "equal" + chains within the target structure. We go for simple + sequence identity there. Short sequences can be + problematic as they may produce high sequence identity + alignments by pure chance. + --min-nuc-length MIN_NUC_LENGTH + Relevant parameter if short nucleotides are involved + in scoring.Minimum nucleotide length for a chain in + the target structure to be considered in chain + mapping. The chain mapping algorithm first performs an + all vs. all pairwise sequence alignment to identify + "equal" chains within the target structure. We go for + simple sequence identity there. Short sequences can be + problematic as they may produce high sequence identity + alignments by pure chance. .. _ost compare ligand structures: diff --git a/modules/mol/alg/pymod/scoring.py b/modules/mol/alg/pymod/scoring.py index 6aaa2276e..e568fdf3a 100644 --- a/modules/mol/alg/pymod/scoring.py +++ b/modules/mol/alg/pymod/scoring.py @@ -141,12 +141,33 @@ class Scorer: object into USalign to compute TM-score. Experimental feature with limitations. :type oum: :class:`bool` + :param min_pep_length: Relevant parameter if short peptides are involved in + scoring. Minimum peptide length for a chain in the + target structure to be considered in chain mapping. + The chain mapping algorithm first performs an all vs. + all pairwise sequence alignment to identify \"equal\" + chains within the target structure. We go for simple + sequence identity there. Short sequences can be + problematic as they may produce high sequence identity + alignments by pure chance. + :type min_pep_length: :class:`int` + :param min_nuc_length: Relevant parameter if short nucleotides are involved + in scoring. Minimum nucleotide length for a chain in + the target structure to be considered in chain + mapping. The chain mapping algorithm first performs + an all vs. all pairwise sequence alignment to + identify \"equal\" chains within the target + structure. We go for simple sequence identity there. + Short sequences can be problematic as they may + produce high sequence identity alignments by pure + chance. + :type min_nuc_length: :class:`int` """ def __init__(self, model, target, resnum_alignments=False, molck_settings = None, cad_score_exec = None, custom_mapping=None, usalign_exec = None, lddt_no_stereochecks=False, n_max_naive=40320, - oum=False): + oum=False, min_pep_length = 10, min_nuc_length = 4): self._target_orig = target self._model_orig = model @@ -231,6 +252,8 @@ class Scorer: self.lddt_no_stereochecks = lddt_no_stereochecks self.n_max_naive = n_max_naive self.oum = oum + self.min_pep_length = min_pep_length + self.min_nuc_length = min_nuc_length # lazily evaluated attributes self._stereochecked_model = None @@ -491,7 +514,9 @@ class Scorer: if self._chain_mapper is None: self._chain_mapper = chain_mapping.ChainMapper(self.target, n_max_naive=1e9, - resnum_alignments=self.resnum_alignments) + resnum_alignments=self.resnum_alignments, + min_pep_length=self.min_pep_length, + min_nuc_length=self.min_nuc_length) return self._chain_mapper @property -- GitLab