scoring: make it possible to provide SEQRES info to scoring actions

0ab3fb42 · Studer Gabriel · ef030ccb · 0ab3fb42 · 0ab3fb42 · 0ab3fb42
Commit 0ab3fb42 authored 3 months ago by Studer Gabriel
--- a/actions/ost-compare-ligand-structures
+++ b/actions/ost-compare-ligand-structures
@@ -454,6 +454,38 @@ def _ParseArgs():
              "threshold is applied to peptide and nucleotide chains.")
    )

+    parser.add_argument(
+        "--seqres",
+        dest="seqres",
+        type = str,
+        default = None,
+        help=("Default: None - manually define chem groups by specifying path "
+              "to a fasta file. Each sequence in that file is considered a "
+              "reference sequence of a chem group. All polymer chains "
+              "in reference will be aligned to these sequences. This only works "
+              "if -rna/--residue-number-alignment is enabled and an error is raised "
+              "otherwise. "
+              "Additionally, you need to manually specify a mapping "
+              "of the polymer chains using trg-seqres-mapping and an error "
+              "is raised otherwise. The one letter codes in the structure must "
+              "exactly match the respective characters in seqres and an error "
+              "is raised if not.")
+    )
+
+    parser.add_argument(
+        "--trg-seqres-mapping",
+        nargs="+",
+        dest="trg_seqres_mapping",
+        default=None,
+        help=("Default: None - Maps each polymer chain in reference to a "
+              "sequence in *seqres*. Each mapping is a key:value pair "
+              "where key is the chain name in reference and value is the "
+              "sequence name in seqres. So let's say you have a homo-dimer "
+              "reference with chains \"A\" and \"B\"for which you provide a "
+              "seqres file containing one sequence with name \"1\". You can "
+              "specify this mapping with: --trg-seqres-mapping A:1 B:1")
+    )
+
    args = parser.parse_args()
    if args.output is None:
        args.output = "out.%s" % args.output_format
@@ -570,8 +602,30 @@ def _LoadStructureData(receptor_path,

    return (receptor, ligands)

+def _SEQRESFeaturesFromArgs(args):
+    seqres = None
+    trg_seqres_mapping = None
+
+    if args.seqres is not None:
+        if args.trg_seqres_mapping is None:
+            raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
+                               "provided")
+        if not args.residue_number_alignment:
+            raise RuntimeError("Must enable residue-number-alignment if seqres "
+                               "is provided.")
+        seqres = io.LoadSequenceList(args.seqres)
+
+    if args.trg_seqres_mapping is not None:
+        if args.seqres is None:
+            raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
+                               "provided.")
+        trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
+
+    return (seqres, trg_seqres_mapping)
+

 def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args):
+    seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
    return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference,
                                                model_ligands = model_ligands,
                                                target_ligands = reference_ligands,
@@ -587,9 +641,12 @@ def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args
                                                pep_seqid_thr = args.chem_group_seqid_thresh,
                                                nuc_seqid_thr = args.chem_group_seqid_thresh,
                                                mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
-                                                mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
+                                                mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
+                                                seqres=seqres,
+                                                trg_seqres_mapping=trg_seqres_mapping)

 def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args):
+    seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
    return ligand_scoring_scrmsd.SCRMSDScorer(model, reference,
                                              model_ligands = model_ligands,
                                              target_ligands = reference_ligands,
@@ -606,7 +663,9 @@ def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args)
                                              pep_seqid_thr = args.chem_group_seqid_thresh,
                                              nuc_seqid_thr = args.chem_group_seqid_thresh,
                                              mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
-                                              mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
+                                              mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
+                                              seqres=seqres,
+                                              trg_seqres_mapping=trg_seqres_mapping)

 def _Process(model, model_ligands, reference, reference_ligands, args):

@@ -963,6 +1022,12 @@ def _Main():
        out["min-nuc-length"] = args.min_nuc_length
        out["chem-group-seqid-thresh"] = args.chem_group_seqid_thresh
        out["chem-map-seqid-thresh"] = args.chem_map_seqid_thresh
+        out["seqres"] = args.seqres
+        if args.trg_seqres_mapping:
+            tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
+            out["trg_seqres_mapping"] = tmp
+        else:
+            out["trg_seqres_mapping"] = None

        # only add lddtpli if actually computed
        if args.lddt_pli:

--- a/actions/ost-compare-structures
+++ b/actions/ost-compare-structures
@@ -53,26 +53,9 @@ comparison:
 * "status": SUCCESS if everything ran through. In case of failure, the only
   content of the JSON output will be \"status\" set to FAILURE and an
   additional key: "traceback".
+ * "ost_version": The OpenStructure version used for computation.

-The following additional keys store relevant input parameters to reproduce
-results:
-
- * "model"
- * "reference"
- * "fault_tolerant"
- * "model_biounit"
- * "reference_biounit"
- * "residue_number_alignment"
- * "enforce_consistency"
- * "cad_exec"
- * "usalign_exec"
- * "lddt_no_stereochecks"
- * "min_pep_length"
- * "min_nuc_length"
- * "lddt_add_mdl_contacts"
- * "lddt_inclusion_radius"
- * "dockq_capri_peptide"
- * "ost_version"
+Additional keys represent input options.

 The pairwise sequence alignments are computed with Needleman-Wunsch using
 BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -684,6 +667,38 @@ def _ParseArgs():
              "min-pep-length/min-nuc-length residues are aligned. The same "
              "threshold is applied to peptide and nucleotide chains.")
    )
+
+    parser.add_argument(
+        "--seqres",
+        dest="seqres",
+        type = str,
+        default = None,
+        help=("Default: None - manually define chem groups by specifying path "
+              "to a fasta file. Each sequence in that file is considered a "
+              "reference sequence of a chem group. All polymer chains "
+              "in reference will be aligned to these sequences. This only works "
+              "if -rna/--residue-number-alignment is enabled and an error is raised "
+              "otherwise. "
+              "Additionally, you need to manually specify a mapping "
+              "of the polymer chains using trg-seqres-mapping and an error "
+              "is raised otherwise. The one letter codes in the structure must "
+              "exactly match the respective characters in seqres and an error "
+              "is raised if not.")
+    )
+
+    parser.add_argument(
+        "--trg-seqres-mapping",
+        nargs="+",
+        dest="trg_seqres_mapping",
+        default=None,
+        help=("Default: None - Maps each polymer chain in reference to a "
+              "sequence in *seqres*. Each mapping is a key:value pair "
+              "where key is the chain name in reference and value is the "
+              "sequence name in seqres. So let's say you have a homo-dimer "
+              "reference with chains \"A\" and \"B\"for which you provide a "
+              "seqres file containing one sequence with name \"1\". You can "
+              "specify this mapping with: --trg-seqres-mapping A:1 B:1")
+    )
 
    return parser.parse_args()

@@ -932,6 +947,24 @@ def _Process(model, reference, args, model_format, reference_format):
    if args.chain_mapping is not None:
        mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping}

+    seqres = None
+    trg_seqres_mapping = None
+
+    if args.seqres is not None:
+        if args.trg_seqres_mapping is None:
+            raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
+                               "provided")
+        if not args.residue_number_alignment:
+            raise RuntimeError("Must enable residue-number-alignment if seqres "
+                               "is provided.")
+        seqres = io.LoadSequenceList(args.seqres)
+
+    if args.trg_seqres_mapping is not None:
+        if args.seqres is None:
+            raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
+                               "provided.")
+        trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
+
    scorer = scoring.Scorer(model, reference,
                            resnum_alignments = args.residue_number_alignment,
                            cad_score_exec = args.cad_exec,
@@ -948,7 +981,9 @@ def _Process(model, reference, args, model_format, reference_format):
                            pep_seqid_thr = args.chem_group_seqid_thresh,
                            nuc_seqid_thr = args.chem_group_seqid_thresh,
                            mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
-                            mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
+                            mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
+                            seqres = seqres,
+                            trg_seqres_mapping = trg_seqres_mapping)

    ir = _GetInconsistentResidues(scorer.aln, scorer.target, scorer.model)
    if len(ir) > 0 and args.enforce_consistency:
@@ -1167,6 +1202,12 @@ def _Main():
        out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts
        out["lddt_inclusion_radius"] = args.lddt_inclusion_radius
        out["dockq_capri_peptide"] = args.dockq_capri_peptide
+        out["seqres"] = args.seqres
+        if args.trg_seqres_mapping:
+            tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
+            out["trg_seqres_mapping"] = tmp
+        else:
+            out["trg_seqres_mapping"] = None
        out["ost_version"] = ost.__version__
        out["status"] = "SUCCESS"
        with open(args.output, 'w') as fh:

--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -55,6 +55,8 @@ Details on the usage (output of ``ost compare-structures --help``):
                                [--lddt-inclusion-radius LDDT_INCLUSION_RADIUS]
                                [--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
                                [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
+                                [--seqres SEQRES]
+                                [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]

  Evaluate model against reference 

@@ -80,18 +82,24 @@ Details on the usage (output of ``ost compare-structures --help``):
   * "reference_chains": Chain names of reference
   * "model_chains": Chain names of model
   * "chem_groups": Groups of polypeptides/polynucleotides from reference that
-     are considered chemically equivalent. You can derive stoichiometry from this.
-     Contains only chains that are considered in chain mapping, i.e. pass a
-     size threshold (defaults: 6 for peptides, 4 for nucleotides).
+     are considered chemically equivalent, i.e. pass a pairwise sequence identity
+     threshold that can be controlled with --chem-group-seqid-thresh.
+     You can derive stoichiometry from this. Contains only chains that are
+     considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
+     peptides, 4 for nucleotides).
   * "chem_mapping": List of same length as "chem_groups". Assigns model chains to
     the respective chem group. Again, only contains chains that are considered
-     in chain mapping.
-   * "unmapped_mdl_chains": Model chains that could be considered in chain mapping
-     but could not be mapped to any chem group. Depends on --chem-map-seqid-thresh
-     and a mapping for each model chain can be enforced by setting it to 0.
+     in chain mapping. That is 1) pass the same size threshold as fo chem_groups
+     2) can be aligned to any of the chem groups with a sequence identity
+     threshold that can be controlled by --chem-map-seqid-thresh.
+   * "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
+     i.e. are long enough, but could not be mapped to any chem group.
+     Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
+     enforced by setting it to 0.
   * "chain_mapping": A dictionary with reference chain names as keys and the
     mapped model chain names as values. Missing chains are either not mapped
-     (but present in "chem_groups", "chem_mapping") or were not considered in
+     (but present in "chem_groups", "chem_mapping"), were not mapped to any chem
+     group (present in "unmapped_mdl_chains") or were not considered in
     chain mapping (short peptides etc.)
   * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
     format.
@@ -104,26 +112,9 @@ Details on the usage (output of ``ost compare-structures --help``):
   * "status": SUCCESS if everything ran through. In case of failure, the only
     content of the JSON output will be "status" set to FAILURE and an
     additional key: "traceback".
+   * "ost_version": The OpenStructure version used for computation.

-  The following additional keys store relevant input parameters to reproduce
-  results:
-
-   * "model"
-   * "reference"
-   * "fault_tolerant"
-   * "model_biounit"
-   * "reference_biounit"
-   * "residue_number_alignment"
-   * "enforce_consistency"
-   * "cad_exec"
-   * "usalign_exec"
-   * "lddt_no_stereochecks"
-   * "min_pep_length"
-   * "min_nuc_length"
-   * "lddt_add_mdl_contacts"
-   * "lddt_inclusion_radius"
-   * "dockq_capri_peptide"
-   * "ost_version"
+  Additional keys represent input options.

  The pairwise sequence alignments are computed with Needleman-Wunsch using
  BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
@@ -489,6 +480,28 @@ Details on the usage (output of ``ost compare-structures --help``):
                          min-pep-length/min-nuc-length residues are aligned.
                          The same threshold is applied to peptide and
                          nucleotide chains.
+    --seqres SEQRES       Default: None - manually define chem groups by
+                          specifying path to a fasta file. Each sequence in that
+                          file is considered a reference sequence of a chem
+                          group. All polymer chains in reference will be aligned
+                          to these sequences. This only works if -rna/--residue-
+                          number-alignment is enabled and an error is raised
+                          otherwise. Additionally, you need to manually specify
+                          a mapping of the polymer chains using trg-seqres-
+                          mapping and an error is raised otherwise. The one
+                          letter codes in the structure must exactly match the
+                          respective characters in seqres and an error is raised
+                          if not.
+    --trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
+                          Default: None - Maps each polymer chain in reference
+                          to a sequence in *seqres*. Each mapping is a key:value
+                          pair where key is the chain name in reference and
+                          value is the sequence name in seqres. So let's say you
+                          have a homo-dimer reference with chains "A" and "B"for
+                          which you provide a seqres file containing one
+                          sequence with name "1". You can specify this mapping
+                          with: --trg-seqres-mapping A:1 B:1
+


 .. _ost compare ligand structures:
@@ -524,6 +537,12 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                                       [--radius RADIUS]
                                       [--lddt-lp-radius LDDT_LP_RADIUS] [-fbs]
                                       [-ms MAX_SYMMETRIES]
+                                       [--min-pep-length MIN_PEP_LENGTH]
+                                       [--min-nuc-length MIN_NUC_LENGTH]
+                                       [--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
+                                       [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
+                                       [--seqres SEQRES]
+                                       [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]

  Evaluate model with non-polymer/small molecule ligands against reference.

@@ -546,6 +565,16 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
  that are not defined for respective residues in the component dictionary. Except
  step 1), every cleanup is logged and a report is available in the json outfile.

+  Only polymers (protein and nucleic acids) of model and reference are considered
+  for ligand binding sites. The mapping of possible reference/model chain
+  assignments requires a preprocessing. In short: identical chains in the
+  reference are grouped based on pairwise sequence identity
+  (see --chem-group-seqid-thresh). Each model chain is assigned to
+  one of these groups (see --chem-map-seqid-thresh param).
+  To avoid spurious matches, only polymers of a certain length are considered
+  in this matching procedure (see --min_pep_length/--min_nuc_length param).
+  Shorter polymers are never mapped and do not contribute to scoring.
+
  Ligands can be given as path to SDF files containing the ligand for both model
  (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
  ligands are optionally detected from a structure file if it is given in mmCIF
@@ -573,6 +602,21 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
     the ligand SDF file(s). Otherwise, they will be the chain name, residue
     number and insertion code of the ligand, separated by a dot.
   * "reference_ligands": Same for reference ligands.
+   * "chem_groups": Groups of polypeptides/polynucleotides from reference that
+     are considered chemically equivalent, i.e. pass a pairwise sequence identity
+     threshold that can be controlled with --chem-group-seqid-thresh.
+     You can derive stoichiometry from this. Contains only chains that are
+     considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
+     peptides, 4 for nucleotides).
+   * "chem_mapping": List of same length as "chem_groups". Assigns model chains to
+     the respective chem group. Again, only contains chains that are considered
+     in chain mapping. That is 1) pass the same size threshold as for chem_groups
+     2) can be aligned to any of the chem groups with a sequence identity
+     threshold that can be controlled by --chem-map-seqid-thresh.
+   * "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
+     i.e. are long enough, but could not be mapped to any chem group.
+     Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
+     enforced by setting it to 0.
   * "status": SUCCESS if everything ran through. In case of failure, the only
     content of the JSON output will be "status" set to FAILURE and an
     additional key: "traceback".
@@ -580,12 +624,8 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
   * "model_cleanup_log": Lists residues/atoms that have been removed in model
     cleanup process.
   * "reference_cleanup_log": Same for reference.
-   * "reference": Parameter provided for --reference/-r
-   * "model": Parameter provided for --model/-m
-   * "resnum_alignments": Parameter provided for --residue-number-alignment/-rna
-   * "substructure_match": Parameter provided for --substructure-match/-sm
-   * "coverage_delta": Parameter provided for --coverage-delta/-cd
-   * "max_symmetries": Parameter provided for --max-symmetries/-ms 
+
+  Additional keys represent input options.

  Each score is opt-in and the respective results are available in three keys:

@@ -651,7 +691,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                          Path to model ligand files.
    -r REFERENCE, --ref REFERENCE, --reference REFERENCE
                          Path to reference file.
-    -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...]
+    -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [  REFERENCE_LIGANDS ...]
                          Path to reference ligand files.
    -o OUTPUT, --out OUTPUT, --output OUTPUT
                          Output file name. Default depends on format: out.json
@@ -726,3 +766,48 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                          If more than that many isomorphisms exist for a
                          target-ligand pair, it will be ignored and reported as
                          unassigned.
+    --min-pep-length MIN_PEP_LENGTH
+                          Default: 6 - Minimum length of a protein chain to be
+                          considered for being part of a binding site.
+    --min-nuc-length MIN_NUC_LENGTH
+                          Default: 4 - Minimum length of a NA chain to be
+                          considered for being part of a binding site.
+    --chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH
+                          Default: 95 - Sequence identity threshold used to
+                          group identical chains in reference structure in the
+                          chain mapping step. The same threshold is applied to
+                          peptide and nucleotide chains.
+    --chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH
+                          Default: 70 - Sequence identity threshold used to map
+                          model chains to groups derived in the chem grouping
+                          step in chain mapping. If set to 0., a mapping is
+                          enforced and each model chain is assigned to the chem
+                          group with maximum sequence identity. If larger than
+                          0., a mapping only happens if the respective model
+                          chain can be aligned to a chem group with the
+                          specified sequence identity threshold AND if at least
+                          min-pep-length/min-nuc-length residues are aligned.
+                          The same threshold is applied to peptide and
+                          nucleotide chains.
+    --seqres SEQRES       Default: None - manually define chem groups by
+                          specifying path to a fasta file. Each sequence in that
+                          file is considered a reference sequence of a chem
+                          group. All polymer chains in reference will be aligned
+                          to these sequences. This only works if -rna/--residue-
+                          number-alignment is enabled and an error is raised
+                          otherwise. Additionally, you need to manually specify
+                          a mapping of the polymer chains using trg-seqres-
+                          mapping and an error is raised otherwise. The one
+                          letter codes in the structure must exactly match the
+                          respective characters in seqres and an error is raised
+                          if not.
+    --trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
+                          Default: None - Maps each polymer chain in reference
+                          to a sequence in *seqres*. Each mapping is a key:value
+                          pair where key is the chain name in reference and
+                          value is the sequence name in seqres. So let's say you
+                          have a homo-dimer reference with chains "A" and "B"for
+                          which you provide a seqres file containing one
+                          sequence with name "1". You can specify this mapping
+                          with: --trg-seqres-mapping A:1 B:1
+