Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
openstructure
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
schwede
openstructure
Commits
0ab3fb42
Commit
0ab3fb42
authored
3 months ago
by
Studer Gabriel
Browse files
Options
Downloads
Patches
Plain Diff
scoring: make it possible to provide SEQRES info to scoring actions
parent
ef030ccb
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
actions/ost-compare-ligand-structures
+67
-2
67 additions, 2 deletions
actions/ost-compare-ligand-structures
actions/ost-compare-structures
+61
-20
61 additions, 20 deletions
actions/ost-compare-structures
modules/doc/actions.rst
+119
-34
119 additions, 34 deletions
modules/doc/actions.rst
with
247 additions
and
56 deletions
actions/ost-compare-ligand-structures
+
67
−
2
View file @
0ab3fb42
...
...
@@ -454,6 +454,38 @@ def _ParseArgs():
"threshold is applied to peptide and nucleotide chains.")
)
parser.add_argument(
"--seqres",
dest="seqres",
type = str,
default = None,
help=("Default: None - manually define chem groups by specifying path "
"to a fasta file. Each sequence in that file is considered a "
"reference sequence of a chem group. All polymer chains "
"in reference will be aligned to these sequences. This only works "
"if -rna/--residue-number-alignment is enabled and an error is raised "
"otherwise. "
"Additionally, you need to manually specify a mapping "
"of the polymer chains using trg-seqres-mapping and an error "
"is raised otherwise. The one letter codes in the structure must "
"exactly match the respective characters in seqres and an error "
"is raised if not.")
)
parser.add_argument(
"--trg-seqres-mapping",
nargs="+",
dest="trg_seqres_mapping",
default=None,
help=("Default: None - Maps each polymer chain in reference to a "
"sequence in *seqres*. Each mapping is a key:value pair "
"where key is the chain name in reference and value is the "
"sequence name in seqres. So let's say you have a homo-dimer "
"reference with chains \"A\" and \"B\"for which you provide a "
"seqres file containing one sequence with name \"1\". You can "
"specify this mapping with: --trg-seqres-mapping A:1 B:1")
)
args = parser.parse_args()
if args.output is None:
args.output = "out.%s" % args.output_format
...
...
@@ -570,8 +602,30 @@ def _LoadStructureData(receptor_path,
return (receptor, ligands)
def _SEQRESFeaturesFromArgs(args):
seqres = None
trg_seqres_mapping = None
if args.seqres is not None:
if args.trg_seqres_mapping is None:
raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
"provided")
if not args.residue_number_alignment:
raise RuntimeError("Must enable residue-number-alignment if seqres "
"is provided.")
seqres = io.LoadSequenceList(args.seqres)
if args.trg_seqres_mapping is not None:
if args.seqres is None:
raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
"provided.")
trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
return (seqres, trg_seqres_mapping)
def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args):
seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference,
model_ligands = model_ligands,
target_ligands = reference_ligands,
...
...
@@ -587,9 +641,12 @@ def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres=seqres,
trg_seqres_mapping=trg_seqres_mapping)
def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args):
seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
return ligand_scoring_scrmsd.SCRMSDScorer(model, reference,
model_ligands = model_ligands,
target_ligands = reference_ligands,
...
...
@@ -606,7 +663,9 @@ def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args)
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres=seqres,
trg_seqres_mapping=trg_seqres_mapping)
def _Process(model, model_ligands, reference, reference_ligands, args):
...
...
@@ -963,6 +1022,12 @@ def _Main():
out["min-nuc-length"] = args.min_nuc_length
out["chem-group-seqid-thresh"] = args.chem_group_seqid_thresh
out["chem-map-seqid-thresh"] = args.chem_map_seqid_thresh
out["seqres"] = args.seqres
if args.trg_seqres_mapping:
tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
out["trg_seqres_mapping"] = tmp
else:
out["trg_seqres_mapping"] = None
# only add lddtpli if actually computed
if args.lddt_pli:
...
...
This diff is collapsed.
Click to expand it.
actions/ost-compare-structures
+
61
−
20
View file @
0ab3fb42
...
...
@@ -53,26 +53,9 @@ comparison:
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
* "ost_version": The OpenStructure version used for computation.
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
* "min_pep_length"
* "min_nuc_length"
* "lddt_add_mdl_contacts"
* "lddt_inclusion_radius"
* "dockq_capri_peptide"
* "ost_version"
Additional keys represent input options.
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
...
...
@@ -684,6 +667,38 @@ def _ParseArgs():
"min-pep-length/min-nuc-length residues are aligned. The same "
"threshold is applied to peptide and nucleotide chains.")
)
parser.add_argument(
"--seqres",
dest="seqres",
type = str,
default = None,
help=("Default: None - manually define chem groups by specifying path "
"to a fasta file. Each sequence in that file is considered a "
"reference sequence of a chem group. All polymer chains "
"in reference will be aligned to these sequences. This only works "
"if -rna/--residue-number-alignment is enabled and an error is raised "
"otherwise. "
"Additionally, you need to manually specify a mapping "
"of the polymer chains using trg-seqres-mapping and an error "
"is raised otherwise. The one letter codes in the structure must "
"exactly match the respective characters in seqres and an error "
"is raised if not.")
)
parser.add_argument(
"--trg-seqres-mapping",
nargs="+",
dest="trg_seqres_mapping",
default=None,
help=("Default: None - Maps each polymer chain in reference to a "
"sequence in *seqres*. Each mapping is a key:value pair "
"where key is the chain name in reference and value is the "
"sequence name in seqres. So let's say you have a homo-dimer "
"reference with chains \"A\" and \"B\"for which you provide a "
"seqres file containing one sequence with name \"1\". You can "
"specify this mapping with: --trg-seqres-mapping A:1 B:1")
)
return parser.parse_args()
...
...
@@ -932,6 +947,24 @@ def _Process(model, reference, args, model_format, reference_format):
if args.chain_mapping is not None:
mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping}
seqres = None
trg_seqres_mapping = None
if args.seqres is not None:
if args.trg_seqres_mapping is None:
raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
"provided")
if not args.residue_number_alignment:
raise RuntimeError("Must enable residue-number-alignment if seqres "
"is provided.")
seqres = io.LoadSequenceList(args.seqres)
if args.trg_seqres_mapping is not None:
if args.seqres is None:
raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
"provided.")
trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
scorer = scoring.Scorer(model, reference,
resnum_alignments = args.residue_number_alignment,
cad_score_exec = args.cad_exec,
...
...
@@ -948,7 +981,9 @@ def _Process(model, reference, args, model_format, reference_format):
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres = seqres,
trg_seqres_mapping = trg_seqres_mapping)
ir = _GetInconsistentResidues(scorer.aln, scorer.target, scorer.model)
if len(ir) > 0 and args.enforce_consistency:
...
...
@@ -1167,6 +1202,12 @@ def _Main():
out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts
out["lddt_inclusion_radius"] = args.lddt_inclusion_radius
out["dockq_capri_peptide"] = args.dockq_capri_peptide
out["seqres"] = args.seqres
if args.trg_seqres_mapping:
tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
out["trg_seqres_mapping"] = tmp
else:
out["trg_seqres_mapping"] = None
out["ost_version"] = ost.__version__
out["status"] = "SUCCESS"
with open(args.output, 'w') as fh:
...
...
This diff is collapsed.
Click to expand it.
modules/doc/actions.rst
+
119
−
34
View file @
0ab3fb42
...
...
@@ -55,6 +55,8 @@ Details on the usage (output of ``ost compare-structures --help``):
[--lddt-inclusion-radius LDDT_INCLUSION_RADIUS]
[--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
[--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
[--seqres SEQRES]
[--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]
Evaluate model against reference
...
...
@@ -80,18 +82,24 @@ Details on the usage (output of ``ost compare-structures --help``):
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
are considered chemically equivalent. You can derive stoichiometry from this.
Contains only chains that are considered in chain mapping, i.e. pass a
size threshold (defaults: 6 for peptides, 4 for nucleotides).
are considered chemically equivalent, i.e. pass a pairwise sequence identity
threshold that can be controlled with --chem-group-seqid-thresh.
You can derive stoichiometry from this. Contains only chains that are
considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
peptides, 4 for nucleotides).
* "chem_mapping": List of same length as "chem_groups". Assigns model chains to
the respective chem group. Again, only contains chains that are considered
in chain mapping.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping
but could not be mapped to any chem group. Depends on --chem-map-seqid-thresh
and a mapping for each model chain can be enforced by setting it to 0.
in chain mapping. That is 1) pass the same size threshold as fo chem_groups
2) can be aligned to any of the chem groups with a sequence identity
threshold that can be controlled by --chem-map-seqid-thresh.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
i.e. are long enough, but could not be mapped to any chem group.
Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
enforced by setting it to 0.
* "chain_mapping": A dictionary with reference chain names as keys and the
mapped model chain names as values. Missing chains are either not mapped
(but present in "chem_groups", "chem_mapping") or were not considered in
(but present in "chem_groups", "chem_mapping"), were not mapped to any chem
group (present in "unmapped_mdl_chains") or were not considered in
chain mapping (short peptides etc.)
* "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
format.
...
...
@@ -104,26 +112,9 @@ Details on the usage (output of ``ost compare-structures --help``):
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
* "ost_version": The OpenStructure version used for computation.
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
* "min_pep_length"
* "min_nuc_length"
* "lddt_add_mdl_contacts"
* "lddt_inclusion_radius"
* "dockq_capri_peptide"
* "ost_version"
Additional keys represent input options.
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
...
...
@@ -489,6 +480,28 @@ Details on the usage (output of ``ost compare-structures --help``):
min-pep-length/min-nuc-length residues are aligned.
The same threshold is applied to peptide and
nucleotide chains.
--seqres SEQRES Default: None - manually define chem groups by
specifying path to a fasta file. Each sequence in that
file is considered a reference sequence of a chem
group. All polymer chains in reference will be aligned
to these sequences. This only works if -rna/--residue-
number-alignment is enabled and an error is raised
otherwise. Additionally, you need to manually specify
a mapping of the polymer chains using trg-seqres-
mapping and an error is raised otherwise. The one
letter codes in the structure must exactly match the
respective characters in seqres and an error is raised
if not.
--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
Default: None - Maps each polymer chain in reference
to a sequence in *seqres*. Each mapping is a key:value
pair where key is the chain name in reference and
value is the sequence name in seqres. So let's say you
have a homo-dimer reference with chains "A" and "B"for
which you provide a seqres file containing one
sequence with name "1". You can specify this mapping
with: --trg-seqres-mapping A:1 B:1
.. _ost compare ligand structures:
...
...
@@ -524,6 +537,12 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
[--radius RADIUS]
[--lddt-lp-radius LDDT_LP_RADIUS] [-fbs]
[-ms MAX_SYMMETRIES]
[--min-pep-length MIN_PEP_LENGTH]
[--min-nuc-length MIN_NUC_LENGTH]
[--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
[--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
[--seqres SEQRES]
[--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]
Evaluate model with non-polymer/small molecule ligands against reference.
...
...
@@ -546,6 +565,16 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
that are not defined for respective residues in the component dictionary. Except
step 1), every cleanup is logged and a report is available in the json outfile.
Only polymers (protein and nucleic acids) of model and reference are considered
for ligand binding sites. The mapping of possible reference/model chain
assignments requires a preprocessing. In short: identical chains in the
reference are grouped based on pairwise sequence identity
(see --chem-group-seqid-thresh). Each model chain is assigned to
one of these groups (see --chem-map-seqid-thresh param).
To avoid spurious matches, only polymers of a certain length are considered
in this matching procedure (see --min_pep_length/--min_nuc_length param).
Shorter polymers are never mapped and do not contribute to scoring.
Ligands can be given as path to SDF files containing the ligand for both model
(--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
ligands are optionally detected from a structure file if it is given in mmCIF
...
...
@@ -573,6 +602,21 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
the ligand SDF file(s). Otherwise, they will be the chain name, residue
number and insertion code of the ligand, separated by a dot.
* "reference_ligands": Same for reference ligands.
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
are considered chemically equivalent, i.e. pass a pairwise sequence identity
threshold that can be controlled with --chem-group-seqid-thresh.
You can derive stoichiometry from this. Contains only chains that are
considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
peptides, 4 for nucleotides).
* "chem_mapping": List of same length as "chem_groups". Assigns model chains to
the respective chem group. Again, only contains chains that are considered
in chain mapping. That is 1) pass the same size threshold as for chem_groups
2) can be aligned to any of the chem groups with a sequence identity
threshold that can be controlled by --chem-map-seqid-thresh.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
i.e. are long enough, but could not be mapped to any chem group.
Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
enforced by setting it to 0.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
...
...
@@ -580,12 +624,8 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
* "model_cleanup_log": Lists residues/atoms that have been removed in model
cleanup process.
* "reference_cleanup_log": Same for reference.
* "reference": Parameter provided for --reference/-r
* "model": Parameter provided for --model/-m
* "resnum_alignments": Parameter provided for --residue-number-alignment/-rna
* "substructure_match": Parameter provided for --substructure-match/-sm
* "coverage_delta": Parameter provided for --coverage-delta/-cd
* "max_symmetries": Parameter provided for --max-symmetries/-ms
Additional keys represent input options.
Each score is opt-in and the respective results are available in three keys:
...
...
@@ -651,7 +691,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
Path to model ligand files.
-r REFERENCE, --ref REFERENCE, --reference REFERENCE
Path to reference file.
-rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...]
-rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [
REFERENCE_LIGANDS ...]
Path to reference ligand files.
-o OUTPUT, --out OUTPUT, --output OUTPUT
Output file name. Default depends on format: out.json
...
...
@@ -726,3 +766,48 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
If more than that many isomorphisms exist for a
target-ligand pair, it will be ignored and reported as
unassigned.
--min-pep-length MIN_PEP_LENGTH
Default: 6 - Minimum length of a protein chain to be
considered for being part of a binding site.
--min-nuc-length MIN_NUC_LENGTH
Default: 4 - Minimum length of a NA chain to be
considered for being part of a binding site.
--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH
Default: 95 - Sequence identity threshold used to
group identical chains in reference structure in the
chain mapping step. The same threshold is applied to
peptide and nucleotide chains.
--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH
Default: 70 - Sequence identity threshold used to map
model chains to groups derived in the chem grouping
step in chain mapping. If set to 0., a mapping is
enforced and each model chain is assigned to the chem
group with maximum sequence identity. If larger than
0., a mapping only happens if the respective model
chain can be aligned to a chem group with the
specified sequence identity threshold AND if at least
min-pep-length/min-nuc-length residues are aligned.
The same threshold is applied to peptide and
nucleotide chains.
--seqres SEQRES Default: None - manually define chem groups by
specifying path to a fasta file. Each sequence in that
file is considered a reference sequence of a chem
group. All polymer chains in reference will be aligned
to these sequences. This only works if -rna/--residue-
number-alignment is enabled and an error is raised
otherwise. Additionally, you need to manually specify
a mapping of the polymer chains using trg-seqres-
mapping and an error is raised otherwise. The one
letter codes in the structure must exactly match the
respective characters in seqres and an error is raised
if not.
--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
Default: None - Maps each polymer chain in reference
to a sequence in *seqres*. Each mapping is a key:value
pair where key is the chain name in reference and
value is the sequence name in seqres. So let's say you
have a homo-dimer reference with chains "A" and "B"for
which you provide a seqres file containing one
sequence with name "1". You can specify this mapping
with: --trg-seqres-mapping A:1 B:1
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment