Skip to content
Snippets Groups Projects
Commit 0ab3fb42 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

scoring: make it possible to provide SEQRES info to scoring actions

parent ef030ccb
No related branches found
No related tags found
No related merge requests found
......@@ -454,6 +454,38 @@ def _ParseArgs():
"threshold is applied to peptide and nucleotide chains.")
)
parser.add_argument(
"--seqres",
dest="seqres",
type = str,
default = None,
help=("Default: None - manually define chem groups by specifying path "
"to a fasta file. Each sequence in that file is considered a "
"reference sequence of a chem group. All polymer chains "
"in reference will be aligned to these sequences. This only works "
"if -rna/--residue-number-alignment is enabled and an error is raised "
"otherwise. "
"Additionally, you need to manually specify a mapping "
"of the polymer chains using trg-seqres-mapping and an error "
"is raised otherwise. The one letter codes in the structure must "
"exactly match the respective characters in seqres and an error "
"is raised if not.")
)
parser.add_argument(
"--trg-seqres-mapping",
nargs="+",
dest="trg_seqres_mapping",
default=None,
help=("Default: None - Maps each polymer chain in reference to a "
"sequence in *seqres*. Each mapping is a key:value pair "
"where key is the chain name in reference and value is the "
"sequence name in seqres. So let's say you have a homo-dimer "
"reference with chains \"A\" and \"B\"for which you provide a "
"seqres file containing one sequence with name \"1\". You can "
"specify this mapping with: --trg-seqres-mapping A:1 B:1")
)
args = parser.parse_args()
if args.output is None:
args.output = "out.%s" % args.output_format
......@@ -570,8 +602,30 @@ def _LoadStructureData(receptor_path,
return (receptor, ligands)
def _SEQRESFeaturesFromArgs(args):
seqres = None
trg_seqres_mapping = None
if args.seqres is not None:
if args.trg_seqres_mapping is None:
raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
"provided")
if not args.residue_number_alignment:
raise RuntimeError("Must enable residue-number-alignment if seqres "
"is provided.")
seqres = io.LoadSequenceList(args.seqres)
if args.trg_seqres_mapping is not None:
if args.seqres is None:
raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
"provided.")
trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
return (seqres, trg_seqres_mapping)
def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args):
seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference,
model_ligands = model_ligands,
target_ligands = reference_ligands,
......@@ -587,9 +641,12 @@ def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres=seqres,
trg_seqres_mapping=trg_seqres_mapping)
def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args):
seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args)
return ligand_scoring_scrmsd.SCRMSDScorer(model, reference,
model_ligands = model_ligands,
target_ligands = reference_ligands,
......@@ -606,7 +663,9 @@ def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args)
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres=seqres,
trg_seqres_mapping=trg_seqres_mapping)
def _Process(model, model_ligands, reference, reference_ligands, args):
......@@ -963,6 +1022,12 @@ def _Main():
out["min-nuc-length"] = args.min_nuc_length
out["chem-group-seqid-thresh"] = args.chem_group_seqid_thresh
out["chem-map-seqid-thresh"] = args.chem_map_seqid_thresh
out["seqres"] = args.seqres
if args.trg_seqres_mapping:
tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
out["trg_seqres_mapping"] = tmp
else:
out["trg_seqres_mapping"] = None
# only add lddtpli if actually computed
if args.lddt_pli:
......
......@@ -53,26 +53,9 @@ comparison:
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
* "ost_version": The OpenStructure version used for computation.
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
* "min_pep_length"
* "min_nuc_length"
* "lddt_add_mdl_contacts"
* "lddt_inclusion_radius"
* "dockq_capri_peptide"
* "ost_version"
Additional keys represent input options.
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
......@@ -684,6 +667,38 @@ def _ParseArgs():
"min-pep-length/min-nuc-length residues are aligned. The same "
"threshold is applied to peptide and nucleotide chains.")
)
parser.add_argument(
"--seqres",
dest="seqres",
type = str,
default = None,
help=("Default: None - manually define chem groups by specifying path "
"to a fasta file. Each sequence in that file is considered a "
"reference sequence of a chem group. All polymer chains "
"in reference will be aligned to these sequences. This only works "
"if -rna/--residue-number-alignment is enabled and an error is raised "
"otherwise. "
"Additionally, you need to manually specify a mapping "
"of the polymer chains using trg-seqres-mapping and an error "
"is raised otherwise. The one letter codes in the structure must "
"exactly match the respective characters in seqres and an error "
"is raised if not.")
)
parser.add_argument(
"--trg-seqres-mapping",
nargs="+",
dest="trg_seqres_mapping",
default=None,
help=("Default: None - Maps each polymer chain in reference to a "
"sequence in *seqres*. Each mapping is a key:value pair "
"where key is the chain name in reference and value is the "
"sequence name in seqres. So let's say you have a homo-dimer "
"reference with chains \"A\" and \"B\"for which you provide a "
"seqres file containing one sequence with name \"1\". You can "
"specify this mapping with: --trg-seqres-mapping A:1 B:1")
)
return parser.parse_args()
......@@ -932,6 +947,24 @@ def _Process(model, reference, args, model_format, reference_format):
if args.chain_mapping is not None:
mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping}
seqres = None
trg_seqres_mapping = None
if args.seqres is not None:
if args.trg_seqres_mapping is None:
raise RuntimeError("Must provide trg-seqres-mapping if seqres is "
"provided")
if not args.residue_number_alignment:
raise RuntimeError("Must enable residue-number-alignment if seqres "
"is provided.")
seqres = io.LoadSequenceList(args.seqres)
if args.trg_seqres_mapping is not None:
if args.seqres is None:
raise RuntimeError("Must provide seqres if trg-seqres-mapping is "
"provided.")
trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
scorer = scoring.Scorer(model, reference,
resnum_alignments = args.residue_number_alignment,
cad_score_exec = args.cad_exec,
......@@ -948,7 +981,9 @@ def _Process(model, reference, args, model_format, reference_format):
pep_seqid_thr = args.chem_group_seqid_thresh,
nuc_seqid_thr = args.chem_group_seqid_thresh,
mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh,
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh)
mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh,
seqres = seqres,
trg_seqres_mapping = trg_seqres_mapping)
ir = _GetInconsistentResidues(scorer.aln, scorer.target, scorer.model)
if len(ir) > 0 and args.enforce_consistency:
......@@ -1167,6 +1202,12 @@ def _Main():
out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts
out["lddt_inclusion_radius"] = args.lddt_inclusion_radius
out["dockq_capri_peptide"] = args.dockq_capri_peptide
out["seqres"] = args.seqres
if args.trg_seqres_mapping:
tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping}
out["trg_seqres_mapping"] = tmp
else:
out["trg_seqres_mapping"] = None
out["ost_version"] = ost.__version__
out["status"] = "SUCCESS"
with open(args.output, 'w') as fh:
......
......@@ -55,6 +55,8 @@ Details on the usage (output of ``ost compare-structures --help``):
[--lddt-inclusion-radius LDDT_INCLUSION_RADIUS]
[--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
[--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
[--seqres SEQRES]
[--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]
Evaluate model against reference
......@@ -80,18 +82,24 @@ Details on the usage (output of ``ost compare-structures --help``):
* "reference_chains": Chain names of reference
* "model_chains": Chain names of model
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
are considered chemically equivalent. You can derive stoichiometry from this.
Contains only chains that are considered in chain mapping, i.e. pass a
size threshold (defaults: 6 for peptides, 4 for nucleotides).
are considered chemically equivalent, i.e. pass a pairwise sequence identity
threshold that can be controlled with --chem-group-seqid-thresh.
You can derive stoichiometry from this. Contains only chains that are
considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
peptides, 4 for nucleotides).
* "chem_mapping": List of same length as "chem_groups". Assigns model chains to
the respective chem group. Again, only contains chains that are considered
in chain mapping.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping
but could not be mapped to any chem group. Depends on --chem-map-seqid-thresh
and a mapping for each model chain can be enforced by setting it to 0.
in chain mapping. That is 1) pass the same size threshold as fo chem_groups
2) can be aligned to any of the chem groups with a sequence identity
threshold that can be controlled by --chem-map-seqid-thresh.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
i.e. are long enough, but could not be mapped to any chem group.
Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
enforced by setting it to 0.
* "chain_mapping": A dictionary with reference chain names as keys and the
mapped model chain names as values. Missing chains are either not mapped
(but present in "chem_groups", "chem_mapping") or were not considered in
(but present in "chem_groups", "chem_mapping"), were not mapped to any chem
group (present in "unmapped_mdl_chains") or were not considered in
chain mapping (short peptides etc.)
* "aln": Pairwise sequence alignment for each pair of mapped chains in fasta
format.
......@@ -104,26 +112,9 @@ Details on the usage (output of ``ost compare-structures --help``):
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
* "ost_version": The OpenStructure version used for computation.
The following additional keys store relevant input parameters to reproduce
results:
* "model"
* "reference"
* "fault_tolerant"
* "model_biounit"
* "reference_biounit"
* "residue_number_alignment"
* "enforce_consistency"
* "cad_exec"
* "usalign_exec"
* "lddt_no_stereochecks"
* "min_pep_length"
* "min_nuc_length"
* "lddt_add_mdl_contacts"
* "lddt_inclusion_radius"
* "dockq_capri_peptide"
* "ost_version"
Additional keys represent input options.
The pairwise sequence alignments are computed with Needleman-Wunsch using
BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the
......@@ -489,6 +480,28 @@ Details on the usage (output of ``ost compare-structures --help``):
min-pep-length/min-nuc-length residues are aligned.
The same threshold is applied to peptide and
nucleotide chains.
--seqres SEQRES Default: None - manually define chem groups by
specifying path to a fasta file. Each sequence in that
file is considered a reference sequence of a chem
group. All polymer chains in reference will be aligned
to these sequences. This only works if -rna/--residue-
number-alignment is enabled and an error is raised
otherwise. Additionally, you need to manually specify
a mapping of the polymer chains using trg-seqres-
mapping and an error is raised otherwise. The one
letter codes in the structure must exactly match the
respective characters in seqres and an error is raised
if not.
--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
Default: None - Maps each polymer chain in reference
to a sequence in *seqres*. Each mapping is a key:value
pair where key is the chain name in reference and
value is the sequence name in seqres. So let's say you
have a homo-dimer reference with chains "A" and "B"for
which you provide a seqres file containing one
sequence with name "1". You can specify this mapping
with: --trg-seqres-mapping A:1 B:1
.. _ost compare ligand structures:
......@@ -524,6 +537,12 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
[--radius RADIUS]
[--lddt-lp-radius LDDT_LP_RADIUS] [-fbs]
[-ms MAX_SYMMETRIES]
[--min-pep-length MIN_PEP_LENGTH]
[--min-nuc-length MIN_NUC_LENGTH]
[--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH]
[--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH]
[--seqres SEQRES]
[--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]]
Evaluate model with non-polymer/small molecule ligands against reference.
......@@ -546,6 +565,16 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
that are not defined for respective residues in the component dictionary. Except
step 1), every cleanup is logged and a report is available in the json outfile.
Only polymers (protein and nucleic acids) of model and reference are considered
for ligand binding sites. The mapping of possible reference/model chain
assignments requires a preprocessing. In short: identical chains in the
reference are grouped based on pairwise sequence identity
(see --chem-group-seqid-thresh). Each model chain is assigned to
one of these groups (see --chem-map-seqid-thresh param).
To avoid spurious matches, only polymers of a certain length are considered
in this matching procedure (see --min_pep_length/--min_nuc_length param).
Shorter polymers are never mapped and do not contribute to scoring.
Ligands can be given as path to SDF files containing the ligand for both model
(--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
ligands are optionally detected from a structure file if it is given in mmCIF
......@@ -573,6 +602,21 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
the ligand SDF file(s). Otherwise, they will be the chain name, residue
number and insertion code of the ligand, separated by a dot.
* "reference_ligands": Same for reference ligands.
* "chem_groups": Groups of polypeptides/polynucleotides from reference that
are considered chemically equivalent, i.e. pass a pairwise sequence identity
threshold that can be controlled with --chem-group-seqid-thresh.
You can derive stoichiometry from this. Contains only chains that are
considered in chain mapping, i.e. pass a size threshold (defaults: 6 for
peptides, 4 for nucleotides).
* "chem_mapping": List of same length as "chem_groups". Assigns model chains to
the respective chem group. Again, only contains chains that are considered
in chain mapping. That is 1) pass the same size threshold as for chem_groups
2) can be aligned to any of the chem groups with a sequence identity
threshold that can be controlled by --chem-map-seqid-thresh.
* "unmapped_mdl_chains": Model chains that could be considered in chain mapping,
i.e. are long enough, but could not be mapped to any chem group.
Depends on --chem-map-seqid-thresh. A mapping for each model chain can be
enforced by setting it to 0.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be "status" set to FAILURE and an
additional key: "traceback".
......@@ -580,12 +624,8 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
* "model_cleanup_log": Lists residues/atoms that have been removed in model
cleanup process.
* "reference_cleanup_log": Same for reference.
* "reference": Parameter provided for --reference/-r
* "model": Parameter provided for --model/-m
* "resnum_alignments": Parameter provided for --residue-number-alignment/-rna
* "substructure_match": Parameter provided for --substructure-match/-sm
* "coverage_delta": Parameter provided for --coverage-delta/-cd
* "max_symmetries": Parameter provided for --max-symmetries/-ms
Additional keys represent input options.
Each score is opt-in and the respective results are available in three keys:
......@@ -651,7 +691,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
Path to model ligand files.
-r REFERENCE, --ref REFERENCE, --reference REFERENCE
Path to reference file.
-rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...]
-rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [ REFERENCE_LIGANDS ...]
Path to reference ligand files.
-o OUTPUT, --out OUTPUT, --output OUTPUT
Output file name. Default depends on format: out.json
......@@ -726,3 +766,48 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
If more than that many isomorphisms exist for a
target-ligand pair, it will be ignored and reported as
unassigned.
--min-pep-length MIN_PEP_LENGTH
Default: 6 - Minimum length of a protein chain to be
considered for being part of a binding site.
--min-nuc-length MIN_NUC_LENGTH
Default: 4 - Minimum length of a NA chain to be
considered for being part of a binding site.
--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH
Default: 95 - Sequence identity threshold used to
group identical chains in reference structure in the
chain mapping step. The same threshold is applied to
peptide and nucleotide chains.
--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH
Default: 70 - Sequence identity threshold used to map
model chains to groups derived in the chem grouping
step in chain mapping. If set to 0., a mapping is
enforced and each model chain is assigned to the chem
group with maximum sequence identity. If larger than
0., a mapping only happens if the respective model
chain can be aligned to a chem group with the
specified sequence identity threshold AND if at least
min-pep-length/min-nuc-length residues are aligned.
The same threshold is applied to peptide and
nucleotide chains.
--seqres SEQRES Default: None - manually define chem groups by
specifying path to a fasta file. Each sequence in that
file is considered a reference sequence of a chem
group. All polymer chains in reference will be aligned
to these sequences. This only works if -rna/--residue-
number-alignment is enabled and an error is raised
otherwise. Additionally, you need to manually specify
a mapping of the polymer chains using trg-seqres-
mapping and an error is raised otherwise. The one
letter codes in the structure must exactly match the
respective characters in seqres and an error is raised
if not.
--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]
Default: None - Maps each polymer chain in reference
to a sequence in *seqres*. Each mapping is a key:value
pair where key is the chain name in reference and
value is the sequence name in seqres. So let's say you
have a homo-dimer reference with chains "A" and "B"for
which you provide a seqres file containing one
sequence with name "1". You can specify this mapping
with: --trg-seqres-mapping A:1 B:1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment