diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures index f324c47617eb1f7a2dc11bcd52424ec9cc4289bc..2801c7319a75ee86e612b683b4ef43bfd3b3a9f5 100644 --- a/actions/ost-compare-ligand-structures +++ b/actions/ost-compare-ligand-structures @@ -454,6 +454,38 @@ def _ParseArgs(): "threshold is applied to peptide and nucleotide chains.") ) + parser.add_argument( + "--seqres", + dest="seqres", + type = str, + default = None, + help=("Default: None - manually define chem groups by specifying path " + "to a fasta file. Each sequence in that file is considered a " + "reference sequence of a chem group. All polymer chains " + "in reference will be aligned to these sequences. This only works " + "if -rna/--residue-number-alignment is enabled and an error is raised " + "otherwise. " + "Additionally, you need to manually specify a mapping " + "of the polymer chains using trg-seqres-mapping and an error " + "is raised otherwise. The one letter codes in the structure must " + "exactly match the respective characters in seqres and an error " + "is raised if not.") + ) + + parser.add_argument( + "--trg-seqres-mapping", + nargs="+", + dest="trg_seqres_mapping", + default=None, + help=("Default: None - Maps each polymer chain in reference to a " + "sequence in *seqres*. Each mapping is a key:value pair " + "where key is the chain name in reference and value is the " + "sequence name in seqres. So let's say you have a homo-dimer " + "reference with chains \"A\" and \"B\"for which you provide a " + "seqres file containing one sequence with name \"1\". You can " + "specify this mapping with: --trg-seqres-mapping A:1 B:1") + ) + args = parser.parse_args() if args.output is None: args.output = "out.%s" % args.output_format @@ -570,8 +602,30 @@ def _LoadStructureData(receptor_path, return (receptor, ligands) +def _SEQRESFeaturesFromArgs(args): + seqres = None + trg_seqres_mapping = None + + if args.seqres is not None: + if args.trg_seqres_mapping is None: + raise RuntimeError("Must provide trg-seqres-mapping if seqres is " + "provided") + if not args.residue_number_alignment: + raise RuntimeError("Must enable residue-number-alignment if seqres " + "is provided.") + seqres = io.LoadSequenceList(args.seqres) + + if args.trg_seqres_mapping is not None: + if args.seqres is None: + raise RuntimeError("Must provide seqres if trg-seqres-mapping is " + "provided.") + trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + + return (seqres, trg_seqres_mapping) + def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args): + seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args) return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference, model_ligands = model_ligands, target_ligands = reference_ligands, @@ -587,9 +641,12 @@ def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args pep_seqid_thr = args.chem_group_seqid_thresh, nuc_seqid_thr = args.chem_group_seqid_thresh, mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh, - mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh) + mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh, + seqres=seqres, + trg_seqres_mapping=trg_seqres_mapping) def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args): + seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args) return ligand_scoring_scrmsd.SCRMSDScorer(model, reference, model_ligands = model_ligands, target_ligands = reference_ligands, @@ -606,7 +663,9 @@ def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args) pep_seqid_thr = args.chem_group_seqid_thresh, nuc_seqid_thr = args.chem_group_seqid_thresh, mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh, - mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh) + mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh, + seqres=seqres, + trg_seqres_mapping=trg_seqres_mapping) def _Process(model, model_ligands, reference, reference_ligands, args): @@ -963,6 +1022,12 @@ def _Main(): out["min-nuc-length"] = args.min_nuc_length out["chem-group-seqid-thresh"] = args.chem_group_seqid_thresh out["chem-map-seqid-thresh"] = args.chem_map_seqid_thresh + out["seqres"] = args.seqres + if args.trg_seqres_mapping: + tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + out["trg_seqres_mapping"] = tmp + else: + out["trg_seqres_mapping"] = None # only add lddtpli if actually computed if args.lddt_pli: diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index 649522d99f41c1a22223de46ec12be2fcd4565a6..025a3737f27ca9030080c7492ebc8a8e56a62c24 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -53,26 +53,9 @@ comparison: * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be \"status\" set to FAILURE and an additional key: "traceback". + * "ost_version": The OpenStructure version used for computation. -The following additional keys store relevant input parameters to reproduce -results: - - * "model" - * "reference" - * "fault_tolerant" - * "model_biounit" - * "reference_biounit" - * "residue_number_alignment" - * "enforce_consistency" - * "cad_exec" - * "usalign_exec" - * "lddt_no_stereochecks" - * "min_pep_length" - * "min_nuc_length" - * "lddt_add_mdl_contacts" - * "lddt_inclusion_radius" - * "dockq_capri_peptide" - * "ost_version" +Additional keys represent input options. The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the @@ -684,6 +667,38 @@ def _ParseArgs(): "min-pep-length/min-nuc-length residues are aligned. The same " "threshold is applied to peptide and nucleotide chains.") ) + + parser.add_argument( + "--seqres", + dest="seqres", + type = str, + default = None, + help=("Default: None - manually define chem groups by specifying path " + "to a fasta file. Each sequence in that file is considered a " + "reference sequence of a chem group. All polymer chains " + "in reference will be aligned to these sequences. This only works " + "if -rna/--residue-number-alignment is enabled and an error is raised " + "otherwise. " + "Additionally, you need to manually specify a mapping " + "of the polymer chains using trg-seqres-mapping and an error " + "is raised otherwise. The one letter codes in the structure must " + "exactly match the respective characters in seqres and an error " + "is raised if not.") + ) + + parser.add_argument( + "--trg-seqres-mapping", + nargs="+", + dest="trg_seqres_mapping", + default=None, + help=("Default: None - Maps each polymer chain in reference to a " + "sequence in *seqres*. Each mapping is a key:value pair " + "where key is the chain name in reference and value is the " + "sequence name in seqres. So let's say you have a homo-dimer " + "reference with chains \"A\" and \"B\"for which you provide a " + "seqres file containing one sequence with name \"1\". You can " + "specify this mapping with: --trg-seqres-mapping A:1 B:1") + ) return parser.parse_args() @@ -932,6 +947,24 @@ def _Process(model, reference, args, model_format, reference_format): if args.chain_mapping is not None: mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} + seqres = None + trg_seqres_mapping = None + + if args.seqres is not None: + if args.trg_seqres_mapping is None: + raise RuntimeError("Must provide trg-seqres-mapping if seqres is " + "provided") + if not args.residue_number_alignment: + raise RuntimeError("Must enable residue-number-alignment if seqres " + "is provided.") + seqres = io.LoadSequenceList(args.seqres) + + if args.trg_seqres_mapping is not None: + if args.seqres is None: + raise RuntimeError("Must provide seqres if trg-seqres-mapping is " + "provided.") + trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + scorer = scoring.Scorer(model, reference, resnum_alignments = args.residue_number_alignment, cad_score_exec = args.cad_exec, @@ -948,7 +981,9 @@ def _Process(model, reference, args, model_format, reference_format): pep_seqid_thr = args.chem_group_seqid_thresh, nuc_seqid_thr = args.chem_group_seqid_thresh, mdl_map_pep_seqid_thr = args.chem_map_seqid_thresh, - mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh) + mdl_map_nuc_seqid_thr = args.chem_map_seqid_thresh, + seqres = seqres, + trg_seqres_mapping = trg_seqres_mapping) ir = _GetInconsistentResidues(scorer.aln, scorer.target, scorer.model) if len(ir) > 0 and args.enforce_consistency: @@ -1167,6 +1202,12 @@ def _Main(): out["lddt_add_mdl_contacts"] = args.lddt_add_mdl_contacts out["lddt_inclusion_radius"] = args.lddt_inclusion_radius out["dockq_capri_peptide"] = args.dockq_capri_peptide + out["seqres"] = args.seqres + if args.trg_seqres_mapping: + tmp = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + out["trg_seqres_mapping"] = tmp + else: + out["trg_seqres_mapping"] = None out["ost_version"] = ost.__version__ out["status"] = "SUCCESS" with open(args.output, 'w') as fh: diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst index c25eeebbcf8ec2dd0721eeacabed570e8b5f01e5..09913c44a5b41f5f8094363e72d47dd0c36ddcdd 100644 --- a/modules/doc/actions.rst +++ b/modules/doc/actions.rst @@ -55,6 +55,8 @@ Details on the usage (output of ``ost compare-structures --help``): [--lddt-inclusion-radius LDDT_INCLUSION_RADIUS] [--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH] [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH] + [--seqres SEQRES] + [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]] Evaluate model against reference @@ -80,18 +82,24 @@ Details on the usage (output of ``ost compare-structures --help``): * "reference_chains": Chain names of reference * "model_chains": Chain names of model * "chem_groups": Groups of polypeptides/polynucleotides from reference that - are considered chemically equivalent. You can derive stoichiometry from this. - Contains only chains that are considered in chain mapping, i.e. pass a - size threshold (defaults: 6 for peptides, 4 for nucleotides). + are considered chemically equivalent, i.e. pass a pairwise sequence identity + threshold that can be controlled with --chem-group-seqid-thresh. + You can derive stoichiometry from this. Contains only chains that are + considered in chain mapping, i.e. pass a size threshold (defaults: 6 for + peptides, 4 for nucleotides). * "chem_mapping": List of same length as "chem_groups". Assigns model chains to the respective chem group. Again, only contains chains that are considered - in chain mapping. - * "unmapped_mdl_chains": Model chains that could be considered in chain mapping - but could not be mapped to any chem group. Depends on --chem-map-seqid-thresh - and a mapping for each model chain can be enforced by setting it to 0. + in chain mapping. That is 1) pass the same size threshold as fo chem_groups + 2) can be aligned to any of the chem groups with a sequence identity + threshold that can be controlled by --chem-map-seqid-thresh. + * "unmapped_mdl_chains": Model chains that could be considered in chain mapping, + i.e. are long enough, but could not be mapped to any chem group. + Depends on --chem-map-seqid-thresh. A mapping for each model chain can be + enforced by setting it to 0. * "chain_mapping": A dictionary with reference chain names as keys and the mapped model chain names as values. Missing chains are either not mapped - (but present in "chem_groups", "chem_mapping") or were not considered in + (but present in "chem_groups", "chem_mapping"), were not mapped to any chem + group (present in "unmapped_mdl_chains") or were not considered in chain mapping (short peptides etc.) * "aln": Pairwise sequence alignment for each pair of mapped chains in fasta format. @@ -104,26 +112,9 @@ Details on the usage (output of ``ost compare-structures --help``): * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". + * "ost_version": The OpenStructure version used for computation. - The following additional keys store relevant input parameters to reproduce - results: - - * "model" - * "reference" - * "fault_tolerant" - * "model_biounit" - * "reference_biounit" - * "residue_number_alignment" - * "enforce_consistency" - * "cad_exec" - * "usalign_exec" - * "lddt_no_stereochecks" - * "min_pep_length" - * "min_nuc_length" - * "lddt_add_mdl_contacts" - * "lddt_inclusion_radius" - * "dockq_capri_peptide" - * "ost_version" + Additional keys represent input options. The pairwise sequence alignments are computed with Needleman-Wunsch using BLOSUM62 (NUC44 for nucleotides). Many benchmarking scenarios preprocess the @@ -489,6 +480,28 @@ Details on the usage (output of ``ost compare-structures --help``): min-pep-length/min-nuc-length residues are aligned. The same threshold is applied to peptide and nucleotide chains. + --seqres SEQRES Default: None - manually define chem groups by + specifying path to a fasta file. Each sequence in that + file is considered a reference sequence of a chem + group. All polymer chains in reference will be aligned + to these sequences. This only works if -rna/--residue- + number-alignment is enabled and an error is raised + otherwise. Additionally, you need to manually specify + a mapping of the polymer chains using trg-seqres- + mapping and an error is raised otherwise. The one + letter codes in the structure must exactly match the + respective characters in seqres and an error is raised + if not. + --trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...] + Default: None - Maps each polymer chain in reference + to a sequence in *seqres*. Each mapping is a key:value + pair where key is the chain name in reference and + value is the sequence name in seqres. So let's say you + have a homo-dimer reference with chains "A" and "B"for + which you provide a seqres file containing one + sequence with name "1". You can specify this mapping + with: --trg-seqres-mapping A:1 B:1 + .. _ost compare ligand structures: @@ -524,6 +537,12 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): [--radius RADIUS] [--lddt-lp-radius LDDT_LP_RADIUS] [-fbs] [-ms MAX_SYMMETRIES] + [--min-pep-length MIN_PEP_LENGTH] + [--min-nuc-length MIN_NUC_LENGTH] + [--chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH] + [--chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH] + [--seqres SEQRES] + [--trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...]] Evaluate model with non-polymer/small molecule ligands against reference. @@ -546,6 +565,16 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): that are not defined for respective residues in the component dictionary. Except step 1), every cleanup is logged and a report is available in the json outfile. + Only polymers (protein and nucleic acids) of model and reference are considered + for ligand binding sites. The mapping of possible reference/model chain + assignments requires a preprocessing. In short: identical chains in the + reference are grouped based on pairwise sequence identity + (see --chem-group-seqid-thresh). Each model chain is assigned to + one of these groups (see --chem-map-seqid-thresh param). + To avoid spurious matches, only polymers of a certain length are considered + in this matching procedure (see --min_pep_length/--min_nuc_length param). + Shorter polymers are never mapped and do not contribute to scoring. + Ligands can be given as path to SDF files containing the ligand for both model (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted, ligands are optionally detected from a structure file if it is given in mmCIF @@ -573,6 +602,21 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): the ligand SDF file(s). Otherwise, they will be the chain name, residue number and insertion code of the ligand, separated by a dot. * "reference_ligands": Same for reference ligands. + * "chem_groups": Groups of polypeptides/polynucleotides from reference that + are considered chemically equivalent, i.e. pass a pairwise sequence identity + threshold that can be controlled with --chem-group-seqid-thresh. + You can derive stoichiometry from this. Contains only chains that are + considered in chain mapping, i.e. pass a size threshold (defaults: 6 for + peptides, 4 for nucleotides). + * "chem_mapping": List of same length as "chem_groups". Assigns model chains to + the respective chem group. Again, only contains chains that are considered + in chain mapping. That is 1) pass the same size threshold as for chem_groups + 2) can be aligned to any of the chem groups with a sequence identity + threshold that can be controlled by --chem-map-seqid-thresh. + * "unmapped_mdl_chains": Model chains that could be considered in chain mapping, + i.e. are long enough, but could not be mapped to any chem group. + Depends on --chem-map-seqid-thresh. A mapping for each model chain can be + enforced by setting it to 0. * "status": SUCCESS if everything ran through. In case of failure, the only content of the JSON output will be "status" set to FAILURE and an additional key: "traceback". @@ -580,12 +624,8 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): * "model_cleanup_log": Lists residues/atoms that have been removed in model cleanup process. * "reference_cleanup_log": Same for reference. - * "reference": Parameter provided for --reference/-r - * "model": Parameter provided for --model/-m - * "resnum_alignments": Parameter provided for --residue-number-alignment/-rna - * "substructure_match": Parameter provided for --substructure-match/-sm - * "coverage_delta": Parameter provided for --coverage-delta/-cd - * "max_symmetries": Parameter provided for --max-symmetries/-ms + + Additional keys represent input options. Each score is opt-in and the respective results are available in three keys: @@ -651,7 +691,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): Path to model ligand files. -r REFERENCE, --ref REFERENCE, --reference REFERENCE Path to reference file. - -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [REFERENCE_LIGANDS ...] + -rl [REFERENCE_LIGANDS ...], --ref-ligands [REFERENCE_LIGANDS ...], --reference-ligands [ REFERENCE_LIGANDS ...] Path to reference ligand files. -o OUTPUT, --out OUTPUT, --output OUTPUT Output file name. Default depends on format: out.json @@ -726,3 +766,48 @@ Details on the usage (output of ``ost compare-ligand-structures --help``): If more than that many isomorphisms exist for a target-ligand pair, it will be ignored and reported as unassigned. + --min-pep-length MIN_PEP_LENGTH + Default: 6 - Minimum length of a protein chain to be + considered for being part of a binding site. + --min-nuc-length MIN_NUC_LENGTH + Default: 4 - Minimum length of a NA chain to be + considered for being part of a binding site. + --chem-group-seqid-thresh CHEM_GROUP_SEQID_THRESH + Default: 95 - Sequence identity threshold used to + group identical chains in reference structure in the + chain mapping step. The same threshold is applied to + peptide and nucleotide chains. + --chem-map-seqid-thresh CHEM_MAP_SEQID_THRESH + Default: 70 - Sequence identity threshold used to map + model chains to groups derived in the chem grouping + step in chain mapping. If set to 0., a mapping is + enforced and each model chain is assigned to the chem + group with maximum sequence identity. If larger than + 0., a mapping only happens if the respective model + chain can be aligned to a chem group with the + specified sequence identity threshold AND if at least + min-pep-length/min-nuc-length residues are aligned. + The same threshold is applied to peptide and + nucleotide chains. + --seqres SEQRES Default: None - manually define chem groups by + specifying path to a fasta file. Each sequence in that + file is considered a reference sequence of a chem + group. All polymer chains in reference will be aligned + to these sequences. This only works if -rna/--residue- + number-alignment is enabled and an error is raised + otherwise. Additionally, you need to manually specify + a mapping of the polymer chains using trg-seqres- + mapping and an error is raised otherwise. The one + letter codes in the structure must exactly match the + respective characters in seqres and an error is raised + if not. + --trg-seqres-mapping TRG_SEQRES_MAPPING [TRG_SEQRES_MAPPING ...] + Default: None - Maps each polymer chain in reference + to a sequence in *seqres*. Each mapping is a key:value + pair where key is the chain name in reference and + value is the sequence name in seqres. So let's say you + have a homo-dimer reference with chains "A" and "B"for + which you provide a seqres file containing one + sequence with name "1". You can specify this mapping + with: --trg-seqres-mapping A:1 B:1 +