diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures index f09003588167a6af2e3cad4022c985ebd18a256e..9cd27342176077aa0cc3fa36f8744a8d5eda949d 100644 --- a/actions/ost-compare-ligand-structures +++ b/actions/ost-compare-ligand-structures @@ -61,8 +61,15 @@ keys: number and insertion code of the ligand, separated by a dot. * "reference_ligands": Same for reference ligands. * "chem_groups": Groups of polypeptides/polynucleotides from reference that - are considered chemically equivalent, i.e. pass a pairwise sequence identity - threshold that can be controlled with --chem-group-seqid-thresh. + are considered chemically equivalent. Predefined if the reference is an mmCIF + file or if "seqres"/"trg-seqres-mapping" are provided manually. Alignments + of structure to SEQRES are established using residue numbers in these cases + and matching structure one letter codes and SEQRES are enforced. + In case of a PDB reference without predefined SEQRES, groups are established + using clustering based on pairwise alignments. Chains within + "chem_group_seqid_thresh" are considered equivalent and alignments are + established using residue numbers or Needleman-Wunsch + (see "residue-number-alignments" flag) You can derive stoichiometry from this. Contains only chains that are considered in chain mapping, i.e. pass a size threshold (defaults: 6 for peptides, 4 for nucleotides). @@ -153,6 +160,7 @@ import traceback import ost from ost import io +from ost.mol.alg import scoring_base from ost.mol.alg import ligand_scoring_base from ost.mol.alg import ligand_scoring_lddtpli from ost.mol.alg import ligand_scoring_scrmsd @@ -583,8 +591,13 @@ def _LoadStructureData(receptor_path, fault_tolerant = False, allow_heuristic_conn = False): + # thats the return variables + # the last two are only set in case of an mmCIF file receptor = None ligands = None + seqres = None + trg_seqres_mapping = None + receptor_format = _GetStructureFormat(receptor_path, sformat = sformat) if receptor_format == "pdb": @@ -595,22 +608,26 @@ def _LoadStructureData(receptor_path, if bu_id is not None: raise RuntimeError(f"Cannot specify biounit ({bu_id}) for receptor " f"in PDB format ({receptor_path})") - receptor = ligand_scoring_base.PDBPrep(receptor_path, - fault_tolerant=fault_tolerant) + receptor = scoring_base.PDBPrep(receptor_path, + fault_tolerant=fault_tolerant) ligands = _LoadLigands(ligand_path) elif receptor_format == "mmcif": if ligand_path is None: - receptor, ligands = ligand_scoring_base.MMCIFPrep(receptor_path, - biounit = bu_id, - extract_nonpoly = True, - fault_tolerant = fault_tolerant, - allow_heuristic_conn = allow_heuristic_conn) + receptor, ligands, seqres, trg_seqres_mapping = \ + scoring_base.MMCIFPrep(receptor_path, + biounit = bu_id, + extract_nonpoly = True, + fault_tolerant = fault_tolerant, + allow_heuristic_conn = allow_heuristic_conn, + extract_seqres_mapping=True) else: - receptor = ligand_scoring_base.MMCIFPrep(receptor_path, - biounit = bu_id, - extract_nonpoly = False, - fault_tolerant = fault_tolerant) + receptor, seqres, trg_seqres_mapping = \ + scoring_base.MMCIFPrep(receptor_path, + biounit = bu_id, + extract_nonpoly = False, + fault_tolerant = fault_tolerant, + extract_seqres_mapping=True) ligands = _LoadLigands(ligand_path) else: raise RuntimeError("This should never happen") @@ -618,32 +635,10 @@ def _LoadStructureData(receptor_path, # assign filename as name to receptor receptor.SetName(receptor_path) - return (receptor, ligands) - -def _SEQRESFeaturesFromArgs(args): - seqres = None - trg_seqres_mapping = None - - if args.seqres is not None: - if args.trg_seqres_mapping is None: - raise RuntimeError("Must provide trg-seqres-mapping if seqres is " - "provided") - if not args.residue_number_alignment: - raise RuntimeError("Must enable residue-number-alignment if seqres " - "is provided.") - seqres = io.LoadSequenceList(args.seqres) - - if args.trg_seqres_mapping is not None: - if args.seqres is None: - raise RuntimeError("Must provide seqres if trg-seqres-mapping is " - "provided.") - trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + return (receptor, ligands, seqres, trg_seqres_mapping) - return (seqres, trg_seqres_mapping) - - -def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args): - seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args) +def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args, + seqres=None, trg_seqres_mapping=None): return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference, model_ligands = model_ligands, target_ligands = reference_ligands, @@ -663,8 +658,8 @@ def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args seqres=seqres, trg_seqres_mapping=trg_seqres_mapping) -def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args): - seqres, trg_seqres_mapping = _SEQRESFeaturesFromArgs(args) +def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args, + seqres=None, trg_seqres_mapping=None): return ligand_scoring_scrmsd.SCRMSDScorer(model, reference, model_ligands = model_ligands, target_ligands = reference_ligands, @@ -685,7 +680,26 @@ def _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, args) seqres=seqres, trg_seqres_mapping=trg_seqres_mapping) -def _Process(model, model_ligands, reference, reference_ligands, args): +def _Process(model, model_ligands, reference, reference_ligands, args, + seqres = None, trg_seqres_mapping = None): + + # see if seqres/trg-seqres-mapping are set in args + # if yes, they're prioritized even if we have these values from + # the mmCIF file + if args.seqres is not None: + if args.trg_seqres_mapping is None: + raise RuntimeError("Must provide trg-seqres-mapping if seqres " + "is provided.") + + if args.trg_seqres_mapping is not None: + if args.seqres is None: + raise RuntimeError("Must provide seqres if trg-seqres-mapping " + "is provided.") + + if args.seqres is not None and args.trg_seqres_mapping is not None: + seqres = io.LoadSequenceList(args.seqres) + trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} + out = dict() @@ -699,12 +713,14 @@ def _Process(model, model_ligands, reference, reference_ligands, args): if args.lddt_pli: lddtpli_scorer = _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, - args) + args, seqres=seqres, + trg_seqres_mapping=trg_seqres_mapping) if args.rmsd: scrmsd_scorer = _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, - args) + args, seqres=seqres, + trg_seqres_mapping=trg_seqres_mapping) # basic info on ligands only requires baseclass functionality # doesn't matter which scorer we use @@ -718,7 +734,8 @@ def _Process(model, model_ligands, reference, reference_ligands, args): # just create SCRMSD scorer to fill basic ligand info scorer = _SetupSCRMSDScorer(model, model_ligands, reference, reference_ligands, - args) + args, seqres=seqres, + trg_seqres_mapping=trg_seqres_mapping) #################################### # Extract / Map ligand information # @@ -1011,22 +1028,24 @@ def _Main(): # Load structures LogScript("Loading data") LogInfo("Loading reference data") - reference, reference_ligands = _LoadStructureData(args.reference, - args.reference_ligands, - sformat = args.reference_format, - bu_id = args.reference_biounit, - fault_tolerant = args.fault_tolerant, - allow_heuristic_conn = args.allow_heuristic_conn) + reference, reference_ligands, seqres, trg_seqres_mapping = \ + _LoadStructureData(args.reference, + args.reference_ligands, + sformat = args.reference_format, + bu_id = args.reference_biounit, + fault_tolerant = args.fault_tolerant, + allow_heuristic_conn = args.allow_heuristic_conn) LogInfo("Loading model data") - model, model_ligands = _LoadStructureData(args.model, - args.model_ligands, - sformat = args.model_format, - bu_id = args.model_biounit, - fault_tolerant = args.fault_tolerant, - allow_heuristic_conn = args.allow_heuristic_conn) - - out = _Process(model, model_ligands, reference, reference_ligands, args) + model, model_ligands, _, _ = _LoadStructureData(args.model, + args.model_ligands, + sformat = args.model_format, + bu_id = args.model_biounit, + fault_tolerant = args.fault_tolerant, + allow_heuristic_conn = args.allow_heuristic_conn) + + out = _Process(model, model_ligands, reference, reference_ligands, args, + seqres = seqres, trg_seqres_mapping = trg_seqres_mapping) # append input arguments out["model"] = args.model diff --git a/actions/ost-compare-structures b/actions/ost-compare-structures index e89b4c7d972ac297f1a6ea4d43fc84585736a603..57b8e9fbd4fc8818775cbbd0a44227ff1c69f188 100644 --- a/actions/ost-compare-structures +++ b/actions/ost-compare-structures @@ -23,14 +23,21 @@ comparison: * "reference_chains": Chain names of reference * "model_chains": Chain names of model * "chem_groups": Groups of polypeptides/polynucleotides from reference that - are considered chemically equivalent, i.e. pass a pairwise sequence identity - threshold that can be controlled with --chem-group-seqid-thresh. + are considered chemically equivalent. Predefined if the reference is an mmCIF + file or if "seqres"/"trg-seqres-mapping" are provided manually. Alignments + of structure to SEQRES are established using residue numbers in these cases + and matching structure one letter codes and SEQRES are enforced. + In case of a PDB reference without predefined SEQRES, groups are established + using clustering based on pairwise alignments. Chains within + "chem_group_seqid_thresh" are considered equivalent and alignments are + established using residue numbers or Needleman-Wunsch + (see "residue-number-alignments" flag) You can derive stoichiometry from this. Contains only chains that are considered in chain mapping, i.e. pass a size threshold (defaults: 6 for peptides, 4 for nucleotides). * "chem_mapping": List of same length as "chem_groups". Assigns model chains to the respective chem group. Again, only contains chains that are considered - in chain mapping. That is 1) pass the same size threshold as fo chem_groups + in chain mapping. That is 1) pass the same size threshold as for chem_groups 2) can be aligned to any of the chem groups with a sequence identity threshold that can be controlled by --chem-map-seqid-thresh. * "mdl_chains_without_chem_mapping": Model chains that could be considered in @@ -755,13 +762,22 @@ def _LoadStructure(structure_path, sformat, fault_tolerant, bu_id): The returned structure has structure_path attached as structure name """ + # thats the return variables + # the last two are only set in case of an mmCIF file + entity = None + seqres = None + trg_seqres_mapping = None + # increase loglevel, as we would pollute the info log with weird stuff ost.PushVerbosityLevel(ost.LogLevel.Error) # Load the structure if sformat == "mmcif": - entity = scoring_base.MMCIFPrep(structure_path, - fault_tolerant=fault_tolerant, - biounit=bu_id) + # from mmCIF we can get seqres and trg_seqres_mapping + entity, seqres, trg_seqres_mapping = \ + scoring_base.MMCIFPrep(structure_path, + fault_tolerant=fault_tolerant, + biounit=bu_id, + extract_seqres_mapping=True) if len(entity.residues) == 0: raise Exception(f"No residues found in file: {structure_path}") else: @@ -772,7 +788,7 @@ def _LoadStructure(structure_path, sformat, fault_tolerant, bu_id): # restore old loglevel and return ost.PopVerbosityLevel() entity.SetName(structure_path) - return entity + return (entity, seqres, trg_seqres_mapping) def _DumpStructure(entity, structure_path, sformat): if sformat == "mmcif": @@ -932,29 +948,29 @@ def _GetAlignedResidues(aln, ref, mdl): "reference": ref_dct}) return aligned_residues -def _Process(model, reference, args, model_format, reference_format): +def _Process(model, reference, args, model_format, reference_format, + seqres=None, trg_seqres_mapping=None): mapping = None if args.chain_mapping is not None: mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} - seqres = None - trg_seqres_mapping = None - + # see if seqres/trg-seqres-mapping are set in args + # if yes, they're prioritized even if we have these values from + # the mmCIF file if args.seqres is not None: if args.trg_seqres_mapping is None: - raise RuntimeError("Must provide trg-seqres-mapping if seqres is " - "provided") - if not args.residue_number_alignment: - raise RuntimeError("Must enable residue-number-alignment if seqres " + raise RuntimeError("Must provide trg-seqres-mapping if seqres " "is provided.") - seqres = io.LoadSequenceList(args.seqres) if args.trg_seqres_mapping is not None: if args.seqres is None: - raise RuntimeError("Must provide seqres if trg-seqres-mapping is " - "provided.") - trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.trg_seqres_mapping} + raise RuntimeError("Must provide seqres if trg-seqres-mapping " + "is provided.") + + if args.seqres is not None and args.trg_seqres_mapping is not None: + seqres = io.LoadSequenceList(args.seqres) + trg_seqres_mapping = {x.split(':')[0]: x.split(':')[1] for x in args.chain_mapping} scorer = scoring.Scorer(model, reference, resnum_alignments = args.residue_number_alignment, @@ -1166,17 +1182,19 @@ def _Main(): "this is the case.") reference_format = _GetStructureFormat(args.reference, sformat=args.reference_format) - reference = _LoadStructure(args.reference, - sformat=reference_format, - bu_id=args.reference_biounit, - fault_tolerant = args.fault_tolerant) + reference, seqres, trg_seqres_mapping = _LoadStructure(args.reference, + sformat=reference_format, + bu_id=args.reference_biounit, + fault_tolerant = args.fault_tolerant) model_format = _GetStructureFormat(args.model, sformat=args.model_format) - model = _LoadStructure(args.model, - sformat=model_format, - bu_id=args.model_biounit, - fault_tolerant = args.fault_tolerant) - out = _Process(model, reference, args, model_format, reference_format) + model,_,_ = _LoadStructure(args.model, + sformat=model_format, + bu_id=args.model_biounit, + fault_tolerant = args.fault_tolerant) + + out = _Process(model, reference, args, model_format, reference_format, + seqres=seqres, trg_seqres_mapping=trg_seqres_mapping) # append input arguments out["model"] = args.model diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index ab36ee49579cdf7934b241eb699da8f29b7f7b83..cdf02f5853c0a6e81367eb66dbf8742a4edb3c1b 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -499,7 +499,10 @@ class ChainMapper: for nucleotides with respective thresholds. 2) specify seqres sequences and provide the mapping manually. This can be useful if you're loading data from an mmCIF file where you have mmCIF - entity information. + entity information. Alignments are performed based on residue numbers + in this case and don't consider the *resnum_alignments* flag. Any + mismatch of one letter code in the structure and the respective SEQRES + raises an error. * Map chains in an input model to these groups. Parameters for generating alignments are the same as above. Model chains are mapped to the chem @@ -522,7 +525,7 @@ class ChainMapper: :param resnum_alignments: Use residue numbers instead of Needleman-Wunsch to compute pairwise alignments. Relevant for :attr:`~chem_groups` - and related attributes. + if *seqres* is not specified explicitely. :type resnum_alignments: :class:`bool` :param pep_seqid_thr: Sequence identity threshold (in percent of aligned columns) used to decide when two chains are @@ -576,11 +579,12 @@ class ChainMapper: *mdl_map_pep_seqid_thr* :type mdl_map_nuc_seqid_thr: :class:`float` :param seqres: SEQRES sequences. All polymer chains in *target* will be - aligned to these sequences. This only works if - *resnum_alignments* is enabled and an error is raised - otherwise. Additionally, you need to manually specify - a mapping of the polymer chains using *trg_seqres_mapping* - and an error is raised otherwise. The one letter codes in + aligned to these sequences using a residue number based + alignment, effectively ignoring *resnum_alignments* in + the chem grouping step. Additionally, you need to + manually specify a mapping of the polymer chains using + *trg_seqres_mapping* and an error is raised otherwise. + The one letter codes in the structure must exactly match the respective characters in seqres and an error is raised if not. These seqres sequences are set as :attr:`~chem_group_ref_seqs` and will @@ -609,10 +613,6 @@ class ChainMapper: if seqres_params_set not in [0, 2]: raise RuntimeError("Must either give both, seqres and " "trg_seqres_mapping, or none of them.") - if seqres_params_set == 2: - if not resnum_alignments: - raise RuntimeError("Must enable resnum_alignments if seqres " - "information is provided") # attributes self.resnum_alignments = resnum_alignments self.pep_seqid_thr = pep_seqid_thr