Skip to content
Snippets Groups Projects
Unverified Commit 53d59b37 authored by Xavier Robin's avatar Xavier Robin
Browse files

feat: SCHWED-5481 inital compare-ligand-structures action

parent 941006d4
No related branches found
No related tags found
No related merge requests found
add_custom_target(actions ALL)
ost_action_init()
ost_action(ost-compare-ligand-structures actions)
ost_action(ost-compare-structures actions)
ost_action(ost-compare-structures-new actions)
"""
Evaluate model with non-polymer/small molecule ligands against reference
Example: ost compare-structures-new \\
-m model.pdb \\
-ml ligand.sdf \\
-r reference.cif \\
--lddt-pli --rmsd
Only minimal cleanup steps are performed (remove hydrogens, and for structures
only, remove unknown atoms and cleanup element column).
Ligands can be given as path to SDF files containing the ligand for both model
(--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
ligands will be detected in the model and reference structures. For structures
given in mmCIF format, this is based on the annotation as "non polymer entity"
and works reliably. For structures given in PDB format, this is based on the
HET records and is normally not what you want.
Output is written in JSON format (default: out.json). In case of no additional
options, this is a dictionary with three keys:
* "model_ligands": A list of ligands in the model. If ligands were provided
explicitly with --model-ligands, elements of the list will be the paths to
the ligand SDF file(s). Otherwise, they will be the chain name and residue
number of the ligand, separated by a dot.
* "reference_ligands": A list of ligands in the reference. If ligands were
provided explicitly with --reference-ligands, elements of the list will be
the paths to the ligand SDF file(s). Otherwise, they will be the chain name
and residue number of the ligand, separated by a dot.
* "status": SUCCESS if everything ran through. In case of failure, the only
content of the JSON output will be \"status\" set to FAILURE and an
additional key: "traceback".
Each score is opt-in and can be enabled with optional arguments and is added
to the output. Keys correspond to the values in `"model_ligands"` above.
Only mapped ligands are reported.
"""
import argparse
import json
import os
import traceback
import warnings
import ost
from ost import conop
from ost import io
from ost.mol.alg import ligand_scoring
from ost.mol.alg import Molck, MolckSettings
def _ParseArgs():
parser = argparse.ArgumentParser(description = __doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
prog="ost compare-ligand-structures")
parser.add_argument(
"-m",
"--mdl",
"--model",
dest="model",
required=True,
help=("Path to model file."))
parser.add_argument(
"-ml",
"--mdl-ligands",
"--model-ligands",
dest="model_ligands",
nargs="*",
default=None,
help=("Path to model ligand files."))
parser.add_argument(
"-r",
"--ref",
"--reference",
dest="reference",
required=True,
help=("Path to reference file."))
parser.add_argument(
"-rl",
"--ref-ligands",
"--reference-ligands",
dest="reference_ligands",
nargs="*",
default=None,
help=("Path to reference ligand files."))
parser.add_argument(
"-o",
"--out",
"--output",
dest="output",
default="out.json",
help=("Output file name. The output will be saved as a JSON file. "
"default: out.json"))
parser.add_argument(
"-mf",
"--mdl-format",
"--model-format",
dest="model_format",
default="auto",
choices=["pdb", "mmcif"],
help=("Format of model file. Inferred from path if not given."))
parser.add_argument(
"-rf",
"--reference-format",
"--ref-format",
dest="reference_format",
default="auto",
choices=["cif", "mmcif"],
help=("Format of reference file. Inferred from path if not given."))
parser.add_argument(
"-ft",
"--fault-tolerant",
dest="fault_tolerant",
default=False,
action="store_true",
help=("Fault tolerant parsing."))
parser.add_argument(
"--residue-number-alignment",
"-rna",
dest="residue_number_alignment",
default=False,
action="store_true",
help=("Make alignment based on residue number instead of using "
"a global BLOSUM62-based alignment (NUC44 for nucleotides)."))
parser.add_argument(
"--lddt-pli",
dest="lddt_pli",
default=False,
action="store_true",
help=("Compute lDDT-PLI score and store as key \"lddt-pli\"."))
parser.add_argument(
"--rmsd",
dest="rmsd",
default=False,
action="store_true",
help=("Compute RMSD score and store as key \"lddt-pli\"."))
parser.add_argument(
'-v',
'--verbosity',
dest="verbosity",
type=int,
default=3,
help="Set verbosity level. Defaults to 3 (INFO).")
return parser.parse_args()
def _LoadStructure(structure_path, format="auto", fault_tolerant=False):
"""Read OST entity either from mmCIF or PDB.
The returned structure has structure_path attached as structure name
"""
if not os.path.exists(structure_path):
raise Exception(f"file not found: {structure_path}")
if format == "auto":
# Determine file format from suffix.
ext = structure_path.split(".")
if ext[-1] == "gz":
ext = ext[:-1]
if len(ext) <= 1:
raise Exception(f"Could not determine format of file "
f"{structure_path}.")
format = ext[-1].lower()
# increase loglevel, as we would pollute the info log with weird stuff
ost.PushVerbosityLevel(ost.LogLevel.Error)
# Load the structure
if format in ["mmcif", "cif"]:
entity, seqres = io.LoadMMCIF(structure_path, seqres=True,
fault_tolerant=fault_tolerant)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
elif format == "pdb":
entity = io.LoadPDB(structure_path, fault_tolerant=fault_tolerant)
if len(entity.residues) == 0:
raise Exception(f"No residues found in file: {structure_path}")
else:
raise Exception(f"Unknown/ unsupported file extension found for "
f"file {structure_path}.")
# Molck it
molck_settings = MolckSettings(rm_unk_atoms=True,
rm_non_std=False,
rm_hyd_atoms=True,
rm_oxt_atoms=False,
rm_zero_occ_atoms=False,
colored=False,
map_nonstd_res=False,
assign_elem=True)
# Cleanup a copy of the structures
Molck(entity, conop.GetDefaultLib(), molck_settings)
# restore old loglevel and return
ost.PopVerbosityLevel()
entity.SetName(structure_path)
return entity
def _LoadLigands(ligands):
"""
Load a list of ligands from file names. Return a list of entities oif the
same size.
"""
if ligands is None:
return None
else:
return [_LoadLigand(lig) for lig in ligands]
def _LoadLigand(file):
"""
Load a single ligand from file names. Return an entity.
"""
ligand_ent = ost.io.LoadEntity(file, format="sdf")
ligand_ent.Select("ele != H")
return ligand_ent.Copy()
def _Validate(structure, ligands, legend):
"""Validate the structure.
If fault_tolerant is True, only warns in case of problems. If False,
raise them as ValueErrors.
At the moment this chiefly checks for ligands in polymers
"""
if ligands is not None:
for residue in structure.residues:
if residue.is_ligand:
warnings.warn("Ligand residue %s found in %s polymer structure" %(
residue.qualified_name, legend
), UserWarning)
def _Process(model, model_ligands, reference, reference_ligands, args):
scorer = ligand_scoring.LigandScorer(
model=model,
target=reference,
model_ligands=model_ligands,
target_ligands=reference_ligands,
resnum_alignments=args.residue_number_alignment,
)
out = dict()
if model_ligands is not None:
# Replace model ligand by path
assert len(model_ligands) == len(scorer.model_ligands)
# Map ligand => path
model_ligands_map = {k: v for k, v in zip(scorer.model_ligands,
args.model_ligands)}
out["model_ligands"] = args.model_ligands
else:
model_ligands_map = {l: "%s.%s" % (l.chain.name, l.number)
for l in scorer.model_ligands}
out["model_ligands"] = list(model_ligands_map.values())
if reference_ligands is not None:
# Replace reference ligand by path
assert len(reference_ligands) == len(scorer.target_ligands)
# Map ligand => path
reference_ligands_map = {k: v for k, v in zip(scorer.target_ligands,
args.reference_ligands)}
out["reference_ligands"] = args.reference_ligands
else:
reference_ligands_map = {l: "%s.%s" % (l.chain.name, l.number)
for l in scorer.target_ligands}
out["reference_ligands"] = list(reference_ligands_map.values())
if args.lddt_pli:
out["lddt_pli"] = {}
for chain, lddt_pli_results in scorer.lddt_pli_details.items():
for _, lddt_pli in lddt_pli_results.items():
model_key = model_ligands_map[lddt_pli["model_ligand"]]
lddt_pli["reference_ligand"] = reference_ligands_map[
lddt_pli.pop("target_ligand")]
lddt_pli["model_ligand"] = model_key
lddt_pli["transform"] = str(lddt_pli["transform"])
out["lddt_pli"][model_key] = lddt_pli
if args.rmsd:
out["rmsd"] = {}
for chain, rmsd_results in scorer.rmsd_details.items():
for _, rmsd in rmsd_results.items():
model_key = model_ligands_map[rmsd["model_ligand"]]
rmsd["reference_ligand"] = reference_ligands_map[
rmsd.pop("target_ligand")]
rmsd["model_ligand"] = model_key
rmsd["transform"] = str(rmsd["transform"])
out["rmsd"][model_key] = rmsd
return out
def _Main():
args = _ParseArgs()
ost.PushVerbosityLevel(args.verbosity)
try:
# Load structures
reference = _LoadStructure(args.reference,
format=args.reference_format,
fault_tolerant = args.fault_tolerant)
model = _LoadStructure(args.model, format=args.model_format,
fault_tolerant = args.fault_tolerant)
# Load ligands
model_ligands = _LoadLigands(args.model_ligands)
reference_ligands = _LoadLigands(args.reference_ligands)
# Validate
_Validate(model, model_ligands, "model")
_Validate(reference, reference_ligands, "reference")
out = _Process(model, model_ligands, reference, reference_ligands, args)
out["status"] = "SUCCESS"
with open(args.output, 'w') as fh:
json.dump(out, fh, indent=4, sort_keys=False)
except Exception:
out = dict()
out["status"] = "FAILURE"
out["traceback"] = traceback.format_exc()
with open(args.output, 'w') as fh:
json.dump(out, fh, indent=4, sort_keys=False)
raise
if __name__ == '__main__':
_Main()
......@@ -68,7 +68,7 @@ class LigandScorer:
molck_settings = MolckSettings(rm_unk_atoms=True,
rm_non_std=False,
rm_hyd_atoms=True,
rm_oxt_atoms=True,
rm_oxt_atoms=False,
rm_zero_occ_atoms=False,
colored=False,
map_nonstd_res=False,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment