From 5cfda76cf845784afdab7e87ef79c6556f61f17a Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Tue, 10 Dec 2024 11:50:36 +0100
Subject: [PATCH] ligand scoring: refactoring of input processing

Key differences:
 - LigandScorer does not automatically extract non-polymer ligands
   anymore, ligand extraction must happen externally
 - LigandScorer performs cleanup of receptor structure 1) removal of
   hydrogens 2) removal of residues for which there is no entry in
   the PDB component dictionary 3) removal of residues that are not
   peptide linking or nucleotide linking according to PDB component
   dictionary 4) removal of atoms that are not defined for the
   respective entry in the PDB component dictionary.
 - Cleanup is Logged and available as output
 - compare-ligand-structures action does not support automatic
   extraction of ligands from PDB structures anymore. This feature
   still works if structures are provided as mmCIF.
 - compare-ligand-structures action adds receptor structure cleanup
   logs and input parameter in json output to improve
   reproducibility of results.
---
 actions/ost-compare-ligand-structures         | 212 ++---
 modules/doc/actions.rst                       | 108 +--
 modules/mol/alg/doc/ligand_scoring.rst        |   6 +
 modules/mol/alg/pymod/ligand_scoring_base.py  | 794 ++++++++++++------
 .../mol/alg/pymod/ligand_scoring_lddtpli.py   |   5 +-
 .../mol/alg/pymod/ligand_scoring_scrmsd.py    |   5 +-
 modules/mol/alg/tests/test_ligand_scoring.py  | 412 +++++----
 7 files changed, 960 insertions(+), 582 deletions(-)

diff --git a/actions/ost-compare-ligand-structures b/actions/ost-compare-ligand-structures
index 3c58acd59..3cff1249a 100644
--- a/actions/ost-compare-ligand-structures
+++ b/actions/ost-compare-ligand-structures
@@ -8,47 +8,58 @@ Example: ost compare-ligand-structures \\
     --lddt-pli --rmsd
 
 Structures of polymer entities (proteins and nucleotides) can be given in PDB
-or mmCIF format.
+or mmCIF format. In case of PDB format, the full loaded structure undergoes
+processing described below. In case of mmCIF format, chains representing
+"polymer" entities according to _entity.type are selected and further processed
+as described below.
+
+Structure cleanup is heavily based on the PDB component dictionary and performs
+1) removal of hydrogens, 2) removal of residues for which there is no entry in
+component dictionary, 3) removal of residues that are not peptide linking or
+nucleotide linking according to the component dictionary 4) removal of atoms
+that are not defined for respective residues in the component dictionary. Except
+step 1), every cleanup is logged and a report is available in the json outfile.
 
 Ligands can be given as path to SDF files containing the ligand for both model
 (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
-ligands will be detected in the model and reference structures. For structures
-given in mmCIF format, this is based on the annotation as "non polymer entity"
-(i.e. ligands in the _pdbx_entity_nonpoly mmCIF category) and works reliably.
-For structures given in legacy PDB format, this is based on the HET records
-which is usually only set properly on files downloaded from the PDB (and even
-then, this is not always the case). This is normally not what you want. You
-should always give ligands as SDF for structures in legacy PDB format.
-
-Polymer/oligomeric ligands (saccharides, peptides, nucleotides) are not
-supported.
-
-Only minimal cleanup steps are performed (remove hydrogens and deuteriums,
-and for structures of polymers only, remove unknown atoms and cleanup element
-column).
-
-Ligands in mmCIF and PDB files must comply with the PDB component dictionary
-definition, and have properly named residues and atoms, in order for
-ligand connectivity to be loaded correctly. Ligands loaded from SDF files
-are exempt from this restriction, meaning any arbitrary ligand can be assessed.
+ligands are optionally detected from a structure file if it is given in mmCIF
+format. This is based on "non-polymer" _entity.type annotation and the
+respective entries must exist in the PDB component dictionary in order to get
+connectivity information. For example, receptor structure and ligand(s) are
+loaded from the same mmCIF file given as '-m'/'-r'. This does not work for
+structures provided in PDB format and an error is raised if ligands are not
+explitely given in SDF format.
+
+Ligands undergo gentle processing where hydrogens are removed. Connectivity
+is relevant for scoring. It is read directly from SDF input. If ligands are
+extracted from mmCIF, connectivity is derived from the PDB component
+dictionary. Polymer/oligomeric ligands (saccharides, peptides, nucleotides)
+are not supported.
 
 Output can be written in two format: JSON (default) or CSV, controlled by the
 --output-format/-of argument.
 
-Without additional options, the JSON ouput is a dictionary with four keys:
+Without additional options, the JSON ouput is a dictionary with the following
+keys:
 
  * "model_ligands": A list of ligands in the model. If ligands were provided
    explicitly with --model-ligands, elements of the list will be the paths to
    the ligand SDF file(s). Otherwise, they will be the chain name, residue
    number and insertion code of the ligand, separated by a dot.
- * "reference_ligands": A list of ligands in the reference. If ligands were
-   provided explicitly with --reference-ligands, elements of the list will be
-   the paths to the ligand SDF file(s). Otherwise, they will be the chain name,
-   residue number and insertion code of the ligand, separated by a dot.
+ * "reference_ligands": Same for reference ligands.
  * "status": SUCCESS if everything ran through. In case of failure, the only
    content of the JSON output will be \"status\" set to FAILURE and an
    additional key: "traceback".
  * "ost_version": The OpenStructure version used for computation.
+ * "model_cleanup_log": Lists residues/atoms that have been removed in model
+   cleanup process.
+ * "reference_cleanup_log": Same for reference. 
+ * "reference": Parameter provided for --reference/-r
+ * "model": Parameter provided for --model/-m
+ * "resnum_alignments": Parameter provided for --residue-number-alignment/-rna
+ * "substructure_match": Parameter provided for --substructure-match/-sm
+ * "coverage_delta": Parameter provided for --coverage-delta/-cd
+ * "max_symmetries": Parameter provided for --max-symmetries/-ms 
 
 Each score is opt-in and the respective results are available in three keys:
 
@@ -368,7 +379,7 @@ def _ParseArgs():
 
     parser.add_argument(
         "-ms",
-        "--max--symmetries",
+        "--max-symmetries",
         dest="max_symmetries",
         default=1e4,
         type=int,
@@ -461,51 +472,17 @@ def _LoadLigands(ligands):
 def _LoadLigand(file):
     """
     Load a single ligand from file names. Return an entity.
+    Removes hydrogens.
     """
     ent = ost.io.LoadEntity(file, format="sdf")
+    ent = ost.mol.CreateEntityFromView(ent.Select(
+        "ele != H and ele != D"), include_exlusive_atoms=False)
     ed = ent.EditXCS()
     ed.RenameChain(ent.chains[0], file)
     ed.UpdateICS()
     return ent
 
 
-def _CleanupStructure(entity):
-    """Cleans up the structure.
-    Currently only removes hydrogens (and deuterium atoms).
-    """
-    return ost.mol.CreateEntityFromView(entity.Select(
-        "ele != H and ele != D"), include_exlusive_atoms=False)
-
-
-def _CleanupLigands(ligands):
-    """Clean up a list of structures.
-    """
-    if ligands is None:
-        return None
-    else:
-        return [_CleanupStructure(lig) for lig in ligands]
-
-
-def _Validate(structure, ligands, legend, fault_tolerant=False):
-    """Validate the structure.
-
-    If fault_tolerant is True, only warns in case of problems. If False,
-    raise them as ValueErrors.
-
-    At the moment this chiefly checks if ligands are in the structure and are
-    given explicitly at the same time.
-    """
-    if ligands is not None:
-        for residue in structure.residues:
-            if residue.is_ligand:
-                msg = "Ligand residue %s found in %s polymer structure" %(
-                    residue.qualified_name, legend)
-                if fault_tolerant:
-                    ost.LogWarning(msg)
-                else:
-                    raise ValueError(msg)
-
-
 def _QualifiedResidueNotation(r):
     """Return a parsable string of the residue in the format:
     ChainName.ResidueNumber.InsertionCode."""
@@ -516,6 +493,46 @@ def _QualifiedResidueNotation(r):
         ins_code=resnum.ins_code.strip("\u0000"),
     )
 
+
+def _LoadStructureData(receptor_path,
+                       ligand_path,
+                       sformat = None,
+                       bu_id = None,
+                       fault_tolerant = False):
+
+    receptor = None
+    ligands = None
+    receptor_format = _GetStructureFormat(receptor_path, sformat = sformat)
+
+    if receptor_format == "pdb":
+        if ligand_path is None:
+            raise RuntimeError(f"Must provide ligand as SDF file(s) when "
+                               f"receptor ({receptor_path}) is given in PDB "
+                               f"format.")
+        if bu_id is not None:
+            raise RuntimeError(f"Cannot specify biounit ({bu_id}) for receptor "
+                               f"in PDB format ({receptor_path})")
+        receptor = ligand_scoring_base.PDBPrep(receptor_path)
+        ligands = _LoadLigands(ligand_path)
+
+    elif receptor_format == "mmcif":
+        if ligand_path is None:
+            receptor, ligands = ligand_scoring_base.MMCIFPrep(receptor_path,
+                                                              biounit = bu_id,
+                                                              extract_nonpoly = True,
+                                                              fault_tolerant = fault_tolerant)
+        else:
+            receptor = ligand_scoring_base.MMCIFPrep(receptor_path,
+                                                     biounit = bu_id,
+                                                     extract_nonpoly = False,
+                                                     fault_tolerant = fault_tolerant)
+            ligands = _LoadLigands(ligand_path)
+    else:
+        raise RuntimeError("This should never happen")
+
+    return (receptor, ligands)
+
+
 def _SetupLDDTPLIScorer(model, model_ligands, reference, reference_ligands, args):
     return ligand_scoring_lddtpli.LDDTPLIScorer(model, reference,
                                                 model_ligands = model_ligands,
@@ -580,7 +597,7 @@ def _Process(model, model_ligands, reference, reference_ligands, args):
     # Extract / Map ligand information #
     ####################################
 
-    if model_ligands is not None:
+    if args.model_ligands is not None:
         # Replace model ligand by path
         if len(model_ligands) == len(scorer.model_ligands):
             # Map ligand => path
@@ -602,7 +619,7 @@ def _Process(model, model_ligands, reference, reference_ligands, args):
         # Map ligand => qualified residue
         out["model_ligands"] = [_QualifiedResidueNotation(l) for l in scorer.model_ligands]
 
-    if reference_ligands is not None:
+    if args.reference_ligands is not None:
         # Replace reference ligand by path
         if len(reference_ligands) == len(scorer.target_ligands):
             # Map ligand => path
@@ -774,6 +791,16 @@ def _Process(model, model_ligands, reference, reference_ligands, args):
                                                             "reference_ligand": target_key,
                                                             "reason": reason})
 
+    # add cleanup logs and parameters relevant for reproducibility
+    out["model"] = args.model
+    out["reference"] = args.reference
+    out["model_cleanup_log"] = scorer.model_cleanup_log
+    out["reference_cleanup_log"] = scorer.target_cleanup_log
+    out["resnum_alignments"] = scorer.resnum_alignments
+    out["substructure_match"] = scorer.substructure_match
+    out["coverage_delta"] = scorer.coverage_delta
+    out["max_symmetries"] = scorer.max_symmetries
+
     return out
 
 def _WriteCSV(out, args):
@@ -859,46 +886,21 @@ def _Main():
     try:
         # Load structures
         LogScript("Loading data")
-        LogInfo("Loading reference structure")
-        reference_format = _GetStructureFormat(args.reference,
-                                               sformat=args.reference_format)
-        reference = _LoadStructure(args.reference,
-                                   sformat=reference_format,
-                                   bu_id=args.reference_biounit,
-                                   fault_tolerant = args.fault_tolerant)
-        LogInfo("Loading model structure")
-        model_format = _GetStructureFormat(args.model,
-                                           sformat=args.model_format)
-        model = _LoadStructure(args.model,
-                               sformat=model_format,
-                               bu_id=args.model_biounit,
-                               fault_tolerant = args.fault_tolerant)
-
-        # Load ligands
-        LogInfo("Loading reference ligands")
-        reference_ligands = _LoadLigands(args.reference_ligands)
-        LogInfo("Loading model ligands")
-        model_ligands = _LoadLigands(args.model_ligands)
-
-        # Cleanup
-        LogVerbose("Cleaning up reference structure")
-        cleaned_reference = _CleanupStructure(reference)
-        LogVerbose("Cleaning up model structure")
-        cleaned_model = _CleanupStructure(model)
-        LogVerbose("Cleaning up reference ligands")
-        cleaned_reference_ligands = _CleanupLigands(reference_ligands)
-        LogVerbose("Cleaning up model ligands")
-        cleaned_model_ligands = _CleanupLigands(model_ligands)
-
-        # Validate
-        _Validate(cleaned_model, cleaned_model_ligands, "model",
-                  fault_tolerant = args.fault_tolerant)
-        _Validate(cleaned_reference, cleaned_reference_ligands, "reference",
-                  fault_tolerant = args.fault_tolerant)
-
-        out = _Process(cleaned_model, cleaned_model_ligands,
-                       cleaned_reference, cleaned_reference_ligands,
-                       args)
+        LogInfo("Loading reference data")
+        reference, reference_ligands = _LoadStructureData(args.reference,
+                                                          args.reference_ligands,
+                                                          sformat = args.reference_format,
+                                                          bu_id = args.reference_biounit,
+                                                          fault_tolerant = args.fault_tolerant)
+
+        LogInfo("Loading model data")
+        model, model_ligands = _LoadStructureData(args.model,
+                                                  args.model_ligands,
+                                                  sformat = args.model_format,
+                                                  bu_id = args.model_biounit,
+                                                  fault_tolerant = args.fault_tolerant)
+
+        out = _Process(model, model_ligands, reference, reference_ligands, args)
 
         out["ost_version"] = ost.__version__
         out["status"] = "SUCCESS"
diff --git a/modules/doc/actions.rst b/modules/doc/actions.rst
index 56afd04dc..f9e330f96 100644
--- a/modules/doc/actions.rst
+++ b/modules/doc/actions.rst
@@ -475,7 +475,7 @@ Comparing two structures with ligands
 You can compare two structures with non-polymer/small molecule ligands and
 compute lDDT-PLI and ligand RMSD scores from the command line with the
 ``ost compare-ligand-structures`` action. This can be considered a command
-line interface to :class:`ost.mol.alg.ligand_scoring.LigandScorer` and more
+line interface to :class:`ost.mol.alg.ligand_scoring_base.LigandScorer` and more
 information about arguments and outputs can be found there.
 
 Details on the usage (output of ``ost compare-ligand-structures --help``):
@@ -500,106 +500,117 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                                        [--radius RADIUS]
                                        [--lddt-lp-radius LDDT_LP_RADIUS] [-fbs]
                                        [-ms MAX_SYMMETRIES]
-  
+
   Evaluate model with non-polymer/small molecule ligands against reference.
-  
+
   Example: ost compare-ligand-structures \
       -m model.pdb \
       -ml ligand.sdf \
       -r reference.cif \
       --lddt-pli --rmsd
-  
+
   Structures of polymer entities (proteins and nucleotides) can be given in PDB
-  or mmCIF format.
-  
+  or mmCIF format. In case of PDB format, the full loaded structure undergoes
+  processing described below. In case of mmCIF format, chains representing
+  "polymer" entities according to _entity.type are selected and further processed
+  as described below.
+
+  Structure cleanup is heavily based on the PDB component dictionary and performs
+  1) removal of hydrogens, 2) removal of residues for which there is no entry in
+  component dictionary, 3) removal of residues that are not peptide linking or
+  nucleotide linking according to the component dictionary 4) removal of atoms
+  that are not defined for respective residues in the component dictionary. Except
+  step 1), every cleanup is logged and a report is available in the json outfile.
+
   Ligands can be given as path to SDF files containing the ligand for both model
   (--model-ligands/-ml) and reference (--reference-ligands/-rl). If omitted,
-  ligands will be detected in the model and reference structures. For structures
-  given in mmCIF format, this is based on the annotation as "non polymer entity"
-  (i.e. ligands in the _pdbx_entity_nonpoly mmCIF category) and works reliably.
-  For structures given in legacy PDB format, this is based on the HET records
-  which is usually only set properly on files downloaded from the PDB (and even
-  then, this is not always the case). This is normally not what you want. You
-  should always give ligands as SDF for structures in legacy PDB format.
-  
-  Polymer/oligomeric ligands (saccharides, peptides, nucleotides) are not
-  supported.
-  
-  Only minimal cleanup steps are performed (remove hydrogens and deuteriums,
-  and for structures of polymers only, remove unknown atoms and cleanup element
-  column).
-  
-  Ligands in mmCIF and PDB files must comply with the PDB component dictionary
-  definition, and have properly named residues and atoms, in order for
-  ligand connectivity to be loaded correctly. Ligands loaded from SDF files
-  are exempt from this restriction, meaning any arbitrary ligand can be assessed.
-  
+  ligands are optionally detected from a structure file if it is given in mmCIF
+  format. This is based on "non-polymer" _entity.type annotation and the
+  respective entries must exist in the PDB component dictionary in order to get
+  connectivity information. For example, receptor structure and ligand(s) are
+  loaded from the same mmCIF file given as '-m'/'-r'. This does not work for
+  structures provided in PDB format and an error is raised if ligands are not
+  explitely given in SDF format.
+
+  Ligands undergo gentle processing where hydrogens are removed. Connectivity
+  is relevant for scoring. It is read directly from SDF input. If ligands are
+  extracted from mmCIF, connectivity is derived from the PDB component
+  dictionary. Polymer/oligomeric ligands (saccharides, peptides, nucleotides)
+  are not supported.
+
   Output can be written in two format: JSON (default) or CSV, controlled by the
   --output-format/-of argument.
-  
-  Without additional options, the JSON ouput is a dictionary with four keys:
-  
+
+  Without additional options, the JSON ouput is a dictionary with the following
+  keys:
+
    * "model_ligands": A list of ligands in the model. If ligands were provided
      explicitly with --model-ligands, elements of the list will be the paths to
      the ligand SDF file(s). Otherwise, they will be the chain name, residue
      number and insertion code of the ligand, separated by a dot.
-   * "reference_ligands": A list of ligands in the reference. If ligands were
-     provided explicitly with --reference-ligands, elements of the list will be
-     the paths to the ligand SDF file(s). Otherwise, they will be the chain name,
-     residue number and insertion code of the ligand, separated by a dot.
+   * "reference_ligands": Same for reference ligands.
    * "status": SUCCESS if everything ran through. In case of failure, the only
      content of the JSON output will be "status" set to FAILURE and an
      additional key: "traceback".
    * "ost_version": The OpenStructure version used for computation.
-  
+   * "model_cleanup_log": Lists residues/atoms that have been removed in model
+     cleanup process.
+   * "reference_cleanup_log": Same for reference.
+   * "reference": Parameter provided for --reference/-r
+   * "model": Parameter provided for --model/-m
+   * "resnum_alignments": Parameter provided for --residue-number-alignment/-rna
+   * "substructure_match": Parameter provided for --substructure-match/-sm
+   * "coverage_delta": Parameter provided for --coverage-delta/-cd
+   * "max_symmetries": Parameter provided for --max-symmetries/-ms 
+
   Each score is opt-in and the respective results are available in three keys:
-  
+
    * "assigned_scores": A list with data for each pair of assigned ligands.
      Data is yet another dict containing score specific information for that
      ligand pair. The following keys are there in any case:
-  
+
       * "model_ligand": The model ligand
       * "reference_ligand": The target ligand to which model ligand is assigned to
       * "score": The score
       * "coverage": Fraction of model ligand atoms which are covered by target
         ligand. Will only deviate from 1.0 if --substructure-match is enabled.
-  
+
    * "model_ligand_unassigned_reason": Dictionary with unassigned model ligands
      as key and an educated guess why this happened.
-  
+
    * "reference_ligand_unassigned_reason": Dictionary with unassigned target ligands
      as key and an educated guess why this happened.
-  
+
   If --full-results is enabled, another element with key "full_results" is added.
   This is a list of data items for each pair of model/reference ligands. The data
   items follow the same structure as in "assigned_scores". If no score for a
   specific pair of ligands could be computed, "score" and "coverage" are set to
   null and a key "reason" is added giving an educated guess why this happened.
-  
+
   CSV output is a table of comma-separated values, with one line for each
   reference ligand (or one model ligand if the --by-model-ligand-output flag was
   set).
-  
+
   The following column is always available:
-  
+
    * reference_ligand/model_ligand: If reference ligands were provided explicitly
      with --reference-ligands, elements of the list will be the paths to the
      ligand SDF file(s). Otherwise, they will be the chain name, residue number
      and insertion code of the ligand, separated by a dot. If the
      --by-model-ligand-output flag was set, this will be model ligand instead,
      following the same rules.
-  
+
   If lDDT-PLI was enabled with --lddt-pli, the following columns are added:
-  
+
    * "lddt_pli", "lddt_pli_coverage" and "lddt_pli_(model|reference)_ligand"
      are the lDDT-PLI score result, the corresponding coverage and assigned model
      ligand (or reference ligand if the --by-model-ligand-output flag was set)
      if an assignment was found, respectively, empty otherwise.
    * "lddt_pli_unassigned" is empty if an assignment was found, otherwise it
      lists the short reason this reference ligand was unassigned.
-  
+
   If BiSyRMSD was enabled with --rmsd, the following columns are added:
-  
+
    * "rmsd", "rmsd_coverage". "lddt_lp" "bb_rmsd" and
      "rmsd_(model|reference)_ligand" are the BiSyRMSD, the corresponding
      coverage, lDDT-LP, backbone RMSD and assigned model ligand (or reference
@@ -607,7 +618,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
      was found, respectively, empty otherwise.
    * "rmsd_unassigned" is empty if an assignment was found, otherwise it
      lists the short reason this reference ligand was unassigned.
-  
+
   options:
     -h, --help            show this help message and exit
     -m MODEL, --mdl MODEL, --model MODEL
@@ -687,8 +698,7 @@ Details on the usage (output of ``ost compare-ligand-structures --help``):
                           Enumerate all potential binding sites in the model
                           when searching rigid superposition for RMSD
                           computation
-    -ms MAX_SYMMETRIES, --max--symmetries MAX_SYMMETRIES
+    -ms MAX_SYMMETRIES, --max-symmetries MAX_SYMMETRIES
                           If more than that many isomorphisms exist for a
                           target-ligand pair, it will be ignored and reported as
                           unassigned.
-
diff --git a/modules/mol/alg/doc/ligand_scoring.rst b/modules/mol/alg/doc/ligand_scoring.rst
index 79fcb8dce..e6a98d8bd 100644
--- a/modules/mol/alg/doc/ligand_scoring.rst
+++ b/modules/mol/alg/doc/ligand_scoring.rst
@@ -1,6 +1,12 @@
 :mod:`~ost.mol.alg.ligand_scoring` -- Ligand scoring functions
 ------------------------------------------------------------------------------------------
 
+.. note ::
+  Extra requirements:
+
+  - Python modules `numpy` and `networkx` must be available
+    (e.g. use ``pip install numpy networkx``)
+
 .. module:: ost.mol.alg.ligand_scoring
    :synopsis: Ligand scoring functions
 
diff --git a/modules/mol/alg/pymod/ligand_scoring_base.py b/modules/mol/alg/pymod/ligand_scoring_base.py
index 45376fde5..f3d95972c 100644
--- a/modules/mol/alg/pymod/ligand_scoring_base.py
+++ b/modules/mol/alg/pymod/ligand_scoring_base.py
@@ -2,7 +2,10 @@ from contextlib import contextmanager
 import numpy as np
 import networkx
 
+import ost
+from ost import io
 from ost import mol
+from ost import conop
 from ost import LogWarning, LogScript, LogInfo, LogVerbose, LogDebug, GetVerbosityLevel, PushVerbosityLevel, PopVerbosityLevel
 from ost.mol.alg import chain_mapping
 
@@ -25,14 +28,193 @@ def _SinkVerbosityLevel(n=1):
         PopVerbosityLevel()
 
 
-class LigandScorer:
-    """ Scorer to compute various small molecule ligand (non polymer) scores.
+def _QualifiedAtomNotation(a):
+    """Return a parsable string of the atom in the format:
+    ChainName.ResidueNumber.InsertionCode.AtomName."""
+    resnum = a.residue.number
+    return "{cname}.{rnum}.{ins_code}.{aname}".format(
+        cname=a.chain.name,
+        rnum=resnum.num,
+        ins_code=resnum.ins_code.strip("\u0000"),
+        aname=a.name,
+    )
+
+
+def _QualifiedResidueNotation(r):
+    """Return a parsable string of the residue in the format:
+    ChainName.ResidueNumber.InsertionCode."""
+    resnum = r.number
+    return "{cname}.{rnum}.{ins_code}".format(
+        cname=r.chain.name,
+        rnum=resnum.num,
+        ins_code=resnum.ins_code.strip("\u0000"),
+    )
+
+
+def CleanHydrogens(ent, clib):
+    """ Ligand scoring helper - Returns copy of *ent* without hydrogens
+
+    Non-standard hydrogen naming can cause trouble in residue property
+    assignment which is done by the :class:`ost.conop.RuleBasedProcessor` when
+    loading. In fact, residue property assignment is not done for every residue
+    that has unknown atoms according to the chemical component dictionary. This
+    function therefore re-processes the entity after removing hydrogens.
+
+    :param ent: Entity to clean
+    :type ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
+    :param clib: Compound library to perform re-processing after hydrogen
+                 removal.
+    :type clib: :class:`ost.conop.CompoundLib`
+    :returns: Cleaned and re-processed ent
+    """
+    cleaned_ent = mol.CreateEntityFromView(ent.Select(
+        "ele != H and ele != D"), include_exlusive_atoms=False)
+    # process again to set missing residue properties due to non standard
+    # hydrogens
+    processor = conop.RuleBasedProcessor(clib)
+    processor.Process(cleaned_ent)
+    return cleaned_ent
+
+
+def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
+              fault_tolerant=False):
+    """ Ligand scoring helper - Prepares :class:`LigandScorer` input from mmCIF
+
+    Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
+    to :class:`LigandScorer`.
+
+    :param mmcif_path: Path to mmCIF file that contains polymer and optionally
+                       non-polymer entities
+    :type mmcif_path: :class:`str`
+    :param biounit: If given, construct specified biounit from mmCIF AU
+    :type biounit: :class:`str`
+    :param extract_nonpoly: Additionally returns a list of
+                            :class:`ost.mol.EntityHandle`
+                            objects representing all non-polymer (ligand)
+                            entities.
+    :type extract_nonpoly: :class:`bool`
+    :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
+    :type fault_tolerant: :class:`bool`
+    :returns: :class:`ost.mol.EntityHandle` which only contains polymer
+              entities representing the receptor structure. If *extract_nonpoly*
+              is True, a tuple is returned which additionally contains a
+              :class:`list` of :class:`ost.mol.EntityHandle`, where each
+              :class:`ost.mol.EntityHandle` represents a non-polymer (ligand).
+    """
+    clib = conop.GetDefaultLib()
+    if not clib:
+        ost.LogError("A compound library is required. "
+                     "Please refer to the OpenStructure website: "
+                     "https://openstructure.org/docs/conop/compoundlib/.")
+        raise RuntimeError("No compound library found")
+
+    mmcif_entity, mmcif_info = io.LoadMMCIF(mmcif_path, info=True,
+                                            fault_tolerant=fault_tolerant)
+    mmcif_entity = CleanHydrogens(mmcif_entity, clib)
+
+    # get AU chain names representing polymer entities
+    polymer_entity_ids = mmcif_info.GetEntityIdsOfType("polymer")
+    polymer_chain_names = list()
+    for ch in mmcif_entity.chains:
+        if mmcif_info.GetMMCifEntityIdTr(ch.name) in polymer_entity_ids:
+            polymer_chain_names.append(ch.name)
+
+    # get AU chain names representing non-polymer entities
+    non_polymer_entity_ids = mmcif_info.GetEntityIdsOfType("non-polymer")
+    non_polymer_chain_names = list()
+    for ch in mmcif_entity.chains:
+        if mmcif_info.GetMMCifEntityIdTr(ch.name) in non_polymer_entity_ids:
+            non_polymer_chain_names.append(ch.name)
+
+    # construct biounit if necessary
+    if biounit is not None:
+        biounit_found = False
+        for bu in mmcif_info.biounits:
+            if bu.id == biounit:
+                mmcif_entity = mol.alg.CreateBU(mmcif_entity, bu)
+                biounit_found = True
+                break
+        if not biounit_found:
+            raise RuntimeError(f"Specified biounit '{biounit}' not in "
+                               f"{mmcif_path}")
+
+    # assign generic properties for selection later on
+    non_poly_id = 0
+    for ch in mmcif_entity.chains:
+        cname = None
+        if biounit is not None:
+            # if a biounit is constructed, you get chain names like: 1.YOLO
+            # we cannot simply split by '.' since '.' is an allowed character
+            # in chain names. => split by first occurence
+            dot_index = ch.name.find('.')
+            if dot_index == -1:
+                cname = ch.name
+            else:
+                cname = ch.name[dot_index+1:]
+        else:
+            cname = ch.name
+        
+        if cname in polymer_chain_names:
+            ch.SetIntProp("poly", 1)
+        if cname in non_polymer_chain_names:
+            ch.SetIntProp("nonpolyid", non_poly_id)
+            non_poly_id += 1
+
+    poly_sel = mmcif_entity.Select("gcpoly:0=1")
+    poly_ent = mol.CreateEntityFromView(poly_sel, True)
+
+    if extract_nonpoly == False:
+        return poly_ent
+
+    non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1")
+    non_poly_entities = list()
+    for i in range(non_poly_id):
+        view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}")
+        if view.GetResidueCount() != 1:
+            raise RuntimeError(f"Expect non-polymer entities in "
+                               f"{mmcif_path} to contain exactly 1 "
+                               f"residue. Got {ch.GetResidueCount()} "
+                               f"in chain {ch.name}")
+        compound = clib.FindCompound(view.residues[0].name)
+        if compound is None:
+            raise RuntimeError(f"Can only extract non-polymer entities if "
+                               f"respective residues are available in PDB "
+                               f"component dictionary. Can't find "
+                               f"\"{view.residues[0].name}\"")
+
+        non_poly_entities.append(mol.CreateEntityFromView(view, True))
+
+    return (poly_ent, non_poly_entities)
+
+
+def PDBPrep(pdb_path, fault_tolerant=False):
+    """ Ligand scoring helper - Prepares :class:`LigandScorer` input from PDB
+
+    Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
+    to :class:`LigandScorer`. There is no logic to extract ligands from PDB
+    files. Ligands must be provided separately as SDF files in these cases.
+
+    :param pdb_path: Path to PDB file that contains polymer entities
+    :type pdb_path: :class:`str`
+    :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadPDB`
+    :type fault_tolerant: :class:`bool`
+    :returns: :class:`EntityHandle` from loaded file.
+    """
+    clib = conop.GetDefaultLib()
+    if not clib:
+        ost.LogError("A compound library is required. "
+                     "Please refer to the OpenStructure website: "
+                     "https://openstructure.org/docs/conop/compoundlib/.")
+        raise RuntimeError("No compound library found")
+
+    pdb_entity = io.LoadPDB(pdb_path, fault_tolerant=fault_tolerant)
+    pdb_entity = CleanHydrogens(pdb_entity, clib)
+
+    return pdb_entity
 
-    .. note ::
-      Extra requirements:
 
-      - Python modules `numpy` and `networkx` must be available
-        (e.g. use ``pip install numpy networkx``)
+class LigandScorer:
+    """ Scorer to compute various small molecule ligand (non polymer) scores.
 
     :class:`LigandScorer` is an abstract base class dealing with all the setup,
     data storage, enumerating ligand symmetries and target/model ligand
@@ -84,26 +266,8 @@ class LigandScorer:
 
     Assumptions:
 
-    :class:`LigandScorer` generally assumes that the
-    :attr:`~ost.mol.ResidueHandle.is_ligand` property is properly set on all
-    the ligand residues, and only ligand atoms. This is typically the case for
-    entities loaded from mmCIF (tested with mmCIF files from the PDB and
-    SWISS-MODEL). Legacy PDB files must contain `HET` headers (which is usually
-    the case for files downloaded from the PDB but not elsewhere).
-
-    The class doesn't perform any cleanup of the provided structures.
-    It is up to the caller to ensure that the data is clean and suitable for
-    scoring. :ref:`Molck <molck>` should be used with extra
-    care, as many of the options (such as `rm_non_std` or `map_nonstd_res`) can
-    cause ligands to be removed from the structure. If cleanup with Molck is
-    needed, ligands should be kept aside and passed separately. Non-ligand
-    residues should be valid compounds with atom names following the naming
-    conventions of the component dictionary. Non-standard residues are
-    acceptable, and if the model contains a standard residue at that position,
-    only atoms with matching names will be considered.
-
     Unlike most of OpenStructure, this class does not assume that the ligands
-    (either in the model or the target) are part of the PDB component
+    (either for the model or the target) are part of the PDB component
     dictionary. They may have arbitrary residue names. Residue names do not
     have to match between the model and the target. Matching is based on
     the calculation of isomorphisms which depend on the atom element name and
@@ -112,49 +276,31 @@ class LigandScorer:
     set before passing any ligands to this class. Ligands with improper
     connectivity will lead to bogus results.
 
-    Note, however, that atom names should be unique within a residue (ie two
-    distinct atoms cannot have the same atom name).
-
     This only applies to the ligand. The rest of the model and target
     structures (protein, nucleic acids) must still follow the usual rules and
-    contain only residues from the compound library.
-
-    Although it isn't a requirement, hydrogen atoms should be removed from the
-    structures. Here is an example code snippet that will perform a reasonable
-    cleanup. Keep in mind that this is most likely not going to work as
-    expected with entities loaded from PDB files, as the `is_ligand` flag is
-    probably not set properly.
+    contain only residues from the compound library. Structures are cleaned up
+    according to constructor documentation. We advise to
+    use the :func:`MMCIFPrep` and :func:`PDBPrep` for loading which already
+    clean hydrogens and, in the case of MMCIF, optionally extract ligands ready
+    to be used by the :class:`LigandScorer` based on "non-polymer" entity types.
+    In case of PDB file format, ligands must be loaded separately as SDF files.
 
-    Here is an example of how to use setup a scorer code::
+    Here is an example of how to setup a scorer::
 
         from ost.mol.alg.ligand_scoring_scrmsd import SCRMSDScorer
-        from ost.mol.alg import Molck, MolckSettings
 
         # Load data
         # Structure model in PDB format, containing the receptor only
-        model = io.LoadPDB("path_to_model.pdb")
+        model = PDBPrep("path_to_model.pdb")
         # Ligand model as SDF file
         model_ligand = io.LoadEntity("path_to_ligand.sdf", format="sdf")
         # Target loaded from mmCIF, containing the ligand
-        target = io.LoadMMCIF("path_to_target.cif")
-
-        # Cleanup a copy of the structures
-        cleaned_model = model.Copy()
-        cleaned_target = target.Copy()
-        molck_settings = MolckSettings(rm_unk_atoms=True,
-                                       rm_non_std=False,
-                                       rm_hyd_atoms=True,
-                                       rm_oxt_atoms=False,
-                                       rm_zero_occ_atoms=False,
-                                       colored=False,
-                                       map_nonstd_res=False,
-                                       assign_elem=True)
-        Molck(cleaned_model, conop.GetDefaultLib(), molck_settings)
-        Molck(cleaned_target, conop.GetDefaultLib(), molck_settings)
-
-        # Setup scorer object and compute lDDT-PLI
+        target, target_ligands = MMCIFPrep("path_to_target.cif",
+                                           extract_nonpoly=True)
+
+        # Setup scorer object and compute SCRMSD
         model_ligands = [model_ligand.Select("ele != H")]
-        sc = SCRMSDScorer(cleaned_model, cleaned_target, model_ligands)
+        sc = SCRMSDScorer(model, target, model_ligands, target_ligands)
 
         # Perform assignment and read respective scores
         for lig_pair in sc.assignment:
@@ -163,54 +309,45 @@ class LigandScorer:
             score = sc.score_matrix[lig_pair[0], lig_pair[1]]
             print(f"Score for {trg_lig} and {mdl_lig}: {score}")
 
+        # check cleanup in model and target structure:
+        print("model cleanup:", sc.model_cleanup_log)
+        print("target cleanup:", sc.target_cleanup_log)
+
     :param model: Model structure - a deep copy is available as :attr:`model`.
-                  No additional processing (ie. Molck), checks,
-                  stereochemistry checks or sanitization is performed on the
-                  input. Hydrogen atoms are kept.
+                  The model undergoes the following cleanup steps which are
+                  dependent on :class:`ost.conop.CompoundLib` returned by
+                  :func:`ost.conop.GetDefaultLib`: 1) removal
+                  of hydrogens, 2) removal of residues for which there is no
+                  entry in :class:`ost.conop.CompoundLib`, 3) removal of
+                  residues that are not peptide linking or nucleotide linking
+                  according to :class:`ost.conop.CompoundLib` 4) removal of
+                  atoms that are not defined for respective residues in
+                  :class:`ost.conop.CompoundLib`. Except step 1), every cleanup
+                  is logged with :class:`ost.LogLevel` Warning and a report is
+                  available as :attr:`model_cleanup_log`.
     :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
-    :param target: Target structure - a deep copy is available as
-                   :attr:`target`. No additional processing (ie. Molck), checks
-                   or sanitization is performed on the input. Hydrogen atoms are
-                   kept.
+    :param target: Target structure - same processing as *model*. 
     :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
     :param model_ligands: Model ligands, as a list of
-                          :class:`~ost.mol.ResidueHandle` belonging to the model
-                          entity. Can be instantiated with either a :class:list
-                          of :class:`~ost.mol.ResidueHandle`/
-                          :class:`ost.mol.ResidueView` or of
+                          :class:`ost.mol.ResidueHandle`/
+                          :class:`ost.mol.ResidueView`/
                           :class:`ost.mol.EntityHandle`/
-                          :class:`ost.mol.EntityView`.
-                          If `None`, ligands will be extracted based on the
-                          :attr:`~ost.mol.ResidueHandle.is_ligand` flag (this is
-                          normally set properly in entities loaded from mmCIF).
+                          :class:`ost.mol.EntityView`. For
+                          :class:`ost.mol.EntityHandle`/
+                          :class:`ost.mol.EntityView`, each residue is
+                          considered to be an individual ligand.
+                          All ligands are copied into a separate
+                          :class:`ost.mol.EntityHandle` available as
+                          :attr:`model_ligand_ent` and the respective
+                          list of ligands is available as :attr:`model_ligands`.
     :type model_ligands: :class:`list`
-    :param target_ligands: Target ligands, as a list of
-                           :class:`~ost.mol.ResidueHandle` belonging to the
-                           target entity. Can be instantiated either a
-                           :class:list of :class:`~ost.mol.ResidueHandle`/
-                           :class:`ost.mol.ResidueView` or of
-                           :class:`ost.mol.EntityHandle`/
-                           :class:`ost.mol.EntityView` containing a single
-                           residue each. If `None`, ligands will be extracted
-                           based on the :attr:`~ost.mol.ResidueHandle.is_ligand`
-                           flag (this is normally set properly in entities
-                           loaded from mmCIF).
+    :param target_ligands: Target ligands, same processing as model ligands. 
     :type target_ligands: :class:`list`
     :param resnum_alignments: Whether alignments between chemically equivalent
                               chains in *model* and *target* can be computed
                               based on residue numbers. This can be assumed in
                               benchmarking setups such as CAMEO/CASP.
     :type resnum_alignments: :class:`bool`
-    :param rename_ligand_chain: If a residue with the same chain name and
-                                residue number than an explicitly passed model
-                                or target ligand exits in the structure,
-                                and `rename_ligand_chain` is False, a
-                                RuntimeError will be raised. If
-                                `rename_ligand_chain` is True, the ligand will
-                                be moved to a new chain instead, and the move
-                                will be logged to the console with SCRIPT
-                                level.
-    :type rename_ligand_chain: :class:`bool`
     :param substructure_match: Set this to True to allow incomplete (i.e.
                                partially resolved) target ligands.
     :type substructure_match: :class:`bool`
@@ -222,52 +359,82 @@ class LigandScorer:
     :type max_symmetries: :class:`int`
     """
 
-    def __init__(self, model, target, model_ligands=None, target_ligands=None,
-                 resnum_alignments=False, rename_ligand_chain=False,
-                 substructure_match=False, coverage_delta=0.2,
-                 max_symmetries=1e5):
+    def __init__(self, model, target, model_ligands, target_ligands,
+                 resnum_alignments=False, substructure_match=False,
+                 coverage_delta=0.2, max_symmetries=1e5,
+                 rename_ligand_chain=False):
 
         if isinstance(model, mol.EntityView):
-            self.model = mol.CreateEntityFromView(model, False)
+            self._model = mol.CreateEntityFromView(model, False)
         elif isinstance(model, mol.EntityHandle):
-            self.model = model.Copy()
+            self._model = model.Copy()
         else:
             raise RuntimeError("model must be of type EntityView/EntityHandle")
 
         if isinstance(target, mol.EntityView):
-            self.target = mol.CreateEntityFromView(target, False)
+            self._target = mol.CreateEntityFromView(target, False)
         elif isinstance(target, mol.EntityHandle):
-            self.target = target.Copy()
+            self._target = target.Copy()
         else:
             raise RuntimeError("target must be of type EntityView/EntityHandle")
 
-        # Extract ligands from target
-        if target_ligands is None:
-            self.target_ligands = self._extract_ligands(self.target)
-        else:
-            self.target_ligands = self._prepare_ligands(self.target, target,
-                                                        target_ligands,
-                                                        rename_ligand_chain)
-        if len(self.target_ligands) == 0:
-            LogWarning("No ligands in the target")
+        clib = conop.GetDefaultLib()
+        if not clib:
+            ost.LogError("A compound library is required. "
+                         "Please refer to the OpenStructure website: "
+                         "https://openstructure.org/docs/conop/compoundlib/.")
+            raise RuntimeError("No compound library found")
+        self._target, self._target_cleanup_log = \
+        self._cleanup_polymer_ent(self._target, clib)
+        self._model, self._model_cleanup_log = \
+        self._cleanup_polymer_ent(self._model, clib)
+
+        # keep ligands separate from polymer entities
+        self._target_ligand_ent = mol.CreateEntity()
+        self._model_ligand_ent = mol.CreateEntity()
+        target_ligand_ent_ed = self._target_ligand_ent.EditXCS(mol.BUFFERED_EDIT)
+        model_ligand_ent_ed = self._model_ligand_ent.EditXCS(mol.BUFFERED_EDIT)
+
+        self._target_ligands = list()
+        for l in target_ligands:
+            if isinstance(l, mol.EntityView) or isinstance(l, mol.EntityHandle):
+                for r in l.residues:
+                    self._target_ligands.append(self._copy_ligand(r, self._target_ligand_ent,
+                                                                  target_ligand_ent_ed,
+                                                                  rename_ligand_chain))
+            elif isinstance(l, mol.ResidueHandle) or isinstance(l, mol.ResidueView):
+                self._target_ligands.append(self._copy_ligand(l, self._target_ligand_ent,
+                                                              target_ligand_ent_ed,
+                                                              rename_ligand_chain))  
+            else:
+                raise RuntimeError("ligands must be of type EntityView/"
+                                   "EntityHandle/ResidueView/ResidueHandle")
+            
+        self._model_ligands = list()
+        for l in model_ligands:
+            if isinstance(l, mol.EntityView) or isinstance(l, mol.EntityHandle):
+                for r in l.residues:
+                    self._model_ligands.append(self._copy_ligand(r, self._model_ligand_ent,
+                                                                 model_ligand_ent_ed,
+                                                                 rename_ligand_chain))
+            elif isinstance(l, mol.ResidueHandle) or isinstance(l, mol.ResidueView):
+                self._model_ligands.append(self._copy_ligand(l, self._model_ligand_ent,
+                                                             model_ligand_ent_ed,
+                                                             rename_ligand_chain))  
+            else:
+                raise RuntimeError("ligands must be of type EntityView/"
+                                   "EntityHandle/ResidueView/ResidueHandle")
+
 
-        # Extract ligands from model
-        if model_ligands is None:
-            self.model_ligands = self._extract_ligands(self.model)
-        else:
-            self.model_ligands = self._prepare_ligands(self.model, model,
-                                                       model_ligands,
-                                                       rename_ligand_chain)
         if len(self.model_ligands) == 0:
             LogWarning("No ligands in the model")
             if len(self.target_ligands) == 0:
                 raise ValueError("No ligand in the model and in the target")
 
-        self.resnum_alignments = resnum_alignments
-        self.rename_ligand_chain = rename_ligand_chain
-        self.substructure_match = substructure_match
-        self.coverage_delta = coverage_delta
-        self.max_symmetries = max_symmetries
+        self._resnum_alignments = resnum_alignments
+        self._substructure_match = substructure_match
+        self._coverage_delta = coverage_delta
+        self._max_symmetries = max_symmetries
 
         # lazily computed attributes
         self.__chain_mapper = None
@@ -314,6 +481,95 @@ class LigandScorer:
              "either model or target ligand have non-zero state."),
          9: ("unknown", "An unknown error occured in LigandScorer")}
 
+
+    @property
+    def model(self):
+        """ Model receptor structure
+
+        Processed according to docs in :class:`LigandScorer` constructor
+        """
+        return self._model
+
+    @property
+    def target(self):
+        """ Target receptor structure
+
+        Processed according to docs in :class:`LigandScorer` constructor
+        """
+        return self._target
+
+    @property
+    def model_cleanup_log(self):
+        """ Reports residues/atoms that were removed in model during cleanup
+
+        Residues and atoms are described as :class:`str` in format
+        <chain_name>.<resnum>.<ins_code> (residue) and
+        <chain_name>.<resnum>.<ins_code>.<aname> (atom).
+
+        :class:`dict` with keys:
+        
+        * 'cleaned_residues': another :class:`dict` with keys:
+
+          * 'no_clib': residues that have been removed because no entry could be
+            found in :class:`ost.conop.CompoundLib`
+          * 'not_linking': residues that have been removed because they're not
+            peptide or nucleotide linking according to
+            :class:`ost.conop.CompoundLib`
+
+        * 'cleaned_atoms': another :class:`dict` with keys:
+
+          * 'unknown_atoms': atoms that have been removed as they're not part
+            of their respective residue according to
+            :class:`ost.conop.CompoundLib`
+        """
+        return self._model_cleanup_log
+
+    @property
+    def target_cleanup_log(self):
+        """ Same for target
+        """
+        return self._target_cleanup_log  
+
+    @property
+    def model_ligands(self):
+        """ Residues representing model ligands
+
+        :class:`list` of :class:`ost.mol.ResidueHandle`
+        """
+        return self._model_ligands    
+
+    @property
+    def target_ligands(self):
+        """ Residues representing target ligands
+
+        :class:`list` of :class:`ost.mol.ResidueHandle`
+        """
+        return self._target_ligands 
+
+    @property
+    def resnum_alignments(self):
+        """ Given at :class:`LigandScorer` construction
+        """
+        return self._resnum_alignments
+
+    @property
+    def substructure_match(self):
+        """ Given at :class:`LigandScorer` construction
+        """
+        return self._substructure_match
+
+    @property
+    def coverage_delta(self):
+        """ Given at :class:`LigandScorer` construction
+        """
+        return self._coverage_delta
+
+    @property
+    def max_symmetries(self):
+        """ Given at :class:`LigandScorer` construction
+        """
+        return self._max_symmetries
+
     @property
     def state_matrix(self):
         """ Encodes states of ligand pairs
@@ -508,6 +764,7 @@ class LigandScorer:
                 self._aux_dict[cname][rnum] = d
         return self._aux_dict
 
+
     @property
     def unassigned_target_ligands(self):
         """ Get indices of target ligands which are not assigned 
@@ -782,147 +1039,6 @@ class LigandScorer:
                                           resnum_alignments=self.resnum_alignments)
         return self.__chain_mapper
 
-    @staticmethod
-    def _extract_ligands(entity):
-        """Extract ligands from entity. Return a list of residues.
-
-        Assumes that ligands have the :attr:`~ost.mol.ResidueHandle.is_ligand`
-        flag set. This is typically the case for entities loaded from mmCIF
-        (tested with mmCIF files from the PDB and SWISS-MODEL).
-        Legacy PDB files must contain `HET` headers (which is usually the
-        case for files downloaded from the PDB but not elsewhere).
-
-        This function performs basic checks to ensure that the residues in this
-        chain are not forming polymer bonds (ie peptide/nucleotide ligands) and
-        will raise a RuntimeError if this assumption is broken.
-
-        :param entity: the entity to extract ligands from
-        :type entity: :class:`~ost.mol.EntityHandle`
-        :rtype: :class:`list` of :class:`~ost.mol.ResidueHandle`
-
-        """
-        extracted_ligands = []
-        for residue in entity.residues:
-            if residue.is_ligand:
-                if mol.InSequence(residue, residue.next):
-                    raise RuntimeError("Residue %s connected in polymer sequen"
-                                       "ce %s" % (residue.qualified_name))
-                extracted_ligands.append(residue)
-                LogVerbose("Detected residue %s as ligand" % residue)
-        return extracted_ligands
-
-    @staticmethod
-    def _prepare_ligands(new_entity, old_entity, ligands, rename_chain):
-        """Prepare the ligands given into a list of ResidueHandles which are
-        part of the copied entity, suitable for the model_ligands and
-        target_ligands properties.
-
-        This function takes a list of ligands as (Entity|Residue)(Handle|View).
-        Entities can contain multiple ligands, which will be considered as
-        separate ligands.
-
-        Ligands which are part of the entity are simply fetched in the new
-        copied entity. Otherwise, they are copied over to the copied entity.
-        """
-        extracted_ligands = []
-
-        next_chain_num = 1
-        new_editor = None
-
-        def _copy_residue(residue, rename_chain):
-            """ Copy the residue into the new chain.
-            Return the new residue handle."""
-            nonlocal next_chain_num, new_editor
-
-            # Instantiate the editor
-            if new_editor is None:
-                new_editor = new_entity.EditXCS(mol.BUFFERED_EDIT)
-
-            new_chain = new_entity.FindChain(residue.chain.name)
-            if not new_chain.IsValid():
-                new_chain = new_editor.InsertChain(residue.chain.name)
-                new_editor.SetChainType(new_chain,
-                                        mol.ChainType.CHAINTYPE_NON_POLY)
-            else:
-                # Does a residue with the same name already exist?
-                already_exists = new_chain.FindResidue(residue.number).IsValid()
-                if already_exists:
-                    if rename_chain:
-                        chain_ext = 2  # Extend the chain name by this
-                        while True:
-                            new_chain_name = \
-                            residue.chain.name + "_" + str(chain_ext)
-                            new_chain = new_entity.FindChain(new_chain_name)
-                            if new_chain.IsValid():
-                                chain_ext += 1
-                                continue
-                            else:
-                                new_chain = \
-                                new_editor.InsertChain(new_chain_name)
-                                break
-                        LogInfo("Moved ligand residue %s to new chain %s" % (
-                            residue.qualified_name, new_chain.name))
-                    else:
-                        msg = \
-                        "A residue number %s already exists in chain %s" % (
-                            residue.number, residue.chain.name)
-                        raise RuntimeError(msg)
-
-            # Add the residue with its original residue number
-            new_res = new_editor.AppendResidue(new_chain, residue.name,
-                                               residue.number)
-            # Add atoms
-            for old_atom in residue.atoms:
-                new_editor.InsertAtom(new_res, old_atom.name, old_atom.pos, 
-                    element=old_atom.element, occupancy=old_atom.occupancy,
-                    b_factor=old_atom.b_factor, is_hetatm=old_atom.is_hetatom)
-            # Add bonds
-            for old_atom in residue.atoms:
-                for old_bond in old_atom.bonds:
-                    new_first = new_res.FindAtom(old_bond.first.name)
-                    new_second = new_res.FindAtom(old_bond.second.name)
-                    new_editor.Connect(new_first, new_second)
-            return new_res
-
-        def _process_ligand_residue(res, rename_chain):
-            """Copy or fetch the residue. Return the residue handle."""
-            new_res = None
-            if res.entity.handle == old_entity.handle:
-                # Residue is part of the old_entity handle.
-                # However, it may not be in the copied one, for instance it may
-                # have been a view We try to grab it first, otherwise we copy it
-                new_res = new_entity.FindResidue(res.chain.name, res.number)
-            if new_res and new_res.valid:
-                qname = res.handle.qualified_name
-                LogVerbose("Ligand residue %s already in entity" % qname)
-            else:
-                # Residue is not part of the entity, need to copy it first
-                new_res = _copy_residue(res, rename_chain)
-                qname = res.handle.qualified_name
-                LogVerbose("Copied ligand residue %s" % qname)
-            new_res.SetIsLigand(True)
-            return new_res
-
-        for ligand in ligands:
-            is_eh = isinstance(ligand, mol.EntityHandle)
-            is_ev = isinstance(ligand, mol.EntityView)
-            is_rh = isinstance(ligand, mol.ResidueHandle)
-            is_rv = isinstance(ligand, mol.ResidueView)
-            if is_eh or is_ev:
-                for residue in ligand.residues:
-                    new_residue = _process_ligand_residue(residue, rename_chain)
-                    extracted_ligands.append(new_residue)
-            elif is_rh or is_rv:
-                new_residue = _process_ligand_residue(ligand, rename_chain)
-                extracted_ligands.append(new_residue)
-            else:
-                raise RuntimeError("Ligands should be given as Entity or "
-                                   "Residue")
-
-        if new_editor is not None:
-            new_editor.UpdateICS()
-        return extracted_ligands
-
     def _compute_scores(self):
         """
         Compute score for every possible target-model ligand pair and store the
@@ -1113,6 +1229,157 @@ class LigandScorer:
         raise NotImplementedError("_score_dir must be implemented by child "
                                   "class")
 
+    def _copy_ligand(self, l, ent, ed, rename_ligand_chain):
+        """ Copies ligand into entity and returns residue handle
+        """
+        new_chain = ent.FindChain(l.chain.name)
+        if not new_chain.IsValid():
+            new_chain = ed.InsertChain(l.chain.name)
+            ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        else:
+            # Does a residue with the same name already exist?
+            already_exists = new_chain.FindResidue(l.number).IsValid()
+            if already_exists:
+                if rename_ligand_chain:
+                    chain_ext = 2  # Extend the chain name by this
+                    while True:
+                        new_chain_name = \
+                        l.chain.name + "_" + str(chain_ext)
+                        new_chain = ent.FindChain(new_chain_name)
+                        if new_chain.IsValid():
+                            chain_ext += 1
+                            continue
+                        else:
+                            new_chain = \
+                            ed.InsertChain(new_chain_name)
+                            break
+                    LogInfo("Moved ligand residue %s to new chain %s" % (
+                            l.qualified_name, new_chain.name))
+                else:
+                    msg = \
+                    "A residue number %s already exists in chain %s" % (
+                        l.number, l.chain.name)
+                    raise RuntimeError(msg)
+
+        # Add the residue with its original residue number
+        new_res = ed.AppendResidue(new_chain, l.name, l.number)
+        # Add atoms
+        for old_atom in l.atoms:
+            ed.InsertAtom(new_res, old_atom.name, old_atom.pos, 
+                element=old_atom.element, occupancy=old_atom.occupancy,
+                b_factor=old_atom.b_factor, is_hetatm=old_atom.is_hetatom)
+        # Add bonds
+        for old_atom in l.atoms:
+            for old_bond in old_atom.bonds:
+                new_first = new_res.FindAtom(old_bond.first.name)
+                new_second = new_res.FindAtom(old_bond.second.name)
+                ed.Connect(new_first, new_second, old_bond.bond_order)
+        new_res.SetIsLigand(True)
+        return new_res
+
+    def _cleanup_polymer_ent(self, ent, clib):
+        """ In principle molck light but logs LigandScorer specific warnings
+
+        Only to be applied to polymer entity
+
+        1) removes atoms with elements set to H or D (not logged as there is no
+           effect on scoring)
+        2) removes residues with no entry in component dictionary
+        3) removes all residues that are not peptide_liking or
+           nucleotide_linking according component dictionary
+        4) removes unknown atoms according to component dictionary
+        5) reruns processor
+        """
+
+        cleanup_log = {"cleaned_residues": {"no_clib": [],
+                                            "not_linking": []},
+                       "cleaned_atoms": {"unknown_atoms": []}}
+
+        # 1)
+        hydrogen_sel = ent.Select("ele == H or ele == D")
+        if hydrogen_sel.GetAtomCount() > 0:
+            # just do, no logging as it has no effect on scoring
+            ent = ost.mol.CreateEntityFromView(ent.Select(
+                      "ele != H and ele != D"), include_exlusive_atoms=False)
+
+        # extract residues/atoms for 2), 3) and 4) 
+        not_in_clib = list()
+        not_linking = list()
+        unknown_atom = list()
+
+        for r in ent.residues:
+            comp = clib.FindCompound(r.GetName())
+            if comp is None:
+                not_in_clib.append(r)
+                continue
+            if not (comp.IsPeptideLinking() or comp.IsNucleotideLinking()):
+                not_linking.append(r)
+                continue
+            comp_anames = set([a.name for a in comp.atom_specs])
+            for a in r.atoms:
+                if a.name not in comp_anames:
+                    unknown_atom.append(a)
+
+        # 2)
+        if len(not_in_clib) > 0:
+            cleanup_log["cleaned_residues"]["no_clib"] = \
+            [_QualifiedResidueNotation(r) for r in not_in_clib]
+            msg = ("Expect all residues in receptor structure to be defined in "
+                   "compound library but this is not the case. "
+                   "Please refer to the OpenStructure website if an updated "
+                   "library is sufficient to solve the problem: "
+                   "https://openstructure.org/docs/conop/compoundlib/ "
+                   "These residues will not be considered for scoring (which "
+                   "may also affect pre-processing steps such as alignments): ")
+            msg += ','.join(cleanup_log["cleaned_residues"]["no_clib"])
+            ost.LogWarning(msg)
+            for r in not_in_clib:
+                r.SetIntProp("clib", 1)
+            ent = ost.mol.CreateEntityFromView(ent.Select("grclib:0!=1"),
+                      include_exlusive_atoms=False)
+
+        # 3)
+        if len(not_linking) > 0:
+            cleanup_log["cleaned_residues"]["not_linking"] = \
+            [_QualifiedResidueNotation(r) for r in not_linking]
+            msg = ("Expect all residues in receptor structure to be peptide "
+                   "linking or nucleotide linking according to the compound "
+                   "library but this is not the case. "
+                   "Please refer to the OpenStructure website if an updated "
+                   "library is sufficient to solve the problem: "
+                   "https://openstructure.org/docs/conop/compoundlib/ "
+                   "These residues will not be considered for scoring (which "
+                   "may also affect pre-processing steps such as alignments): ")
+            msg += ','.join(cleanup_log["cleaned_residues"]["not_linking"])
+            ost.LogWarning(msg)
+            for r in not_linking:
+                r.SetIntProp("linking", 1)
+            ent = ost.mol.CreateEntityFromView(ent.Select("grlinking:0!=1"),
+                      include_exlusive_atoms=False)
+
+        # 4)
+        if len(unknown_atom) > 0:
+            cleanup_log["cleaned_atoms"]["unknown_atoms"] = \
+            [_QualifiedAtomNotation(a) for a in unknown_atom]
+            msg = ("Expect atom names according to the compound library but "
+                   "this is not the case."
+                   "Please refer to the OpenStructure website if an updated "
+                   "library is sufficient to solve the problem: "
+                   "https://openstructure.org/docs/conop/compoundlib/ "
+                   "These atoms will not be considered for scoring: ")
+            msg += ','.join(cleanup_log["cleaned_atoms"]["unknown_atoms"])
+            ost.LogWarning(msg)
+            for a in unknown_atom:
+                a.SetIntProp("unknown", 1)
+            ent = ost.mol.CreateEntityFromView(ent.Select("gaunknown:0!=1"),
+                      include_exlusive_atoms=False)
+
+        # 5)
+        processor = conop.RuleBasedProcessor(clib)
+        processor.Process(ent)
+
+        return (ent, cleanup_log)
+
 
 def _ResidueToGraph(residue, by_atom_index=False):
     """Return a NetworkX graph representation of the residue.
@@ -1279,6 +1546,7 @@ class DisconnectedGraphError(Exception):
     pass
 
 # specify public interface
-__all__ = ('LigandScorer', 'ComputeSymmetries', 'NoSymmetryError',
+__all__ = ('CleanHydrogens', 'MMCIFPrep', 'PDBPrep',
+           'LigandScorer', 'ComputeSymmetries', 'NoSymmetryError',
            'NoIsomorphicSymmetryError', 'TooManySymmetriesError',
            'DisconnectedGraphError')
diff --git a/modules/mol/alg/pymod/ligand_scoring_lddtpli.py b/modules/mol/alg/pymod/ligand_scoring_lddtpli.py
index 467066471..7d40728ff 100644
--- a/modules/mol/alg/pymod/ligand_scoring_lddtpli.py
+++ b/modules/mol/alg/pymod/ligand_scoring_lddtpli.py
@@ -95,7 +95,7 @@ class LDDTPLIScorer(ligand_scoring_base.LigandScorer):
     :type lddt_pli_binding_site_radius: :class:`float`
     """
 
-    def __init__(self, model, target, model_ligands=None, target_ligands=None,
+    def __init__(self, model, target, model_ligands, target_ligands,
                  resnum_alignments=False, rename_ligand_chain=False,
                  substructure_match=False, coverage_delta=0.2,
                  max_symmetries=1e4, lddt_pli_radius=6.0,
@@ -103,8 +103,7 @@ class LDDTPLIScorer(ligand_scoring_base.LigandScorer):
                  lddt_pli_thresholds = [0.5, 1.0, 2.0, 4.0],
                  lddt_pli_binding_site_radius=None):
 
-        super().__init__(model, target, model_ligands = model_ligands,
-                         target_ligands = target_ligands,
+        super().__init__(model, target, model_ligands, target_ligands,
                          resnum_alignments = resnum_alignments,
                          rename_ligand_chain = rename_ligand_chain,
                          substructure_match = substructure_match,
diff --git a/modules/mol/alg/pymod/ligand_scoring_scrmsd.py b/modules/mol/alg/pymod/ligand_scoring_scrmsd.py
index 15ef917b4..33e6aa094 100644
--- a/modules/mol/alg/pymod/ligand_scoring_scrmsd.py
+++ b/modules/mol/alg/pymod/ligand_scoring_scrmsd.py
@@ -108,15 +108,14 @@ class SCRMSDScorer(ligand_scoring_base.LigandScorer):
                            pose is too far from the actual binding site.
     :type full_bs_search: :class:`bool`
     """
-    def __init__(self, model, target, model_ligands=None, target_ligands=None,
+    def __init__(self, model, target, model_ligands, target_ligands,
                  resnum_alignments=False, rename_ligand_chain=False,
                  substructure_match=False, coverage_delta=0.2,
                  max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0,
                  model_bs_radius=25, binding_sites_topn=100000,
                  full_bs_search=False):
 
-        super().__init__(model, target, model_ligands = model_ligands,
-                         target_ligands = target_ligands,
+        super().__init__(model, target, model_ligands, target_ligands,
                          resnum_alignments = resnum_alignments,
                          rename_ligand_chain = rename_ligand_chain,
                          substructure_match = substructure_match,
diff --git a/modules/mol/alg/tests/test_ligand_scoring.py b/modules/mol/alg/tests/test_ligand_scoring.py
index 90cb8f815..435af94d5 100644
--- a/modules/mol/alg/tests/test_ligand_scoring.py
+++ b/modules/mol/alg/tests/test_ligand_scoring.py
@@ -4,7 +4,7 @@ from functools import lru_cache
 import numpy as np
 
 import ost
-from ost import io, mol, geom
+from ost import io, mol, geom, conop
 # check if we can import: fails if numpy or scipy not available
 try:
     from ost.mol.alg.ligand_scoring_base import *
@@ -55,64 +55,19 @@ class TestLigandScoringFancy(unittest.TestCase):
     def test_extract_ligands_mmCIF(self):
         """Test that we can extract ligands from mmCIF files.
         """
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
-
-        sc = LigandScorer(mdl, trg, None, None)
-
-        self.assertEqual(len(sc.target_ligands),  7)
-        self.assertEqual(len(sc.model_ligands), 1)
-        self.assertEqual(len([r for r in sc.target.residues if r.is_ligand]), 7)
-        self.assertEqual(len([r for r in sc.model.residues if r.is_ligand]), 1)
-
-    def test_extract_ligands_PDB(self):
-        """Test that we can extract ligands from PDB files containing HET records.
-        """
-        trg = _LoadPDB("1R8Q.pdb")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
-
-        sc = LigandScorer(mdl, trg, None, None)
-
-        self.assertEqual(len(sc.target_ligands),  7)
-        self.assertEqual(len(sc.model_ligands), 1)
-        self.assertEqual(len([r for r in sc.target.residues if r.is_ligand]), 7)
-        self.assertEqual(len([r for r in sc.model.residues if r.is_ligand]), 1)
-
-    def test_init_given_ligands(self):
-        """Test that we can instantiate the scorer with ligands contained in
-        the target and model entity and given in a list.
-        """
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
+        # they're extracted in the MMCIFPrep function actually
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+        mdl, mdl_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"),
+                                                     extract_nonpoly=True)
 
-        # Pass entity views
-        trg_lig = [trg.Select("rname=MG"), trg.Select("rname=G3D")]
-        mdl_lig = [mdl.Select("rname=G3D")]
-        sc = LigandScorer(mdl, trg, mdl_lig, trg_lig)
 
-        self.assertEqual(len(sc.target_ligands), 4)
-        self.assertEqual(len(sc.model_ligands), 1)
-        # IsLigand flag should still be set even on not selected ligands
-        self.assertEqual(len([r for r in sc.target.residues if r.is_ligand]), 7)
-        self.assertEqual(len([r for r in sc.model.residues if r.is_ligand]), 1)
-
-        # Ensure the residues are not copied
-        self.assertEqual(len(sc.target.Select("rname=MG").residues), 2)
-        self.assertEqual(len(sc.target.Select("rname=G3D").residues), 2)
-        self.assertEqual(len(sc.model.Select("rname=G3D").residues), 1)
-
-        # Pass residue handles
-        trg_lig = [trg.FindResidue("F", 1), trg.FindResidue("H", 1)]
-        mdl_lig = [mdl.FindResidue("L_2", 1)]
         sc = LigandScorer(mdl, trg, mdl_lig, trg_lig)
 
-        self.assertEqual(len(sc.target_ligands), 2)
+        self.assertEqual(len(sc.target_ligands),  7)
         self.assertEqual(len(sc.model_ligands), 1)
-
-        # Ensure the residues are not copied
-        self.assertEqual(len(sc.target.Select("rname=ZN").residues), 1)
-        self.assertEqual(len(sc.target.Select("rname=G3D").residues), 2)
-        self.assertEqual(len(sc.model.Select("rname=G3D").residues), 1)
+        self.assertEqual(len([r for r in sc.target_ligands if r.is_ligand]), 7)
+        self.assertEqual(len([r for r in sc.model_ligands if r.is_ligand]), 1)
 
     def test_init_sdf_ligands(self):
         """Test that we can instantiate the scorer with ligands from separate SDF files.
@@ -130,9 +85,9 @@ class TestLigandScoringFancy(unittest.TestCase):
                     io.SaveEntity(lig_ent, "%s_ligand_%d.sdf" % (prefix, lig_num))
                     lig_num += 1
         """
-        mdl = _LoadPDB("P84080_model_02_nolig.pdb")
+        mdl = ligand_scoring_base.PDBPrep(_GetTestfilePath("P84080_model_02_nolig.pdb"))
         mdl_ligs = [_LoadEntity("P84080_model_02_ligand_0.sdf")]
-        trg = _LoadPDB("1r8q_protein.pdb.gz")
+        trg = ligand_scoring_base.PDBPrep(_GetTestfilePath("1r8q_protein.pdb.gz"))
         trg_ligs = [_LoadEntity("1r8q_ligand_%d.sdf" % i) for i in range(7)]
 
         # Pass entities
@@ -141,8 +96,8 @@ class TestLigandScoringFancy(unittest.TestCase):
         self.assertEqual(len(sc.target_ligands), 7)
         self.assertEqual(len(sc.model_ligands), 1)
         # Ensure we set the is_ligand flag
-        self.assertEqual(len([r for r in sc.target.residues if r.is_ligand]), 7)
-        self.assertEqual(len([r for r in sc.model.residues if r.is_ligand]), 1)
+        self.assertEqual(len([r for r in sc.target_ligands if r.is_ligand]), 7)
+        self.assertEqual(len([r for r in sc.model_ligands if r.is_ligand]), 1)
 
         # Pass residues
         mdl_ligs_res = [mdl_ligs[0].residues[0]]
@@ -157,9 +112,9 @@ class TestLigandScoringFancy(unittest.TestCase):
         """Test that we reject input if multiple ligands with the same chain
          name/residue number are given.
         """
-        mdl = _LoadPDB("P84080_model_02_nolig.pdb")
+        mdl = ligand_scoring_base.PDBPrep(_GetTestfilePath("P84080_model_02_nolig.pdb"))
         mdl_ligs = [_LoadEntity("P84080_model_02_ligand_0.sdf")]
-        trg = _LoadPDB("1r8q_protein.pdb.gz")
+        trg = ligand_scoring_base.PDBPrep(_GetTestfilePath("1r8q_protein.pdb.gz"))
         trg_ligs = [_LoadEntity("1r8q_ligand_%d.sdf" % i) for i in range(7)]
 
         # Reject identical model ligands
@@ -286,10 +241,12 @@ class TestLigandScoringFancy(unittest.TestCase):
     def test_compute_rmsd_scores(self):
         """Test that _compute_scores works.
         """
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+        mdl = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"))
         mdl_lig = io.LoadEntity(os.path.join('testfiles', "P84080_model_02_ligand_0.sdf"))
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig], None)
+
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig], trg_lig)
 
         # Note: expect warning about Binding site of H.ZN1 not mapped to the model
         self.assertEqual(sc.score_matrix.shape, (7, 1))
@@ -303,10 +260,14 @@ class TestLigandScoringFancy(unittest.TestCase):
             [np.nan]]), decimal=5)
 
     def test_compute_lddtpli_scores(self):
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+
+        mdl = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"))
         mdl_lig = io.LoadEntity(os.path.join('testfiles', "P84080_model_02_ligand_0.sdf"))
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], None,
+
+
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], trg_lig,
                                                   add_mdl_contacts = False,
                                                   lddt_pli_binding_site_radius = 4.0)
         self.assertEqual(sc.score_matrix.shape, (7, 1))
@@ -324,23 +285,27 @@ class TestLigandScoringFancy(unittest.TestCase):
         prot = _LoadMMCIF("1r8q.cif.gz").Copy()
 
         # model has the full binding site
-        mdl = mol.CreateEntityFromView(prot.Select("cname=A,B,G"), True)
+        mdl = mol.CreateEntityFromView(prot.Select("cname=A,B"), True)
+        mdl_lig = mol.CreateEntityFromView(prot.Select("cname=G"), True)
 
         # chain C has same sequence as chain A but is not in contact
         # with ligand in chain G
         # target has thus incomplete binding site only from chain B
-        trg = mol.CreateEntityFromView(prot.Select("cname=B,C,G"), True)
+        trg = mol.CreateEntityFromView(prot.Select("cname=B,C"), True)
+        trg_lig = mol.CreateEntityFromView(prot.Select("cname=G"), True)
 
         # if added model contacts are not considered, the incomplete binding
         # site only from chain B is perfectly reproduced by model which also has
         # chain B
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=False)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], [trg_lig],
+                                                  add_mdl_contacts=False)
         self.assertAlmostEqual(sc.score_matrix[0,0], 1.0, 5)
 
         # if added model contacts are considered, contributions from chain B are
         # perfectly reproduced but all contacts of ligand towards chain A are
         # added as penalty
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], [trg_lig],
+                                                  add_mdl_contacts=True)
 
         lig = prot.Select("cname=G")
         A_count = 0
@@ -367,7 +332,8 @@ class TestLigandScoringFancy(unittest.TestCase):
         # chain C
         query = "cname=B,G or (cname=C and rnum!=66)"
         trg = mol.CreateEntityFromView(prot.Select(query), True)
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], [trg_lig],
+                                                  add_mdl_contacts=True)
 
         TRP66_count = 0
         for a in lig.atoms:
@@ -391,7 +357,8 @@ class TestLigandScoringFancy(unittest.TestCase):
         mdl_ed = mdl.EditXCS()
         at = mdl.FindResidue("B", mol.ResNum(8)).FindAtom("NZ")
         mdl_ed.SetAtomPos(at, lig.geometric_center)
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, add_mdl_contacts=True)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [mdl_lig], [trg_lig],
+                                                  add_mdl_contacts=True)
 
         # compared to the last assertAlmostEqual, we add the number of ligand
         # atoms as additional penalties
@@ -400,21 +367,26 @@ class TestLigandScoringFancy(unittest.TestCase):
                                lig.GetAtomCount()), 5)
 
     def test_assignment(self):
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg)
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly = True)
+        mdl, mdl_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"),
+                                                     extract_nonpoly = True)
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, trg_lig)
         self.assertEqual(sc.assignment, [(1, 0)])
 
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, mdl_lig, trg_lig)
         self.assertEqual(sc.assignment, [(5, 0)])
 
     def test_dict_results_rmsd(self):
         """Test that the scores are computed correctly
         """
         # 4C0A has more ligands
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        trg_4c0a = _LoadMMCIF("4c0a.cif.gz")
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg_4c0a, None, None)
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly = True)
+
+        trg_4c0a, trg_4c0a_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("4c0a.cif.gz"),
+                                                               extract_nonpoly = True)
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg_4c0a, trg_lig, trg_4c0a_lig)
         expected_keys = {"J", "F"}
         self.assertFalse(expected_keys.symmetric_difference(sc.score.keys()))
         self.assertFalse(expected_keys.symmetric_difference(sc.aux.keys()))
@@ -439,9 +411,13 @@ class TestLigandScoringFancy(unittest.TestCase):
         """Test that the scores are computed correctly
         """
         # 4C0A has more ligands
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        trg_4c0a = _LoadMMCIF("4c0a.cif.gz")
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, None, None,
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly = True)
+
+        trg_4c0a, trg_4c0a_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("4c0a.cif.gz"),
+                                                               extract_nonpoly = True)
+
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, trg_lig, trg_4c0a_lig,
                                                   add_mdl_contacts=False,
                                                   lddt_pli_binding_site_radius = 4.0)
         expected_keys = {"J", "F"}
@@ -462,7 +438,7 @@ class TestLigandScoringFancy(unittest.TestCase):
         self.assertEqual(sc.aux["F"][mol.ResNum(1)]["model_ligand"].qualified_name, 'F.G3D1')
 
         # lddt_pli with added mdl contacts
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, None, None,
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(trg, trg_4c0a, trg_lig, trg_4c0a_lig,
                                                   add_mdl_contacts=True)
         self.assertAlmostEqual(sc.score["J"][mol.ResNum(1)], 0.8988340192043895, 5)
         self.assertAlmostEqual(sc.score["F"][mol.ResNum(1)], 0.9039735099337749, 5)
@@ -482,8 +458,10 @@ class TestLigandScoringFancy(unittest.TestCase):
          other ligands, peptides and short oligomers into account for superposition.
          When that's the case this test should be adapter
          """
-        trg = _LoadMMCIF("1SSP.cif.gz")
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg, None, None)
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1SSP.cif.gz"),
+                                                     extract_nonpoly=True)
+
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(trg, trg, trg_lig, trg_lig)
         expected_bs_ref_res = ['C.GLY62', 'C.GLN63', 'C.ASP64', 'C.PRO65', 'C.TYR66', 'C.CYS76', 'C.PHE77', 'C.ASN123', 'C.HIS187']
         ost.PushVerbosityLevel(ost.LogLevel.Error)
         self.assertEqual([str(r) for r in sc.aux["D"][1]["bs_ref_res"]], expected_bs_ref_res)
@@ -524,13 +502,14 @@ class TestLigandScoringFancy(unittest.TestCase):
          the ligand RET was wrongly assigned to short copies of OLA that float
           around and yielded higher scores.
           Here we test that this is resolved correctly."""
-        mdl = _LoadPDB("6jyf_mdl.pdb")
-        trg = _LoadMMCIF("6jyf_trg.cif")
+        mdl = ligand_scoring_base.PDBPrep(_GetTestfilePath("6jyf_mdl.pdb"))
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("6jyf_trg.cif"),
+                                                     extract_nonpoly=True)
         mdl_lig = _LoadEntity("6jyf_RET_pred.sdf")
         mdl_lig_full = _LoadEntity("6jyf_RET_pred_complete.sdf")
 
         # Problem is easily fixed by just prioritizing full coverage
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig],
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig], trg_lig,
                                                 substructure_match=True)
         self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment
         trg_lig_idx, mdl_lig_idx = sc.assignment[0]
@@ -555,7 +534,7 @@ class TestLigandScoringFancy(unittest.TestCase):
         # We need to make sure that it also works if the match is partial.
         # For that we load the complete ligand incl. the O missing in target
         # with a coverage of around 95% only.
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig_full],
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig_full], trg_lig,
                                                 substructure_match=True)
         self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment
         trg_lig_idx, mdl_lig_idx = sc.assignment[0]
@@ -566,7 +545,7 @@ class TestLigandScoringFancy(unittest.TestCase):
         # Next, we check that coverage_delta has an effect. With a large
         # delta of 0.5 we will assign to OLA which has a higher RMSD
         # but a coverage of 0.52 only.
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, model_ligands=[mdl_lig_full],
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig_full], trg_lig,
                                                 substructure_match=True,
                                                 coverage_delta=0.5)
         self.assertEqual(len(sc.assignment), 1) # only one mdl ligand => 1 assignment
@@ -623,22 +602,32 @@ class TestLigandScoringFancy(unittest.TestCase):
         loop when the data matrices are not filled properly.
         """
         trg = _LoadMMCIF("1r8q.cif.gz").Copy()
+        
+
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+
         mdl = trg.Copy()
+        mdl_lig = [l.Copy() for l in trg_lig]
 
-        trg_zn = trg.FindResidue("H", 1)
-        trg_g3d = trg.FindResidue("F", 1)
+        trg_g3d = trg_lig[1]
+        trg_zn = trg_lig[3]
+        self.assertEqual(trg_g3d.chains[0].name, "F")       
+        self.assertEqual(trg_zn.chains[0].name, "H")
+        self.assertEqual(trg_zn.GetAtomCount(), 1)    
 
         # Move the zinc out of the reference binding site...
-        ed = trg.EditXCS()
-        ed.SetAtomPos(trg_zn.FindAtom("ZN"),
-                      trg_zn.FindAtom("ZN").pos + geom.Vec3(6, 0, 0))
+        ed = trg_zn.EditXCS()
+        ed.SetAtomPos(trg_zn.atoms[0],
+                      trg_zn.atoms[0].pos + geom.Vec3(6, 0, 0))
+
         # Remove some atoms from G3D to decrease coverage. This messed up
         # the assignment in the past.
-        ed.DeleteAtom(trg_g3d.FindAtom("O6"))
+        ed = trg_g3d.EditXCS()
+        ed.DeleteAtom(trg_g3d.residues[0].FindAtom("O6"))
         ed.UpdateICS()
 
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg,
-                                                target_ligands=[trg_zn, trg_g3d],
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, target_ligands=[trg_zn, trg_g3d],
                                                 coverage_delta=0, substructure_match=True)
 
         self.assertTrue(np.isnan(sc.score_matrix[0, 3]))
@@ -680,9 +669,9 @@ class TestLigandScoringFancy(unittest.TestCase):
           disambiguate the RMSDs).
         - We get lDDT-PLI = None with RMSD assignment
         """
-        trg = _LoadPDB("T1118v1.pdb")
+        trg = ligand_scoring_base.PDBPrep(_GetTestfilePath("T1118v1.pdb"))
         trg_lig = _LoadEntity("T1118v1_001.sdf")
-        mdl = _LoadPDB("T1118v1LG035_1.pdb")
+        mdl = ligand_scoring_base.PDBPrep(_GetTestfilePath("T1118v1LG035_1.pdb"))
         mdl_lig = _LoadEntity("T1118v1LG035_1_1_1.sdf")
 
         # Ensure it's unassigned in lDDT
@@ -712,7 +701,8 @@ class TestLigandScoringFancy(unittest.TestCase):
         # However RMSD should be assigned
         sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [mdl_lig, mdl_lig],
                                                 [trg_lig, trg_lig],
-                                                bs_radius=5, rename_ligand_chain=True)
+                                                bs_radius=5,
+                                                rename_ligand_chain=True)
 
         self.assertEqual(sc.assignment, [(0,0), (1,1)])
         self.assertEqual(sc.unassigned_target_ligands, [])
@@ -735,63 +725,60 @@ class TestLigandScoringFancy(unittest.TestCase):
 
     def test_unassigned_reasons(self):
         """Test reasons for being unassigned."""
-        trg = _LoadMMCIF("1r8q.cif.gz")
-        mdl = _LoadMMCIF("P84080_model_02.cif.gz")
-
-        def _AppendResidueWithBonds(ed, chain, old_res):
-            new_res = ed.AppendResidue(chain, old_res.name)
-            for old_atom in old_res.atoms:
-                ed.InsertAtom(new_res, old_atom.name, old_atom.pos, old_atom.element,
-                              old_atom.occupancy, old_atom.b_factor, old_atom.is_hetatom)
-            for old_bond in old_atom.bonds:
-                new_first = new_res.FindAtom(old_bond.first.name)
-                new_second = new_res.FindAtom(old_bond.second.name)
-                ed.Connect(new_first, new_second)
-            return new_res
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+        mdl, mdl_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"),
+                                                     extract_nonpoly=True)
 
         # Add interesting ligands to model and target
-        mdl_ed = mdl.EditXCS()
-        trg_ed = trg.EditXCS()
+        mdl_lig_ent = mol.CreateEntity()
+        mdl_lig_ed = mdl_lig_ent.EditXCS()
+        trg_lig_ent = mol.CreateEntity()
+        trg_lig_ed = trg_lig_ent.EditXCS()
 
         # Add ZN: representation in the model (chain missing in model)
-        new_chain = mdl_ed.InsertChain("L_ZN")
-        mdl_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = _AppendResidueWithBonds(mdl_ed, new_chain, trg.Select("rname=ZN").residues[0].handle)
+        new_chain = mdl_lig_ed.InsertChain("L_ZN")
+        mdl_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        trg_zn_ent = trg_lig[3]
+        self.assertEqual(trg_zn_ent.residues[0].name, "ZN")
+        new_res = mdl_lig_ed.AppendResidue(new_chain, "ZN")
+        new_atom = mdl_lig_ed.InsertAtom(new_res, "ZN",
+                                         trg_zn_ent.atoms[0].GetPos(), "ZN")
         new_res.is_ligand = True
 
         # Add NA: not in contact with target
-        new_chain = trg_ed.InsertChain("L_NA")
-        trg_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = trg_ed.AppendResidue(new_chain, "NA")
-        new_atom = trg_ed.InsertAtom(new_res, "NA", geom.Vec3(100, 100, 100), "NA")
+        new_chain = trg_lig_ed.InsertChain("L_NA")
+        trg_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        new_res = trg_lig_ed.AppendResidue(new_chain, "NA")
+        new_atom = trg_lig_ed.InsertAtom(new_res, "NA", geom.Vec3(100, 100, 100), "NA")
         new_res.is_ligand = True
-        new_chain = mdl_ed.InsertChain("L_NA")
-        mdl_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = mdl_ed.AppendResidue(new_chain, "NA")
-        new_atom = mdl_ed.InsertAtom(new_res, "NA", geom.Vec3(100, 100, 100), "NA")
+        new_chain = mdl_lig_ed.InsertChain("L_NA")
+        mdl_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        new_res = mdl_lig_ed.AppendResidue(new_chain, "NA")
+        new_atom = mdl_lig_ed.InsertAtom(new_res, "NA", geom.Vec3(100, 100, 100), "NA")
         new_res.is_ligand = True
 
         # Add OXY: no symmetry/ not identical -
-        new_chain = mdl_ed.InsertChain("L_OXY")
-        mdl_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = mdl_ed.AppendResidue(new_chain, "OXY")
-        new_atom1 = mdl_ed.InsertAtom(new_res, "O1", geom.Vec3(0, 0, 0), "O")
-        new_atom2 = mdl_ed.InsertAtom(new_res, "O2", geom.Vec3(1, 1, 1), "O")
-        mdl_ed.Connect(new_atom1, new_atom2)
+        new_chain = mdl_lig_ed.InsertChain("L_OXY")
+        mdl_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        new_res = mdl_lig_ed.AppendResidue(new_chain, "OXY")
+        new_atom1 = mdl_lig_ed.InsertAtom(new_res, "O1", geom.Vec3(0, 0, 0), "O")
+        new_atom2 = mdl_lig_ed.InsertAtom(new_res, "O2", geom.Vec3(1, 1, 1), "O")
+        mdl_lig_ed.Connect(new_atom1, new_atom2)
         new_res.is_ligand = True
 
         # Add CMO: disconnected
-        new_chain = mdl_ed.InsertChain("L_CMO")
-        mdl_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = mdl_ed.AppendResidue(new_chain, "CMO")
-        new_atom1 = mdl_ed.InsertAtom(new_res, "O", geom.Vec3(0, 0, 0), "O")
-        new_atom2 = mdl_ed.InsertAtom(new_res, "C", geom.Vec3(1, 1, 1), "O")
+        new_chain = mdl_lig_ed.InsertChain("L_CMO")
+        mdl_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        new_res = mdl_lig_ed.AppendResidue(new_chain, "CMO")
+        new_atom1 = mdl_lig_ed.InsertAtom(new_res, "O", geom.Vec3(0, 0, 0), "O")
+        new_atom2 = mdl_lig_ed.InsertAtom(new_res, "C", geom.Vec3(1, 1, 1), "O")
         new_res.is_ligand = True
-        new_chain = trg_ed.InsertChain("L_CMO")
-        trg_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-        new_res = trg_ed.AppendResidue(new_chain, "CMO")
-        new_atom1 = trg_ed.InsertAtom(new_res, "O", geom.Vec3(0, 0, 0), "O")
-        new_atom2 = trg_ed.InsertAtom(new_res, "C", geom.Vec3(1, 1, 1), "O")
+        new_chain = trg_lig_ed.InsertChain("L_CMO")
+        trg_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+        new_res = trg_lig_ed.AppendResidue(new_chain, "CMO")
+        new_atom1 = trg_lig_ed.InsertAtom(new_res, "O", geom.Vec3(0, 0, 0), "O")
+        new_atom2 = trg_lig_ed.InsertAtom(new_res, "C", geom.Vec3(1, 1, 1), "O")
         new_res.is_ligand = True
 
         # Add 3 MG in model: assignment/stoichiometry
@@ -801,16 +788,19 @@ class TestLigandScoringFancy(unittest.TestCase):
             geom.Vec3(3.871, 12.343, 44.485) + 100
         ]
         for i in range(3):
-            new_chain = mdl_ed.InsertChain("L_MG_%d" % i)
-            mdl_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
-            new_res = mdl_ed.AppendResidue(new_chain, "MG")
-            new_atom = mdl_ed.InsertAtom(new_res, "MG", mg_pos[i], "MG")
+            new_chain = mdl_lig_ed.InsertChain("L_MG_%d" % i)
+            mdl_lig_ed.SetChainType(new_chain, mol.ChainType.CHAINTYPE_NON_POLY)
+            new_res = mdl_lig_ed.AppendResidue(new_chain, "MG")
+            new_atom = mdl_lig_ed.InsertAtom(new_res, "MG", mg_pos[i], "MG")
             new_res.is_ligand = True
 
-        mdl_ed.UpdateICS()
-        trg_ed.UpdateICS()
+        mdl_lig_ed.UpdateICS()
+        trg_lig_ed.UpdateICS()
 
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, None, None)
+        trg_lig.append(trg_lig_ent)
+        mdl_lig.append(mdl_lig_ent)
+
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, mdl_lig, trg_lig)
 
         # Check unassigned targets
         # NA: not in contact with target
@@ -850,11 +840,9 @@ class TestLigandScoringFancy(unittest.TestCase):
         self.assertEqual(sc.unassigned_model_ligands_reasons["L_CMO"][1], "disconnected")
 
         # Should work with rmsd_assignment too
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, None, None,
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, trg_lig,
                                                 full_bs_search=True)
 
-
-
         self.assertDictEqual(sc.unassigned_model_ligands_reasons, {
             'L_ZN': {1: 'model_binding_site'},
             'L_NA': {1: 'target_binding_site'},
@@ -873,27 +861,27 @@ class TestLigandScoringFancy(unittest.TestCase):
         self.assertTrue("L_OXY" not in sc.score)
 
         # With missing ligands
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl.Select("cname=A"), trg, None, None)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, [], trg_lig)
         self.assertEqual(sc.unassigned_target_ligands_reasons["E"][1], 'no_ligand')
 
-        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg.Select("cname=A"), None, None)
+        sc = ligand_scoring_lddtpli.LDDTPLIScorer(mdl, trg, mdl_lig, [])
         self.assertEqual(sc.unassigned_model_ligands_reasons["L_2"][1], 'no_ligand')
 
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl.Select("cname=A"), trg, None, None)
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, [], trg_lig)
         self.assertEqual(sc.unassigned_target_ligands_reasons["E"][1], 'no_ligand')
 
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg.Select("cname=A"), None, None)
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, [])
         self.assertEqual(sc.unassigned_model_ligands_reasons["L_2"][1], 'no_ligand')
 
         # However not everything must be missing
         with self.assertRaises(ValueError):
-            sc = LigandScorer(mdl.Select("cname=A"), trg.Select("cname=A"), None, None)
+            sc = LigandScorer(mdl, trg, [], [])
 
         # Test with partial bs search (full_bs_search=True)
         # Here we expect L_MG_2 to be unassigned because of stoichiometry
         # rather than model_binding_site, as it no longer matters so far from
         # the binding site.
-        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, None, None,
+        sc = ligand_scoring_scrmsd.SCRMSDScorer(mdl, trg, mdl_lig, trg_lig,
                                                full_bs_search=True)
         self.assertEqual(sc.unassigned_model_ligands_reasons, {
             'L_ZN': {1: 'model_binding_site'},
@@ -911,6 +899,112 @@ class TestLigandScoringFancy(unittest.TestCase):
             "L_CMO": {1: 'disconnected'}
         })
 
+    def test_cleanup_polymer_ent(self):
+
+        trg, trg_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
+                                                     extract_nonpoly=True)
+        mdl, mdl_lig = ligand_scoring_base.MMCIFPrep(_GetTestfilePath("P84080_model_02.cif.gz"),
+                                                     extract_nonpoly=True)
+
+        # check hydrogen cleanup
+        trg_with_h = trg.Copy()
+        N = trg_with_h.GetAtomCount()
+        ed = trg_with_h.EditXCS()
+        ed.InsertAtom(trg_with_h.residues[0], "H", geom.Vec3(), "H")
+        self.assertEqual(trg_with_h.GetAtomCount(), N + 1)
+        sc = LigandScorer(mdl, trg_with_h, mdl_lig, trg_lig)
+        self.assertEqual(sc.target.GetAtomCount(),N)
+
+        trg_with_d = trg.Copy()
+        N = trg_with_d.GetAtomCount()
+        ed = trg_with_d.EditXCS()
+        ed.InsertAtom(trg_with_d.residues[0], "H", geom.Vec3(), "D")
+        self.assertEqual(trg_with_d.GetAtomCount(), N + 1)
+        sc = LigandScorer(mdl, trg_with_d, mdl_lig, trg_lig)
+        self.assertEqual(sc.target.GetAtomCount(),N)
+
+        mdl_with_h = mdl.Copy()
+        N = mdl_with_h.GetAtomCount()
+        ed = mdl_with_h.EditXCS()
+        ed.InsertAtom(mdl_with_h.residues[0], "H", geom.Vec3(), "H")
+        self.assertEqual(mdl_with_h.GetAtomCount(), N + 1)
+        sc = LigandScorer(mdl_with_h, trg, mdl_lig, trg_lig)
+        self.assertEqual(sc.model.GetAtomCount(),N)
+
+        mdl_with_d = mdl.Copy()
+        N = mdl_with_d.GetAtomCount()
+        ed = mdl_with_d.EditXCS()
+        ed.InsertAtom(mdl_with_d.residues[0], "H", geom.Vec3(), "D")
+        self.assertEqual(mdl_with_d.GetAtomCount(), N + 1)
+        sc = LigandScorer(mdl_with_d, trg, mdl_lig, trg_lig)
+        self.assertEqual(sc.model.GetAtomCount(),N)
+
+        # residue with no entry in the component dictionary
+        trg_with_funny_compound = trg.Copy()
+        N = trg_with_funny_compound.GetResidueCount()
+        ed = trg_with_funny_compound.EditXCS()
+        ed.AppendResidue(trg_with_funny_compound.chains[0], "funny_compound")
+        self.assertEqual(trg_with_funny_compound.GetResidueCount(), N+1)
+        sc = LigandScorer(mdl, trg_with_funny_compound, mdl_lig, trg_lig)
+        self.assertEqual(sc.target.GetResidueCount(),N)
+        self.assertEqual(sc.target_cleanup_log["cleaned_residues"]["no_clib"], ["A.181."])
+
+        mdl_with_funny_compound = trg.Copy()
+        N = mdl_with_funny_compound.GetResidueCount()
+        ed = mdl_with_funny_compound.EditXCS()
+        ed.AppendResidue(mdl_with_funny_compound.chains[0], "funny_compound")
+        self.assertEqual(mdl_with_funny_compound.GetResidueCount(), N+1)
+        sc = LigandScorer(mdl_with_funny_compound, trg, mdl_lig, trg_lig)
+        self.assertEqual(sc.model.GetResidueCount(),N)
+        self.assertEqual(sc.model_cleanup_log["cleaned_residues"]["no_clib"], ["A.181."])
+
+        # residue which is not peptide linking or nucleotide linking
+        trg_with_pot = trg.Copy()
+        N = trg_with_pot.GetResidueCount()
+        ed = trg_with_pot.EditXCS()
+        ed.AppendResidue(trg_with_pot.chains[0], "POT")
+        self.assertTrue(conop.GetDefaultLib().FindCompound("POT") is not None)
+        self.assertEqual(trg_with_pot.GetResidueCount(), N+1)
+        sc = LigandScorer(mdl, trg_with_pot, mdl_lig, trg_lig)
+        self.assertEqual(sc.target.GetResidueCount(),N)
+        self.assertEqual(sc.target_cleanup_log["cleaned_residues"]["not_linking"], ["A.181."])
+
+        mdl_with_pot = mdl.Copy()
+        N = mdl_with_pot.GetResidueCount()
+        ed = mdl_with_pot.EditXCS()
+        ed.AppendResidue(mdl_with_pot.chains[0], "POT")
+        self.assertTrue(conop.GetDefaultLib().FindCompound("POT") is not None)
+        self.assertEqual(mdl_with_pot.GetResidueCount(), N+1)
+        sc = LigandScorer(mdl_with_pot, trg, mdl_lig, trg_lig)
+        self.assertEqual(sc.model.GetResidueCount(),N)
+        self.assertEqual(sc.model_cleanup_log["cleaned_residues"]["not_linking"], ["A.181."])
+
+        # unknown atom
+        trg_with_unk = trg.Copy()
+        N = trg_with_unk.GetAtomCount()
+        N_res = trg_with_unk.GetResidueCount()
+        ed = trg_with_unk.EditXCS()
+        ed.InsertAtom(trg_with_unk.residues[0], "yolo", geom.Vec3(), "C")
+        self.assertEqual(trg_with_unk.GetAtomCount(), N + 1)
+        self.assertEqual(trg_with_unk.GetResidueCount(), N_res)
+        sc = LigandScorer(mdl, trg_with_unk, mdl_lig, trg_lig)
+        self.assertEqual(sc.target.GetAtomCount(), N)
+        self.assertEqual(sc.target.GetResidueCount(), N_res)
+        self.assertEqual(sc.target_cleanup_log["cleaned_atoms"]["unknown_atoms"], ["A.2..yolo"])        
+
+        mdl_with_unk = mdl.Copy()
+        N = mdl_with_unk.GetAtomCount()
+        N_res = mdl_with_unk.GetResidueCount()
+        ed = mdl_with_unk.EditXCS()
+        ed.InsertAtom(mdl_with_unk.residues[0], "yolo", geom.Vec3(), "C")
+        self.assertEqual(mdl_with_unk.GetAtomCount(), N + 1)
+        self.assertEqual(mdl_with_unk.GetResidueCount(), N_res)
+        sc = LigandScorer(mdl_with_unk, trg, mdl_lig, trg_lig)
+        self.assertEqual(sc.model.GetAtomCount(), N)
+        self.assertEqual(sc.model.GetResidueCount(), N_res)
+        self.assertEqual(sc.model_cleanup_log["cleaned_atoms"]["unknown_atoms"], ["A.2..yolo"]) 
+
+
 if __name__ == "__main__":
     from ost import testutils
     if testutils.DefaultCompoundLibIsSet():
-- 
GitLab