From 239b144a0bb6c545c0458a65ea7ac901bcd5fab5 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Fri, 3 Mar 2023 18:23:15 +0100
Subject: [PATCH] select biounits in compare-structures

---
 actions/ost-compare-structures-new | 80 +++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/actions/ost-compare-structures-new b/actions/ost-compare-structures-new
index a36c3b7c1..f7c3d5b61 100644
--- a/actions/ost-compare-structures-new
+++ b/actions/ost-compare-structures-new
@@ -107,6 +107,30 @@ def _ParseArgs():
         help=("Format of reference file. pdb reads pdb but also pdb.gz, same "
               "applies to cif/mmcif. Inferred from filepath if not given."))
 
+    parser.add_argument(
+        "-mb",
+        "--model-biounit",
+        dest="model_biounit",
+        required=False,
+        default=None,
+        type=int,
+        help=("Only has an effect if model is in mmcif format. By default, "
+              "the assymetric unit (AU) is used for scoring. If there are "
+              "biounits defined in the mmcif file, you can specify the index "
+              "of the one which should be used."))
+
+    parser.add_argument(
+        "-rb",
+        "--reference-biounit",
+        dest="reference_biounit",
+        required=False,
+        default=None,
+        type=int,
+        help=("Only has an effect if reference is in mmcif format. By default, "
+              "the assymetric unit (AU) is used for scoring. If there are "
+              "biounits defined in the mmcif file, you can specify the index "
+              "of the one which should be used."))
+
     parser.add_argument(
         "-rna",
         "--residue-number-alignment",
@@ -290,7 +314,42 @@ def _ParseArgs():
 
     return parser.parse_args()
 
-def _LoadStructure(structure_path, sformat=None, fault_tolerant=False):
+def _Rename(ent):
+    """Revert chain names to original names.
+
+    PDBize assigns chain name in order A,B,C,D... which does not allow to infer
+    the original chain name. We do a renaming here:
+    if there are two chains mapping to chain A the resulting
+    chain names will be: A and A2.
+    """
+    new_chain_names = list()
+    chain_indices = list() # the chains where we actually change the name
+    suffix_indices = dict() # keep track of whats the current suffix index
+                            # for each original chain name
+
+    for ch_idx, ch in enumerate(ent.chains):
+        if not ch.HasProp("original_name"):
+            # pdbize doesnt set this property for chain names in ['_', '-']
+            continue
+        original_name = ch.GetStringProp("original_name")
+        if original_name in new_chain_names:
+            new_name = original_name + str(suffix_indices[original_name])
+            new_chain_names.append(new_name)
+            suffix_indices[original_name] = suffix_indices[original_name] + 1
+        else:
+            new_chain_names.append(original_name)
+            suffix_indices[original_name] = 2
+        chain_indices.append(ch_idx)
+    editor = ent.EditXCS()
+    # rename to nonsense to avoid clashing chain names
+    for ch_idx in chain_indices:
+        editor.RenameChain(ent.chains[ch_idx], ent.chains[ch_idx].name+"_yolo")
+    # and do final renaming
+    for new_name, ch_idx in zip(new_chain_names, chain_indices):
+        editor.RenameChain(ent.chains[ch_idx], new_name)
+
+def _LoadStructure(structure_path, sformat=None, fault_tolerant=False,
+                   bu_idx=None):
     """Read OST entity either from mmCIF or PDB.
 
     The returned structure has structure_path attached as structure name
@@ -313,7 +372,22 @@ def _LoadStructure(structure_path, sformat=None, fault_tolerant=False):
     ost.PushVerbosityLevel(ost.LogLevel.Error)
     # Load the structure
     if sformat in ["mmcif", "cif"]:
-        entity = io.LoadMMCIF(structure_path, fault_tolerant = fault_tolerant)
+        if bu_idx is not None:
+            cif_entity, cif_seqres, cif_info = \
+            io.LoadMMCIF(structure_path, info=True, seqres=True,
+                         fault_tolerant=fault_tolerant)
+            if bu_idx >= len(cif_info.biounits):
+                raise RuntimeError(f"Invalid biounit index - requested {bu_idx} "
+                                  f"cif file has {len(cif_info.biounits)}")
+            biounit = cif_info.biounits[bu_idx]
+            entity = biounit.PDBize(cif_entity, min_polymer_size=0)
+            if not entity.IsValid():
+                raise IOError(
+                    "Provided file does not contain valid entity.")
+            _Rename(entity)
+        else:
+            entity = io.LoadMMCIF(structure_path,
+                                  fault_tolerant = fault_tolerant)
         if len(entity.residues) == 0:
             raise Exception(f"No residues found in file: {structure_path}")
     elif sformat == "pdb":
@@ -453,9 +527,11 @@ def _Main():
     try:
         reference = _LoadStructure(args.reference,
                                    sformat=args.reference_format,
+                                   bu_idx=args.reference_biounit,
                                    fault_tolerant = args.fault_tolerant)
         model = _LoadStructure(args.model,
                                sformat=args.model_format,
+                               bu_idx=args.model_biounit,
                                fault_tolerant = args.fault_tolerant)
         out = _Process(model, reference, args)
         out["status"] = "SUCCESS"
-- 
GitLab