From b98d887878d3ae118cde2ca726585b60451ee0bd Mon Sep 17 00:00:00 2001
From: B13nch3n <b13nch3n_01@theb-si.de>
Date: Mon, 17 Oct 2022 16:05:55 +0200
Subject: [PATCH] Update converter script.

---
 .../translate2modelcif.py                     | 222 ++++++++++++------
 1 file changed, 152 insertions(+), 70 deletions(-)

diff --git a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py
index 4deb91e..900c848 100644
--- a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py
+++ b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py
@@ -46,10 +46,12 @@ def _parse_args():
         + "'<UniProtKB AC>-<UniProtKB AC>'",
     )
     parser.add_argument(
-        "--rank",
+        "--selected_rank",
         type=str,
         default=None,
-        help="Only process the model with this rank.",
+        help="If a certain model of a modelling project is selected by rank, "
+        + "the other models are still translated to ModelCIF but stored as "
+        + "accompanying files to the selected model.",
     )
     parser.add_argument(
         "--out_dir",
@@ -151,7 +153,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
                 occupancy=atm.occupancy,
             )
 
-    def add_scores(self, scores_json, entry_id, mdl_name):
+    def add_scores(self, scores_json, entry_id, mdl_name, add_files):
         """Add QA metrics from AF2 scores."""
         # global scores
         self.qa_metrics.extend(
@@ -196,21 +198,26 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
         self.qa_metrics.extend(lpae)
 
         ac_file = f"{mdl_name}_local_pairwise_qa.cif"
-        qa_file = modelcif.associated.LocalPairwiseQAScoresFile(
-            ac_file,
-            categories=["_ma_qa_metric_local_pairwise"],
-            copy_categories=["_ma_qa_metric"],
-            entry_id=entry_id,
-            entry_details="This file is an associated file consisting "
-            + "of local pairwise QA metrics. This is a partial mmCIF "
-            + "file and can be validated by merging with the main "
-            + "mmCIF file containing the model coordinates and other "
-            + "associated data.",
-            details="Predicted aligned error",
-        )
+        arc_files = [
+            modelcif.associated.LocalPairwiseQAScoresFile(
+                ac_file,
+                categories=["_ma_qa_metric_local_pairwise"],
+                copy_categories=["_ma_qa_metric"],
+                entry_id=entry_id,
+                entry_details="This file is an associated file consisting "
+                + "of local pairwise QA metrics. This is a partial mmCIF "
+                + "file and can be validated by merging with the main "
+                + "mmCIF file containing the model coordinates and other "
+                + "associated data.",
+                details="Predicted aligned error",
+            )
+        ]
+        if add_files:
+            arc_files.extend(add_files)
+
         return modelcif.associated.Repository(
             "",
-            [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=[qa_file])],
+            [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
         )
         # NOTE: by convention MA expects zip file with same name as model-cif
 
@@ -248,14 +255,14 @@ def _check_model_extra_files_present(model_dir, pdb_file):
 def _get_audit_authors():
     """Return the list of authors that produced this model."""
     return (
-        "Bartolec, T.",
+        "Bartolec, T.K.",
         "Vazquez-Campos, X.",
-        "Johnson, M.",
         "Norman, A.",
-        "Payne, R.",
-        "Wilkins, M.",
-        "Mackay, J.",
-        "Low, J.",
+        "Luong, C.",
+        "Payne, R.J.",
+        "Wilkins, M.R.",
+        "Mackay, J.P.",
+        "Low, J.K.K.",
     )
 
 
@@ -818,19 +825,19 @@ def _compress_cif_file(cif_file):
     os.remove(cif_file)
 
 
-def _package_associated_files(mdl_name):
+def _package_associated_files(repo):
     """Compress associated files into single zip file and delete original."""
-    # file names must match ones from add_scores
-    zip_path = f"{mdl_name}.zip"
-    files = [f"{mdl_name}_local_pairwise_qa.cif"]
     # zip settings tested for good speed vs compression
-    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_BZIP2) as myzip:
-        for file in files:
-            myzip.write(file)
-            os.remove(file)
+    for archive in repo.files:
+        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
+            for zfile in archive.files:
+                cif_zip.write(zfile.path, arcname=zfile.path)
+                os.remove(zfile.path)
 
 
-def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
+def _store_as_modelcif(
+    data_json, ost_ent, out_dir, file_prfx, compress, add_files
+):
     """Mix all the data into a ModelCIF file."""
     print("    generating ModelCIF objects...", end="")
     pstart = timer()
@@ -876,18 +883,23 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
     print("    processing QA scores...", end="", flush=True)
     pstart = timer()
     mdl_name = os.path.basename(file_prfx)
-    system.repositories.append(model.add_scores(data_json, system.id, mdl_name))
+    system.repositories.append(
+        model.add_scores(data_json, system.id, mdl_name, add_files)
+    )
     print(f" ({timer()-pstart:.2f}s)")
 
     system.model_groups.append(
         modelcif.model.ModelGroup([model], name=data_json["model_group_name"])
     )
 
-    ref_dbs = _get_sequence_dbs(data_json["config_data"]["seq_dbs"])
-    protocol = _get_modelcif_protocol(
-        data_json["protocol"], system.target_entities, model, ref_dbs
+    system.protocols.append(
+        _get_modelcif_protocol(
+            data_json["protocol"],
+            system.target_entities,
+            model,
+            _get_sequence_dbs(data_json["config_data"]["seq_dbs"]),
+        )
     )
-    system.protocols.append(protocol)
 
     # write modelcif System to file
     print("    write to disk...", end="", flush=True)
@@ -896,17 +908,48 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
     # -> hence we cheat by changing path and back while being exception-safe...
     oldpwd = os.getcwd()
     os.chdir(out_dir)
+    mdl_fle = f"{mdl_name}.cif"
     try:
-        with open(f"{mdl_name}.cif", "w", encoding="ascii") as mmcif_fh:
+        with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
             modelcif.dumper.write(mmcif_fh, [system])
-        _package_associated_files(mdl_name)
+        _package_associated_files(system.repositories[0])
         if compress:
-            _compress_cif_file(f"{mdl_name}.cif")
+            _compress_cif_file(mdl_fle)
     finally:
         os.chdir(oldpwd)
-
     print(f" ({timer()-pstart:.2f}s)")
 
+    mdl_fle = _get_assoc_mdl_file(mdl_fle, data_json)
+    zip_fle = _get_assoc_zip_file(
+        system.repositories[0].files[0].path, data_json
+    )
+    return mdl_fle, zip_fle
+
+
+def _get_assoc_mdl_file(fle_path, data_json):
+    """Generate a modelcif.associated.File object that looks like a CIF file.
+    The dedicated CIFFile functionality in modelcif would also try to write it.
+    """
+    cfile = modelcif.associated.File(
+        fle_path,
+        details=f"model {data_json['mdl_num']}; rank {data_json['rank_num']}",
+    )
+    cfile.file_format = "cif"
+    return cfile
+
+
+def _get_assoc_zip_file(fle_path, data_json):
+    """Create a modelcif.associated.File object that looks like a ZIP file.
+    This is NOT the archive ZIP file for the PAEs but to store that in the
+    ZIP archive of the selected model."""
+    zfile = modelcif.associated.File(
+        fle_path,
+        details="archive with multiple files for model "
+        + f"{data_json['mdl_num']}; rank {data_json['rank_num']}",
+    )
+    zfile.file_format = "zip"
+    return zfile
+
 
 def _create_interaction_json(config_data):
     """Create a dictionary (mimicking JSON) that contains data which is the same
@@ -931,6 +974,41 @@ def _create_model_json(data, pdb_file, up_acs, block_id):
     return ost_ent
 
 
+def _translate2modelcif(up_acs, pdb_fle, config_data, opts, add_files):
+    """Convert a PDB file with its accompanying data to ModelCIF."""
+    pdb_start = timer()
+    file_prfx, uid = _check_model_extra_files_present(opts.model_dir, pdb_fle)
+    pdb_fle = os.path.join(opts.model_dir, pdb_fle)
+
+    # gather data into JSON-like structure
+    print("    preparing data...", end="")
+    pstart = timer()
+
+    mdlcf_json = _create_interaction_json(config_data)
+
+    # uid = ..._rank_X_model_Y.pdb
+    mdl_name_parts = uid.split("_")
+    assert mdl_name_parts[-4] == "rank"
+    assert mdl_name_parts[-2] == "model"
+    mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
+    mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
+
+    ost_ent = _create_model_json(mdlcf_json, pdb_fle, up_acs, uid)
+
+    # read quality scores from JSON file
+    _get_scores(mdlcf_json, file_prfx)
+    print(f" ({timer()-pstart:.2f}s)")
+    mdlcf_fle, zip_fle = _store_as_modelcif(
+        mdlcf_json,
+        ost_ent,
+        opts.out_dir,
+        file_prfx,
+        opts.compress,
+        add_files,
+    )
+    return pdb_start, pdb_fle, mdlcf_fle, zip_fle
+
+
 def _main():
     """Run as script."""
     opts = _parse_args()
@@ -945,45 +1023,49 @@ def _main():
     config_data = _parse_colabfold_config(cnfg)
 
     # iterate model directory
-    found_ranked = False
+    # There is 1 representative for a modelling project, the other models are
+    # stored in its ZIP archive.
+    not_slctd_mdls = []
+    slctd_mdl = None
     for fle in sorted(os.listdir(opts.model_dir)):
         # iterate PDB files
         if not fle.endswith(".pdb"):
             continue
-        if opts.rank is not None and f"rank_{opts.rank}" not in fle:
+        if (
+            opts.selected_rank is not None
+            and f"rank_{opts.selected_rank}" in fle
+        ):
+            slctd_mdl = fle
             continue
-        found_ranked = True
         print(f"  translating {fle}...")
-        pdb_start = timer()
-        file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle)
-        fle = os.path.join(opts.model_dir, fle)
-
-        # gather data into JSON-like structure
-        print("    preparing data...", end="")
-        pstart = timer()
-
-        mdlcf_json = _create_interaction_json(config_data)
-
-        # uid = ..._rank_X_model_Y.pdb
-        mdl_name_parts = uid.split("_")
-        assert mdl_name_parts[-4] == "rank"
-        assert mdl_name_parts[-2] == "model"
-        mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
-        mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
-
-        ost_ent = _create_model_json(mdlcf_json, fle, up_acs, uid)
-
-        # read quality scores from JSON file
-        _get_scores(mdlcf_json, file_prfx)
-        print(f" ({timer()-pstart:.2f}s)")
-
-        _store_as_modelcif(
-            mdlcf_json, ost_ent, opts.out_dir, file_prfx, opts.compress
+        pdb_start, fle, mdlcf_fle, zip_fle = _translate2modelcif(
+            up_acs,
+            fle,
+            config_data,
+            opts,
+            None,
         )
         print(f"  ... done with {fle} ({timer()-pdb_start:.2f}s).")
+        not_slctd_mdls.append(mdlcf_fle)
+        not_slctd_mdls.append(zip_fle)
+    if opts.selected_rank:
+        if slctd_mdl is None:
+            _abort_msg(
+                f"Could not find model of requested rank '{opts.selected_rank}'"
+            )
+        print(
+            f"  translating selected model {opts.selected_rank} "
+            + f"({slctd_mdl})..."
+        )
+        _translate2modelcif(
+            up_acs,
+            slctd_mdl,
+            config_data,
+            opts,
+            not_slctd_mdls,
+        )
+        print(f"  ... done with {slctd_mdl} ({timer()-pdb_start:.2f}s).")
 
-    if opts.rank and not found_ranked:
-        _abort_msg(f"Could not find model of requested rank '{opts.rank}'")
     print(f"... done with {opts.model_dir}.")
 
 
-- 
GitLab