Update converter script.

b98d8878 · B13nch3n · 0ba72f69 · b98d8878
Commit b98d8878 authored 2 years ago by B13nch3n
--- a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py
+++ b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py
@@ -46,10 +46,12 @@ def _parse_args():
        + "'<UniProtKB AC>-<UniProtKB AC>'",
    )
    parser.add_argument(
-        "--rank",
+        "--selected_rank",
        type=str,
        default=None,
-        help="Only process the model with this rank.",
+        help="If a certain model of a modelling project is selected by rank, "
+        + "the other models are still translated to ModelCIF but stored as "
+        + "accompanying files to the selected model.",
    )
    parser.add_argument(
        "--out_dir",
@@ -151,7 +153,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
                occupancy=atm.occupancy,
            )
-    def add_scores(self, scores_json, entry_id, mdl_name):
+    def add_scores(self, scores_json, entry_id, mdl_name, add_files):
        """Add QA metrics from AF2 scores."""
        # global scores
        self.qa_metrics.extend(
@@ -196,21 +198,26 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
        self.qa_metrics.extend(lpae)
        ac_file = f"{mdl_name}_local_pairwise_qa.cif"
-        qa_file = modelcif.associated.LocalPairwiseQAScoresFile(
+        arc_files = [
-            ac_file,
+            modelcif.associated.LocalPairwiseQAScoresFile(
-            categories=["_ma_qa_metric_local_pairwise"],
+                ac_file,
-            copy_categories=["_ma_qa_metric"],
+                categories=["_ma_qa_metric_local_pairwise"],
-            entry_id=entry_id,
+                copy_categories=["_ma_qa_metric"],
-            entry_details="This file is an associated file consisting "
+                entry_id=entry_id,
-            + "of local pairwise QA metrics. This is a partial mmCIF "
+                entry_details="This file is an associated file consisting "
-            + "file and can be validated by merging with the main "
+                + "of local pairwise QA metrics. This is a partial mmCIF "
-            + "mmCIF file containing the model coordinates and other "
+                + "file and can be validated by merging with the main "
-            + "associated data.",
+                + "mmCIF file containing the model coordinates and other "
-            details="Predicted aligned error",
+                + "associated data.",
-        )
+                details="Predicted aligned error",
+            )
+        ]
+        if add_files:
+            arc_files.extend(add_files)
        return modelcif.associated.Repository(
            "",
-            [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=[qa_file])],
+            [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
        )
        # NOTE: by convention MA expects zip file with same name as model-cif
@@ -248,14 +255,14 @@ def _check_model_extra_files_present(model_dir, pdb_file):
 def _get_audit_authors():
    """Return the list of authors that produced this model."""
    return (
-        "Bartolec, T.",
+        "Bartolec, T.K.",
        "Vazquez-Campos, X.",
-        "Johnson, M.",
        "Norman, A.",
-        "Payne, R.",
+        "Luong, C.",
-        "Wilkins, M.",
+        "Payne, R.J.",
-        "Mackay, J.",
+        "Wilkins, M.R.",
-        "Low, J.",
+        "Mackay, J.P.",
+        "Low, J.K.K.",
    )
@@ -818,19 +825,19 @@ def _compress_cif_file(cif_file):
    os.remove(cif_file)
-def _package_associated_files(mdl_name):
+def _package_associated_files(repo):
    """Compress associated files into single zip file and delete original."""
-    # file names must match ones from add_scores
-    zip_path = f"{mdl_name}.zip"
-    files = [f"{mdl_name}_local_pairwise_qa.cif"]
    # zip settings tested for good speed vs compression
-    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_BZIP2) as myzip:
+    for archive in repo.files:
-        for file in files:
+        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
-            myzip.write(file)
+            for zfile in archive.files:
-            os.remove(file)
+                cif_zip.write(zfile.path, arcname=zfile.path)
+                os.remove(zfile.path)
-def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
+def _store_as_modelcif(
+    data_json, ost_ent, out_dir, file_prfx, compress, add_files
+):
    """Mix all the data into a ModelCIF file."""
    print("    generating ModelCIF objects...", end="")
    pstart = timer()
@@ -876,18 +883,23 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
    print("    processing QA scores...", end="", flush=True)
    pstart = timer()
    mdl_name = os.path.basename(file_prfx)
-    system.repositories.append(model.add_scores(data_json, system.id, mdl_name))
+    system.repositories.append(
+        model.add_scores(data_json, system.id, mdl_name, add_files)
+    )
    print(f" ({timer()-pstart:.2f}s)")
    system.model_groups.append(
        modelcif.model.ModelGroup([model], name=data_json["model_group_name"])
    )
-    ref_dbs = _get_sequence_dbs(data_json["config_data"]["seq_dbs"])
+    system.protocols.append(
-    protocol = _get_modelcif_protocol(
+        _get_modelcif_protocol(
-        data_json["protocol"], system.target_entities, model, ref_dbs
+            data_json["protocol"],
+            system.target_entities,
+            model,
+            _get_sequence_dbs(data_json["config_data"]["seq_dbs"]),
+        )
    )
-    system.protocols.append(protocol)
    # write modelcif System to file
    print("    write to disk...", end="", flush=True)
@@ -896,17 +908,48 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
    # -> hence we cheat by changing path and back while being exception-safe...
    oldpwd = os.getcwd()
    os.chdir(out_dir)
+    mdl_fle = f"{mdl_name}.cif"
    try:
-        with open(f"{mdl_name}.cif", "w", encoding="ascii") as mmcif_fh:
+        with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
            modelcif.dumper.write(mmcif_fh, [system])
-        _package_associated_files(mdl_name)
+        _package_associated_files(system.repositories[0])
        if compress:
-            _compress_cif_file(f"{mdl_name}.cif")
+            _compress_cif_file(mdl_fle)
    finally:
        os.chdir(oldpwd)
    print(f" ({timer()-pstart:.2f}s)")
+    mdl_fle = _get_assoc_mdl_file(mdl_fle, data_json)
+    zip_fle = _get_assoc_zip_file(
+        system.repositories[0].files[0].path, data_json
+    )
+    return mdl_fle, zip_fle
+def _get_assoc_mdl_file(fle_path, data_json):
+    """Generate a modelcif.associated.File object that looks like a CIF file.
+    The dedicated CIFFile functionality in modelcif would also try to write it.
+    """
+    cfile = modelcif.associated.File(
+        fle_path,
+        details=f"model {data_json['mdl_num']}; rank {data_json['rank_num']}",
+    )
+    cfile.file_format = "cif"
+    return cfile
+def _get_assoc_zip_file(fle_path, data_json):
+    """Create a modelcif.associated.File object that looks like a ZIP file.
+    This is NOT the archive ZIP file for the PAEs but to store that in the
+    ZIP archive of the selected model."""
+    zfile = modelcif.associated.File(
+        fle_path,
+        details="archive with multiple files for model "
+        + f"{data_json['mdl_num']}; rank {data_json['rank_num']}",
+    )
+    zfile.file_format = "zip"
+    return zfile
 def _create_interaction_json(config_data):
    """Create a dictionary (mimicking JSON) that contains data which is the same
@@ -931,6 +974,41 @@ def _create_model_json(data, pdb_file, up_acs, block_id):
    return ost_ent
+def _translate2modelcif(up_acs, pdb_fle, config_data, opts, add_files):
+    """Convert a PDB file with its accompanying data to ModelCIF."""
+    pdb_start = timer()
+    file_prfx, uid = _check_model_extra_files_present(opts.model_dir, pdb_fle)
+    pdb_fle = os.path.join(opts.model_dir, pdb_fle)
+    # gather data into JSON-like structure
+    print("    preparing data...", end="")
+    pstart = timer()
+    mdlcf_json = _create_interaction_json(config_data)
+    # uid = ..._rank_X_model_Y.pdb
+    mdl_name_parts = uid.split("_")
+    assert mdl_name_parts[-4] == "rank"
+    assert mdl_name_parts[-2] == "model"
+    mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
+    mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
+    ost_ent = _create_model_json(mdlcf_json, pdb_fle, up_acs, uid)
+    # read quality scores from JSON file
+    _get_scores(mdlcf_json, file_prfx)
+    print(f" ({timer()-pstart:.2f}s)")
+    mdlcf_fle, zip_fle = _store_as_modelcif(
+        mdlcf_json,
+        ost_ent,
+        opts.out_dir,
+        file_prfx,
+        opts.compress,
+        add_files,
+    )
+    return pdb_start, pdb_fle, mdlcf_fle, zip_fle
 def _main():
    """Run as script."""
    opts = _parse_args()
@@ -945,45 +1023,49 @@ def _main():
    config_data = _parse_colabfold_config(cnfg)
    # iterate model directory
-    found_ranked = False
+    # There is 1 representative for a modelling project, the other models are
+    # stored in its ZIP archive.
+    not_slctd_mdls = []
+    slctd_mdl = None
    for fle in sorted(os.listdir(opts.model_dir)):
        # iterate PDB files
        if not fle.endswith(".pdb"):
            continue
-        if opts.rank is not None and f"rank_{opts.rank}" not in fle:
+        if (
+            opts.selected_rank is not None
+            and f"rank_{opts.selected_rank}" in fle
+        ):
+            slctd_mdl = fle
            continue
-        found_ranked = True
        print(f"  translating {fle}...")
-        pdb_start = timer()
+        pdb_start, fle, mdlcf_fle, zip_fle = _translate2modelcif(
-        file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle)
+            up_acs,
-        fle = os.path.join(opts.model_dir, fle)
+            fle,
+            config_data,
-        # gather data into JSON-like structure
+            opts,
-        print("    preparing data...", end="")
+            None,
-        pstart = timer()
-        mdlcf_json = _create_interaction_json(config_data)
-        # uid = ..._rank_X_model_Y.pdb
-        mdl_name_parts = uid.split("_")
-        assert mdl_name_parts[-4] == "rank"
-        assert mdl_name_parts[-2] == "model"
-        mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
-        mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
-        ost_ent = _create_model_json(mdlcf_json, fle, up_acs, uid)
-        # read quality scores from JSON file
-        _get_scores(mdlcf_json, file_prfx)
-        print(f" ({timer()-pstart:.2f}s)")
-        _store_as_modelcif(
-            mdlcf_json, ost_ent, opts.out_dir, file_prfx, opts.compress
        )
        print(f"  ... done with {fle} ({timer()-pdb_start:.2f}s).")
+        not_slctd_mdls.append(mdlcf_fle)
+        not_slctd_mdls.append(zip_fle)
+    if opts.selected_rank:
+        if slctd_mdl is None:
+            _abort_msg(
+                f"Could not find model of requested rank '{opts.selected_rank}'"
+            )
+        print(
+            f"  translating selected model {opts.selected_rank} "
+            + f"({slctd_mdl})..."
+        )
+        _translate2modelcif(
+            up_acs,
+            slctd_mdl,
+            config_data,
+            opts,
+            not_slctd_mdls,
+        )
+        print(f"  ... done with {slctd_mdl} ({timer()-pdb_start:.2f}s).")
-    if opts.rank and not found_ranked:
-        _abort_msg(f"Could not find model of requested rank '{opts.rank}'")
    print(f"... done with {opts.model_dir}.")