From 5f2676e33a4318f6183fb8b9c02b7e5e9400d551 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Tue, 22 Aug 2023 14:32:50 +0200
Subject: [PATCH] Get draft of ModelCIF 'header'

---
 convert_to_modelcif.py | 162 ++++++++++++++++++++++++++++++++++-------
 pyproject.toml         |   8 ++
 2 files changed, 145 insertions(+), 25 deletions(-)

diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py
index 398ff2c..bd2a21f 100755
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -3,11 +3,15 @@
 """Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF
 file with a lot of metadata in place."""
 
+from typing import Tuple
+import json
 import os
 import sys
 
 from absl import app, flags, logging
 
+import modelcif
+import modelcif.dumper
 
 # ToDo: Get options properly, best get the same names as used in existing
 #       scripts, e.g. could '--monomer_objects_dir' be used as feature
@@ -30,44 +34,144 @@ FLAGS = flags.FLAGS
 #       exist as expected.
 
 
-def alphapulldown_model_to_modelcif() -> None:
+def _store_as_modelcif(
+    data_json: dict,
+    mdl_file: str,
+    out_dir: str,
+    # ost_ent, file_prfx, compress, add_files
+) -> None:
+    """Create the actual ModelCIF file."""
+    system = modelcif.System(
+        title=data_json["_struct.title"],
+        id=data_json["data_"].upper(),
+        model_details=data_json["_struct.pdbx_model_details"],
+    )
+
+    # write modelcif.System to file
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    try:
+        with open(
+            f"{os.path.splitext(os.path.basename(mdl_file))[0]}.cif",
+            "w",
+            encoding="ascii",
+        ) as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+        # _package_associated_files(system.repositories[0])
+        # if compress:
+        #    _compress_cif_file(mdl_fle)
+    finally:
+        os.chdir(oldpwd)
+
+
+def _get_model_details(cmplx_name: str, data_json: dict) -> str:
+    """Get the model description."""
+    ap_versions = []
+    for mnmr in data_json["__meta__"]:
+        if data_json["__meta__"][mnmr]["ap_version"] not in ap_versions:
+            ap_versions.append(data_json["__meta__"][mnmr]["ap_version"])
+
+    # ToDo: fetch AF2 version/ have it in metadata JSON
+    return (
+        f"Model generated for {' and '.join(cmplx_name)}, produced "
+        + "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by "
+        + f"AlphaPulldown ({', '.join(ap_versions)})."
+    )
+
+
+def _get_feature_metadata(
+    modelcif_json: dict, cmplx_name: str, prj_dir: str
+) -> list:
+    """Read metadata from a feature JSON file."""
+    cmplx_name = cmplx_name.split("_and_")
+    prj_dir = os.path.join(prj_dir, "features_monomers")
+    if not os.path.isdir(prj_dir):
+        logging.info(f"No feature directory '{prj_dir}' found.")
+        sys.exit()
+    if "__meta__" not in modelcif_json:
+        modelcif_json["__meta__"] = {}
+    for mnmr in cmplx_name:
+        modelcif_json["__meta__"][mnmr] = {}
+        feature_json = os.path.join(prj_dir, f"{mnmr}_feature_metadata.json")
+        if not os.path.isfile(feature_json):
+            logging.info(f"No feature metadata file '{feature_json}' found.")
+            sys.exit()
+        # ToDo: make sure that its always ASCII
+        with open(feature_json, "r", encoding="ascii") as jfh:
+            jdata = json.load(jfh)
+        modelcif_json["__meta__"][mnmr]["ap_version"] = jdata["version"]
+
+    return cmplx_name
+
+
+def _get_data_block_id_and_struct_and_entry_categories(
+    cif_json: dict, cmplx_name: str
+) -> None:
+    """Get 'data_' block ID and data for categories '_struct' and '_entry'."""
+    cif_json["data_"] = "_".join(cmplx_name)
+    cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}"
+    cif_json["_struct.pdbx_model_details"] = _get_model_details(
+        cmplx_name, cif_json
+    )
+
+
+def alphapulldown_model_to_modelcif(
+    cmplx_name: str,
+    mdl_file: str,
+    out_dir: str,
+    prj_dir: str,
+) -> None:
     """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file.
 
     Metadata for the ModelCIF categories will be fetched from AlphaPulldown
     output as far as possible. This expects modelling projects to exists in
     AlphaPulldown's output directory structure."""
-
-
-def _get_model_list(ap_dir, model_selected) -> list:
+    # ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...")
+    modelcif_json = {}
+    # fetch metadata
+    cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
+    # fetch/ assemble more data about the modelling experiment
+    _get_data_block_id_and_struct_and_entry_categories(
+        modelcif_json, cmplx_name
+    )
+    _store_as_modelcif(modelcif_json, mdl_file, out_dir)
+    # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
+
+
+def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
     """Get the list of models to be converted.
 
     If `model_selected` is none, all models will be marked for conversion."""
     # ToDo: Question - use 'ranked_*.pdb' or
     #       'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
 
+    mdl_path = os.path.join(ap_dir, "models")
+    cmplx = os.listdir(mdl_path)
+    # For now, exactly 1 complex is expected in the 'models' subdirectory. If
+    # there are more, the 'model_selected' mechanism needs to be further tuned
+    # to get to the right model.
+    assert len(cmplx) == 1
+    cmplx = cmplx[0]
+    mdl_path = os.path.join(mdl_path, cmplx)
+    models = []
     if model_selected is not None:
-        mdl_paths = os.path.join(ap_dir, "models")
-        cmplx = os.listdir(mdl_paths)
-        # For now, exactly 1 complex is expected in the 'models' subdirectory.
-        # If there are more, the 'model_selected' mechanism needs to be further
-        # tuned to get to the right model.
-        assert len(cmplx) == 1
-        cmplx = cmplx[0]
-        mdl_paths = os.path.join(
-            mdl_paths, cmplx, f"ranked_{model_selected}.pdb"
-        )
-        mdl_paths = [mdl_paths]
+        models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb"))
+    else:
+        for mdl in os.listdir(mdl_path):
+            if mdl.startswith("ranked_"):
+                models.append(os.path.join(mdl_path, mdl))
 
-    for mdl in mdl_paths:
-        if os.path.isfile(mdl):
+    # check that files actually exist
+    for mdl in models:
+        if not os.path.isfile(mdl):
             logging.info(
                 f"Model file '{mdl}' does not exist or is not a regular file."
             )
             sys.exit()
 
-    # check that files actually exist
-
-    return []
+    return cmplx, mdl_path, models
 
 
 def main(argv):
@@ -92,10 +196,14 @@ def main(argv):
     # pylint: enable=pointless-string-statement
     del argv  # Unused.
 
-    # make list of selected models
-    model_conversions = _get_model_list(FLAGS.ap_output, FLAGS.model_selected)
-    # assemble selected models into ModelCIF files + associated data archives
-    alphapulldown_model_to_modelcif()
+    # get list of selected models and assemble ModelCIF files + associated data
+    complex_name, model_dir, model_list = _get_model_list(
+        FLAGS.ap_output, FLAGS.model_selected
+    )
+    for mdl in model_list:
+        alphapulldown_model_to_modelcif(
+            complex_name, mdl, model_dir, FLAGS.ap_output
+        )
 
 
 if __name__ == "__main__":
@@ -106,5 +214,9 @@ if __name__ == "__main__":
 #       but we did that already in the past. Idea is to have all models
 #       available for... reproducibility and whatnot, but show the selected
 #       (representative) of the modelling experiment/ study more prominently.
+# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
+#       'data_', '_entry', maybe have a user-defined JSON document with things
+#       like that, including author names?
+# ToDo: where to store which model was chosen? Should be in Tara's models.
 
-#  LocalWords:  ToDo AlphaPulldown PAEs dir
+#  LocalWords:  ToDo AlphaPulldown PAEs dir struct
diff --git a/pyproject.toml b/pyproject.toml
index c80443d..f9bbc98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,16 @@ dependencies = [
 [tool.black]
 line-length = 80
 
+[tool.pylint.MAIN]
+load-plugins = "pylint.extensions.bad_builtin"
+
 [tool.pylint.REPORTS]
 reports = "no"
 
 [tool.pylint.FORMAT]
 max-line-length = 81
+
+[tool.pylint.deprecated_builtins]
+# We want to use proper logging, so we can control *ALL* output bei the Abseil
+# logger, hence: deprecate 'print'
+bad-functions = ["map", "filter", "print"]
\ No newline at end of file
-- 
GitLab