Get draft of ModelCIF 'header'

5f2676e3 · Bienchen · e4f5df09 · 5f2676e3 · 5f2676e3
Commit 5f2676e3 authored 1 year ago by Bienchen
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -3,11 +3,15 @@
 """Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF
 file with a lot of metadata in place."""
+from typing import Tuple
+import json
 import os
 import sys
 from absl import app, flags, logging
+import modelcif
+import modelcif.dumper
 # ToDo: Get options properly, best get the same names as used in existing
 #       scripts, e.g. could '--monomer_objects_dir' be used as feature
@@ -30,44 +34,144 @@ FLAGS = flags.FLAGS
 #       exist as expected.
-def alphapulldown_model_to_modelcif() -> None:
+def _store_as_modelcif(
+    data_json: dict,
+    mdl_file: str,
+    out_dir: str,
+    # ost_ent, file_prfx, compress, add_files
+) -> None:
+    """Create the actual ModelCIF file."""
+    system = modelcif.System(
+        title=data_json["_struct.title"],
+        id=data_json["data_"].upper(),
+        model_details=data_json["_struct.pdbx_model_details"],
+    )
+    # write modelcif.System to file
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    try:
+        with open(
+            f"{os.path.splitext(os.path.basename(mdl_file))[0]}.cif",
+            "w",
+            encoding="ascii",
+        ) as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+        # _package_associated_files(system.repositories[0])
+        # if compress:
+        #    _compress_cif_file(mdl_fle)
+    finally:
+        os.chdir(oldpwd)
+def _get_model_details(cmplx_name: str, data_json: dict) -> str:
+    """Get the model description."""
+    ap_versions = []
+    for mnmr in data_json["__meta__"]:
+        if data_json["__meta__"][mnmr]["ap_version"] not in ap_versions:
+            ap_versions.append(data_json["__meta__"][mnmr]["ap_version"])
+    # ToDo: fetch AF2 version/ have it in metadata JSON
+    return (
+        f"Model generated for {' and '.join(cmplx_name)}, produced "
+        + "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by "
+        + f"AlphaPulldown ({', '.join(ap_versions)})."
+    )
+def _get_feature_metadata(
+    modelcif_json: dict, cmplx_name: str, prj_dir: str
+) -> list:
+    """Read metadata from a feature JSON file."""
+    cmplx_name = cmplx_name.split("_and_")
+    prj_dir = os.path.join(prj_dir, "features_monomers")
+    if not os.path.isdir(prj_dir):
+        logging.info(f"No feature directory '{prj_dir}' found.")
+        sys.exit()
+    if "__meta__" not in modelcif_json:
+        modelcif_json["__meta__"] = {}
+    for mnmr in cmplx_name:
+        modelcif_json["__meta__"][mnmr] = {}
+        feature_json = os.path.join(prj_dir, f"{mnmr}_feature_metadata.json")
+        if not os.path.isfile(feature_json):
+            logging.info(f"No feature metadata file '{feature_json}' found.")
+            sys.exit()
+        # ToDo: make sure that its always ASCII
+        with open(feature_json, "r", encoding="ascii") as jfh:
+            jdata = json.load(jfh)
+        modelcif_json["__meta__"][mnmr]["ap_version"] = jdata["version"]
+    return cmplx_name
+def _get_data_block_id_and_struct_and_entry_categories(
+    cif_json: dict, cmplx_name: str
+) -> None:
+    """Get 'data_' block ID and data for categories '_struct' and '_entry'."""
+    cif_json["data_"] = "_".join(cmplx_name)
+    cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}"
+    cif_json["_struct.pdbx_model_details"] = _get_model_details(
+        cmplx_name, cif_json
+    )
+def alphapulldown_model_to_modelcif(
+    cmplx_name: str,
+    mdl_file: str,
+    out_dir: str,
+    prj_dir: str,
+) -> None:
    """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file.
    Metadata for the ModelCIF categories will be fetched from AlphaPulldown
    output as far as possible. This expects modelling projects to exists in
    AlphaPulldown's output directory structure."""
+    # ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...")
+    modelcif_json = {}
-def _get_model_list(ap_dir, model_selected) -> list:
+    # fetch metadata
+    cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
+    # fetch/ assemble more data about the modelling experiment
+    _get_data_block_id_and_struct_and_entry_categories(
+        modelcif_json, cmplx_name
+    )
+    _store_as_modelcif(modelcif_json, mdl_file, out_dir)
+    # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
+def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
    """Get the list of models to be converted.
    If `model_selected` is none, all models will be marked for conversion."""
    # ToDo: Question - use 'ranked_*.pdb' or
    #       'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
+    mdl_path = os.path.join(ap_dir, "models")
+    cmplx = os.listdir(mdl_path)
+    # For now, exactly 1 complex is expected in the 'models' subdirectory. If
+    # there are more, the 'model_selected' mechanism needs to be further tuned
+    # to get to the right model.
+    assert len(cmplx) == 1
+    cmplx = cmplx[0]
+    mdl_path = os.path.join(mdl_path, cmplx)
+    models = []
    if model_selected is not None:
-        mdl_paths = os.path.join(ap_dir, "models")
+        models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb"))
-        cmplx = os.listdir(mdl_paths)
+    else:
-        # For now, exactly 1 complex is expected in the 'models' subdirectory.
+        for mdl in os.listdir(mdl_path):
-        # If there are more, the 'model_selected' mechanism needs to be further
+            if mdl.startswith("ranked_"):
-        # tuned to get to the right model.
+                models.append(os.path.join(mdl_path, mdl))
-        assert len(cmplx) == 1
-        cmplx = cmplx[0]
-        mdl_paths = os.path.join(
-            mdl_paths, cmplx, f"ranked_{model_selected}.pdb"
-        )
-        mdl_paths = [mdl_paths]
-    for mdl in mdl_paths:
+    # check that files actually exist
-        if os.path.isfile(mdl):
+    for mdl in models:
+        if not os.path.isfile(mdl):
            logging.info(
                f"Model file '{mdl}' does not exist or is not a regular file."
            )
            sys.exit()
-    # check that files actually exist
+    return cmplx, mdl_path, models
-    return []
 def main(argv):
@@ -92,10 +196,14 @@ def main(argv):
    # pylint: enable=pointless-string-statement
    del argv  # Unused.
-    # make list of selected models
+    # get list of selected models and assemble ModelCIF files + associated data
-    model_conversions = _get_model_list(FLAGS.ap_output, FLAGS.model_selected)
+    complex_name, model_dir, model_list = _get_model_list(
-    # assemble selected models into ModelCIF files + associated data archives
+        FLAGS.ap_output, FLAGS.model_selected
-    alphapulldown_model_to_modelcif()
+    )
+    for mdl in model_list:
+        alphapulldown_model_to_modelcif(
+            complex_name, mdl, model_dir, FLAGS.ap_output
+        )
 if __name__ == "__main__":
@@ -106,5 +214,9 @@ if __name__ == "__main__":
 #       but we did that already in the past. Idea is to have all models
 #       available for... reproducibility and whatnot, but show the selected
 #       (representative) of the modelling experiment/ study more prominently.
+# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
+#       'data_', '_entry', maybe have a user-defined JSON document with things
+#       like that, including author names?
+# ToDo: where to store which model was chosen? Should be in Tara's models.
-#  LocalWords:  ToDo AlphaPulldown PAEs dir
+#  LocalWords:  ToDo AlphaPulldown PAEs dir struct
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,16 @@ dependencies = [
 [tool.black]
 line-length = 80
+[tool.pylint.MAIN]
+load-plugins = "pylint.extensions.bad_builtin"
 [tool.pylint.REPORTS]
 reports = "no"
 [tool.pylint.FORMAT]
 max-line-length = 81
+[tool.pylint.deprecated_builtins]
+# We want to use proper logging, so we can control *ALL* output bei the Abseil
+# logger, hence: deprecate 'print'
+bad-functions = ["map", "filter", "print"]
\ No newline at end of file