Get a proper model name

45ffa009 · Bienchen · d45d73c3 · 45ffa009
Commit 45ffa009 authored 1 year ago by Bienchen
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -29,6 +29,15 @@ import modelcif.model
 # ToDo: Get options properly, best get the same names as used in existing
 #       scripts, e.g. could '--monomer_objects_dir' be used as feature
 #       directory/ directory with the feature JSON files?
+# ToDo: Monomers work separately - features may come from different set of
+#       software, databases... so target sequences may be connected to different
+#       versions of the same sequence database, may use different versions of
+#       software... can this still go into single protocol steps or would this
+#       prevent identifying which MSAs were produced with which software? E.g.
+#       can there still be a single "sequence search" step? (This is
+#       definitively for later, not the first working version of the converter
+#       script)
+# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__'
 flags.DEFINE_string(
    "ap_output", None, "AlphaPulldown pipeline output directory."
 )
@@ -245,19 +254,16 @@ def _store_as_modelcif(
        system,
    )
-    # ToDo: get modelling-experiment auhtors
+    # ToDo: get modelling-experiment authors
    # audit_authors
    # system.authors.extend(data_json["audit_authors"])
    # set up the model to produce coordinates
-    # ToDo: model file names are different than expected, got ranked_<N>.pdb
-    #       and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
-    #       like model_<N>_rank_<M>.pdb, used for 'model list name'
    model = _Biopython2ModelCIF(
        assembly=modelcif.Assembly(asym_units.values()),
        asym=asym_units,
        bio_pdb_structure=structure,
-        name="ToDo: Model <N> (ranked #<M>)",
+        name=data_json["_ma_model_list.model_name"],
    )
    # create software list from feature metadata
@@ -327,6 +333,7 @@ def _compress_cif_file(cif_file):
 def _get_model_details(cmplx_name: str, data_json: dict) -> str:
    """Get the model description."""
    ap_versions = []
+    af2_version = None
    for mnmr in data_json["__meta__"]:  # mnmr = monomer
        if (
            data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"]
@@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str:
                    "version"
                ]
            )
+        # AlphaFold-Multimer builds the model we are looking at, can only be a
+        # single version.
+        if af2_version is None:
+            af2_version = data_json["__meta__"][mnmr]["software"]["alphafold"][
+                "version"
+            ]
+        else:
+            if (
+                data_json["__meta__"][mnmr]["software"]["alphafold"]["version"]
+                != af2_version
+            ):
+                # pylint: disable=line-too-long
+                raise RuntimeError(
+                    "Different versions of AlphaFold-Multimer found: "
+                    + f"'{data_json['__meta__'][mnmr]['software']['alphafold']['version']}'"
+                    + f" vs. '{af2_version}'"
+                )
-    # ToDo: fetch AF2 version/ have it in metadata JSON
    return (
        f"Model generated for {' and '.join(cmplx_name)}, produced "
-        + "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by "
+        + f"using AlphaFold-Multimer ({af2_version}) as implemented by "
        + f"AlphaPulldown ({', '.join(ap_versions)})."
    )
@@ -377,8 +400,11 @@ def _get_feature_metadata(
    return cmplx_name
-def _get_data_block_id_and_struct_and_entry_categories(
+def _get_model_info(
-    cif_json: dict, cmplx_name: str
+    cif_json: dict,
+    cmplx_name: str,
+    mdl_id: str,
+    mdl_rank: int,
 ) -> None:
    """Get 'data_' block ID and data for categories '_struct' and '_entry'."""
    cif_json["data_"] = "_".join(cmplx_name)
@@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories(
    cif_json["_struct.pdbx_model_details"] = _get_model_details(
        cmplx_name, cif_json
    )
+    cif_json[
+        "_ma_model_list.model_name"
+    ] = f"Model {mdl_id} (ranked #{mdl_rank})"
 def _get_entities(
@@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list:
 def alphapulldown_model_to_modelcif(
    cmplx_name: str,
-    mdl_file: str,
+    mdl: tuple,
-    scr_file: str,
    out_dir: str,
    prj_dir: str,
    compress: bool = False,
@@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif(
    Metadata for the ModelCIF categories will be fetched from AlphaPulldown
    output as far as possible. This expects modelling projects to exists in
    AlphaPulldown's output directory structure."""
-    # ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...")
+    # ToDo: ENABLE logging.info(f"Processing '{mdl[0]}'...")
    modelcif_json = {}
    # fetch metadata
    cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
    # fetch/ assemble more data about the modelling experiment
-    _get_data_block_id_and_struct_and_entry_categories(
+    _get_model_info(
-        modelcif_json, cmplx_name
+        modelcif_json,
+        cmplx_name,
+        mdl[2],
+        mdl[3],
    )
    # gather target entities (sequences that have been modeled) info
-    structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir)
+    structure = _get_entities(modelcif_json, mdl[0], cmplx_name, prj_dir)
    # read quality scores from pickle file
-    _get_scores(modelcif_json, scr_file)
+    _get_scores(modelcif_json, mdl[1])
-    _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir, compress)
+    _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress)
-    # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
+    # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'")
 def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
    """Get the list of models to be converted.
    If `model_selected` is none, all models will be marked for conversion."""
-    # ToDo: Question - use 'ranked_*.pdb' or
-    #       'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
    mdl_path = os.path.join(ap_dir, "models")
    cmplx = os.listdir(mdl_path)
    # For now, exactly 1 complex is expected in the 'models' subdirectory. If
@@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
    ranking_dbg = os.path.join(mdl_path, "ranking_debug.json")
    if not os.path.isfile(ranking_dbg):
        logging.info(
-            f"Ranking file '{ranking_dbg} doe snot exist or is no regular "
+            f"Ranking file '{ranking_dbg} does not exist or is no regular "
            + "file."
        )
        sys.exit()
@@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
        ranking_dbg = json.load(jfh)
    score_files = {}
    for i, fle in enumerate(ranking_dbg["order"]):
-        score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl")
+        if not fle.startswith("model_"):
+            raise RuntimeError(
+                "Filename does not start with 'model_', can "
+                + f"not determine model ID: '{fle}'"
+            )
+        score_files[i] = (
+            os.path.join(mdl_path, f"result_{fle}.pkl"),
+            fle.split("_")[1],
+            i,
+        )
    # match PDB files with pickle files
    if model_selected is not None:
        models.append(
            (
                os.path.join(mdl_path, f"ranked_{model_selected}.pdb"),
-                score_files[model_selected],
+                score_files[model_selected][0],
+                score_files[model_selected][1],  # model ID
+                score_files[model_selected][2],  # model rank
            )
        )
    else:
@@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
            rank = re.match(r"ranked_(\d+)\.pdb", mdl)
            if rank is not None:
                rank = int(rank.group(1))
-                models.append((os.path.join(mdl_path, mdl), score_files[rank]))
+                models.append(
+                    (
+                        os.path.join(mdl_path, mdl),
+                        score_files[rank][0].score_files[rank][1],
+                    )
+                )
    # check that files actually exist
-    for mdl, scrs in models:
+    for mdl, scrs, *_ in models:
        if not os.path.isfile(mdl):
            logging.info(
                f"Model file '{mdl}' does not exist or is not a regular file."
@@ -642,11 +686,10 @@ def main(argv):
    complex_name, model_dir, model_list = _get_model_list(
        FLAGS.ap_output, FLAGS.model_selected
    )
-    for mdl, scrs in model_list:
+    for mdl in model_list:
        alphapulldown_model_to_modelcif(
            complex_name,
            mdl,
-            scrs,
            model_dir,
            FLAGS.ap_output,
            FLAGS.compress,
@@ -668,4 +711,4 @@ if __name__ == "__main__":
 # ToDo: make sure all functions come with types
 #  LocalWords:  ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
-#  LocalWords:  multimer sif Jupyter aa
+#  LocalWords:  multimer sif Jupyter aa MSAs