From 5f2676e33a4318f6183fb8b9c02b7e5e9400d551 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Tue, 22 Aug 2023 14:32:50 +0200 Subject: [PATCH] Get draft of ModelCIF 'header' --- convert_to_modelcif.py | 162 ++++++++++++++++++++++++++++++++++------- pyproject.toml | 8 ++ 2 files changed, 145 insertions(+), 25 deletions(-) diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index 398ff2c..bd2a21f 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -3,11 +3,15 @@ """Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF file with a lot of metadata in place.""" +from typing import Tuple +import json import os import sys from absl import app, flags, logging +import modelcif +import modelcif.dumper # ToDo: Get options properly, best get the same names as used in existing # scripts, e.g. could '--monomer_objects_dir' be used as feature @@ -30,44 +34,144 @@ FLAGS = flags.FLAGS # exist as expected. -def alphapulldown_model_to_modelcif() -> None: +def _store_as_modelcif( + data_json: dict, + mdl_file: str, + out_dir: str, + # ost_ent, file_prfx, compress, add_files +) -> None: + """Create the actual ModelCIF file.""" + system = modelcif.System( + title=data_json["_struct.title"], + id=data_json["data_"].upper(), + model_details=data_json["_struct.pdbx_model_details"], + ) + + # write modelcif.System to file + # NOTE: this will dump PAE on path provided in add_scores + # -> hence we cheat by changing path and back while being exception-safe... + oldpwd = os.getcwd() + os.chdir(out_dir) + try: + with open( + f"{os.path.splitext(os.path.basename(mdl_file))[0]}.cif", + "w", + encoding="ascii", + ) as mmcif_fh: + modelcif.dumper.write(mmcif_fh, [system]) + # _package_associated_files(system.repositories[0]) + # if compress: + # _compress_cif_file(mdl_fle) + finally: + os.chdir(oldpwd) + + +def _get_model_details(cmplx_name: str, data_json: dict) -> str: + """Get the model description.""" + ap_versions = [] + for mnmr in data_json["__meta__"]: + if data_json["__meta__"][mnmr]["ap_version"] not in ap_versions: + ap_versions.append(data_json["__meta__"][mnmr]["ap_version"]) + + # ToDo: fetch AF2 version/ have it in metadata JSON + return ( + f"Model generated for {' and '.join(cmplx_name)}, produced " + + "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by " + + f"AlphaPulldown ({', '.join(ap_versions)})." + ) + + +def _get_feature_metadata( + modelcif_json: dict, cmplx_name: str, prj_dir: str +) -> list: + """Read metadata from a feature JSON file.""" + cmplx_name = cmplx_name.split("_and_") + prj_dir = os.path.join(prj_dir, "features_monomers") + if not os.path.isdir(prj_dir): + logging.info(f"No feature directory '{prj_dir}' found.") + sys.exit() + if "__meta__" not in modelcif_json: + modelcif_json["__meta__"] = {} + for mnmr in cmplx_name: + modelcif_json["__meta__"][mnmr] = {} + feature_json = os.path.join(prj_dir, f"{mnmr}_feature_metadata.json") + if not os.path.isfile(feature_json): + logging.info(f"No feature metadata file '{feature_json}' found.") + sys.exit() + # ToDo: make sure that its always ASCII + with open(feature_json, "r", encoding="ascii") as jfh: + jdata = json.load(jfh) + modelcif_json["__meta__"][mnmr]["ap_version"] = jdata["version"] + + return cmplx_name + + +def _get_data_block_id_and_struct_and_entry_categories( + cif_json: dict, cmplx_name: str +) -> None: + """Get 'data_' block ID and data for categories '_struct' and '_entry'.""" + cif_json["data_"] = "_".join(cmplx_name) + cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}" + cif_json["_struct.pdbx_model_details"] = _get_model_details( + cmplx_name, cif_json + ) + + +def alphapulldown_model_to_modelcif( + cmplx_name: str, + mdl_file: str, + out_dir: str, + prj_dir: str, +) -> None: """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file. Metadata for the ModelCIF categories will be fetched from AlphaPulldown output as far as possible. This expects modelling projects to exists in AlphaPulldown's output directory structure.""" - - -def _get_model_list(ap_dir, model_selected) -> list: + # ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...") + modelcif_json = {} + # fetch metadata + cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir) + # fetch/ assemble more data about the modelling experiment + _get_data_block_id_and_struct_and_entry_categories( + modelcif_json, cmplx_name + ) + _store_as_modelcif(modelcif_json, mdl_file, out_dir) + # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'") + + +def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: """Get the list of models to be converted. If `model_selected` is none, all models will be marked for conversion.""" # ToDo: Question - use 'ranked_*.pdb' or # 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models? + mdl_path = os.path.join(ap_dir, "models") + cmplx = os.listdir(mdl_path) + # For now, exactly 1 complex is expected in the 'models' subdirectory. If + # there are more, the 'model_selected' mechanism needs to be further tuned + # to get to the right model. + assert len(cmplx) == 1 + cmplx = cmplx[0] + mdl_path = os.path.join(mdl_path, cmplx) + models = [] if model_selected is not None: - mdl_paths = os.path.join(ap_dir, "models") - cmplx = os.listdir(mdl_paths) - # For now, exactly 1 complex is expected in the 'models' subdirectory. - # If there are more, the 'model_selected' mechanism needs to be further - # tuned to get to the right model. - assert len(cmplx) == 1 - cmplx = cmplx[0] - mdl_paths = os.path.join( - mdl_paths, cmplx, f"ranked_{model_selected}.pdb" - ) - mdl_paths = [mdl_paths] + models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb")) + else: + for mdl in os.listdir(mdl_path): + if mdl.startswith("ranked_"): + models.append(os.path.join(mdl_path, mdl)) - for mdl in mdl_paths: - if os.path.isfile(mdl): + # check that files actually exist + for mdl in models: + if not os.path.isfile(mdl): logging.info( f"Model file '{mdl}' does not exist or is not a regular file." ) sys.exit() - # check that files actually exist - - return [] + return cmplx, mdl_path, models def main(argv): @@ -92,10 +196,14 @@ def main(argv): # pylint: enable=pointless-string-statement del argv # Unused. - # make list of selected models - model_conversions = _get_model_list(FLAGS.ap_output, FLAGS.model_selected) - # assemble selected models into ModelCIF files + associated data archives - alphapulldown_model_to_modelcif() + # get list of selected models and assemble ModelCIF files + associated data + complex_name, model_dir, model_list = _get_model_list( + FLAGS.ap_output, FLAGS.model_selected + ) + for mdl in model_list: + alphapulldown_model_to_modelcif( + complex_name, mdl, model_dir, FLAGS.ap_output + ) if __name__ == "__main__": @@ -106,5 +214,9 @@ if __name__ == "__main__": # but we did that already in the past. Idea is to have all models # available for... reproducibility and whatnot, but show the selected # (representative) of the modelling experiment/ study more prominently. +# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details', +# 'data_', '_entry', maybe have a user-defined JSON document with things +# like that, including author names? +# ToDo: where to store which model was chosen? Should be in Tara's models. -# LocalWords: ToDo AlphaPulldown PAEs dir +# LocalWords: ToDo AlphaPulldown PAEs dir struct diff --git a/pyproject.toml b/pyproject.toml index c80443d..f9bbc98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,16 @@ dependencies = [ [tool.black] line-length = 80 +[tool.pylint.MAIN] +load-plugins = "pylint.extensions.bad_builtin" + [tool.pylint.REPORTS] reports = "no" [tool.pylint.FORMAT] max-line-length = 81 + +[tool.pylint.deprecated_builtins] +# We want to use proper logging, so we can control *ALL* output bei the Abseil +# logger, hence: deprecate 'print' +bad-functions = ["map", "filter", "print"] \ No newline at end of file -- GitLab