Skip to content
Snippets Groups Projects
Commit 5f2676e3 authored by Bienchen's avatar Bienchen
Browse files

Get draft of ModelCIF 'header'

parent e4f5df09
No related branches found
No related tags found
No related merge requests found
...@@ -3,11 +3,15 @@ ...@@ -3,11 +3,15 @@
"""Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF """Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF
file with a lot of metadata in place.""" file with a lot of metadata in place."""
from typing import Tuple
import json
import os import os
import sys import sys
from absl import app, flags, logging from absl import app, flags, logging
import modelcif
import modelcif.dumper
# ToDo: Get options properly, best get the same names as used in existing # ToDo: Get options properly, best get the same names as used in existing
# scripts, e.g. could '--monomer_objects_dir' be used as feature # scripts, e.g. could '--monomer_objects_dir' be used as feature
...@@ -30,44 +34,144 @@ FLAGS = flags.FLAGS ...@@ -30,44 +34,144 @@ FLAGS = flags.FLAGS
# exist as expected. # exist as expected.
def alphapulldown_model_to_modelcif() -> None: def _store_as_modelcif(
data_json: dict,
mdl_file: str,
out_dir: str,
# ost_ent, file_prfx, compress, add_files
) -> None:
"""Create the actual ModelCIF file."""
system = modelcif.System(
title=data_json["_struct.title"],
id=data_json["data_"].upper(),
model_details=data_json["_struct.pdbx_model_details"],
)
# write modelcif.System to file
# NOTE: this will dump PAE on path provided in add_scores
# -> hence we cheat by changing path and back while being exception-safe...
oldpwd = os.getcwd()
os.chdir(out_dir)
try:
with open(
f"{os.path.splitext(os.path.basename(mdl_file))[0]}.cif",
"w",
encoding="ascii",
) as mmcif_fh:
modelcif.dumper.write(mmcif_fh, [system])
# _package_associated_files(system.repositories[0])
# if compress:
# _compress_cif_file(mdl_fle)
finally:
os.chdir(oldpwd)
def _get_model_details(cmplx_name: str, data_json: dict) -> str:
"""Get the model description."""
ap_versions = []
for mnmr in data_json["__meta__"]:
if data_json["__meta__"][mnmr]["ap_version"] not in ap_versions:
ap_versions.append(data_json["__meta__"][mnmr]["ap_version"])
# ToDo: fetch AF2 version/ have it in metadata JSON
return (
f"Model generated for {' and '.join(cmplx_name)}, produced "
+ "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by "
+ f"AlphaPulldown ({', '.join(ap_versions)})."
)
def _get_feature_metadata(
modelcif_json: dict, cmplx_name: str, prj_dir: str
) -> list:
"""Read metadata from a feature JSON file."""
cmplx_name = cmplx_name.split("_and_")
prj_dir = os.path.join(prj_dir, "features_monomers")
if not os.path.isdir(prj_dir):
logging.info(f"No feature directory '{prj_dir}' found.")
sys.exit()
if "__meta__" not in modelcif_json:
modelcif_json["__meta__"] = {}
for mnmr in cmplx_name:
modelcif_json["__meta__"][mnmr] = {}
feature_json = os.path.join(prj_dir, f"{mnmr}_feature_metadata.json")
if not os.path.isfile(feature_json):
logging.info(f"No feature metadata file '{feature_json}' found.")
sys.exit()
# ToDo: make sure that its always ASCII
with open(feature_json, "r", encoding="ascii") as jfh:
jdata = json.load(jfh)
modelcif_json["__meta__"][mnmr]["ap_version"] = jdata["version"]
return cmplx_name
def _get_data_block_id_and_struct_and_entry_categories(
cif_json: dict, cmplx_name: str
) -> None:
"""Get 'data_' block ID and data for categories '_struct' and '_entry'."""
cif_json["data_"] = "_".join(cmplx_name)
cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}"
cif_json["_struct.pdbx_model_details"] = _get_model_details(
cmplx_name, cif_json
)
def alphapulldown_model_to_modelcif(
cmplx_name: str,
mdl_file: str,
out_dir: str,
prj_dir: str,
) -> None:
"""Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file. """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file.
Metadata for the ModelCIF categories will be fetched from AlphaPulldown Metadata for the ModelCIF categories will be fetched from AlphaPulldown
output as far as possible. This expects modelling projects to exists in output as far as possible. This expects modelling projects to exists in
AlphaPulldown's output directory structure.""" AlphaPulldown's output directory structure."""
# ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...")
modelcif_json = {}
def _get_model_list(ap_dir, model_selected) -> list: # fetch metadata
cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
# fetch/ assemble more data about the modelling experiment
_get_data_block_id_and_struct_and_entry_categories(
modelcif_json, cmplx_name
)
_store_as_modelcif(modelcif_json, mdl_file, out_dir)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
"""Get the list of models to be converted. """Get the list of models to be converted.
If `model_selected` is none, all models will be marked for conversion.""" If `model_selected` is none, all models will be marked for conversion."""
# ToDo: Question - use 'ranked_*.pdb' or # ToDo: Question - use 'ranked_*.pdb' or
# 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models? # 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
mdl_path = os.path.join(ap_dir, "models")
cmplx = os.listdir(mdl_path)
# For now, exactly 1 complex is expected in the 'models' subdirectory. If
# there are more, the 'model_selected' mechanism needs to be further tuned
# to get to the right model.
assert len(cmplx) == 1
cmplx = cmplx[0]
mdl_path = os.path.join(mdl_path, cmplx)
models = []
if model_selected is not None: if model_selected is not None:
mdl_paths = os.path.join(ap_dir, "models") models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb"))
cmplx = os.listdir(mdl_paths) else:
# For now, exactly 1 complex is expected in the 'models' subdirectory. for mdl in os.listdir(mdl_path):
# If there are more, the 'model_selected' mechanism needs to be further if mdl.startswith("ranked_"):
# tuned to get to the right model. models.append(os.path.join(mdl_path, mdl))
assert len(cmplx) == 1
cmplx = cmplx[0]
mdl_paths = os.path.join(
mdl_paths, cmplx, f"ranked_{model_selected}.pdb"
)
mdl_paths = [mdl_paths]
for mdl in mdl_paths: # check that files actually exist
if os.path.isfile(mdl): for mdl in models:
if not os.path.isfile(mdl):
logging.info( logging.info(
f"Model file '{mdl}' does not exist or is not a regular file." f"Model file '{mdl}' does not exist or is not a regular file."
) )
sys.exit() sys.exit()
# check that files actually exist return cmplx, mdl_path, models
return []
def main(argv): def main(argv):
...@@ -92,10 +196,14 @@ def main(argv): ...@@ -92,10 +196,14 @@ def main(argv):
# pylint: enable=pointless-string-statement # pylint: enable=pointless-string-statement
del argv # Unused. del argv # Unused.
# make list of selected models # get list of selected models and assemble ModelCIF files + associated data
model_conversions = _get_model_list(FLAGS.ap_output, FLAGS.model_selected) complex_name, model_dir, model_list = _get_model_list(
# assemble selected models into ModelCIF files + associated data archives FLAGS.ap_output, FLAGS.model_selected
alphapulldown_model_to_modelcif() )
for mdl in model_list:
alphapulldown_model_to_modelcif(
complex_name, mdl, model_dir, FLAGS.ap_output
)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -106,5 +214,9 @@ if __name__ == "__main__": ...@@ -106,5 +214,9 @@ if __name__ == "__main__":
# but we did that already in the past. Idea is to have all models # but we did that already in the past. Idea is to have all models
# available for... reproducibility and whatnot, but show the selected # available for... reproducibility and whatnot, but show the selected
# (representative) of the modelling experiment/ study more prominently. # (representative) of the modelling experiment/ study more prominently.
# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
# 'data_', '_entry', maybe have a user-defined JSON document with things
# like that, including author names?
# ToDo: where to store which model was chosen? Should be in Tara's models.
# LocalWords: ToDo AlphaPulldown PAEs dir # LocalWords: ToDo AlphaPulldown PAEs dir struct
...@@ -9,8 +9,16 @@ dependencies = [ ...@@ -9,8 +9,16 @@ dependencies = [
[tool.black] [tool.black]
line-length = 80 line-length = 80
[tool.pylint.MAIN]
load-plugins = "pylint.extensions.bad_builtin"
[tool.pylint.REPORTS] [tool.pylint.REPORTS]
reports = "no" reports = "no"
[tool.pylint.FORMAT] [tool.pylint.FORMAT]
max-line-length = 81 max-line-length = 81
[tool.pylint.deprecated_builtins]
# We want to use proper logging, so we can control *ALL* output bei the Abseil
# logger, hence: deprecate 'print'
bad-functions = ["map", "filter", "print"]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment