Skip to content
Snippets Groups Projects
Commit 45ffa009 authored by Bienchen's avatar Bienchen
Browse files

Get a proper model name

parent d45d73c3
Branches
No related tags found
No related merge requests found
...@@ -29,6 +29,15 @@ import modelcif.model ...@@ -29,6 +29,15 @@ import modelcif.model
# ToDo: Get options properly, best get the same names as used in existing # ToDo: Get options properly, best get the same names as used in existing
# scripts, e.g. could '--monomer_objects_dir' be used as feature # scripts, e.g. could '--monomer_objects_dir' be used as feature
# directory/ directory with the feature JSON files? # directory/ directory with the feature JSON files?
# ToDo: Monomers work separately - features may come from different set of
# software, databases... so target sequences may be connected to different
# versions of the same sequence database, may use different versions of
# software... can this still go into single protocol steps or would this
# prevent identifying which MSAs were produced with which software? E.g.
# can there still be a single "sequence search" step? (This is
# definitively for later, not the first working version of the converter
# script)
# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__'
flags.DEFINE_string( flags.DEFINE_string(
"ap_output", None, "AlphaPulldown pipeline output directory." "ap_output", None, "AlphaPulldown pipeline output directory."
) )
...@@ -245,19 +254,16 @@ def _store_as_modelcif( ...@@ -245,19 +254,16 @@ def _store_as_modelcif(
system, system,
) )
# ToDo: get modelling-experiment auhtors # ToDo: get modelling-experiment authors
# audit_authors # audit_authors
# system.authors.extend(data_json["audit_authors"]) # system.authors.extend(data_json["audit_authors"])
# set up the model to produce coordinates # set up the model to produce coordinates
# ToDo: model file names are different than expected, got ranked_<N>.pdb
# and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
# like model_<N>_rank_<M>.pdb, used for 'model list name'
model = _Biopython2ModelCIF( model = _Biopython2ModelCIF(
assembly=modelcif.Assembly(asym_units.values()), assembly=modelcif.Assembly(asym_units.values()),
asym=asym_units, asym=asym_units,
bio_pdb_structure=structure, bio_pdb_structure=structure,
name="ToDo: Model <N> (ranked #<M>)", name=data_json["_ma_model_list.model_name"],
) )
# create software list from feature metadata # create software list from feature metadata
...@@ -327,6 +333,7 @@ def _compress_cif_file(cif_file): ...@@ -327,6 +333,7 @@ def _compress_cif_file(cif_file):
def _get_model_details(cmplx_name: str, data_json: dict) -> str: def _get_model_details(cmplx_name: str, data_json: dict) -> str:
"""Get the model description.""" """Get the model description."""
ap_versions = [] ap_versions = []
af2_version = None
for mnmr in data_json["__meta__"]: # mnmr = monomer for mnmr in data_json["__meta__"]: # mnmr = monomer
if ( if (
data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"] data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"]
...@@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str: ...@@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str:
"version" "version"
] ]
) )
# AlphaFold-Multimer builds the model we are looking at, can only be a
# single version.
if af2_version is None:
af2_version = data_json["__meta__"][mnmr]["software"]["alphafold"][
"version"
]
else:
if (
data_json["__meta__"][mnmr]["software"]["alphafold"]["version"]
!= af2_version
):
# pylint: disable=line-too-long
raise RuntimeError(
"Different versions of AlphaFold-Multimer found: "
+ f"'{data_json['__meta__'][mnmr]['software']['alphafold']['version']}'"
+ f" vs. '{af2_version}'"
)
# ToDo: fetch AF2 version/ have it in metadata JSON
return ( return (
f"Model generated for {' and '.join(cmplx_name)}, produced " f"Model generated for {' and '.join(cmplx_name)}, produced "
+ "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by " + f"using AlphaFold-Multimer ({af2_version}) as implemented by "
+ f"AlphaPulldown ({', '.join(ap_versions)})." + f"AlphaPulldown ({', '.join(ap_versions)})."
) )
...@@ -377,8 +400,11 @@ def _get_feature_metadata( ...@@ -377,8 +400,11 @@ def _get_feature_metadata(
return cmplx_name return cmplx_name
def _get_data_block_id_and_struct_and_entry_categories( def _get_model_info(
cif_json: dict, cmplx_name: str cif_json: dict,
cmplx_name: str,
mdl_id: str,
mdl_rank: int,
) -> None: ) -> None:
"""Get 'data_' block ID and data for categories '_struct' and '_entry'.""" """Get 'data_' block ID and data for categories '_struct' and '_entry'."""
cif_json["data_"] = "_".join(cmplx_name) cif_json["data_"] = "_".join(cmplx_name)
...@@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories( ...@@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories(
cif_json["_struct.pdbx_model_details"] = _get_model_details( cif_json["_struct.pdbx_model_details"] = _get_model_details(
cmplx_name, cif_json cmplx_name, cif_json
) )
cif_json[
"_ma_model_list.model_name"
] = f"Model {mdl_id} (ranked #{mdl_rank})"
def _get_entities( def _get_entities(
...@@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list: ...@@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list:
def alphapulldown_model_to_modelcif( def alphapulldown_model_to_modelcif(
cmplx_name: str, cmplx_name: str,
mdl_file: str, mdl: tuple,
scr_file: str,
out_dir: str, out_dir: str,
prj_dir: str, prj_dir: str,
compress: bool = False, compress: bool = False,
...@@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif( ...@@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif(
Metadata for the ModelCIF categories will be fetched from AlphaPulldown Metadata for the ModelCIF categories will be fetched from AlphaPulldown
output as far as possible. This expects modelling projects to exists in output as far as possible. This expects modelling projects to exists in
AlphaPulldown's output directory structure.""" AlphaPulldown's output directory structure."""
# ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...") # ToDo: ENABLE logging.info(f"Processing '{mdl[0]}'...")
modelcif_json = {} modelcif_json = {}
# fetch metadata # fetch metadata
cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir) cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
# fetch/ assemble more data about the modelling experiment # fetch/ assemble more data about the modelling experiment
_get_data_block_id_and_struct_and_entry_categories( _get_model_info(
modelcif_json, cmplx_name modelcif_json,
cmplx_name,
mdl[2],
mdl[3],
) )
# gather target entities (sequences that have been modeled) info # gather target entities (sequences that have been modeled) info
structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir) structure = _get_entities(modelcif_json, mdl[0], cmplx_name, prj_dir)
# read quality scores from pickle file # read quality scores from pickle file
_get_scores(modelcif_json, scr_file) _get_scores(modelcif_json, mdl[1])
_store_as_modelcif(modelcif_json, structure, mdl_file, out_dir, compress) _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'") # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'")
def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
"""Get the list of models to be converted. """Get the list of models to be converted.
If `model_selected` is none, all models will be marked for conversion.""" If `model_selected` is none, all models will be marked for conversion."""
# ToDo: Question - use 'ranked_*.pdb' or
# 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
mdl_path = os.path.join(ap_dir, "models") mdl_path = os.path.join(ap_dir, "models")
cmplx = os.listdir(mdl_path) cmplx = os.listdir(mdl_path)
# For now, exactly 1 complex is expected in the 'models' subdirectory. If # For now, exactly 1 complex is expected in the 'models' subdirectory. If
...@@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: ...@@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
ranking_dbg = os.path.join(mdl_path, "ranking_debug.json") ranking_dbg = os.path.join(mdl_path, "ranking_debug.json")
if not os.path.isfile(ranking_dbg): if not os.path.isfile(ranking_dbg):
logging.info( logging.info(
f"Ranking file '{ranking_dbg} doe snot exist or is no regular " f"Ranking file '{ranking_dbg} does not exist or is no regular "
+ "file." + "file."
) )
sys.exit() sys.exit()
...@@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: ...@@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
ranking_dbg = json.load(jfh) ranking_dbg = json.load(jfh)
score_files = {} score_files = {}
for i, fle in enumerate(ranking_dbg["order"]): for i, fle in enumerate(ranking_dbg["order"]):
score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl") if not fle.startswith("model_"):
raise RuntimeError(
"Filename does not start with 'model_', can "
+ f"not determine model ID: '{fle}'"
)
score_files[i] = (
os.path.join(mdl_path, f"result_{fle}.pkl"),
fle.split("_")[1],
i,
)
# match PDB files with pickle files # match PDB files with pickle files
if model_selected is not None: if model_selected is not None:
models.append( models.append(
( (
os.path.join(mdl_path, f"ranked_{model_selected}.pdb"), os.path.join(mdl_path, f"ranked_{model_selected}.pdb"),
score_files[model_selected], score_files[model_selected][0],
score_files[model_selected][1], # model ID
score_files[model_selected][2], # model rank
) )
) )
else: else:
...@@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: ...@@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
rank = re.match(r"ranked_(\d+)\.pdb", mdl) rank = re.match(r"ranked_(\d+)\.pdb", mdl)
if rank is not None: if rank is not None:
rank = int(rank.group(1)) rank = int(rank.group(1))
models.append((os.path.join(mdl_path, mdl), score_files[rank])) models.append(
(
os.path.join(mdl_path, mdl),
score_files[rank][0].score_files[rank][1],
)
)
# check that files actually exist # check that files actually exist
for mdl, scrs in models: for mdl, scrs, *_ in models:
if not os.path.isfile(mdl): if not os.path.isfile(mdl):
logging.info( logging.info(
f"Model file '{mdl}' does not exist or is not a regular file." f"Model file '{mdl}' does not exist or is not a regular file."
...@@ -642,11 +686,10 @@ def main(argv): ...@@ -642,11 +686,10 @@ def main(argv):
complex_name, model_dir, model_list = _get_model_list( complex_name, model_dir, model_list = _get_model_list(
FLAGS.ap_output, FLAGS.model_selected FLAGS.ap_output, FLAGS.model_selected
) )
for mdl, scrs in model_list: for mdl in model_list:
alphapulldown_model_to_modelcif( alphapulldown_model_to_modelcif(
complex_name, complex_name,
mdl, mdl,
scrs,
model_dir, model_dir,
FLAGS.ap_output, FLAGS.ap_output,
FLAGS.compress, FLAGS.compress,
...@@ -668,4 +711,4 @@ if __name__ == "__main__": ...@@ -668,4 +711,4 @@ if __name__ == "__main__":
# ToDo: make sure all functions come with types # ToDo: make sure all functions come with types
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling # LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter aa # LocalWords: multimer sif Jupyter aa MSAs
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment