diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index fc30f1cd719ceb2f55ce0404867df9f025f1472e..a20bea145d186b642f5099f60c1a2c6f4695e57c 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -29,6 +29,15 @@ import modelcif.model # ToDo: Get options properly, best get the same names as used in existing # scripts, e.g. could '--monomer_objects_dir' be used as feature # directory/ directory with the feature JSON files? +# ToDo: Monomers work separately - features may come from different set of +# software, databases... so target sequences may be connected to different +# versions of the same sequence database, may use different versions of +# software... can this still go into single protocol steps or would this +# prevent identifying which MSAs were produced with which software? E.g. +# can there still be a single "sequence search" step? (This is +# definitively for later, not the first working version of the converter +# script) +# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__' flags.DEFINE_string( "ap_output", None, "AlphaPulldown pipeline output directory." ) @@ -245,19 +254,16 @@ def _store_as_modelcif( system, ) - # ToDo: get modelling-experiment auhtors + # ToDo: get modelling-experiment authors # audit_authors # system.authors.extend(data_json["audit_authors"]) # set up the model to produce coordinates - # ToDo: model file names are different than expected, got ranked_<N>.pdb - # and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something - # like model_<N>_rank_<M>.pdb, used for 'model list name' model = _Biopython2ModelCIF( assembly=modelcif.Assembly(asym_units.values()), asym=asym_units, bio_pdb_structure=structure, - name="ToDo: Model <N> (ranked #<M>)", + name=data_json["_ma_model_list.model_name"], ) # create software list from feature metadata @@ -327,6 +333,7 @@ def _compress_cif_file(cif_file): def _get_model_details(cmplx_name: str, data_json: dict) -> str: """Get the model description.""" ap_versions = [] + af2_version = None for mnmr in data_json["__meta__"]: # mnmr = monomer if ( data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"] @@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str: "version" ] ) + # AlphaFold-Multimer builds the model we are looking at, can only be a + # single version. + if af2_version is None: + af2_version = data_json["__meta__"][mnmr]["software"]["alphafold"][ + "version" + ] + else: + if ( + data_json["__meta__"][mnmr]["software"]["alphafold"]["version"] + != af2_version + ): + # pylint: disable=line-too-long + raise RuntimeError( + "Different versions of AlphaFold-Multimer found: " + + f"'{data_json['__meta__'][mnmr]['software']['alphafold']['version']}'" + + f" vs. '{af2_version}'" + ) - # ToDo: fetch AF2 version/ have it in metadata JSON return ( f"Model generated for {' and '.join(cmplx_name)}, produced " - + "using AlphaFold-Multimer (<AF2 VERSION>) as implemented by " + + f"using AlphaFold-Multimer ({af2_version}) as implemented by " + f"AlphaPulldown ({', '.join(ap_versions)})." ) @@ -377,8 +400,11 @@ def _get_feature_metadata( return cmplx_name -def _get_data_block_id_and_struct_and_entry_categories( - cif_json: dict, cmplx_name: str +def _get_model_info( + cif_json: dict, + cmplx_name: str, + mdl_id: str, + mdl_rank: int, ) -> None: """Get 'data_' block ID and data for categories '_struct' and '_entry'.""" cif_json["data_"] = "_".join(cmplx_name) @@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories( cif_json["_struct.pdbx_model_details"] = _get_model_details( cmplx_name, cif_json ) + cif_json[ + "_ma_model_list.model_name" + ] = f"Model {mdl_id} (ranked #{mdl_rank})" def _get_entities( @@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list: def alphapulldown_model_to_modelcif( cmplx_name: str, - mdl_file: str, - scr_file: str, + mdl: tuple, out_dir: str, prj_dir: str, compress: bool = False, @@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif( Metadata for the ModelCIF categories will be fetched from AlphaPulldown output as far as possible. This expects modelling projects to exists in AlphaPulldown's output directory structure.""" - # ToDo: ENABLE logging.info(f"Processing '{mdl_file}'...") + # ToDo: ENABLE logging.info(f"Processing '{mdl[0]}'...") modelcif_json = {} # fetch metadata cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir) # fetch/ assemble more data about the modelling experiment - _get_data_block_id_and_struct_and_entry_categories( - modelcif_json, cmplx_name + _get_model_info( + modelcif_json, + cmplx_name, + mdl[2], + mdl[3], ) # gather target entities (sequences that have been modeled) info - structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir) + structure = _get_entities(modelcif_json, mdl[0], cmplx_name, prj_dir) # read quality scores from pickle file - _get_scores(modelcif_json, scr_file) + _get_scores(modelcif_json, mdl[1]) - _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir, compress) - # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'") + _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress) + # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'") def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: """Get the list of models to be converted. If `model_selected` is none, all models will be marked for conversion.""" - # ToDo: Question - use 'ranked_*.pdb' or - # 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models? - mdl_path = os.path.join(ap_dir, "models") cmplx = os.listdir(mdl_path) # For now, exactly 1 complex is expected in the 'models' subdirectory. If @@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: ranking_dbg = os.path.join(mdl_path, "ranking_debug.json") if not os.path.isfile(ranking_dbg): logging.info( - f"Ranking file '{ranking_dbg} doe snot exist or is no regular " + f"Ranking file '{ranking_dbg} does not exist or is no regular " + "file." ) sys.exit() @@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: ranking_dbg = json.load(jfh) score_files = {} for i, fle in enumerate(ranking_dbg["order"]): - score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl") + if not fle.startswith("model_"): + raise RuntimeError( + "Filename does not start with 'model_', can " + + f"not determine model ID: '{fle}'" + ) + score_files[i] = ( + os.path.join(mdl_path, f"result_{fle}.pkl"), + fle.split("_")[1], + i, + ) # match PDB files with pickle files if model_selected is not None: models.append( ( os.path.join(mdl_path, f"ranked_{model_selected}.pdb"), - score_files[model_selected], + score_files[model_selected][0], + score_files[model_selected][1], # model ID + score_files[model_selected][2], # model rank ) ) else: @@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: rank = re.match(r"ranked_(\d+)\.pdb", mdl) if rank is not None: rank = int(rank.group(1)) - models.append((os.path.join(mdl_path, mdl), score_files[rank])) + models.append( + ( + os.path.join(mdl_path, mdl), + score_files[rank][0].score_files[rank][1], + ) + ) # check that files actually exist - for mdl, scrs in models: + for mdl, scrs, *_ in models: if not os.path.isfile(mdl): logging.info( f"Model file '{mdl}' does not exist or is not a regular file." @@ -642,11 +686,10 @@ def main(argv): complex_name, model_dir, model_list = _get_model_list( FLAGS.ap_output, FLAGS.model_selected ) - for mdl, scrs in model_list: + for mdl in model_list: alphapulldown_model_to_modelcif( complex_name, mdl, - scrs, model_dir, FLAGS.ap_output, FLAGS.compress, @@ -668,4 +711,4 @@ if __name__ == "__main__": # ToDo: make sure all functions come with types # LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling -# LocalWords: multimer sif Jupyter aa +# LocalWords: multimer sif Jupyter aa MSAs