From 1fc029a09bf1b6608b7f7609f27358c3b8c1943e Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Wed, 10 Jan 2024 16:41:20 +0100 Subject: [PATCH] Add software parameters --- convert_to_modelcif.py | 152 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 14 deletions(-) diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index bb4557e..974d594 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -326,21 +326,23 @@ def _get_modelcif_protocol( ) # loop over software group and assemble software group from that sw_grp = modelcif.SoftwareGroup() - for pss in js_step["software_group"]: # protocol step software - if sw_dict[pss] is not None: + for ( + pss, + psp, + ) in zip( # protocol step software & protocol step parameters + js_step["software_group"], js_step["parameter_group"] + ): + plst = [] + for arg, val in psp.items(): + plst.append(modelcif.SoftwareParameter(arg, val)) + if len(plst) > 0: + # add software with individual parameters + sw_grp.append( + modelcif.SoftwareWithParameters(sw_dict[pss], plst) + ) + else: # add software w/o individual parameters sw_grp.append(sw_dict[pss]) - # add software with individual parameters - # Commented out code does not need spelling check, so disable - # it in Pylint - # pylint: disable=wrong-spelling-in-comment - # sw_grp.append( - # modelcif.SoftwareWithParameters( - # sw_dict[pss], - # [modelcif.SoftwareParameter(<name>, <value>)], - # ) - # ) - # pylint: enable=wrong-spelling-in-comment # ToDo: make sure AlphaPulldown is first in the SoftwareGroup() list, # AlphaFold second; that influences citation order in the ModelCIF # file. @@ -622,6 +624,117 @@ def _file_exists_or_exit(path, msg): sys.exit() +def _cast_param(val): + """Cast a string input val to its actual data type.""" + try: + return int(val) + except ValueError: + pass + try: + return float(val) + except ValueError: + pass + if val == "True": + return True + if val == "False": + return False + + return val + + +def _get_software_with_parameters(sw_dict, other_dict): + """Get software with versions and parameters.""" + # ToDo: deal with `use_mmseqs=True` + # ToDo: should all args go to AlphaPulldown? + known_args = { + "db_preset": ["AlphaFold"], + "max_template_date": ["AlphaFold"], + "model_preset": ["AlphaFold"], + "num_multimer_predictions_per_model": ["AlphaFold"], + "plddt_threshold": ["AlphaPulldown"], + "hb_allowance": ["AlphaPulldown"], + "threshold_clashes": ["AlphaPulldown"], + "job_index": ["AlphaPulldown"], + "use_mmseqs2": ["AlphaPulldown"], + "skip_existing": ["AlphaPulldown"], + "save_msa_files": ["AlphaPulldown"], + "num_predictions_per_model": ["AlphaPulldown"], + "benchmark": ["AlphaFold", "AlphaPulldown"], + "use_precomputed_msas": ["AlphaFold", "AlphaPulldown"], + "models_to_relax": ["AlphaFold", "AlphaPulldown"], + } + trans_args = { + "num_multimer_predictions_per_model": "num_predictions_per_model" + } + ignored_args = [ + "?", + "alsologtostderr", + "bfd_database_path", + "data_dir", + "delta_threshold", + "description_file", + "hbm_oom_exit", + "hhblits_binary_path", + "hhsearch_binary_path", + "hmmbuild_binary_path", + "hmmsearch_binary_path", + "jackhmmer_binary_path", + "kalign_binary_path", + "log_dir", + "logger_levels", + "logtostderr", + "mgnify_database_path", + "obsolete_pdbs_path", + "only_check_args", + "op_conversion_fallback_to_while_loop", + "output_dir", + "path_to_fasta", + "path_to_mmt", + "pdb", + "pdb70_database_path", + "pdb_post_mortem", + "pdb_seqres_database_path", + "run_with_pdb", + "run_with_profiling", + "runtime_oom_exit", + "showprefixforinfo", + "small_bfd_database_path", + "stderrthreshold", + "template_mmcif_dir", + "tt_check_filter", + "tt_single_core_summaries", + "uniprot_database_path", + "uniref30_database_path", + "uniref90_database_path", + "use_small_bfd", + "v", + "verbosity", + "xml_output_file", + ] + re_args = re.compile( + r"(?:fasta_path|multimeric_chains|multimeric_templates|protein)_\d+" + ) + swwp = sw_dict # Software With Parameters + for key, val in other_dict.items(): + if key in known_args: + for tool in known_args[key]: + if "params" not in swwp[tool]: + swwp[tool]["params"] = {} + swwp[tool]["params"][f"--{key}"] = _cast_param(val) + if key in trans_args: + key = trans_args[key] + for tool in known_args[key]: + if "params" not in swwp[tool]: + swwp[tool]["params"] = {} + swwp[tool]["params"][f"--{key}"] = _cast_param(val) + else: + if key not in ignored_args and re.match(re_args, key) is None: + logging.info(f"Found unknown key in 'other': {key}") + sys.exit() + + return swwp + + def _get_feature_metadata( modelcif_json: dict, cmplx_name: str, @@ -642,8 +755,10 @@ def _get_feature_metadata( # ToDo: make sure that its always ASCII with open(feature_json, "r", encoding="ascii") as jfh: jdata = json.load(jfh) - modelcif_json["__meta__"][mnmr]["software"] = jdata["software"] modelcif_json["__meta__"][mnmr]["databases"] = jdata["databases"] + modelcif_json["__meta__"][mnmr][ + "software" + ] = _get_software_with_parameters(jdata["software"], jdata["other"]) return cmplx_name @@ -944,12 +1059,17 @@ def _get_protocol_steps(modelcif_json): "input_data_group": ["target_sequences", "reference_dbs"], "output_data_group": "monomer_pickle_files", "software_group": [], + "parameter_group": [], } for sftwr in modelcif_json["__meta__"].values(): sftwr = sftwr["software"] for tool in sftwr: if tool not in step["software_group"]: step["software_group"].append(tool) + if "params" in sftwr[tool]: + step["parameter_group"].append(sftwr[tool]["params"]) + else: + step["parameter_group"].append({}) protocol.append(step) # modelling step @@ -963,6 +1083,10 @@ def _get_protocol_steps(modelcif_json): "input_data_group": ["target_sequences", "STEPTYPE$coevolution MSA"], "output_data_group": "model", "software_group": ["AlphaPulldown", "AlphaFold"], + "parameter_group": [ + sftwr["AlphaPulldown"]["params"], + sftwr["AlphaFold"]["params"], + ], } protocol.append(step) -- GitLab