From f3e4fac0cbbb3eead83d95eedb249dfd45c094bc Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Wed, 4 Oct 2023 11:57:49 +0200 Subject: [PATCH] First steps towards a protocol --- convert_to_modelcif.py | 264 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 239 insertions(+), 25 deletions(-) diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index 3e7b940..97a3f0e 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -25,6 +25,7 @@ import modelcif import modelcif.associated import modelcif.dumper import modelcif.model +import modelcif.protocol # ToDo: Get options properly, best get the same names as used in existing # scripts, e.g. could '--monomer_objects_dir' be used as feature @@ -38,6 +39,7 @@ import modelcif.model # definitively for later, not the first working version of the converter # script) # ToDo: sort non-ModelCIF items in the main JSON object into '__meta__' +# ToDo: protocol step software parameters flags.DEFINE_string( "ap_output", None, "AlphaPulldown pipeline output directory." ) @@ -132,11 +134,11 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel): def add_scores(self, scores_json, entry_id, file_prefix, sw_dct): """Add QA metrics""" - _GlobalPLDDT.software = sw_dct["alphafold"] - _GlobalPTM.software = sw_dct["alphafold"] - _GlobalIPTM.software = sw_dct["alphafold"] - _LocalPLDDT.software = sw_dct["alphafold"] - _LocalPairwisePAE.software = sw_dct["alphafold"] + _GlobalPLDDT.software = sw_dct["AlphaFold"] + _GlobalPTM.software = sw_dct["AlphaFold"] + _GlobalIPTM.software = sw_dct["AlphaFold"] + _LocalPLDDT.software = sw_dct["AlphaFold"] + _LocalPairwisePAE.software = sw_dct["AlphaFold"] # global scores self.qa_metrics.extend( ( @@ -230,6 +232,113 @@ def _get_modelcif_entities(target_ents, asym_units, system): system.target_entities.append(mdlcif_ent) +def _get_modelcif_protocol_input( + input_data_group, target_entities, ref_dbs, model +): + """Assemble input data for a ModelCIF protocol step.""" + if input_data_group == "target_sequences": + input_data = modelcif.data.DataGroup(target_entities) + input_data.extend(ref_dbs) + elif input_data_group == "model": + input_data = model + else: + raise RuntimeError(f"Unknown protocol input: '{input_data_group}'") + return input_data + + +def _get_modelcif_protocol( + protocol_steps, target_entities, model, sw_dict, ref_dbs +): + """Create the protocol for the ModelCIF file.""" + protocol = modelcif.protocol.Protocol() + for js_step in protocol_steps: + # assemble input data + input_data = _get_modelcif_protocol_input( + js_step["input_data_group"], target_entities, ref_dbs, model + ) + + # loop over software group and assemble software group from that + sw_grp = modelcif.SoftwareGroup() + for pss in js_step["software_group"]: # protocol step software + if sw_dict[pss] is not None: + # add software w/o individual parameters + sw_grp.append(sw_dict[pss]) + # add software with individual parameters + # sw_grp.append( + # modelcif.SoftwareWithParameters( + # sw_dict[pss], + # [modelcif.SoftwareParameter(<name>, <value>)], + # ) + # ) + # ToDo: make sure AlphaPulldown is first in the SoftwareGroup() list, + # AlphaFold second; that influences citation order in the ModelCIF + # file. + # mix everything together in a protocol step + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=None, # output_data, + name=js_step["step_name"], + details=js_step["details"], + software=sw_grp, + ) + ) + print("modelcif.protocol.Step(") + print(f" input_data={input_data},") + # output_data=output_data, + print(f" name={js_step['step_name']},") + print(f" details=\"{js_step['details']}\",") + print(f" software={sw_grp},") + print(")") + + return protocol + + +def _get_modelcif_ref_dbs(meta_json): + """Get sequence DBs used for monomer features.""" + # vendor formatting for DB names/ URLs, extend on KeyError + db_info = { + "uniref90": { + "name": "UniRef90", + "url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/" + + "uniref90/", + }, + "mgnify": {"name": "MGnify", "url": None}, + "bfd": {"name": "BFD", "url": None}, + "small_bfd": {"name": "Reduced BFD", "url": None}, + "uniref30": {"name": "UniRef30", "url": None}, + "uniprot": {"name": "UniProt", "url": None}, + "pdb70": {"name": "PDB70", "url": None}, + "pdb_seqres": {"name": "PDB seqres", "url": None}, + "colabfold": {"name": "ColabFold", "url": None}, + } + sdb_dct = {} # 'sequence database list', starts as dict + for data in meta_json.values(): + for db_name, vdct in data["databases"].items(): + db_name = db_name.lower() + # if DB already exists, check URL and version + if db_name in sdb_dct: + print("ALREADY THERE", sdb_dct[db_name]) + # ToDo: switch URL to the actual URL read from JSON + if ( + sdb_dct[db_name].version == vdct["version"] + or sdb_dct[db_name].url != db_info[db_name]["url"] + ): + raise RuntimeError( + "Database versions or URLs differ for " + + f"'{db_name}': '{sdb_dct[db_name].version}/ " + + f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ " + + f"{db_info[db_name]['url']}'" + ) + sdb_dct[db_name] = modelcif.ReferenceDatabase( + db_info[db_name]["name"], + db_info[db_name]["url"], + version=vdct["version"], + ) + + return sdb_dct.values() + + def _store_as_modelcif( data_json: dict, structure: BioStructure, @@ -292,7 +401,22 @@ def _store_as_modelcif( # - what about including the tabular summary? # - model selection: only if just a certain model is translated to # ModelCIF, or mix it with scoring step? - # system.protocols.append() + # + # - cancer-PPI-domains has 'coevolutin MSA' + # - then modelling + # - do we have an example with a split MSA & modelling step? + # - model selection like for Tara + system.protocols.append( + _get_modelcif_protocol( + data_json["ma_protocol_step"], + system.target_entities, + model, + sw_dct, + # ToDo: _storte_as_modelcif should not use __meta__, __meta__ is + # tool specific + _get_modelcif_ref_dbs(data_json["__meta__"]), + ) + ) # write modelcif.System to file # NOTE: this will dump PAE on path provided in add_scores @@ -334,23 +458,23 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str: af2_version = None for mnmr in data_json["__meta__"]: # mnmr = monomer if ( - data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"] + data_json["__meta__"][mnmr]["software"]["AlphaPulldown"]["version"] not in ap_versions ): ap_versions.append( - data_json["__meta__"][mnmr]["software"]["alphapulldown"][ + data_json["__meta__"][mnmr]["software"]["AlphaPulldown"][ "version" ] ) # AlphaFold-Multimer builds the model we are looking at, can only be a # single version. if af2_version is None: - af2_version = data_json["__meta__"][mnmr]["software"]["alphafold"][ + af2_version = data_json["__meta__"][mnmr]["software"]["AlphaFold"][ "version" ] else: if ( - data_json["__meta__"][mnmr]["software"]["alphafold"]["version"] + data_json["__meta__"][mnmr]["software"]["AlphaFold"]["version"] != af2_version ): # pylint: disable=line-too-long @@ -387,13 +511,8 @@ def _get_feature_metadata( # ToDo: make sure that its always ASCII with open(feature_json, "r", encoding="ascii") as jfh: jdata = json.load(jfh) - modelcif_json["__meta__"][mnmr]["software"] = jdata["binaries"] - modelcif_json["__meta__"][mnmr]["software"]["alphapulldown"] = { - "version": jdata["version"] - } - modelcif_json["__meta__"][mnmr]["software"]["alphafold"] = { - "version": jdata["AlphaFold version"] - } + modelcif_json["__meta__"][mnmr]["software"] = jdata["software"] + modelcif_json["__meta__"][mnmr]["databases"] = jdata["databases"] return cmplx_name @@ -475,16 +594,27 @@ def _get_scores(cif_json: dict, scr_file: str) -> None: def _get_software_data(meta_json: dict) -> list: """Turn meta data about software into modelcif.Software objects.""" + cite_hhsuite = ihm.Citation( + pmid="31521110", + title="HH-suite3 for fast remote homology detection and deep " + + "protein annotation.", + journal="BMC Bioinformatics", + volume=20, + page_range=None, + year=2019, + authors=[ + "Steinegger, M.", + "Meier, M.", + "Mirdita, M.", + "Voehringer, H.", + "Haunsberger, S.J.", + "Soeding, J.", + ], + doi="10.1186/s12859-019-3019-7", + ) # {key from json: dict needed to produce sw entry plus internal key} sw_data = { - "jackhmmer": None, - "hhblits": None, - "hhsearch": None, - "hmmsearch": None, - "hmmbuild": None, - "kalign": None, - "alphapulldown": None, - "alphafold": modelcif.Software( + "AlphaFold": modelcif.Software( "AlphaFold-Multimer", "model building", "Structure prediction", @@ -525,7 +655,56 @@ def _get_software_data(meta_json: dict) -> list: doi="10.1101/2021.10.04.463034", ), ), + "AlphaPulldown": modelcif.Software( + "AlphaPulldown", + "model building", + "Structure prediction", + "https://github.com/KosinskiLab/AlphaPulldown", + "package", + None, + ihm.Citation( + pmid="36413069", + title="AlphaPulldown-a python package for protein-protein " + + "interaction screens using AlphaFold-Multimer.", + journal="Bioinformatics", + volume=39, + page_range=None, + year=2023, + authors=[ + "Yu, D.", + "Chojnowski, G.", + "Rosenthal, M.", + "Kosinski, J.", + ], + doi="10.1093/bioinformatics/btac749", + ), + ), + "hhblits": modelcif.Software( + "HHblits", + "data collection", + "Iterative protein sequence searching by HMM-HMM alignment", + "https://github.com/soedinglab/hh-suite", + "program", + None, + cite_hhsuite, + ), + "hhsearch": modelcif.Software( + "HHsearch", + "data collection", + "Protein sequence searching by HMM-HMM comparison", + "https://github.com/soedinglab/hh-suite", + "program", + None, + cite_hhsuite, + ), + "hmmbuild": None, + "hmmsearch": None, + "jackhmmer": None, + "kalign": None, } + # ToDo: refactor to only those SW objects created/ added that are actually + # in the dictionary. That is, instead of a pre-build dictionary, + # instantiate on the fly, there is anyway a hard-coded-tool-name. for data in meta_json.values(): for sftwr, version in data["software"].items(): if sftwr not in sw_data: @@ -533,6 +712,7 @@ def _get_software_data(meta_json: dict) -> list: "Unknown software found in meta data: " + f"'{sftwr}'" ) version = version["version"] + # ToDo: software should not be None, remove in final version if sw_data[sftwr] is not None: if sw_data[sftwr].version is not None: if sw_data[sftwr].version != version: @@ -546,6 +726,38 @@ def _get_software_data(meta_json: dict) -> list: return sw_data +def _get_protocol_steps(modelcif_json): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + # MSA/ monomer feature generation step + # ToDo: get software from modelcif_json + step = { + "method_type": "coevolution MSA", + "step_name": "MSA generation", + "details": "Create sequence features for corresponding monomers.", + "input_data_group": "target_sequences", + "output_data_group": "monomer_pickle_files", + "software_group": [ + "AlphaPulldown", + "AlphaFold", + "jackhmmer", + "hhblits", + "hhsearch", + "hmmsearch", + "hmmbuild", + "kalign", + ] + # _ma_protocol_step.protocol_id + } + protocol.append(step) + + # modelling step + + # model selection step + + return protocol + + def alphapulldown_model_to_modelcif( cmplx_name: str, mdl: tuple, @@ -575,6 +787,8 @@ def alphapulldown_model_to_modelcif( # read quality scores from pickle file _get_scores(modelcif_json, mdl[1]) + modelcif_json["ma_protocol_step"] = _get_protocol_steps(modelcif_json) + _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress) # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'") -- GitLab