diff --git a/.gitignore b/.gitignore index c527082c72996ed22aff6b5a497c366799093e9c..3610828a887ca275f0672225a55e13a55742a8de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,12 @@ # Don't have Emac's backup files *~ +# ignore test scripts +biop-test.py +junk.py + # ignore some files used for testing 1ake.1.pdb 3lre.3.A.pdb 6xne.pdb +cmp.cif diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index 91a5176e512f6087bc734d5133be8817bd897e2a..fc30f1cd719ceb2f55ce0404867df9f025f1472e 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -20,6 +20,7 @@ from Bio.PDB.Structure import Structure as BioStructure from absl import app, flags, logging import numpy as np +import ihm.citations import modelcif import modelcif.associated import modelcif.dumper @@ -120,8 +121,13 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel): occupancy=atm.occupancy, ) - def add_scores(self, scores_json, entry_id, file_prefix): + def add_scores(self, scores_json, entry_id, file_prefix, sw_dct): """Add QA metrics""" + _GlobalPLDDT.software = sw_dct["alphafold"] + _GlobalPTM.software = sw_dct["alphafold"] + _GlobalIPTM.software = sw_dct["alphafold"] + _LocalPLDDT.software = sw_dct["alphafold"] + _LocalPairwisePAE.software = sw_dct["alphafold"] # global scores self.qa_metrics.extend( ( @@ -254,9 +260,14 @@ def _store_as_modelcif( name="ToDo: Model <N> (ranked #<M>)", ) + # create software list from feature metadata + sw_dct = _get_software_data(data_json["__meta__"]) + # process scores mdl_file = os.path.splitext(os.path.basename(mdl_file))[0] - system.repositories.append(model.add_scores(data_json, system.id, mdl_file)) + system.repositories.append( + model.add_scores(data_json, system.id, mdl_file, sw_dct) + ) system.model_groups.append(modelcif.model.ModelGroup([model])) @@ -316,9 +327,16 @@ def _compress_cif_file(cif_file): def _get_model_details(cmplx_name: str, data_json: dict) -> str: """Get the model description.""" ap_versions = [] - for mnmr in data_json["__meta__"]: - if data_json["__meta__"][mnmr]["ap_version"] not in ap_versions: - ap_versions.append(data_json["__meta__"][mnmr]["ap_version"]) + for mnmr in data_json["__meta__"]: # mnmr = monomer + if ( + data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"] + not in ap_versions + ): + ap_versions.append( + data_json["__meta__"][mnmr]["software"]["alphapulldown"][ + "version" + ] + ) # ToDo: fetch AF2 version/ have it in metadata JSON return ( @@ -348,7 +366,13 @@ def _get_feature_metadata( # ToDo: make sure that its always ASCII with open(feature_json, "r", encoding="ascii") as jfh: jdata = json.load(jfh) - modelcif_json["__meta__"][mnmr]["ap_version"] = jdata["version"] + modelcif_json["__meta__"][mnmr]["software"] = jdata["binaries"] + modelcif_json["__meta__"][mnmr]["software"]["alphapulldown"] = { + "version": jdata["version"] + } + modelcif_json["__meta__"][mnmr]["software"]["alphafold"] = { + "version": jdata["AlphaFold version"] + } return cmplx_name @@ -418,7 +442,6 @@ def _get_scores(cif_json: dict, scr_file: str) -> None: """Add scores to JSON data.""" with open(scr_file, "rb") as sfh: scr_dict = pickle.load(sfh) - # dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'num_recycles', 'structure_module', 'aligned_confidence_probs', 'max_predicted_aligned_error', 'ranking_confidence']) # Get pLDDT as a list, the global pLDDT is the average, calculated on the # spot. cif_json["plddt"] = scr_dict["plddt"] @@ -427,6 +450,79 @@ def _get_scores(cif_json: dict, scr_file: str) -> None: cif_json["pae"] = scr_dict["predicted_aligned_error"] +def _get_software_data(meta_json: dict) -> list: + """Turn meta data about software into modelcif.Software objects.""" + # {key from json: dict needed to produce sw entry plus internal key} + sw_data = { + "jackhmmer": None, + "hhblits": None, + "hhsearch": None, + "hmmsearch": None, + "hmmbuild": None, + "kalign": None, + "alphapulldown": None, + "alphafold": modelcif.Software( + "AlphaFold-Multimer", + "model building", + "Structure prediction", + "https://github.com/deepmind/alphafold", + "package", + None, + ihm.Citation( + pmid=None, + title="Protein complex prediction with AlphaFold-Multimer.", + journal="bioRxiv", + volume=None, + page_range=None, + year=2021, + authors=[ + "Evans, R.", + "O'Neill, M.", + "Pritzel, A.", + "Antropova, N.", + "Senior, A.", + "Green, T.", + "Zidek, A.", + "Bates, R.", + "Blackwell, S.", + "Yim, J.", + "Ronneberger, O.", + "Bodenstein, S.", + "Zielinski, M.", + "Bridgland, A.", + "Potapenko, A.", + "Cowie, A.", + "Tunyasuvunakool, K.", + "Jain, R.", + "Clancy, E.", + "Kohli, P.", + "Jumper, J.", + "Hassabis, D.", + ], + doi="10.1101/2021.10.04.463034", + ), + ), + } + for data in meta_json.values(): + for sftwr, version in data["software"].items(): + if sftwr not in sw_data: + raise RuntimeError( + "Unknown software found in meta data: " + f"'{sftwr}'" + ) + version = version["version"] + if sw_data[sftwr] is not None: + if sw_data[sftwr].version is not None: + if sw_data[sftwr].version != version: + raise RuntimeError( + "Software versions differ for " + + f"'{sftwr}': '{sw_data[sftwr].version}' vs. " + + f"'{version}'" + ) + sw_data[sftwr].version = version + + return sw_data + + def alphapulldown_model_to_modelcif( cmplx_name: str, mdl_file: str,