From f3e4fac0cbbb3eead83d95eedb249dfd45c094bc Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Wed, 4 Oct 2023 11:57:49 +0200
Subject: [PATCH] First steps towards a protocol

---
 convert_to_modelcif.py | 264 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 239 insertions(+), 25 deletions(-)

diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py
index 3e7b940..97a3f0e 100755
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -25,6 +25,7 @@ import modelcif
 import modelcif.associated
 import modelcif.dumper
 import modelcif.model
+import modelcif.protocol
 
 # ToDo: Get options properly, best get the same names as used in existing
 #       scripts, e.g. could '--monomer_objects_dir' be used as feature
@@ -38,6 +39,7 @@ import modelcif.model
 #       definitively for later, not the first working version of the converter
 #       script)
 # ToDo: sort non-ModelCIF items in the main JSON object into '__meta__'
+# ToDo: protocol step software parameters
 flags.DEFINE_string(
     "ap_output", None, "AlphaPulldown pipeline output directory."
 )
@@ -132,11 +134,11 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
 
     def add_scores(self, scores_json, entry_id, file_prefix, sw_dct):
         """Add QA metrics"""
-        _GlobalPLDDT.software = sw_dct["alphafold"]
-        _GlobalPTM.software = sw_dct["alphafold"]
-        _GlobalIPTM.software = sw_dct["alphafold"]
-        _LocalPLDDT.software = sw_dct["alphafold"]
-        _LocalPairwisePAE.software = sw_dct["alphafold"]
+        _GlobalPLDDT.software = sw_dct["AlphaFold"]
+        _GlobalPTM.software = sw_dct["AlphaFold"]
+        _GlobalIPTM.software = sw_dct["AlphaFold"]
+        _LocalPLDDT.software = sw_dct["AlphaFold"]
+        _LocalPairwisePAE.software = sw_dct["AlphaFold"]
         # global scores
         self.qa_metrics.extend(
             (
@@ -230,6 +232,113 @@ def _get_modelcif_entities(target_ents, asym_units, system):
         system.target_entities.append(mdlcif_ent)
 
 
+def _get_modelcif_protocol_input(
+    input_data_group, target_entities, ref_dbs, model
+):
+    """Assemble input data for a ModelCIF protocol step."""
+    if input_data_group == "target_sequences":
+        input_data = modelcif.data.DataGroup(target_entities)
+        input_data.extend(ref_dbs)
+    elif input_data_group == "model":
+        input_data = model
+    else:
+        raise RuntimeError(f"Unknown protocol input: '{input_data_group}'")
+    return input_data
+
+
+def _get_modelcif_protocol(
+    protocol_steps, target_entities, model, sw_dict, ref_dbs
+):
+    """Create the protocol for the ModelCIF file."""
+    protocol = modelcif.protocol.Protocol()
+    for js_step in protocol_steps:
+        # assemble input data
+        input_data = _get_modelcif_protocol_input(
+            js_step["input_data_group"], target_entities, ref_dbs, model
+        )
+
+        # loop over software group and assemble software group from that
+        sw_grp = modelcif.SoftwareGroup()
+        for pss in js_step["software_group"]:  # protocol step software
+            if sw_dict[pss] is not None:
+                # add software w/o individual parameters
+                sw_grp.append(sw_dict[pss])
+                # add software with individual parameters
+                # sw_grp.append(
+                #     modelcif.SoftwareWithParameters(
+                #         sw_dict[pss],
+                #         [modelcif.SoftwareParameter(<name>, <value>)],
+                #     )
+                # )
+        # ToDo: make sure AlphaPulldown is first in the SoftwareGroup() list,
+        #       AlphaFold second; that influences citation order in the ModelCIF
+        #       file.
+        # mix everything together in a protocol step
+        protocol.steps.append(
+            modelcif.protocol.Step(
+                input_data=input_data,
+                output_data=None,  # output_data,
+                name=js_step["step_name"],
+                details=js_step["details"],
+                software=sw_grp,
+            )
+        )
+        print("modelcif.protocol.Step(")
+        print(f"            input_data={input_data},")
+        #        output_data=output_data,
+        print(f"            name={js_step['step_name']},")
+        print(f"            details=\"{js_step['details']}\",")
+        print(f"            software={sw_grp},")
+        print(")")
+
+    return protocol
+
+
+def _get_modelcif_ref_dbs(meta_json):
+    """Get sequence DBs used for monomer features."""
+    # vendor formatting for DB names/ URLs, extend on KeyError
+    db_info = {
+        "uniref90": {
+            "name": "UniRef90",
+            "url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/"
+            + "uniref90/",
+        },
+        "mgnify": {"name": "MGnify", "url": None},
+        "bfd": {"name": "BFD", "url": None},
+        "small_bfd": {"name": "Reduced BFD", "url": None},
+        "uniref30": {"name": "UniRef30", "url": None},
+        "uniprot": {"name": "UniProt", "url": None},
+        "pdb70": {"name": "PDB70", "url": None},
+        "pdb_seqres": {"name": "PDB seqres", "url": None},
+        "colabfold": {"name": "ColabFold", "url": None},
+    }
+    sdb_dct = {}  # 'sequence database list', starts as dict
+    for data in meta_json.values():
+        for db_name, vdct in data["databases"].items():
+            db_name = db_name.lower()
+            # if DB already exists, check URL and version
+            if db_name in sdb_dct:
+                print("ALREADY THERE", sdb_dct[db_name])
+                # ToDo: switch URL to the actual URL read from JSON
+                if (
+                    sdb_dct[db_name].version == vdct["version"]
+                    or sdb_dct[db_name].url != db_info[db_name]["url"]
+                ):
+                    raise RuntimeError(
+                        "Database versions or URLs differ for "
+                        + f"'{db_name}': '{sdb_dct[db_name].version}/ "
+                        + f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ "
+                        + f"{db_info[db_name]['url']}'"
+                    )
+            sdb_dct[db_name] = modelcif.ReferenceDatabase(
+                db_info[db_name]["name"],
+                db_info[db_name]["url"],
+                version=vdct["version"],
+            )
+
+    return sdb_dct.values()
+
+
 def _store_as_modelcif(
     data_json: dict,
     structure: BioStructure,
@@ -292,7 +401,22 @@ def _store_as_modelcif(
     #      - what about including the tabular summary?
     #      - model selection: only if just a certain model is translated to
     #        ModelCIF, or mix it with scoring step?
-    # system.protocols.append()
+    #
+    #      - cancer-PPI-domains has 'coevolutin MSA'
+    #      - then modelling
+    #      - do we have an example with a split MSA & modelling step?
+    #      - model selection like for Tara
+    system.protocols.append(
+        _get_modelcif_protocol(
+            data_json["ma_protocol_step"],
+            system.target_entities,
+            model,
+            sw_dct,
+            # ToDo: _storte_as_modelcif should not use __meta__, __meta__ is
+            #       tool specific
+            _get_modelcif_ref_dbs(data_json["__meta__"]),
+        )
+    )
 
     # write modelcif.System to file
     # NOTE: this will dump PAE on path provided in add_scores
@@ -334,23 +458,23 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str:
     af2_version = None
     for mnmr in data_json["__meta__"]:  # mnmr = monomer
         if (
-            data_json["__meta__"][mnmr]["software"]["alphapulldown"]["version"]
+            data_json["__meta__"][mnmr]["software"]["AlphaPulldown"]["version"]
             not in ap_versions
         ):
             ap_versions.append(
-                data_json["__meta__"][mnmr]["software"]["alphapulldown"][
+                data_json["__meta__"][mnmr]["software"]["AlphaPulldown"][
                     "version"
                 ]
             )
         # AlphaFold-Multimer builds the model we are looking at, can only be a
         # single version.
         if af2_version is None:
-            af2_version = data_json["__meta__"][mnmr]["software"]["alphafold"][
+            af2_version = data_json["__meta__"][mnmr]["software"]["AlphaFold"][
                 "version"
             ]
         else:
             if (
-                data_json["__meta__"][mnmr]["software"]["alphafold"]["version"]
+                data_json["__meta__"][mnmr]["software"]["AlphaFold"]["version"]
                 != af2_version
             ):
                 # pylint: disable=line-too-long
@@ -387,13 +511,8 @@ def _get_feature_metadata(
         # ToDo: make sure that its always ASCII
         with open(feature_json, "r", encoding="ascii") as jfh:
             jdata = json.load(jfh)
-        modelcif_json["__meta__"][mnmr]["software"] = jdata["binaries"]
-        modelcif_json["__meta__"][mnmr]["software"]["alphapulldown"] = {
-            "version": jdata["version"]
-        }
-        modelcif_json["__meta__"][mnmr]["software"]["alphafold"] = {
-            "version": jdata["AlphaFold version"]
-        }
+        modelcif_json["__meta__"][mnmr]["software"] = jdata["software"]
+        modelcif_json["__meta__"][mnmr]["databases"] = jdata["databases"]
 
     return cmplx_name
 
@@ -475,16 +594,27 @@ def _get_scores(cif_json: dict, scr_file: str) -> None:
 
 def _get_software_data(meta_json: dict) -> list:
     """Turn meta data about software into modelcif.Software objects."""
+    cite_hhsuite = ihm.Citation(
+        pmid="31521110",
+        title="HH-suite3 for fast remote homology detection and deep "
+        + "protein annotation.",
+        journal="BMC Bioinformatics",
+        volume=20,
+        page_range=None,
+        year=2019,
+        authors=[
+            "Steinegger, M.",
+            "Meier, M.",
+            "Mirdita, M.",
+            "Voehringer, H.",
+            "Haunsberger, S.J.",
+            "Soeding, J.",
+        ],
+        doi="10.1186/s12859-019-3019-7",
+    )
     # {key from json: dict needed to produce sw entry plus internal key}
     sw_data = {
-        "jackhmmer": None,
-        "hhblits": None,
-        "hhsearch": None,
-        "hmmsearch": None,
-        "hmmbuild": None,
-        "kalign": None,
-        "alphapulldown": None,
-        "alphafold": modelcif.Software(
+        "AlphaFold": modelcif.Software(
             "AlphaFold-Multimer",
             "model building",
             "Structure prediction",
@@ -525,7 +655,56 @@ def _get_software_data(meta_json: dict) -> list:
                 doi="10.1101/2021.10.04.463034",
             ),
         ),
+        "AlphaPulldown": modelcif.Software(
+            "AlphaPulldown",
+            "model building",
+            "Structure prediction",
+            "https://github.com/KosinskiLab/AlphaPulldown",
+            "package",
+            None,
+            ihm.Citation(
+                pmid="36413069",
+                title="AlphaPulldown-a python package for protein-protein "
+                + "interaction screens using AlphaFold-Multimer.",
+                journal="Bioinformatics",
+                volume=39,
+                page_range=None,
+                year=2023,
+                authors=[
+                    "Yu, D.",
+                    "Chojnowski, G.",
+                    "Rosenthal, M.",
+                    "Kosinski, J.",
+                ],
+                doi="10.1093/bioinformatics/btac749",
+            ),
+        ),
+        "hhblits": modelcif.Software(
+            "HHblits",
+            "data collection",
+            "Iterative protein sequence searching by HMM-HMM alignment",
+            "https://github.com/soedinglab/hh-suite",
+            "program",
+            None,
+            cite_hhsuite,
+        ),
+        "hhsearch": modelcif.Software(
+            "HHsearch",
+            "data collection",
+            "Protein sequence searching by HMM-HMM comparison",
+            "https://github.com/soedinglab/hh-suite",
+            "program",
+            None,
+            cite_hhsuite,
+        ),
+        "hmmbuild": None,
+        "hmmsearch": None,
+        "jackhmmer": None,
+        "kalign": None,
     }
+    # ToDo: refactor to only those SW objects created/ added that are actually
+    #       in the dictionary. That is, instead of a pre-build dictionary,
+    #       instantiate on the fly, there is anyway a hard-coded-tool-name.
     for data in meta_json.values():
         for sftwr, version in data["software"].items():
             if sftwr not in sw_data:
@@ -533,6 +712,7 @@ def _get_software_data(meta_json: dict) -> list:
                     "Unknown software found in meta data: " + f"'{sftwr}'"
                 )
             version = version["version"]
+            # ToDo: software should not be None, remove in final version
             if sw_data[sftwr] is not None:
                 if sw_data[sftwr].version is not None:
                     if sw_data[sftwr].version != version:
@@ -546,6 +726,38 @@ def _get_software_data(meta_json: dict) -> list:
     return sw_data
 
 
+def _get_protocol_steps(modelcif_json):
+    """Create the list of protocol steps with software and parameters used."""
+    protocol = []
+    # MSA/ monomer feature generation step
+    # ToDo: get software from modelcif_json
+    step = {
+        "method_type": "coevolution MSA",
+        "step_name": "MSA generation",
+        "details": "Create sequence features for corresponding monomers.",
+        "input_data_group": "target_sequences",
+        "output_data_group": "monomer_pickle_files",
+        "software_group": [
+            "AlphaPulldown",
+            "AlphaFold",
+            "jackhmmer",
+            "hhblits",
+            "hhsearch",
+            "hmmsearch",
+            "hmmbuild",
+            "kalign",
+        ]
+        # _ma_protocol_step.protocol_id
+    }
+    protocol.append(step)
+
+    # modelling step
+
+    # model selection step
+
+    return protocol
+
+
 def alphapulldown_model_to_modelcif(
     cmplx_name: str,
     mdl: tuple,
@@ -575,6 +787,8 @@ def alphapulldown_model_to_modelcif(
     # read quality scores from pickle file
     _get_scores(modelcif_json, mdl[1])
 
+    modelcif_json["ma_protocol_step"] = _get_protocol_steps(modelcif_json)
+
     _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress)
     # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'")
 
-- 
GitLab