From 150a9cbf9728e27f4c18ecb0960cc63ee14ef7dc Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Mon, 28 Aug 2023 13:38:37 +0200
Subject: [PATCH] Notes on protocol steps

---
 convert_to_modelcif.py | 147 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py
index bd2a21f..7518283 100755
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -4,14 +4,19 @@
 file with a lot of metadata in place."""
 
 from typing import Tuple
+import hashlib
 import json
 import os
 import sys
 
+from Bio import SeqIO
+from Bio.PDB import PDBParser, PPBuilder
+from Bio.PDB.Structure import Structure as BioStructure
 from absl import app, flags, logging
 
 import modelcif
 import modelcif.dumper
+import modelcif.model
 
 # ToDo: Get options properly, best get the same names as used in existing
 #       scripts, e.g. could '--monomer_objects_dir' be used as feature
@@ -34,8 +39,49 @@ FLAGS = flags.FLAGS
 #       exist as expected.
 
 
+class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
+    """Map Biopython PDB.Structure object to ihm.model"""
+
+    def __init__(self, *args, **kwargs):
+        """Initialise a model"""
+        self.structure = kwargs.pop("bio_pdb_structure")
+        self.asym = kwargs.pop("asym")
+
+        super().__init__(*args, **kwargs)
+
+    def get_atoms(self):
+        for atm in self.structure.get_atoms():
+            yield modelcif.model.Atom(
+                asym_unit=self.asym[atm.parent.parent.id],
+                seq_id=atm.parent.id[1],
+                atom_id=atm.name,
+                type_symbol=atm.element,
+                x=atm.coord[0],
+                y=atm.coord[1],
+                z=atm.coord[2],
+                het=atm.parent.id[0] != " ",
+                biso=atm.bfactor,
+                occupancy=atm.occupancy,
+            )
+
+
+def _get_modelcif_entities(target_ents, asym_units, system):
+    """Create ModelCIF entities and asymmetric units."""
+    for cif_ent in target_ents:
+        mdlcif_ent = modelcif.Entity(
+            # 'pdb_sequence' can be used here, since AF2 always has the
+            # complete sequence.
+            cif_ent["pdb_sequence"],
+            description=cif_ent["description"],
+        )
+        for pdb_chain_id in cif_ent["pdb_chain_id"]:
+            asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent)
+        system.target_entities.append(mdlcif_ent)
+
+
 def _store_as_modelcif(
     data_json: dict,
+    structure: BioStructure,
     mdl_file: str,
     out_dir: str,
     # ost_ent, file_prfx, compress, add_files
@@ -47,6 +93,52 @@ def _store_as_modelcif(
         model_details=data_json["_struct.pdbx_model_details"],
     )
 
+    # create target entities, references, source, asymmetric units & assembly
+    # create an asymmetric unit and an entity per target sequence
+    asym_units = {}
+    _get_modelcif_entities(
+        data_json["target_entities"],
+        asym_units,
+        system,
+    )
+
+    # ToDo: get modelling-experiment auhtors
+    # audit_authors
+    # system.authors.extend(data_json["audit_authors"])
+
+    # set up the model to produce coordinates
+    # ToDo: model file names are different than expected, got ranked_<N>.pdb
+    #       and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
+    #       like model_<N>_rank_<M>.pdb, used for 'model list name'
+    model = _Biopython2ModelCIF(
+        assembly=modelcif.Assembly(asym_units.values()),
+        asym=asym_units,
+        bio_pdb_structure=structure,
+        name="ToDo: Model <N> (ranked #<M>)",
+    )
+
+    # ToDO: check data_json["model_group_name"] at Tara's
+    system.model_groups.append(modelcif.model.ModelGroup([model]))
+
+    # ToDo: get protocol steps together
+    #       - 'coevolution MSA' (create_individual_features.py) step
+    #       - 'template search' is usually omitted for AF2 as that term more
+    #         relates to homology modelling, AF2 uses templates in a different
+    #         way and a template search usually results in a list of templates
+    #         which would need to be included, here, in theory
+    #      - for MSA/ template search, how to represent the outcome? Adding the
+    #        pickle files quickly exceeds reasonable storage use
+    #      - 'modeling' (run_multimer_jobs.py), are the four modes reflected by
+    #        the JSON data/ does the JSON data look different for each mode?
+    #      - are the scores only calculated by alpha-analysis.sif or do they
+    #        come out of run_multimer_jobs.py? Does this go into its own step?
+    #      - could Jupyter notebooks be included as associated file or do they
+    #        require the pickle files?
+    #      - what about including the tabular summary?
+    #      - model selection: only if just a certain model is translated to
+    #        ModelCIF, or mix it with scoring step?
+    # system.protocols.append()
+
     # write modelcif.System to file
     # NOTE: this will dump PAE on path provided in add_scores
     # -> hence we cheat by changing path and back while being exception-safe...
@@ -117,6 +209,53 @@ def _get_data_block_id_and_struct_and_entry_categories(
     )
 
 
+def _get_entities(
+    cif_json: dict, pdb_file: str, cmplx_name: list, prj_dir: str
+) -> BioStructure:
+    """Gather data for the mmCIF (target) entities."""
+    # read sequence from FastA, map to sequences from PDB file
+    sequences = {}
+    # Fetch FastA sequences to assign monomers to the molecular entities
+    prj_dir = os.path.join(prj_dir, "fastas")
+    if not os.path.isdir(prj_dir):
+        logging.info(f"FastA directory '{prj_dir}' does not exist.")
+        sys.exit()
+    for mnmr in cmplx_name:
+        fasta = os.path.join(prj_dir, f"{mnmr}.fa")
+        if not os.path.isfile(fasta):
+            logging.info(f"FastA file '{fasta}' does not exist.")
+            sys.exit()
+        with open(fasta, "r", encoding="ascii") as ffh:
+            seq = SeqIO.read(ffh, "fasta")
+            # Using MD5 sums for comparing sequences only works since AF2
+            # *ALWAYS* has the complete sequence in a chain.
+            sequences[hashlib.md5(seq.seq.encode()).hexdigest()] = mnmr
+    # gather molecular entities from PDB file
+    structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file)
+    cif_json["target_entities"] = []
+    already_seen = []
+    for chn, seq in zip(
+        structure.get_chains(), PPBuilder().build_peptides(structure)
+    ):
+        seq = seq.get_sequence()
+        seq_md5 = hashlib.md5(seq.encode()).hexdigest()
+        cif_ent = {}
+        try:
+            e_idx = already_seen.index(seq_md5)
+        except ValueError:
+            pass
+        else:
+            cif_json["target_entities"][e_idx]["pdb_chain_id"].append(chn.id)
+            continue
+        cif_ent["pdb_sequence"] = seq
+        cif_ent["pdb_chain_id"] = [chn.id]
+        cif_ent["description"] = sequences[seq_md5]
+        cif_json["target_entities"].append(cif_ent)
+        already_seen.append(sequences[seq_md5])
+
+    return structure
+
+
 def alphapulldown_model_to_modelcif(
     cmplx_name: str,
     mdl_file: str,
@@ -136,7 +275,10 @@ def alphapulldown_model_to_modelcif(
     _get_data_block_id_and_struct_and_entry_categories(
         modelcif_json, cmplx_name
     )
-    _store_as_modelcif(modelcif_json, mdl_file, out_dir)
+    # gather target entities (sequences that have been modeled) info
+    structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir)
+
+    _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir)
     # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
 
 
@@ -219,4 +361,5 @@ if __name__ == "__main__":
 #       like that, including author names?
 # ToDo: where to store which model was chosen? Should be in Tara's models.
 
-#  LocalWords:  ToDo AlphaPulldown PAEs dir struct
+#  LocalWords:  ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
+#  LocalWords:  multimer sif Jupyter
-- 
GitLab