Added global and local pLDDT

490cdfe0 · Bienchen · 09b5e132 · 490cdfe0
Commit 490cdfe0 authored 1 year ago by Bienchen
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -7,12 +7,15 @@ from typing import Tuple
 import hashlib
 import json
 import os
+import pickle
+import re
 import sys

 from Bio import SeqIO
 from Bio.PDB import PDBParser, PPBuilder
 from Bio.PDB.Structure import Structure as BioStructure
 from absl import app, flags, logging
+import numpy as np

 import modelcif
 import modelcif.dumper
@@ -39,6 +42,41 @@ FLAGS = flags.FLAGS
 #       exist as expected.


+# pylint: disable=too-few-public-methods
+class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
+    name = "pLDDT"
+    software = None
+
+
+class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
+    """Predicted accuracy according to the TM-score score in [0,1]"""
+
+    name = "pTM"
+    software = None
+
+
+class _GlobalIPTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
+    # python-modelcif reads the first line of class-doc has description of a
+    # score, so we need to allow long lines, here.
+    # pylint: disable=line-too-long
+    """Predicted protein-protein interface score, based on the TM-score score in [0,1]."""
+
+    name = "ipTM"
+    software = None
+
+
+class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
+    name = "pLDDT"
+    software = None
+
+
+# pylint: enable=too-few-public-methods
+
+
 class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
    """Map Biopython PDB.Structure object to ihm.model"""

@@ -64,6 +102,44 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
                occupancy=atm.occupancy,
            )

+    def add_scores(self, scores_json, entry_id):
+        """Add QA metrics"""
+        # global scores
+        self.qa_metrics.extend(
+            (
+                _GlobalPLDDT(np.mean(scores_json["plddt"])),
+                _GlobalPTM(scores_json["ptm"]),
+                _GlobalIPTM(scores_json["iptm"]),
+            )
+        )
+
+        # local scores
+        # iterate polypetide chains
+        # local PLDDT
+        i = 0
+        # aa_only=False includes non-canonical amino acids but seems to skip
+        # non-peptide-linking residues like ions
+        for chn_i in PPBuilder().build_peptides(self.structure, aa_only=False):
+            for res_i in chn_i:
+                # local pLDDT
+                # Assertion assumes that pLDDT values are also stored in the
+                # B-factor column.
+                assert (
+                    round(scores_json["plddt"][i], 2)
+                    == next(res_i.get_atoms()).bfactor
+                )
+                self.qa_metrics.append(
+                    _LocalPLDDT(
+                        self.asym[res_i.parent.id].residue(res_i.id[1]),
+                        round(scores_json["plddt"][i], 2),
+                    )
+                )
+
+                # pairwise alignment error
+                j = 0
+
+                i += 1
+

 def _get_modelcif_entities(target_ents, asym_units, system):
    """Create ModelCIF entities and asymmetric units."""
@@ -84,7 +160,7 @@ def _store_as_modelcif(
    structure: BioStructure,
    mdl_file: str,
    out_dir: str,
-    # ost_ent, file_prfx, compress, add_files
+    # file_prfx, compress, add_files
 ) -> None:
    """Create the actual ModelCIF file."""
    system = modelcif.System(
@@ -118,10 +194,8 @@ def _store_as_modelcif(
    )

    # process scores
-    # ToDo: Get scores, but where are they stored? For other AF2 related
-    #       projects, there are JSON or Pickle files with all AF2 scores
    # system.repositories.append(
-    #     model.add_scores(data_json, system.id, mdl_name, add_files)
+    model.add_scores(data_json, system.id)
    # )

    system.model_groups.append(modelcif.model.ModelGroup([model]))
@@ -240,6 +314,9 @@ def _get_entities(
    structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file)
    cif_json["target_entities"] = []
    already_seen = []
+    # ToDo: skip using chains to make it safe for models with ligands. Chains
+    #       are only used for the chain ID. That can be obtained from the first
+    #       residue in the sequence: seq[0].parent.id
    for chn, seq in zip(
        structure.get_chains(), PPBuilder().build_peptides(structure)
    ):
@@ -262,9 +339,22 @@ def _get_entities(
    return structure


+def _get_scores(cif_json: dict, scr_file: str) -> None:
+    """Add scores to JSON data."""
+    with open(scr_file, "rb") as sfh:
+        scr_dict = pickle.load(sfh)
+    # dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'num_recycles', 'predicted_aligned_error', 'structure_module', 'plddt', 'aligned_confidence_probs', 'max_predicted_aligned_error', 'ranking_confidence'])
+    # Get pLDDT as a list, the global pLDDT is the average, calculated on the
+    # spot.
+    cif_json["plddt"] = scr_dict["plddt"]
+    cif_json["ptm"] = float(scr_dict["ptm"])
+    cif_json["iptm"] = float(scr_dict["iptm"])
+
+
 def alphapulldown_model_to_modelcif(
    cmplx_name: str,
    mdl_file: str,
+    scr_file: str,
    out_dir: str,
    prj_dir: str,
 ) -> None:
@@ -284,6 +374,9 @@ def alphapulldown_model_to_modelcif(
    # gather target entities (sequences that have been modeled) info
    structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir)

+    # read quality scores from pickle file
+    _get_scores(modelcif_json, scr_file)
+
    _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir)
    # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")

@@ -304,20 +397,48 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
    cmplx = cmplx[0]
    mdl_path = os.path.join(mdl_path, cmplx)
    models = []
+    # We are going for models with name "rank_?.pdb" as it does not depend on
+    # relaxation. But this means we have to match the pickle files via
+    # ranking_debug.json.
+    ranking_dbg = os.path.join(mdl_path, "ranking_debug.json")
+    if not os.path.isfile(ranking_dbg):
+        logging.info(
+            f"Ranking file '{ranking_dbg} doe snot exist or is no regular "
+            + "file."
+        )
+        sys.exit()
+    with open(ranking_dbg, "r", encoding="utf8") as jfh:
+        ranking_dbg = json.load(jfh)
+    score_files = {}
+    for i, fle in enumerate(ranking_dbg["order"]):
+        score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl")
+    # match PDB files with pickle files
    if model_selected is not None:
-        models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb"))
+        models.append(
+            (
+                os.path.join(mdl_path, f"ranked_{model_selected}.pdb"),
+                score_files[model_selected],
+            )
+        )
    else:
        for mdl in os.listdir(mdl_path):
-            if mdl.startswith("ranked_"):
-                models.append(os.path.join(mdl_path, mdl))
+            rank = re.match(r"ranked_(\d+)\.pdb", mdl)
+            if rank is not None:
+                rank = int(rank.group(1))
+                models.append((os.path.join(mdl_path, mdl), score_files[rank]))

    # check that files actually exist
-    for mdl in models:
+    for mdl, scrs in models:
        if not os.path.isfile(mdl):
            logging.info(
                f"Model file '{mdl}' does not exist or is not a regular file."
            )
            sys.exit()
+        if not os.path.isfile(scrs):
+            logging.info(
+                f"Scores file '{scrs}' does not exist or is not a regular file."
+            )
+            sys.exit()

    return cmplx, mdl_path, models

@@ -348,9 +469,9 @@ def main(argv):
    complex_name, model_dir, model_list = _get_model_list(
        FLAGS.ap_output, FLAGS.model_selected
    )
-    for mdl in model_list:
+    for mdl, scrs in model_list:
        alphapulldown_model_to_modelcif(
-            complex_name, mdl, model_dir, FLAGS.ap_output
+            complex_name, mdl, scrs, model_dir, FLAGS.ap_output
        )


@@ -369,4 +490,4 @@ if __name__ == "__main__":
 # ToDo: make sure all functions come with types

 #  LocalWords:  ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
-#  LocalWords:  multimer sif Jupyter
+#  LocalWords:  multimer sif Jupyter aa