Skip to content
Snippets Groups Projects
Commit 490cdfe0 authored by Bienchen's avatar Bienchen
Browse files

Added global and local pLDDT

parent 09b5e132
Branches
No related tags found
No related merge requests found
......@@ -7,12 +7,15 @@ from typing import Tuple
import hashlib
import json
import os
import pickle
import re
import sys
from Bio import SeqIO
from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.Structure import Structure as BioStructure
from absl import app, flags, logging
import numpy as np
import modelcif
import modelcif.dumper
......@@ -39,6 +42,41 @@ FLAGS = flags.FLAGS
# exist as expected.
# pylint: disable=too-few-public-methods
class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
"""Predicted accuracy according to the CA-only lDDT in [0,100]"""
name = "pLDDT"
software = None
class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
"""Predicted accuracy according to the TM-score score in [0,1]"""
name = "pTM"
software = None
class _GlobalIPTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
# python-modelcif reads the first line of class-doc has description of a
# score, so we need to allow long lines, here.
# pylint: disable=line-too-long
"""Predicted protein-protein interface score, based on the TM-score score in [0,1]."""
name = "ipTM"
software = None
class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
"""Predicted accuracy according to the CA-only lDDT in [0,100]"""
name = "pLDDT"
software = None
# pylint: enable=too-few-public-methods
class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
"""Map Biopython PDB.Structure object to ihm.model"""
......@@ -64,6 +102,44 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
occupancy=atm.occupancy,
)
def add_scores(self, scores_json, entry_id):
"""Add QA metrics"""
# global scores
self.qa_metrics.extend(
(
_GlobalPLDDT(np.mean(scores_json["plddt"])),
_GlobalPTM(scores_json["ptm"]),
_GlobalIPTM(scores_json["iptm"]),
)
)
# local scores
# iterate polypetide chains
# local PLDDT
i = 0
# aa_only=False includes non-canonical amino acids but seems to skip
# non-peptide-linking residues like ions
for chn_i in PPBuilder().build_peptides(self.structure, aa_only=False):
for res_i in chn_i:
# local pLDDT
# Assertion assumes that pLDDT values are also stored in the
# B-factor column.
assert (
round(scores_json["plddt"][i], 2)
== next(res_i.get_atoms()).bfactor
)
self.qa_metrics.append(
_LocalPLDDT(
self.asym[res_i.parent.id].residue(res_i.id[1]),
round(scores_json["plddt"][i], 2),
)
)
# pairwise alignment error
j = 0
i += 1
def _get_modelcif_entities(target_ents, asym_units, system):
"""Create ModelCIF entities and asymmetric units."""
......@@ -84,7 +160,7 @@ def _store_as_modelcif(
structure: BioStructure,
mdl_file: str,
out_dir: str,
# ost_ent, file_prfx, compress, add_files
# file_prfx, compress, add_files
) -> None:
"""Create the actual ModelCIF file."""
system = modelcif.System(
......@@ -118,10 +194,8 @@ def _store_as_modelcif(
)
# process scores
# ToDo: Get scores, but where are they stored? For other AF2 related
# projects, there are JSON or Pickle files with all AF2 scores
# system.repositories.append(
# model.add_scores(data_json, system.id, mdl_name, add_files)
model.add_scores(data_json, system.id)
# )
system.model_groups.append(modelcif.model.ModelGroup([model]))
......@@ -240,6 +314,9 @@ def _get_entities(
structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file)
cif_json["target_entities"] = []
already_seen = []
# ToDo: skip using chains to make it safe for models with ligands. Chains
# are only used for the chain ID. That can be obtained from the first
# residue in the sequence: seq[0].parent.id
for chn, seq in zip(
structure.get_chains(), PPBuilder().build_peptides(structure)
):
......@@ -262,9 +339,22 @@ def _get_entities(
return structure
def _get_scores(cif_json: dict, scr_file: str) -> None:
"""Add scores to JSON data."""
with open(scr_file, "rb") as sfh:
scr_dict = pickle.load(sfh)
# dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'num_recycles', 'predicted_aligned_error', 'structure_module', 'plddt', 'aligned_confidence_probs', 'max_predicted_aligned_error', 'ranking_confidence'])
# Get pLDDT as a list, the global pLDDT is the average, calculated on the
# spot.
cif_json["plddt"] = scr_dict["plddt"]
cif_json["ptm"] = float(scr_dict["ptm"])
cif_json["iptm"] = float(scr_dict["iptm"])
def alphapulldown_model_to_modelcif(
cmplx_name: str,
mdl_file: str,
scr_file: str,
out_dir: str,
prj_dir: str,
) -> None:
......@@ -284,6 +374,9 @@ def alphapulldown_model_to_modelcif(
# gather target entities (sequences that have been modeled) info
structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir)
# read quality scores from pickle file
_get_scores(modelcif_json, scr_file)
_store_as_modelcif(modelcif_json, structure, mdl_file, out_dir)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
......@@ -304,20 +397,48 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
cmplx = cmplx[0]
mdl_path = os.path.join(mdl_path, cmplx)
models = []
# We are going for models with name "rank_?.pdb" as it does not depend on
# relaxation. But this means we have to match the pickle files via
# ranking_debug.json.
ranking_dbg = os.path.join(mdl_path, "ranking_debug.json")
if not os.path.isfile(ranking_dbg):
logging.info(
f"Ranking file '{ranking_dbg} doe snot exist or is no regular "
+ "file."
)
sys.exit()
with open(ranking_dbg, "r", encoding="utf8") as jfh:
ranking_dbg = json.load(jfh)
score_files = {}
for i, fle in enumerate(ranking_dbg["order"]):
score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl")
# match PDB files with pickle files
if model_selected is not None:
models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb"))
models.append(
(
os.path.join(mdl_path, f"ranked_{model_selected}.pdb"),
score_files[model_selected],
)
)
else:
for mdl in os.listdir(mdl_path):
if mdl.startswith("ranked_"):
models.append(os.path.join(mdl_path, mdl))
rank = re.match(r"ranked_(\d+)\.pdb", mdl)
if rank is not None:
rank = int(rank.group(1))
models.append((os.path.join(mdl_path, mdl), score_files[rank]))
# check that files actually exist
for mdl in models:
for mdl, scrs in models:
if not os.path.isfile(mdl):
logging.info(
f"Model file '{mdl}' does not exist or is not a regular file."
)
sys.exit()
if not os.path.isfile(scrs):
logging.info(
f"Scores file '{scrs}' does not exist or is not a regular file."
)
sys.exit()
return cmplx, mdl_path, models
......@@ -348,9 +469,9 @@ def main(argv):
complex_name, model_dir, model_list = _get_model_list(
FLAGS.ap_output, FLAGS.model_selected
)
for mdl in model_list:
for mdl, scrs in model_list:
alphapulldown_model_to_modelcif(
complex_name, mdl, model_dir, FLAGS.ap_output
complex_name, mdl, scrs, model_dir, FLAGS.ap_output
)
......@@ -369,4 +490,4 @@ if __name__ == "__main__":
# ToDo: make sure all functions come with types
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter
# LocalWords: multimer sif Jupyter aa
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment