diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index bf73e59f4c040cc839c7f7070a35e8c84de6dfe2..18820683004f04d21f1cfbbbb548dfa667a92c37 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -7,12 +7,15 @@ from typing import Tuple import hashlib import json import os +import pickle +import re import sys from Bio import SeqIO from Bio.PDB import PDBParser, PPBuilder from Bio.PDB.Structure import Structure as BioStructure from absl import app, flags, logging +import numpy as np import modelcif import modelcif.dumper @@ -39,6 +42,41 @@ FLAGS = flags.FLAGS # exist as expected. +# pylint: disable=too-few-public-methods +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM): + """Predicted accuracy according to the TM-score score in [0,1]""" + + name = "pTM" + software = None + + +class _GlobalIPTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM): + # python-modelcif reads the first line of class-doc has description of a + # score, so we need to allow long lines, here. + # pylint: disable=line-too-long + """Predicted protein-protein interface score, based on the TM-score score in [0,1].""" + + name = "ipTM" + software = None + + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +# pylint: enable=too-few-public-methods + + class _Biopython2ModelCIF(modelcif.model.AbInitioModel): """Map Biopython PDB.Structure object to ihm.model""" @@ -64,6 +102,44 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel): occupancy=atm.occupancy, ) + def add_scores(self, scores_json, entry_id): + """Add QA metrics""" + # global scores + self.qa_metrics.extend( + ( + _GlobalPLDDT(np.mean(scores_json["plddt"])), + _GlobalPTM(scores_json["ptm"]), + _GlobalIPTM(scores_json["iptm"]), + ) + ) + + # local scores + # iterate polypetide chains + # local PLDDT + i = 0 + # aa_only=False includes non-canonical amino acids but seems to skip + # non-peptide-linking residues like ions + for chn_i in PPBuilder().build_peptides(self.structure, aa_only=False): + for res_i in chn_i: + # local pLDDT + # Assertion assumes that pLDDT values are also stored in the + # B-factor column. + assert ( + round(scores_json["plddt"][i], 2) + == next(res_i.get_atoms()).bfactor + ) + self.qa_metrics.append( + _LocalPLDDT( + self.asym[res_i.parent.id].residue(res_i.id[1]), + round(scores_json["plddt"][i], 2), + ) + ) + + # pairwise alignment error + j = 0 + + i += 1 + def _get_modelcif_entities(target_ents, asym_units, system): """Create ModelCIF entities and asymmetric units.""" @@ -84,7 +160,7 @@ def _store_as_modelcif( structure: BioStructure, mdl_file: str, out_dir: str, - # ost_ent, file_prfx, compress, add_files + # file_prfx, compress, add_files ) -> None: """Create the actual ModelCIF file.""" system = modelcif.System( @@ -118,10 +194,8 @@ def _store_as_modelcif( ) # process scores - # ToDo: Get scores, but where are they stored? For other AF2 related - # projects, there are JSON or Pickle files with all AF2 scores # system.repositories.append( - # model.add_scores(data_json, system.id, mdl_name, add_files) + model.add_scores(data_json, system.id) # ) system.model_groups.append(modelcif.model.ModelGroup([model])) @@ -240,6 +314,9 @@ def _get_entities( structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file) cif_json["target_entities"] = [] already_seen = [] + # ToDo: skip using chains to make it safe for models with ligands. Chains + # are only used for the chain ID. That can be obtained from the first + # residue in the sequence: seq[0].parent.id for chn, seq in zip( structure.get_chains(), PPBuilder().build_peptides(structure) ): @@ -262,9 +339,22 @@ def _get_entities( return structure +def _get_scores(cif_json: dict, scr_file: str) -> None: + """Add scores to JSON data.""" + with open(scr_file, "rb") as sfh: + scr_dict = pickle.load(sfh) + # dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'num_recycles', 'predicted_aligned_error', 'structure_module', 'plddt', 'aligned_confidence_probs', 'max_predicted_aligned_error', 'ranking_confidence']) + # Get pLDDT as a list, the global pLDDT is the average, calculated on the + # spot. + cif_json["plddt"] = scr_dict["plddt"] + cif_json["ptm"] = float(scr_dict["ptm"]) + cif_json["iptm"] = float(scr_dict["iptm"]) + + def alphapulldown_model_to_modelcif( cmplx_name: str, mdl_file: str, + scr_file: str, out_dir: str, prj_dir: str, ) -> None: @@ -284,6 +374,9 @@ def alphapulldown_model_to_modelcif( # gather target entities (sequences that have been modeled) info structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir) + # read quality scores from pickle file + _get_scores(modelcif_json, scr_file) + _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir) # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'") @@ -304,20 +397,48 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]: cmplx = cmplx[0] mdl_path = os.path.join(mdl_path, cmplx) models = [] + # We are going for models with name "rank_?.pdb" as it does not depend on + # relaxation. But this means we have to match the pickle files via + # ranking_debug.json. + ranking_dbg = os.path.join(mdl_path, "ranking_debug.json") + if not os.path.isfile(ranking_dbg): + logging.info( + f"Ranking file '{ranking_dbg} doe snot exist or is no regular " + + "file." + ) + sys.exit() + with open(ranking_dbg, "r", encoding="utf8") as jfh: + ranking_dbg = json.load(jfh) + score_files = {} + for i, fle in enumerate(ranking_dbg["order"]): + score_files[i] = os.path.join(mdl_path, f"result_{fle}.pkl") + # match PDB files with pickle files if model_selected is not None: - models.append(os.path.join(mdl_path, f"ranked_{model_selected}.pdb")) + models.append( + ( + os.path.join(mdl_path, f"ranked_{model_selected}.pdb"), + score_files[model_selected], + ) + ) else: for mdl in os.listdir(mdl_path): - if mdl.startswith("ranked_"): - models.append(os.path.join(mdl_path, mdl)) + rank = re.match(r"ranked_(\d+)\.pdb", mdl) + if rank is not None: + rank = int(rank.group(1)) + models.append((os.path.join(mdl_path, mdl), score_files[rank])) # check that files actually exist - for mdl in models: + for mdl, scrs in models: if not os.path.isfile(mdl): logging.info( f"Model file '{mdl}' does not exist or is not a regular file." ) sys.exit() + if not os.path.isfile(scrs): + logging.info( + f"Scores file '{scrs}' does not exist or is not a regular file." + ) + sys.exit() return cmplx, mdl_path, models @@ -348,9 +469,9 @@ def main(argv): complex_name, model_dir, model_list = _get_model_list( FLAGS.ap_output, FLAGS.model_selected ) - for mdl in model_list: + for mdl, scrs in model_list: alphapulldown_model_to_modelcif( - complex_name, mdl, model_dir, FLAGS.ap_output + complex_name, mdl, scrs, model_dir, FLAGS.ap_output ) @@ -369,4 +490,4 @@ if __name__ == "__main__": # ToDo: make sure all functions come with types # LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling -# LocalWords: multimer sif Jupyter +# LocalWords: multimer sif Jupyter aa