From 150a9cbf9728e27f4c18ecb0960cc63ee14ef7dc Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Mon, 28 Aug 2023 13:38:37 +0200 Subject: [PATCH] Notes on protocol steps --- convert_to_modelcif.py | 147 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 2 deletions(-) diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index bd2a21f..7518283 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -4,14 +4,19 @@ file with a lot of metadata in place.""" from typing import Tuple +import hashlib import json import os import sys +from Bio import SeqIO +from Bio.PDB import PDBParser, PPBuilder +from Bio.PDB.Structure import Structure as BioStructure from absl import app, flags, logging import modelcif import modelcif.dumper +import modelcif.model # ToDo: Get options properly, best get the same names as used in existing # scripts, e.g. could '--monomer_objects_dir' be used as feature @@ -34,8 +39,49 @@ FLAGS = flags.FLAGS # exist as expected. +class _Biopython2ModelCIF(modelcif.model.AbInitioModel): + """Map Biopython PDB.Structure object to ihm.model""" + + def __init__(self, *args, **kwargs): + """Initialise a model""" + self.structure = kwargs.pop("bio_pdb_structure") + self.asym = kwargs.pop("asym") + + super().__init__(*args, **kwargs) + + def get_atoms(self): + for atm in self.structure.get_atoms(): + yield modelcif.model.Atom( + asym_unit=self.asym[atm.parent.parent.id], + seq_id=atm.parent.id[1], + atom_id=atm.name, + type_symbol=atm.element, + x=atm.coord[0], + y=atm.coord[1], + z=atm.coord[2], + het=atm.parent.id[0] != " ", + biso=atm.bfactor, + occupancy=atm.occupancy, + ) + + +def _get_modelcif_entities(target_ents, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + for cif_ent in target_ents: + mdlcif_ent = modelcif.Entity( + # 'pdb_sequence' can be used here, since AF2 always has the + # complete sequence. + cif_ent["pdb_sequence"], + description=cif_ent["description"], + ) + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent) + system.target_entities.append(mdlcif_ent) + + def _store_as_modelcif( data_json: dict, + structure: BioStructure, mdl_file: str, out_dir: str, # ost_ent, file_prfx, compress, add_files @@ -47,6 +93,52 @@ def _store_as_modelcif( model_details=data_json["_struct.pdbx_model_details"], ) + # create target entities, references, source, asymmetric units & assembly + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities( + data_json["target_entities"], + asym_units, + system, + ) + + # ToDo: get modelling-experiment auhtors + # audit_authors + # system.authors.extend(data_json["audit_authors"]) + + # set up the model to produce coordinates + # ToDo: model file names are different than expected, got ranked_<N>.pdb + # and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something + # like model_<N>_rank_<M>.pdb, used for 'model list name' + model = _Biopython2ModelCIF( + assembly=modelcif.Assembly(asym_units.values()), + asym=asym_units, + bio_pdb_structure=structure, + name="ToDo: Model <N> (ranked #<M>)", + ) + + # ToDO: check data_json["model_group_name"] at Tara's + system.model_groups.append(modelcif.model.ModelGroup([model])) + + # ToDo: get protocol steps together + # - 'coevolution MSA' (create_individual_features.py) step + # - 'template search' is usually omitted for AF2 as that term more + # relates to homology modelling, AF2 uses templates in a different + # way and a template search usually results in a list of templates + # which would need to be included, here, in theory + # - for MSA/ template search, how to represent the outcome? Adding the + # pickle files quickly exceeds reasonable storage use + # - 'modeling' (run_multimer_jobs.py), are the four modes reflected by + # the JSON data/ does the JSON data look different for each mode? + # - are the scores only calculated by alpha-analysis.sif or do they + # come out of run_multimer_jobs.py? Does this go into its own step? + # - could Jupyter notebooks be included as associated file or do they + # require the pickle files? + # - what about including the tabular summary? + # - model selection: only if just a certain model is translated to + # ModelCIF, or mix it with scoring step? + # system.protocols.append() + # write modelcif.System to file # NOTE: this will dump PAE on path provided in add_scores # -> hence we cheat by changing path and back while being exception-safe... @@ -117,6 +209,53 @@ def _get_data_block_id_and_struct_and_entry_categories( ) +def _get_entities( + cif_json: dict, pdb_file: str, cmplx_name: list, prj_dir: str +) -> BioStructure: + """Gather data for the mmCIF (target) entities.""" + # read sequence from FastA, map to sequences from PDB file + sequences = {} + # Fetch FastA sequences to assign monomers to the molecular entities + prj_dir = os.path.join(prj_dir, "fastas") + if not os.path.isdir(prj_dir): + logging.info(f"FastA directory '{prj_dir}' does not exist.") + sys.exit() + for mnmr in cmplx_name: + fasta = os.path.join(prj_dir, f"{mnmr}.fa") + if not os.path.isfile(fasta): + logging.info(f"FastA file '{fasta}' does not exist.") + sys.exit() + with open(fasta, "r", encoding="ascii") as ffh: + seq = SeqIO.read(ffh, "fasta") + # Using MD5 sums for comparing sequences only works since AF2 + # *ALWAYS* has the complete sequence in a chain. + sequences[hashlib.md5(seq.seq.encode()).hexdigest()] = mnmr + # gather molecular entities from PDB file + structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file) + cif_json["target_entities"] = [] + already_seen = [] + for chn, seq in zip( + structure.get_chains(), PPBuilder().build_peptides(structure) + ): + seq = seq.get_sequence() + seq_md5 = hashlib.md5(seq.encode()).hexdigest() + cif_ent = {} + try: + e_idx = already_seen.index(seq_md5) + except ValueError: + pass + else: + cif_json["target_entities"][e_idx]["pdb_chain_id"].append(chn.id) + continue + cif_ent["pdb_sequence"] = seq + cif_ent["pdb_chain_id"] = [chn.id] + cif_ent["description"] = sequences[seq_md5] + cif_json["target_entities"].append(cif_ent) + already_seen.append(sequences[seq_md5]) + + return structure + + def alphapulldown_model_to_modelcif( cmplx_name: str, mdl_file: str, @@ -136,7 +275,10 @@ def alphapulldown_model_to_modelcif( _get_data_block_id_and_struct_and_entry_categories( modelcif_json, cmplx_name ) - _store_as_modelcif(modelcif_json, mdl_file, out_dir) + # gather target entities (sequences that have been modeled) info + structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir) + + _store_as_modelcif(modelcif_json, structure, mdl_file, out_dir) # ToDo: ENABLE logging.info(f"... done with '{mdl_file}'") @@ -219,4 +361,5 @@ if __name__ == "__main__": # like that, including author names? # ToDo: where to store which model was chosen? Should be in Tara's models. -# LocalWords: ToDo AlphaPulldown PAEs dir struct +# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling +# LocalWords: multimer sif Jupyter -- GitLab