Skip to content
Snippets Groups Projects
Commit 150a9cbf authored by Bienchen's avatar Bienchen
Browse files

Notes on protocol steps

parent 5f2676e3
Branches
No related tags found
No related merge requests found
......@@ -4,14 +4,19 @@
file with a lot of metadata in place."""
from typing import Tuple
import hashlib
import json
import os
import sys
from Bio import SeqIO
from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.Structure import Structure as BioStructure
from absl import app, flags, logging
import modelcif
import modelcif.dumper
import modelcif.model
# ToDo: Get options properly, best get the same names as used in existing
# scripts, e.g. could '--monomer_objects_dir' be used as feature
......@@ -34,8 +39,49 @@ FLAGS = flags.FLAGS
# exist as expected.
class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
"""Map Biopython PDB.Structure object to ihm.model"""
def __init__(self, *args, **kwargs):
"""Initialise a model"""
self.structure = kwargs.pop("bio_pdb_structure")
self.asym = kwargs.pop("asym")
super().__init__(*args, **kwargs)
def get_atoms(self):
for atm in self.structure.get_atoms():
yield modelcif.model.Atom(
asym_unit=self.asym[atm.parent.parent.id],
seq_id=atm.parent.id[1],
atom_id=atm.name,
type_symbol=atm.element,
x=atm.coord[0],
y=atm.coord[1],
z=atm.coord[2],
het=atm.parent.id[0] != " ",
biso=atm.bfactor,
occupancy=atm.occupancy,
)
def _get_modelcif_entities(target_ents, asym_units, system):
"""Create ModelCIF entities and asymmetric units."""
for cif_ent in target_ents:
mdlcif_ent = modelcif.Entity(
# 'pdb_sequence' can be used here, since AF2 always has the
# complete sequence.
cif_ent["pdb_sequence"],
description=cif_ent["description"],
)
for pdb_chain_id in cif_ent["pdb_chain_id"]:
asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent)
system.target_entities.append(mdlcif_ent)
def _store_as_modelcif(
data_json: dict,
structure: BioStructure,
mdl_file: str,
out_dir: str,
# ost_ent, file_prfx, compress, add_files
......@@ -47,6 +93,52 @@ def _store_as_modelcif(
model_details=data_json["_struct.pdbx_model_details"],
)
# create target entities, references, source, asymmetric units & assembly
# create an asymmetric unit and an entity per target sequence
asym_units = {}
_get_modelcif_entities(
data_json["target_entities"],
asym_units,
system,
)
# ToDo: get modelling-experiment auhtors
# audit_authors
# system.authors.extend(data_json["audit_authors"])
# set up the model to produce coordinates
# ToDo: model file names are different than expected, got ranked_<N>.pdb
# and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
# like model_<N>_rank_<M>.pdb, used for 'model list name'
model = _Biopython2ModelCIF(
assembly=modelcif.Assembly(asym_units.values()),
asym=asym_units,
bio_pdb_structure=structure,
name="ToDo: Model <N> (ranked #<M>)",
)
# ToDO: check data_json["model_group_name"] at Tara's
system.model_groups.append(modelcif.model.ModelGroup([model]))
# ToDo: get protocol steps together
# - 'coevolution MSA' (create_individual_features.py) step
# - 'template search' is usually omitted for AF2 as that term more
# relates to homology modelling, AF2 uses templates in a different
# way and a template search usually results in a list of templates
# which would need to be included, here, in theory
# - for MSA/ template search, how to represent the outcome? Adding the
# pickle files quickly exceeds reasonable storage use
# - 'modeling' (run_multimer_jobs.py), are the four modes reflected by
# the JSON data/ does the JSON data look different for each mode?
# - are the scores only calculated by alpha-analysis.sif or do they
# come out of run_multimer_jobs.py? Does this go into its own step?
# - could Jupyter notebooks be included as associated file or do they
# require the pickle files?
# - what about including the tabular summary?
# - model selection: only if just a certain model is translated to
# ModelCIF, or mix it with scoring step?
# system.protocols.append()
# write modelcif.System to file
# NOTE: this will dump PAE on path provided in add_scores
# -> hence we cheat by changing path and back while being exception-safe...
......@@ -117,6 +209,53 @@ def _get_data_block_id_and_struct_and_entry_categories(
)
def _get_entities(
cif_json: dict, pdb_file: str, cmplx_name: list, prj_dir: str
) -> BioStructure:
"""Gather data for the mmCIF (target) entities."""
# read sequence from FastA, map to sequences from PDB file
sequences = {}
# Fetch FastA sequences to assign monomers to the molecular entities
prj_dir = os.path.join(prj_dir, "fastas")
if not os.path.isdir(prj_dir):
logging.info(f"FastA directory '{prj_dir}' does not exist.")
sys.exit()
for mnmr in cmplx_name:
fasta = os.path.join(prj_dir, f"{mnmr}.fa")
if not os.path.isfile(fasta):
logging.info(f"FastA file '{fasta}' does not exist.")
sys.exit()
with open(fasta, "r", encoding="ascii") as ffh:
seq = SeqIO.read(ffh, "fasta")
# Using MD5 sums for comparing sequences only works since AF2
# *ALWAYS* has the complete sequence in a chain.
sequences[hashlib.md5(seq.seq.encode()).hexdigest()] = mnmr
# gather molecular entities from PDB file
structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file)
cif_json["target_entities"] = []
already_seen = []
for chn, seq in zip(
structure.get_chains(), PPBuilder().build_peptides(structure)
):
seq = seq.get_sequence()
seq_md5 = hashlib.md5(seq.encode()).hexdigest()
cif_ent = {}
try:
e_idx = already_seen.index(seq_md5)
except ValueError:
pass
else:
cif_json["target_entities"][e_idx]["pdb_chain_id"].append(chn.id)
continue
cif_ent["pdb_sequence"] = seq
cif_ent["pdb_chain_id"] = [chn.id]
cif_ent["description"] = sequences[seq_md5]
cif_json["target_entities"].append(cif_ent)
already_seen.append(sequences[seq_md5])
return structure
def alphapulldown_model_to_modelcif(
cmplx_name: str,
mdl_file: str,
......@@ -136,7 +275,10 @@ def alphapulldown_model_to_modelcif(
_get_data_block_id_and_struct_and_entry_categories(
modelcif_json, cmplx_name
)
_store_as_modelcif(modelcif_json, mdl_file, out_dir)
# gather target entities (sequences that have been modeled) info
structure = _get_entities(modelcif_json, mdl_file, cmplx_name, prj_dir)
_store_as_modelcif(modelcif_json, structure, mdl_file, out_dir)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
......@@ -219,4 +361,5 @@ if __name__ == "__main__":
# like that, including author names?
# ToDo: where to store which model was chosen? Should be in Tara's models.
# LocalWords: ToDo AlphaPulldown PAEs dir struct
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment