diff --git a/projects/dark-matter-metagenomics/README.md b/projects/dark-matter-metagenomics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..845e23eca15a8b368ca914434c4076bae00104b0 --- /dev/null +++ b/projects/dark-matter-metagenomics/README.md @@ -0,0 +1,32 @@ +# Modelling of representative sequences from NMPFamsDB (DB on metagenomic families) + +[Link to project in ModelArchive](https://modelarchive.org/doi/10.5452/ma-nmpfamsdb) (incl. background on project itself) + +Input files for conversion: +- Directly from Sergey: + - all_ptm_plddt.txt.gz: "csv" file (' ' as separator) with pTM and global pLDDT values for each model + - all_pdb.zip: all produced models (5 models per modelled family) + - all_pae.zip: PAE matrices ("as a text file") for each model +- From [NMPFamsDB downloads](https://bib.fleming.gr/NMPFamsDB/downloads): + - "AlphaFold2 3D models (PDB)" (structures_all.tgz): selected PDB models for 80585 families (expected to match top ranked one in all_pdb.zip) + - "Consensus Sequences (FASTA)" (consensus_all.fasta.gz): representative sequences for each family (expected to match modelled sequence where model available; contains 25613 extra sequences which is fine) + - "AlphaFold2 Input Alignments (FASTA)" (alphafold_msa_all.tgz): MSA used for each AF run (first sequence expected to match modelled sequence) + +Modelling setup: +- Total of 106198 families identified from clustered JGI sequences; only subset with 3D structure (see paper for details) +- Using AlphaFold v2.0 for monomer predictions with ptm weights, producing 5 models, without model relaxation, without templates, ranked by pLDDT, starting from a custom MSA +- Custom MSA as extra modelling step ("created by calculating the central or "pivot" sequence of each seed MSA, and refining each alignment using that sequence as the guide") +- Sequence source set to NMPFamsDB as representative sequences do not have a direct match to JGI sequences + +Special features here: +- No separate data for pLDDT and hence taken from b-factors of PDB files +- Needed to deal with UNK in sequences (here: global pLDDT and pTM and local PAE take it into account but no data available for local pLDDT as no coordinates for UNK res.; example: F000347) +- Custom MSA stored in accompanying data +- Extra models stored in accompanying data (incl. PAE but without duplicating custom MSA; similar setup as in human-heterodimers-w-crosslinks but here listing pTM and pLDDT in description of acc. files) +- Note that on the NMPFamsDB web site they list max. pTM and display model with max. pLDDT. In ModelArchive, we show top-pLDDT-ranked model as main entry and others in acc. data, but list max. pTM and max. pLDDT in model description. +- Used optional family name prefix to be able to batch process families in parallel (did 1062 parallel jobs) +- Soft PDB comparison to allow for some numerical differences between PDB stored on web and one in ModelCIF (due to random reruns of AF2 in Sergey's data; differences small enough to ignore; largest one in F004657 with RMSD of 0.204) +- 11 extra families had models in all_pdb.zip but those were skipped (no custom MSA there) + +Content: +- translate2modelcif.py : script to do conversion (was in virtual environment with same setup as Docker container here but with OST 2.6 RC; used python-modelcif 0.9 and ModelCIF dict. 1.4.5) diff --git a/projects/dark-matter-metagenomics/translate2modelcif.py b/projects/dark-matter-metagenomics/translate2modelcif.py new file mode 100644 index 0000000000000000000000000000000000000000..955df92a8c43e78040f8a2c07e459f7492fba558 --- /dev/null +++ b/projects/dark-matter-metagenomics/translate2modelcif.py @@ -0,0 +1,1223 @@ +#! /usr/local/bin/ost +# -*- coding: utf-8 -*- +"""Translate models for Sergey from PDB + extra data into ModelCIF.""" + +from timeit import default_timer as timer +import argparse +import filecmp +import gzip +import os +import shutil +import sys +import zipfile + +import pandas as pd +import numpy as np + +import ihm +import ihm.citations +import modelcif +import modelcif.associated +import modelcif.dumper +import modelcif.model +import modelcif.protocol +import modelcif.reference + +from ost import io, geom, mol + + +# EXAMPLE for running: +# ost translate2modelcif.py ./raw_data/all_ptm_plddt.txt ./raw_data/all_pdb \ +# ./all_pae ./web_dloads/pivot ./modelcif --prefix=F000347 \ +# --pdb-web-path=./web_dloads/pdb \ +# --refseq-path=./web_dloads/consensus_all.fasta +# NOTE: add "--compress --all-models --all-pae" for final runs + + +################################################################################ +# HELPERS (mostly copied from from ost/modules/io/tests/test_io_omf.py) +# to compare PDBs +def _compare_atoms(a1, a2, occupancy_thresh=0.01, bfactor_thresh=0.01, + dist_thresh=0.001): + if abs(a1.occupancy - a2.occupancy) > occupancy_thresh: + return False + if abs(a1.b_factor - a2.b_factor) > bfactor_thresh: + return False + # modification: look at x,y,z spearately + if abs(a1.pos.x - a2.pos.x) > dist_thresh: + return False + if abs(a1.pos.y - a2.pos.y) > dist_thresh: + return False + if abs(a1.pos.z - a2.pos.z) > dist_thresh: + return False + if a1.is_hetatom != a2.is_hetatom: + return False + if a1.element != a2.element: + return False + return True + +def _compare_residues(r1, r2, at_occupancy_thresh=0.01, + at_bfactor_thresh=0.01, at_dist_thresh=0.001, + skip_ss=False, skip_rnums=False): + if r1.GetName() != r2.GetName(): + return False + if skip_rnums is False: + if r1.GetNumber() != r2.GetNumber(): + return False + if skip_ss is False: + if str(r1.GetSecStructure()) != str(r2.GetSecStructure()): + return False + if r1.one_letter_code != r2.one_letter_code: + return False + if r1.chem_type != r2.chem_type: + return False + if r1.chem_class != r2.chem_class: + return False + anames1 = [a.GetName() for a in r1.atoms] + anames2 = [a.GetName() for a in r2.atoms] + if sorted(anames1) != sorted(anames2): + return False + anames = anames1 + for aname in anames: + a1 = r1.FindAtom(aname) + a2 = r2.FindAtom(aname) + if not _compare_atoms(a1, a2, + occupancy_thresh=at_occupancy_thresh, + bfactor_thresh=at_bfactor_thresh, + dist_thresh=at_dist_thresh): + return False + return True + +def _compare_chains(ch1, ch2, at_occupancy_thresh=0.01, + at_bfactor_thresh=0.01, at_dist_thresh=0.001, + skip_ss=False, skip_rnums=False): + if len(ch1.residues) != len(ch2.residues): + return False + for r1, r2 in zip(ch1.residues, ch2.residues): + if not _compare_residues(r1, r2, + at_occupancy_thresh=at_occupancy_thresh, + at_bfactor_thresh=at_bfactor_thresh, + at_dist_thresh=at_dist_thresh, + skip_ss=skip_ss, skip_rnums=skip_rnums): + return False + return True + +def _compare_bonds(ent1, ent2): + bonds1 = list() + for b in ent1.bonds: + bond_partners = [str(b.first), str(b.second)] + bonds1.append([min(bond_partners), max(bond_partners), b.bond_order]) + bonds2 = list() + for b in ent2.bonds: + bond_partners = [str(b.first), str(b.second)] + bonds2.append([min(bond_partners), max(bond_partners), b.bond_order]) + return sorted(bonds1) == sorted(bonds2) + +def _compare_ent(ent1, ent2, at_occupancy_thresh=0.01, + at_bfactor_thresh=0.01, at_dist_thresh=0.001, + skip_ss=False, skip_cnames=False, skip_bonds=False, + skip_rnums=False, bu_idx=None): + if bu_idx is not None: + if ent1.GetName() + ' ' + str(bu_idx) != ent2.GetName(): + return False + else: + if ent1.GetName() != ent2.GetName(): + return False + chain_names_one = [ch.GetName() for ch in ent1.chains] + chain_names_two = [ch.GetName() for ch in ent2.chains] + if skip_cnames: + # only check whether we have the same number of chains + if len(chain_names_one) != len(chain_names_two): + return False + else: + if chain_names_one != chain_names_two: + return False + for ch1, ch2 in zip(ent1.chains, ent2.chains): + if not _compare_chains(ch1, ch2, + at_occupancy_thresh=at_occupancy_thresh, + at_bfactor_thresh=at_bfactor_thresh, + at_dist_thresh=at_dist_thresh, + skip_ss=skip_ss, skip_rnums=skip_rnums): + return False + if not skip_bonds: + if not _compare_bonds(ent1, ent2): + return False + return True + +def _compare_pdbs(f_name, f1, f2): + """Use atom-by-atom comparison on PDB files allowing num. errors.""" + # first do simple file diff. + if filecmp.cmp(f1, f2): + return True + else: + ent1 = io.LoadPDB(f1) + ent2 = io.LoadPDB(f2) + # allow a bit more errors as input files can have rounding errors + if _compare_ent(ent1, ent2, 0.011, 0.011, 0.0011, True, False, True): + return True + else: + # check manually and give warning... + atom_names_1 = [a.qualified_name for a in ent1.atoms] + atom_names_2 = [a.qualified_name for a in ent2.atoms] + assert atom_names_1 == atom_names_2 + b_diffs = [abs(a1.b_factor - a2.b_factor) \ + for a1, a2 in zip(ent1.atoms, ent2.atoms)] + max_b_diff = max(b_diffs) + rmsd = mol.alg.CalculateRMSD(ent1.Select(""), ent2.Select("")) + _warn_msg( + f"PDB file mismatch web vs top-ranked for {f_name}: " + f"RMSD {rmsd:.3f}, max. b_factor diff {max_b_diff:.3f}" + ) + return False +################################################################################ + + +def _abort_msg(msg, exit_code=1): + """Write error message and exit with exit_code.""" + print(f"{msg}\nAborting.", file=sys.stderr) + sys.exit(exit_code) + + +def _warn_msg(msg): + """Write a warning message to stdout.""" + print(f"WARNING: {msg}") + + +def _check_file(file_path): + """Make sure a file exists and is actually a file.""" + if not os.path.exists(file_path): + _abort_msg(f"File not found: '{file_path}'.") + if not os.path.isfile(file_path): + _abort_msg(f"File path does not point to file: '{file_path}'.") + + +def _check_folder(dir_path): + """Make sure a file exists and is actually a file.""" + if not os.path.exists(dir_path): + _abort_msg(f"Path not found: '{dir_path}'.") + if not os.path.isdir(dir_path): + _abort_msg(f"Path does not point to a directory: '{dir_path}'.") + + +def _check_opts_folder(dir_path): + """Remove trailing '/' (return fixed one) and check if path valid.""" + if dir_path.endswith("/"): + dir_path = dir_path[:-1] + _check_folder(dir_path) + return dir_path + + +def _parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__, + ) + + parser.add_argument( + "metadata_file", + type=str, + metavar="<METADATA FILE>", + help='Path to table with metadata. Excpected columns: "ID" (Family ID ' + + 'F...), "mdl" (AF2-model-name), "pTM", "pLDDT".', + ) + parser.add_argument( + "model_dir", + type=str, + metavar="<MODEL DIR>", + help='Directory with model PDBs named "{ID}_{mdl}.pdb" (with ID and ' + + 'mdl matching info in metadata).', + ) + parser.add_argument( + "pae_dir", + type=str, + metavar="<PAE DIR>", + help='Directory with PAE text files named "{ID}_{mdl}.txt.gz" (with ID ' + + 'and mdl matching info in metadata).', + ) + parser.add_argument( + "msa_data_dir", + type=str, + metavar="<MSA DIR>", + help="Directory with F*.fasta files for custom MSAs.", + ) + parser.add_argument( + "out_dir", + type=str, + metavar="<OUTPUT DIR>", + help="Path to directory to store results.", + ) + parser.add_argument( + "--prefix", + type=str, + metavar="<PREFIX>", + default="", + help="Only process families starting with given prefix. By default " + + "all families are processed.", + ) + parser.add_argument( + "--all-models", + default=False, + action="store_true", + help="Process all 5 models for each family (top ranked as main, others " + + "as accompanying data).", + ) + parser.add_argument( + "--all-pae", + default=False, + action="store_true", + help="Store PAE for all models instead of just top ranked one.", + ) + parser.add_argument( + "--all-msa", + default=False, + action="store_true", + help="Store MSA with all models instead of just once.", + ) + parser.add_argument( + "--compress", + default=False, + action="store_true", + help="Compress ModelCIF file with gzip " + + "(note that acc. data is zipped either way).", + ) + parser.add_argument( + "--pdb-web-path", + type=str, + metavar="<PDB WEB PATH>", + default=None, + help="Optional path to directory with F*.pdb files as available on " + + "NMPFamsDB web site. Used to check if top ranked model the same.", + ) + parser.add_argument( + "--refseq-path", + type=str, + metavar="<PDB REFSEQ PATH>", + default=None, + help="Optional path to fasta file with all ref. seq. as available on " + + "NMPFamsDB web site. Used to check if it matches with MSA.", + ) + opts = parser.parse_args() + + # check input + _check_file(opts.metadata_file) + opts.model_dir = _check_opts_folder(opts.model_dir) + opts.pae_dir = _check_opts_folder(opts.pae_dir) + opts.msa_data_dir = _check_opts_folder(opts.msa_data_dir) + # check out_dir + if opts.out_dir.endswith("/"): + opts.out_dir = opts.out_dir[:-1] + if not os.path.exists(opts.out_dir): + os.makedirs(opts.out_dir) + if not os.path.isdir(opts.out_dir): + _abort_msg(f"Path '{opts.out_dir}' does not point to a directory.") + # check optional paths + if opts.pdb_web_path is not None: + opts.pdb_web_path = _check_opts_folder(opts.pdb_web_path) + if opts.refseq_path is not None: + _check_file(opts.refseq_path) + return opts + + +# pylint: disable=too-few-public-methods +class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM): + """Predicted accuracy according to the TM-score score in [0,1]""" + + name = "pTM" + software = None + + +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, modelcif.qa_metric.PAE): + """Predicted aligned error (in Angstroms)""" + + name = "PAE" + software = None + + +class _NmpfamsdbTrgRef(modelcif.reference.TargetReference): + """NMPFamsDB as target reference.""" + + name = "Other" + other_details = "NMPFamsDB" + + +class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet): + """Have the default amino acid alphabet plus 'X' for unknown residues.""" + + def __init__(self): + """Create the alphabet.""" + super().__init__() + self._comps["X"] = self._comps["UNK"] + + +# pylint: enable=too-few-public-methods + + +def _get_res_num(r, use_auth=False): + """Get res. num. from auth. IDs if reading from mmCIF files.""" + if use_auth: + return int(r.GetStringProp("pdb_auth_resnum")) + return r.number.num + + +def _get_ch_name(ch, use_auth=False): + """Get chain name from auth. IDs if reading from mmCIF files.""" + if use_auth: + return ch.GetStringProp("pdb_auth_chain_name") + return ch.name + + +class _OST2ModelCIF(modelcif.model.AbInitioModel): + """Map OST entity elements to ihm.model""" + + def __init__(self, *args, **kwargs): + """Initialise a model""" + for i in ["ost_entity", "asym", "scores_json"]: + if i not in kwargs: + raise TypeError(f"Required keyword argument '{i}' not found.") + self.ost_entity = kwargs.pop("ost_entity") + self.asym = kwargs.pop("asym") + self.scores_json = kwargs.pop("scores_json") + + # use auth IDs for res. nums and chain names + self.use_auth = False + # what accuracy to use for PAE? (writer uses 3 anyway) + self.pae_digits = 3 + + # fetch plddts per residue + self.plddts = [] + for res in self.ost_entity.residues: + b_factors = [a.b_factor for a in res.atoms] + assert len(set(b_factors)) == 1 # must all be equal! + self.plddts.append(b_factors[0]) + + super().__init__(*args, **kwargs) + + def get_atoms(self): + # ToDo [internal]: Take B-factor out since its not a B-factor? + # NOTE: this assumes that _get_res_num maps residue to pos. in seqres + # within asym + for atm in self.ost_entity.atoms: + yield modelcif.model.Atom( + asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)], + seq_id=_get_res_num(atm.residue, self.use_auth), + atom_id=atm.name, + type_symbol=atm.element, + x=atm.pos[0], + y=atm.pos[1], + z=atm.pos[2], + het=atm.is_hetatom, + biso=atm.b_factor, + occupancy=atm.occupancy, + ) + + def add_scores(self): + """Add QA metrics from AF2 scores.""" + # global scores + self.qa_metrics.extend( + ( + _GlobalPLDDT(self.scores_json["plddt_global"]), + _GlobalPTM(self.scores_json["ptm"]), + ) + ) + + # local scores + lpae = [] + i = 0 + for chn_i in self.ost_entity.chains: + ch_name = _get_ch_name(chn_i, self.use_auth) + for res_i in chn_i.residues: + # local pLDDT + res_num_i = _get_res_num(res_i, self.use_auth) + self.qa_metrics.append( + _LocalPLDDT( + self.asym[ch_name].residue(res_num_i), + self.plddts[i], + ) + ) + i += 1 + + # PAE needs to go by residue index as it also stores ones + # for missing residues (i.e. X) + if "pae" in self.scores_json: + pae_i = self.scores_json["pae"][res_num_i - 1] + for chn_j in self.ost_entity.chains: + for res_j in chn_j.residues: + res_num_j = _get_res_num(res_j, self.use_auth) + pae_ij = pae_i[res_num_j - 1] + lpae.append( + _LocalPairwisePAE( + self.asym[chn_i.name].residue(res_num_i), + self.asym[chn_j.name].residue(res_num_j), + round(pae_ij, self.pae_digits), + ) + ) + + self.qa_metrics.extend(lpae) + + +def _get_audit_authors(): + """Return the list of authors that produced this model.""" + return ( + "Pavlopoulos, Georgios A.", + "Baltoumas, Fotis A.", + "Liu, Sirui", + "Selvitopi, Oguz", + "Camargo, Antonio Pedro", + "Nayfach, Stephen", + "Azad, Ariful", + "Roux, Simon", + "Call, Lee", + "Ivanova, Natalia N.", + "Chen, I-Min", + "Paez-Espino, David", + "Karatzas, Evangelos", + "Novel Metagenome Protein Families Consortium", + "Iliopoulos, Ioannis", + "Konstantinidi, Konstantinos", + "Tiedje, James M.", + "Pett-Ridge, Jennifer", + "Baker, David", + "Visel, Axel", + "Ouzounis, Christos A.", + "Ovchinnikov, Sergey", + "Buluc, Aydin", + "Kyrpides, Nikos C.", + ) + + +def _get_metadata(metadata_file): + """Read csv file with metedata and prepare for next steps.""" + metadata = pd.read_csv( + metadata_file, sep=" ", names=["ID", "mdl", "pTM", "pLDDT"] + ) + return metadata + + +def _get_pdb_files(model_dir): + """Collect PDB files from model_dir. + + Returns dict with key = family name and value = list of paths to PDB files. + """ + pdb_files_split = {} # to return + pdb_files = [ + f for f in os.listdir(model_dir) if not f.startswith(".") + ] + for f in pdb_files: + f_path = os.path.join(model_dir, f) + f_name = f.split("_")[0] + if f_name in pdb_files_split: + pdb_files_split[f_name].append(f_path) + else: + pdb_files_split[f_name] = [f_path] + return pdb_files_split + + +def _get_config(): + """Define AF setup.""" + msa_description = ( + 'MSA created by calculating the central or "pivot" ' + "sequence of each seed MSA, and refining each " + "alignment using that sequence as the guide." + ) + mdl_description = ( + "Model generated using AlphaFold (v2.0.0 with models " + "fine-tuned to return pTM weights) producing 5 models, " + "without model relaxation, without templates, ranked " + "by pLDDT, starting from a custom MSA." + ) + af_config = {} + return { + "af_config": af_config, + "af_version": "2.0.0", + "mdl_description": mdl_description, + "msa_description": msa_description, + "use_templates": False, + "use_small_bfd": False, + "use_multimer": False, + } + + +def _get_protocol_steps_and_software(config_data): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + + # MSA step + step = { + "method_type": "coevolution MSA", + "name": None, + "details": config_data["msa_description"], + } + step["input"] = "target_sequences" + step["output"] = "MSA" + step["software"] = [] + step["software_parameters"] = {} + protocol.append(step) + + # modelling step + step = { + "method_type": "modeling", + "name": None, + "details": config_data["mdl_description"], + } + # get input data + # Must refer to data already in the JSON, so we try keywords + step["input"] = "target_sequences_and_MSA" + # get output data + # Must refer to existing data, so we try keywords + step["output"] = "model" + # get software + if config_data["use_multimer"]: + step["software"] = [ + { + "name": "AlphaFold-Multimer", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.Citation( + pmid=None, + title="Protein complex prediction with " + + "AlphaFold-Multimer.", + journal="bioRxiv", + volume=None, + page_range=None, + year=2021, + authors=[ + "Evans, R.", + "O'Neill, M.", + "Pritzel, A.", + "Antropova, N.", + "Senior, A.", + "Green, T.", + "Zidek, A.", + "Bates, R.", + "Blackwell, S.", + "Yim, J.", + "Ronneberger, O.", + "Bodenstein, S.", + "Zielinski, M.", + "Bridgland, A.", + "Potapenko, A.", + "Cowie, A.", + "Tunyasuvunakool, K.", + "Jain, R.", + "Clancy, E.", + "Kohli, P.", + "Jumper, J.", + "Hassabis, D.", + ], + doi="10.1101/2021.10.04.463034", + ), + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + } + ] + else: + step["software"] = [ + { + "name": "AlphaFold", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.citations.alphafold2, + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + } + ] + step["software_parameters"] = config_data["af_config"] + protocol.append(step) + + return protocol + + +def _get_title(fam_name): + """Get a title for this modelling experiment.""" + return f"AlphaFold model for NMPFamsDB Family {fam_name}" + + +def _get_model_details(fam_name, max_pLDDT, max_pTM): + """Get the model description.""" + db_url = f"https://bib.fleming.gr/NMPFamsDB/family?id={fam_name}" + return ( + f'Model generated using AlphaFold (v2.0.0) for the "Representative ' + f'Sequence" of NMPFamsDB Metagenome / Metatranscriptome Family ' + f"{fam_name}.\n\nThe 5 produced models reached a max. global pLDDT of " + f"{round(max_pLDDT, 3)} and max. pTM of {round(max_pTM, 3)}.\n\n" + f"See {db_url} for additional details." + ) + + +def _get_model_group_name(): + """Get a name for a model group.""" + return None + + +def _get_sequence(chn, use_auth=False): + """Get the sequence out of an OST chain incl. '-' for gaps in resnums.""" + # initialise (add gaps if first is not at num. 1) + lst_rn = _get_res_num(chn.residues[0], use_auth) + idx = 1 + sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code + + for res in chn.residues[idx:]: + lst_rn += 1 + while lst_rn != _get_res_num(res, use_auth): + sqe += "-" + lst_rn += 1 + sqe += res.one_letter_code + return sqe + + +def _get_entities(pdb_file, ref_seq, fam_name): + """Gather data for the mmCIF (target) entities.""" + _check_file(pdb_file) + + ost_ent = io.LoadPDB(pdb_file) + if ost_ent.chain_count != 1: + raise RuntimeError(f"Unexpected oligomer in {pdb_file}") + chn = ost_ent.chains[0] + sqe_gaps = _get_sequence(chn) + + # NOTE: can have gaps to accommodate "X" in ref_seq + exp_seq = sqe_gaps.replace("-", "X") + len_diff = len(ref_seq.string) - len(exp_seq) + if len_diff > 0: + exp_seq += "X" * len_diff + if exp_seq != ref_seq.string: + raise RuntimeError(f"Sequence in {pdb_file} does not match ref_seq") + + cif_ent = { + "seqres": ref_seq.string, + "pdb_sequence": sqe_gaps, + "pdb_chain_id": [_get_ch_name(chn, False)], + "fam_name": fam_name, + "description": "Representative Sequence of NMPFamsDB Family " + + f"{fam_name}", + } + + return [cif_ent], ost_ent + + +def _get_modelcif_entities(target_ents, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + alphabet = _LPeptideAlphabetWithX() + for cif_ent in target_ents: + mdlcif_ent = modelcif.Entity( + # NOTE: sequence here defines residues in model! + cif_ent["seqres"], + alphabet=alphabet, + description=cif_ent["description"], + source=None, + references=[ + _NmpfamsdbTrgRef( + cif_ent["fam_name"], + cif_ent["fam_name"], + align_begin=1, + align_end=len(cif_ent["seqres"]), + ) + ], + ) + # NOTE: this assigns (potentially new) alphabetic chain names + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit( + mdlcif_ent, + strand_id=pdb_chain_id, + ) + system.target_entities.append(mdlcif_ent) + + +def _get_assoc_pae_file(entry_id, mdl_name): + """Generate a associated file object to extract PAE to extra file.""" + return modelcif.associated.LocalPairwiseQAScoresFile( + f"{mdl_name}_local_pairwise_qa.cif", + categories=["_ma_qa_metric_local_pairwise"], + copy_categories=["_ma_qa_metric"], + entry_id=entry_id, + entry_details="This file is an associated file consisting " + + "of local pairwise QA metrics. This is a partial mmCIF " + + "file and can be validated by merging with the main " + + "mmCIF file containing the model coordinates and other " + + "associated data.", + details="Predicted aligned error", + ) + + +def _get_aln_data(): + """Generate Data object for ALN.""" + aln_data = modelcif.data.Data("Custom MSA for modelling") + aln_data.data_content_type = "coevolution MSA" + return aln_data + + +def _get_assoc_aln_file(fle_path): + """Generate a modelcif.associated.File object pointing to FASTA formatted + file containing MSA. + """ + cfile = modelcif.associated.File( + fle_path, + details="Custom MSA for modelling", + data=_get_aln_data(), + ) + cfile.file_format = "fasta" + cfile.file_content = "multiple sequence alignments" + return cfile + + +def _get_associated_files(mdl_name, arc_files): + """Create entry for associated files.""" + # package all into zip file + return modelcif.associated.Repository( + "", + [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)], + ) + # NOTE: by convention MA expects zip file with same name as model-cif + + +def _assemble_modelcif_software(soft_dict): + """Create a modelcif.Software instance from dictionary.""" + return modelcif.Software( + soft_dict["name"], + soft_dict["classification"], + soft_dict["description"], + soft_dict["location"], + soft_dict["type"], + soft_dict["version"], + citation=soft_dict["citation"], + ) + + +def _get_modelcif_protocol_software(js_step): + """Assemble software entries for a ModelCIF protocol step.""" + if js_step["software"]: + if len(js_step["software"]) == 1: + sftwre = _assemble_modelcif_software(js_step["software"][0]) + else: + sftwre = [] + for sft in js_step["software"]: + sftwre.append(_assemble_modelcif_software(sft)) + sftwre = modelcif.SoftwareGroup(elements=sftwre) + if js_step["software_parameters"]: + params = [] + for key, val in js_step["software_parameters"].items(): + params.append(modelcif.SoftwareParameter(key, val)) + if isinstance(sftwre, modelcif.SoftwareGroup): + sftwre.parameters = params + else: + sftwre = modelcif.SoftwareGroup( + elements=(sftwre,), parameters=params + ) + return sftwre + return None + + +def _get_modelcif_protocol_data(data_label, target_entities, aln_data, model): + """Assemble data for a ModelCIF protocol step.""" + if data_label == "target_sequences": + data = modelcif.data.DataGroup(target_entities) + elif data_label == "MSA": + data = aln_data + elif data_label == "target_sequences_and_MSA": + data = modelcif.data.DataGroup(target_entities) + data.append(aln_data) + elif data_label == "model": + data = model + else: + raise RuntimeError(f"Unknown protocol data: '{data_label}'") + return data + + +def _get_modelcif_protocol(protocol_steps, target_entities, aln_data, model): + """Create the protocol for the ModelCIF file.""" + protocol = modelcif.protocol.Protocol() + for js_step in protocol_steps: + sftwre = _get_modelcif_protocol_software(js_step) + input_data = _get_modelcif_protocol_data( + js_step["input"], target_entities, aln_data, model + ) + output_data = _get_modelcif_protocol_data( + js_step["output"], target_entities, aln_data, model + ) + + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=output_data, + name=js_step["name"], + details=js_step["details"], + software=sftwre, + ) + ) + protocol.steps[-1].method_type = js_step["method_type"] + return protocol + + +def _compress_cif_file(cif_file): + """Compress cif file and delete original.""" + with open(cif_file, "rb") as f_in: + with gzip.open(cif_file + ".gz", "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(cif_file) + + +def _package_associated_files(repo): + """Compress associated files into single zip file and delete original.""" + # zip settings tested for good speed vs compression + for archive in repo.files: + with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip: + for zfile in archive.files: + cif_zip.write(zfile.path, arcname=zfile.path) + os.remove(zfile.path) + + +def _get_assoc_mdl_file(fle_path, data_json): + """Generate a modelcif.associated.File object that looks like a CIF file. + The dedicated CIFFile functionality in modelcif would also try to write it. + """ + cfile = modelcif.associated.File( + fle_path, + details=f"#{data_json['mdl_rank']} ranked model; " + + f"pTM {round(data_json['ptm'], 3)}, " + + f"pLDDT {round(data_json['plddt_global'], 3)}", + ) + cfile.file_format = "cif" + return cfile + + +def _get_assoc_zip_file(fle_path, data_json): + """Create a modelcif.associated.File object that looks like a ZIP file. + This is NOT the archive ZIP file for the PAEs but to store that in the + ZIP archive of the selected model.""" + zfile = modelcif.associated.File( + fle_path, + details="archive with multiple files for " + + f"#{data_json['mdl_rank']} ranked model", + ) + zfile.file_format = "other" + return zfile + + +def _store_as_modelcif( + data_json, ost_ent, out_dir, mdl_name, compress, add_pae, add_aln, add_files +): + """Mix all the data into a ModelCIF file.""" + print(" generating ModelCIF objects...", end="") + pstart = timer() + # create system to gather all the data + system = modelcif.System( + title=data_json["title"], + id=data_json["mdl_id"].upper(), + model_details=data_json["model_details"], + ) + + # add primary citation (not using from_pubmed_id to ensure that author names + # have no special chars) + system.citations.append(ihm.Citation( + pmid="37821698", + title="Unraveling the functional dark matter through global " + + "metagenomics.", + journal="Nature", + volume=622, + page_range=(594, 602), + year=2023, + authors=[ + 'Pavlopoulos, G.A.', 'Baltoumas, F.A.', 'Liu, S.', 'Selvitopi, O.', + 'Camargo, A.P.', 'Nayfach, S.', 'Azad, A.', 'Roux, S.', 'Call, L.', + 'Ivanova, N.N.', 'Chen, I.M.', 'Paez-Espino, D.', 'Karatzas, E.', + 'Iliopoulos, I.', 'Konstantinidis, K.', 'Tiedje, J.M.', + 'Pett-Ridge, J.', 'Baker, D.', 'Visel, A.', 'Ouzounis, C.A.', + 'Ovchinnikov, S.', 'Buluc, A.', 'Kyrpides, N.C.' + ], + doi="10.1038/s41586-023-06583-7", + is_primary=True, + )) + + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities(data_json["target_entities"], asym_units, system) + + # audit_authors + system.authors.extend(data_json["audit_authors"]) + + # set up the model to produce coordinates + model = _OST2ModelCIF( + assembly=modelcif.Assembly(asym_units.values()), + asym=asym_units, + ost_entity=ost_ent, + scores_json=data_json, + name=data_json["mdl_name"], + ) + print(f" ({timer()-pstart:.2f}s)") + print(" processing QA scores...", end="", flush=True) + pstart = timer() + model.add_scores() + print(f" ({timer()-pstart:.2f}s)") + + model_group = modelcif.model.ModelGroup( + [model], name=data_json["model_group_name"] + ) + system.model_groups.append(model_group) + + # handle additional files + arc_files = [] + if add_pae: + arc_files.append(_get_assoc_pae_file(system.id, mdl_name)) + if add_aln: + aln_file = _get_assoc_aln_file(data_json["aln_file_name"]) + arc_files.append(aln_file) + aln_data = aln_file.data + else: + aln_data = _get_aln_data() + aln_data.data_other_details = "MSA stored with parent entry" + arc_files.extend(add_files) + if arc_files: + system.repositories.append(_get_associated_files(mdl_name, arc_files)) + + # get data and steps + protocol = _get_modelcif_protocol( + data_json["protocol"], + system.target_entities, + aln_data, + model, + ) + system.protocols.append(protocol) + + # write modelcif System to file (NOTE: no PAE here!) + print(" write to disk...", end="", flush=True) + pstart = timer() + # copy aln file to compress them + if add_aln: + shutil.copyfile( + data_json["aln_file_path"], + os.path.join(out_dir, data_json["aln_file_name"]), + ) + # NOTE: we change path and back while being exception-safe to handle zipfile + oldpwd = os.getcwd() + os.chdir(out_dir) + mdl_fle = f"{mdl_name}.cif" + try: + with open(mdl_fle, "w", encoding="ascii") as mmcif_fh: + modelcif.dumper.write(mmcif_fh, [system]) + if arc_files: + _package_associated_files(system.repositories[0]) + if compress: + _compress_cif_file(mdl_fle) + mdl_fle += ".gz" + finally: + os.chdir(oldpwd) + print(f" ({timer()-pstart:.2f}s)") + assoc_files = [_get_assoc_mdl_file(mdl_fle, data_json)] + if arc_files: + assoc_files.append( + _get_assoc_zip_file(system.repositories[0].files[0].path, data_json) + ) + return assoc_files + + +def _translate2modelcif_single( + f_name, opts, metadata, pdb_files, mdl_rank, aln_file, aln_path, ref_seq, + mdl_details, add_files=[] +): + """Convert a single model with its accompanying data to ModelCIF.""" + mdl_id = f_name + if mdl_rank > 1: + mdl_id += f"_rank_{mdl_rank}_{metadata.mdl}" + + print(f" translating {mdl_id}...") + pdb_start = timer() + + # gather data into JSON-like structure + print(" preparing data...", end="") + pstart = timer() + + config_data = _get_config() + mdlcf_json = {} + mdlcf_json["audit_authors"] = _get_audit_authors() + mdlcf_json["protocol"] = _get_protocol_steps_and_software(config_data) + mdlcf_json["config_data"] = config_data + mdlcf_json["mdl_id"] = mdl_id # used for entry ID + mdlcf_json["mdl_rank"] = mdl_rank + mdlcf_json["aln_file_name"] = aln_file + mdlcf_json["aln_file_path"] = aln_path + + # find model to process + pdb_list_sel = [f for f in pdb_files if metadata.mdl in f] + if len(pdb_list_sel) != 1: + # this should never happen + raise RuntimeError( + f"Multiple file matches found for {metadata.mdl} in {f_name}" + ) + if mdl_rank == 1: + mdlcf_json["mdl_name"] = f"Top ranked model ({metadata.mdl})" + else: + mdlcf_json["mdl_name"] = f"#{mdl_rank} ranked model ({metadata.mdl})" + + # process coordinates + pdb_file = pdb_list_sel[0] + target_entities, ost_ent = _get_entities(pdb_file, ref_seq, f_name) + mdlcf_json["target_entities"] = target_entities + # sanity check (only for top ranked model!) + if mdl_rank == 1 and opts.pdb_web_path is not None: + pdb_file_web = os.path.join(opts.pdb_web_path, f"{f_name}.pdb") + # warning handled in compare function... + _compare_pdbs(f_name, pdb_file, pdb_file_web) + + # get scores for this entry + mdlcf_json["plddt_global"] = metadata.pLDDT + mdlcf_json["ptm"] = metadata.pTM + add_pae = (mdl_rank == 1 or opts.all_pae) + if add_pae: + pdb_basename = os.path.basename(pdb_file) + pae_basename = os.path.splitext(pdb_basename)[0] + ".txt.gz" + pae_file = os.path.join(opts.pae_dir, pae_basename) + _check_file(pae_file) + mdlcf_json["pae"] = np.loadtxt(pae_file) + exp_num_res = len(ref_seq.string) + if mdlcf_json["pae"].shape != (exp_num_res, exp_num_res): + raise RuntimeError(f"Unexpected PAE shape in {pae_file}") + + + # fill annotations + mdlcf_json["title"] = _get_title(f_name) + if mdl_rank != 1: + mdlcf_json["title"] += f" (#{mdl_rank} ranked model)" + mdlcf_json["model_details"] = mdl_details + mdlcf_json["model_group_name"] = _get_model_group_name() + print(f" ({timer()-pstart:.2f}s)") + + # save ModelCIF + assoc_files = _store_as_modelcif( + mdlcf_json, + ost_ent, + opts.out_dir, + mdl_id, + opts.compress and mdl_rank == 1, + add_pae, + opts.all_msa or mdl_rank == 1, + add_files + ) + + # check if result can be read and has expected seq. + mdl_path = os.path.join(opts.out_dir, assoc_files[0].path) + ent, ss = io.LoadMMCIF(mdl_path, seqres=True) + exp_seqs = [ + trg_ent["pdb_sequence"] for trg_ent in mdlcf_json["target_entities"] + ] + assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}" + # here we expect auth = label IDs + ent_seq = "".join([_get_sequence(chn, False) for chn in ent.chains]) + ent_seq_a = "".join([_get_sequence(chn, True) for chn in ent.chains]) + assert ent_seq == ent_seq_a + assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}" + ent_seqres = "".join( + [ss.FindSequence(chn.name).string for chn in ent.chains] + ) + exp_seqres = "".join( + [trg_ent["seqres"] for trg_ent in mdlcf_json["target_entities"]] + ) + assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}" + + print(f" ... done with {mdl_id} ({timer()-pdb_start:.2f}s).") + + return assoc_files + + +def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): + """Convert a family of models with their accompanying data to ModelCIF.""" + + # expected to have exactly 5 models per family + if len(metadata_fam) != 5: + raise RuntimeError( + f"Unexpected number of {len(metadata_fam)} models in " + f"metadata for family {f_name}." + ) + + # skip if done already + if opts.compress: + cifext = "cif.gz" + else: + cifext = "cif" + mdl_path = os.path.join(opts.out_dir, f"{f_name}.{cifext}") + if os.path.exists(mdl_path): + print(f" {f_name} already done...") + return + + # get aln_data and ref. seq. for this entry + aln_file = f"{f_name}.fasta" + aln_path = os.path.join(opts.msa_data_dir, aln_file) + # expected 11 extra families compared to web data but those don't have MSAs + # -> skipped for consistency and to keep code simple here + if not os.path.exists(aln_path): + _warn_msg(f"Missing MSA for {f_name}. Skipping...") + return + + aln = io.LoadAlignment(aln_path) # note: this checks that it's an actual MSA + ref_seq = aln.sequences[0] + if ref_seq_check is not None and ref_seq_check.string != ref_seq.string: + raise RuntimeError(f"Sequence mismatch for {f_name}") + + # get global model details + mdl_details = _get_model_details( + f_name, metadata_fam.pLDDT.max(), metadata_fam.pTM.max() + ) + # rank available models + metadata_sorted = metadata_fam.sort_values("pLDDT", ascending=False) + add_files = [] + if opts.all_models: + for idx in range(1, 5): + assoc_files = _translate2modelcif_single( + f_name, opts, metadata_sorted.iloc[idx], pdb_files, idx + 1, + aln_file, aln_path, ref_seq, mdl_details + ) + add_files.extend(assoc_files) + # process top ranked one + _translate2modelcif_single( + f_name, opts, metadata_sorted.iloc[0], pdb_files, 1, + aln_file, aln_path, ref_seq, mdl_details, add_files + ) + + +def _main(): + """Run as script.""" + opts = _parse_args() + + # parse/fetch global data + metadata_full = _get_metadata(opts.metadata_file) + pdb_files_split = _get_pdb_files(opts.model_dir) + if opts.refseq_path is not None: + refseqs = io.LoadSequenceList(opts.refseq_path) + else: + refseqs = None + + # get on with models + print(f"Working on {opts.prefix}*...") + + # iterate over models + for f_name in sorted(pdb_files_split): + if f_name.startswith(opts.prefix): + _translate2modelcif( + f_name, + opts, + metadata_full[metadata_full.ID == f_name], + pdb_files_split[f_name], + refseqs.FindSequence(f_name) if refseqs is not None else None, + ) + + print(f"... done with {opts.prefix}*.") + + +if __name__ == "__main__": + _main() diff --git a/projects/phytoplasma-effectors/README.md b/projects/phytoplasma-effectors/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f7dec837ae5f3046451ee630ffd40b147cad075 --- /dev/null +++ b/projects/phytoplasma-effectors/README.md @@ -0,0 +1,18 @@ +# Modelling of phytoplasma effectors + +[Link to project in ModelArchive](https://modelarchive.org/doi/10.5452/ma-saps) (incl. background on project itself) + +Setup: +- Using AlphaFold v2.2 for monomer predictions with fairly default settings +- Conversion based only on PDB files without extra data +- CSV file with links to accession codes in UniProt + +Special features here: +- Generic code added to find best sequence match in UniProt history and report mismatches +- Sequences with remaining mismatches (here due to experimental assays) labelled in entity name +- One obsolete entry (SAP11_MBS linked to A0A1B3JKP4 in UniProt or UPI00083DE1DE in UniParc) +- pLDDT extracted from b-factors (simplest setup since no other QA scores anyway) + +Content: +- translate2modelcif.py : script to do conversion +- input_data: PDB files and CSV used for conversion diff --git a/projects/phytoplasma-effectors/input_data/accessions.csv b/projects/phytoplasma-effectors/input_data/accessions.csv new file mode 100644 index 0000000000000000000000000000000000000000..a9e351b9eab0a83341219089f8a63a1f07550142 --- /dev/null +++ b/projects/phytoplasma-effectors/input_data/accessions.csv @@ -0,0 +1,22 @@ +Accession,Name +Q2NJQ2,SAP54 +A0A0P7KHL3,SAP54_Peanut +A0A4Y5N0H8,SAP54_RP +Q2NJD7,SAP49 +Q2NJN9,SAP41 +Q2NJB0,SAP66 +A0A410HXL4,SAP11_WBDL +Q2NK94,SAP05 +Q2NJA6,SAP11 +Q2NII6,SAP13 +Q2NKA4,SAP21 +Q2NJZ9,SAP27 +Q2NIN8,SAP35 +Q2NJL8,SAP42 +Q2NJI2,SAP45 +Q2NJA8,SAP67 +Q2NJA7,SAP68 +Q2NK93,SAP06 +Q2NJJ6,SAP44 +Q2NJD6,SAP48 +A0A1B3JKP4,SAP11_MBS diff --git a/projects/phytoplasma-effectors/input_data/structures.zip b/projects/phytoplasma-effectors/input_data/structures.zip new file mode 100644 index 0000000000000000000000000000000000000000..91cc3f3f7bf7119669f78792ac9332ec2b467b66 Binary files /dev/null and b/projects/phytoplasma-effectors/input_data/structures.zip differ diff --git a/projects/phytoplasma-effectors/translate2modelcif.py b/projects/phytoplasma-effectors/translate2modelcif.py new file mode 100644 index 0000000000000000000000000000000000000000..e26ae244f202dc4d3ed8bfd173e4b19163fe916f --- /dev/null +++ b/projects/phytoplasma-effectors/translate2modelcif.py @@ -0,0 +1,977 @@ +#! /usr/local/bin/ost +# -*- coding: utf-8 -*- + +"""Translate models for Miguel from PDB + extra data into ModelCIF.""" + +# EXAMPLES for running: +""" +ost translate2modelcif.py ./structures ./accessions.csv ./modelcif \ + --compress > script_out.txt +""" + +import argparse +import datetime +import gzip +import os +import shutil +import sys +import zipfile +import pickle +import filecmp +import re +from timeit import default_timer as timer +import numpy as np +import requests +import ujson as json +import gemmi +import pandas as pd + +import ihm +import ihm.citations +import modelcif +import modelcif.associated +import modelcif.dumper +import modelcif.model +import modelcif.protocol +import modelcif.reference + +from ost import io, seq + + +def _parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__, + ) + + parser.add_argument( + "model_dir", + type=str, + metavar="<MODEL DIR>", + help="Directory with PDB files to be translated.", + ) + parser.add_argument( + "metadata_file", + type=str, + metavar="<METADATA FILE>", + help="Path to CSV file with metadata.", + ) + parser.add_argument( + "out_dir", + type=str, + metavar="<OUTPUT DIR>", + help="Path to directory to store results.", + ) + parser.add_argument( + "--compress", + default=False, + action="store_true", + help="Compress ModelCIF file with gzip.", + ) + + opts = parser.parse_args() + + # check that model dir exists + if opts.model_dir.endswith("/"): + opts.model_dir = opts.model_dir[:-1] + if not os.path.exists(opts.model_dir): + _abort_msg(f"Model directory '{opts.model_dir}' does not exist.") + if not os.path.isdir(opts.model_dir): + _abort_msg(f"Path '{opts.model_dir}' does not point to a directory.") + # check metadata_file + if not os.path.exists(opts.metadata_file): + _abort_msg(f"Metadata file '{opts.metadata_file}' does not exist.") + if not os.path.isfile(opts.metadata_file): + _abort_msg(f"Path '{opts.metadata_file}' does not point to a file.") + # check out_dir + if opts.out_dir.endswith("/"): + opts.out_dir = opts.out_dir[:-1] + if not os.path.exists(opts.out_dir): + os.makedirs(opts.out_dir) + if not os.path.isdir(opts.out_dir): + _abort_msg(f"Path '{opts.out_dir}' does not point to a directory.") + return opts + + +# pylint: disable=too-few-public-methods +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + name = "pLDDT" + software = None + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + name = "pLDDT" + software = None +# pylint: enable=too-few-public-methods + + +def _get_res_num(r, use_auth=False): + """Get res. num. from auth. IDs if reading from mmCIF files.""" + if use_auth: + return int(r.GetStringProp("pdb_auth_resnum")) + else: + return r.number.num + + +def _get_ch_name(ch, use_auth=False): + """Get chain name from auth. IDs if reading from mmCIF files.""" + if use_auth: + return ch.GetStringProp("pdb_auth_chain_name") + else: + return ch.name + + +class _OST2ModelCIF(modelcif.model.AbInitioModel): + """Map OST entity elements to ihm.model""" + + def __init__(self, *args, **kwargs): + """Initialise a model""" + self.ost_entity = kwargs.pop("ost_entity") + self.asym = kwargs.pop("asym") + + # use auth IDs for res. nums and chain names + self.use_auth = False + + # fetch plddts per residue + self.plddts = [] + for res in self.ost_entity.residues: + b_factors = [a.b_factor for a in res.atoms] + assert len(set(b_factors)) == 1 # must all be equal! + self.plddts.append(b_factors[0]) + + super().__init__(*args, **kwargs) + + + + def get_atoms(self): + # ToDo [internal]: Take B-factor out since its not a B-factor? + # NOTE: this assumes that _get_res_num maps residue to pos. in seqres + # within asym + for atm in self.ost_entity.atoms: + yield modelcif.model.Atom( + asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)], + seq_id=_get_res_num(atm.residue, self.use_auth), + atom_id=atm.name, + type_symbol=atm.element, + x=atm.pos[0], + y=atm.pos[1], + z=atm.pos[2], + het=atm.is_hetatom, + biso=atm.b_factor, + occupancy=atm.occupancy, + ) + + def add_scores(self): + """Add QA metrics from AF2 scores.""" + # global scores + self.qa_metrics.append( + _GlobalPLDDT(np.mean(self.plddts)) + ) + + # local scores + i = 0 + for chn_i in self.ost_entity.chains: + ch_name = _get_ch_name(chn_i, self.use_auth) + for res_i in chn_i.residues: + # local pLDDT + res_num = _get_res_num(res_i, self.use_auth) + self.qa_metrics.append( + _LocalPLDDT( + self.asym[ch_name].residue(res_num), + self.plddts[i], + ) + ) + i += 1 + + +def _abort_msg(msg, exit_code=1): + """Write error message and exit with exit_code.""" + print(f"{msg}\nAborting.", file=sys.stderr) + sys.exit(exit_code) + + +def _warn_msg(msg): + """Write a warning message to stdout.""" + print(f"WARNING: {msg}") + + +def _check_file(file_path): + """Make sure a file exists and is actually a file.""" + if not os.path.exists(file_path): + _abort_msg(f"File not found: '{file_path}'.") + if not os.path.isfile(file_path): + _abort_msg(f"File path does not point to file: '{file_path}'.") + + +def _get_audit_authors(): + """Return the list of authors that produced this model.""" + return ( + "Correa Marrero, Miguel", + "Capdevielle, Sylvain", + "Huang, Weijie", + "Al-Subhi, Ali M.", + "Busscher, Marco", + "Busscher-Lange, Jacqueline", + "van der Wal, Froukje", + "de Ridder, Dick", + "van Dijk, Aalt D.J.", + "Hogenhout, Saskia A.", + "Immink, Richard", + ) + + +def _get_metadata(metadata_file): + """Read csv file with metedata and prepare for next steps.""" + metadata = pd.read_csv(metadata_file) + # make sure names (== PDB file names) are unique + assert len(set(metadata.Name)) == metadata.shape[0] + # columns: [Accession, Name] + return metadata + + +def _get_config(): + """Define AF setup.""" + description = "Model generated using AlphaFold (v2.2.0) producing 5 " \ + "monomer models with 3 recycles each, without model " \ + "relaxation, using templates (up to Aug. 4 2022), ranked " \ + "by pLDDT, starting from an MSA with reduced_dbs setting." + af_config = { + "model_preset": "monomer", + "db_preset": "reduced_dbs", + "max_template_date": "2022-08-04", + "run_relax": False, + } + return { + "af_config": af_config, + "af_version": "2.2.0", + "description": description, + "use_templates": True, + "use_small_bfd": True, + "use_multimer": False, + } + + +def _get_protocol_steps_and_software(config_data): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + + # modelling step + step = { + "method_type": "modeling", + "name": None, + "details": config_data["description"], + } + # get input data + # Must refer to data already in the JSON, so we try keywords + step["input"] = "target_sequences" + # get output data + # Must refer to existing data, so we try keywords + step["output"] = "model" + # get software + if config_data["use_multimer"]: + step["software"] = [{ + "name": "AlphaFold-Multimer", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.Citation( + pmid=None, + title="Protein complex prediction with " + + "AlphaFold-Multimer.", + journal="bioRxiv", + volume=None, + page_range=None, + year=2021, + authors=[ + "Evans, R.", + "O'Neill, M.", + "Pritzel, A.", + "Antropova, N.", + "Senior, A.", + "Green, T.", + "Zidek, A.", + "Bates, R.", + "Blackwell, S.", + "Yim, J.", + "Ronneberger, O.", + "Bodenstein, S.", + "Zielinski, M.", + "Bridgland, A.", + "Potapenko, A.", + "Cowie, A.", + "Tunyasuvunakool, K.", + "Jain, R.", + "Clancy, E.", + "Kohli, P.", + "Jumper, J.", + "Hassabis, D.", + ], + doi="10.1101/2021.10.04.463034", + ), + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + }] + else: + step["software"] = [{ + "name": "AlphaFold", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.citations.alphafold2, + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + }] + step["software_parameters"] = config_data["af_config"] + protocol.append(step) + + return protocol + + +def _get_title(prot_name): + """Get a title for this modelling experiment.""" + return f"AlphaFold2 model of Candidatus Phytoplasma ({prot_name})" + + +def _get_model_details(prot_name): + """Get the model description.""" + return f"The AlphaFold2 model of Candidatus Phytoplasma ({prot_name}) is " \ + f"part of a larger structural dataset. The complete dataset " \ + f"comprises AlphaFold2 models of 21 different phytoplasma " \ + f"effectors studied in a protein-protein interaction assay." + + +def _get_model_group_name(): + """Get a name for a model group.""" + return None + + +def _get_sequence(chn, use_auth=False): + """Get the sequence out of an OST chain incl. '-' for gaps in resnums.""" + # initialise (add gaps if first is not at num. 1) + lst_rn = _get_res_num(chn.residues[0], use_auth) + idx = 1 + sqe = "-" * (lst_rn - 1) \ + + chn.residues[0].one_letter_code + + for res in chn.residues[idx:]: + lst_rn += 1 + while lst_rn != _get_res_num(res, use_auth): + sqe += "-" + lst_rn += 1 + sqe += res.one_letter_code + return sqe + + +def _check_sequence(up_ac, sequence): + """Verify sequence to only contain standard olc.""" + ns_aa_pos = [] # positions of non-standard amino acids + for i, res in enumerate(sequence): + if res not in "ACDEFGHIKLMNPQRSTVWY": + if res == "U": + _warn_msg( + f"Selenocysteine found at position {i+1} of entry " + + f"'{up_ac}', this residue may be missing in the " + + "model." + ) + ns_aa_pos.append(i) + continue + raise RuntimeError( + "Non-standard aa found in UniProtKB sequence " + + f"for entry '{up_ac}': {res}, position {i+1}" + ) + return ns_aa_pos + + +def _get_n_parse_up_entry(up_ac, up_url): + """Get data for an UniProtKB entry and parse it.""" + # This is a simple parser for UniProtKB txt format, instead of breaking it + # up into multiple functions, we just allow many many branches & statements, + # here. + # pylint: disable=too-many-branches,too-many-statements + data = {} + data["up_organism"] = "" + data["up_sequence"] = "" + data["up_ac"] = up_ac + rspns = requests.get(up_url, timeout=180) + for line in rspns.iter_lines(decode_unicode=True): + if line.startswith("ID "): + sline = line.split() + if len(sline) != 5: + raise RuntimeError(f"Unusual UniProtKB ID line found:\n" \ + f"'{line}'") + data["up_id"] = sline[1] + elif line.startswith("OX NCBI_TaxID="): + # Following strictly the UniProtKB format: 'OX NCBI_TaxID=<ID>;' + data["up_ncbi_taxid"] = line[len("OX NCBI_TaxID=") : -1] + data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip() + elif line.startswith("OS "): + if line[-1] == ".": + data["up_organism"] += line[len("OS ") : -1] + else: + data["up_organism"] += line[len("OS ") : -1] + " " + elif line.startswith("SQ "): + sline = line.split() + if len(sline) != 8: + raise RuntimeError(f"Unusual UniProtKB SQ line found:\n" \ + f"'{line}'") + data["up_seqlen"] = int(sline[2]) + data["up_crc64"] = sline[6] + elif line.startswith(" "): + sline = line.split() + if len(sline) > 6: + raise RuntimeError( + "Unusual UniProtKB sequence data line " + + f"found:\n'{line}'" + ) + data["up_sequence"] += "".join(sline) + elif line.startswith("RP "): + if "ISOFORM" in line.upper(): + raise RuntimeError( + f"First ISOFORM found for '{up_ac}', needs handling." + ) + elif line.startswith("DT "): + # 2012-10-03 + dt_flds = line[len("DT ") :].split(", ") + if dt_flds[1].upper().startswith("SEQUENCE VERSION "): + data["up_last_mod"] = datetime.datetime.strptime( + dt_flds[0], "%d-%b-%Y" + ) + elif dt_flds[1].upper().startswith("ENTRY VERSION "): + data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :] + if data["up_entry_version"][-1] == ".": + data["up_entry_version"] = data["up_entry_version"][:-1] + data["up_entry_version"] = int(data["up_entry_version"]) + elif line.startswith("GN Name="): + data["up_gn"] = line[len("GN Name=") :].split(";")[0] + data["up_gn"] = data["up_gn"].split("{")[0].strip() + + # we have not seen isoforms in the data set, yet, so we just set them to '.' + data["up_isoform"] = None + + # NOTE: no gene names in this set (use provided names instead) + # if "up_gn" not in data: + # _warn_msg( + # f"No gene name found for UniProtKB entry '{up_ac}', using " + # + "UniProtKB AC instead." + # ) + # data["up_gn"] = up_ac + if "up_last_mod" not in data: + raise RuntimeError(f"No sequence version found for UniProtKB entry " \ + f"'{up_ac}'.") + if "up_crc64" not in data: + raise RuntimeError(f"No CRC64 value found for UniProtKB entry " \ + f"'{up_ac}'.") + if len(data["up_sequence"]) == 0: + raise RuntimeError(f"No sequence found for UniProtKB entry '{up_ac}'.") + # check that sequence length and CRC64 is correct + if data["up_seqlen"] != len(data["up_sequence"]): + raise RuntimeError( + "Sequence length of SQ line and sequence data differ for " + + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != " + + f"{len(data['up_sequence'])}" + ) + data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"]) + + if "up_id" not in data: + raise RuntimeError(f"No ID found for UniProtKB entry '{up_ac}'.") + if "up_ncbi_taxid" not in data: + raise RuntimeError(f"No NCBI taxonomy ID found for UniProtKB entry " \ + f"'{up_ac}'.") + if len(data["up_organism"]) == 0: + raise RuntimeError(f"No organism species found for UniProtKB entry " \ + f"'{up_ac}'.") + return data + + +def _fetch_upkb_entry(up_ac): + """Get an UniProtKB entry.""" + return _get_n_parse_up_entry( + up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt" + ) + + +def _fetch_unisave_entry(up_ac, version): + """Get an UniSave entry, in contrast to an UniProtKB entry, that allows us + to specify a version.""" + return _get_n_parse_up_entry( + up_ac, + f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&" + + f"versions={version}", + ) + + +def _cmp_sequences(mdl, upkb, ns_aa_pos, deletion_mismatches=True): + """Compare sequence while paying attention on non-standard amino acids. + Returns list of mismatches (up_pos, olc_up, olc_mdl) and covered UP range. + UniProt positions and ranges are 1-indexed. + Negative "up_pos" relates to (1-indexed) pos. in model for added residues. + If deletion_mismatches is True, res. in UP seq. but not in mdl within UP + range are counted as mismatches (N/C-terminal ones are never counted). + """ + # We add a U to the sequence when necessary. AF2 does not model it. The PDB + # has selenocysteine as canonical aa, see PDB entry 7Z0T. + for pos in ns_aa_pos: + if mdl[pos] != "-": + _abort_msg( + f"Position {pos+1} of non-canonical amino acid should be " + "a gap!" + ) + mdl = mdl[0:pos] + "U" + mdl[pos + 1 :] + if mdl == upkb: + mismatches = [] + up_range = (1, len(mdl)) + else: + # align and report mismatches + up_seq = seq.CreateSequence("UP", upkb) + ch_seq = seq.CreateSequence("CH", mdl) + aln = seq.alg.SemiGlobalAlign(up_seq, ch_seq, seq.alg.BLOSUM62)[0] + # get range and mismatches + aligned_indices = [i for i, c in enumerate(aln) \ + if c[0] != '-' and c[1] != '-'] + up_range = ( + aln.GetResidueIndex(0, aligned_indices[0]) + 1, + aln.GetResidueIndex(0, aligned_indices[-1]) + 1, + ) + mismatches = [] + for idx, (olc_up, olc_mdl) in enumerate(aln): + if olc_up != olc_mdl: + # mismatches are either extra res. in mdl/UP or mismatch + if olc_up == '-': + up_pos = -(aln.GetResidueIndex(1, idx) + 1) + else: + up_pos = aln.GetResidueIndex(0, idx) + 1 + # ignore if out of UP range + if up_pos < up_range[0] or up_pos > up_range[1]: + continue + # optionally ignore extra res. in UP also otherwise + if not deletion_mismatches and olc_mdl == '-': + continue + mismatches.append((up_pos, olc_up, olc_mdl)) + return mismatches, up_range + + +def _get_upkb_for_sequence(sqe, up_ac, up_version=None): + """Get UniProtKB entry data for given sequence. + If up_version given, we start from historical data in unisave. + Returns best possible hit (i.e. fewest mismatches between sqe and UP seq.) + as dict. with parsed UP data (see _get_n_parse_up_entry) with range covered + and mismatches (see _cmp_sequences) added as "up_range" and "mismatches". + """ + if up_version is None: + up_data = _fetch_upkb_entry(up_ac) + else: + up_data = _fetch_unisave_entry(up_ac, up_version) + min_up_data = None + while True: + mismatches, up_range = _cmp_sequences(sqe, up_data["up_sequence"], + up_data["up_ns_aa"]) + if min_up_data is None or \ + len(mismatches) < len(min_up_data["mismatches"]): + min_up_data = up_data + min_up_data["mismatches"] = mismatches + min_up_data["up_range"] = up_range + if len(mismatches) == 0: + # found hit; done + break + # fetch next one (skip if exceptions happen) + next_v = up_data["up_entry_version"] - 1 + while next_v > 0: + try: + # note: can fail to parse very old UP versions... + up_data = _fetch_unisave_entry(up_ac, next_v) + # can move on if no exception happened + break + except RuntimeError as ex: + #_warn_msg(f"Error in parsing v{next_v} of {up_ac}:\n{ex}") + # try next one + next_v -= 1 + if next_v == 0: + # warn user about failure to find match and abort + min_mismatches = min_up_data["mismatches"] + msg = f"Sequences not equal from file: {sqe}, from UniProtKB: " \ + f"{min_up_data['up_sequence']} ({up_ac}), checked entire " \ + f"entry history and best match had following mismatches " \ + f"in v{min_up_data['up_entry_version']} (range " \ + f"{min_up_data['up_range']}): {min_up_data['mismatches']}." + _warn_msg(msg) + # raise RuntimeError(msg) + break + return min_up_data + + +def _get_entities(pdb_file, up_ac, prot_name): + """Gather data for the mmCIF (target) entities.""" + _check_file(pdb_file) + ost_ent = io.LoadPDB(pdb_file) + if ost_ent.chain_count != 1: + raise RuntimeError( + f"Unexpected oligomer in {pdb_file}" + ) + chn = ost_ent.chains[0] + sqe_no_gaps = "".join([res.one_letter_code for res in chn.residues]) + sqe_gaps = _get_sequence(chn) + if sqe_no_gaps != sqe_gaps: + raise RuntimeError(f"Sequence in {pdb_file} has gaps for chain " \ + f"{chn.name}") + + # map to entities + # special case: A0A1B3JKP4 is obsolete (there may be better ways to catch that) + if up_ac == "A0A1B3JKP4": + up_version = 10 + else: + up_version = None + upkb = _get_upkb_for_sequence(sqe_no_gaps, up_ac, up_version) + description = f"Candidatus Phytoplasma ({prot_name})" + if len(upkb["mismatches"]) != 0: + description += " (sequence mismatches due to sequencing from experimental assay)" + cif_ent = { + "seqres": sqe_no_gaps, + "pdb_sequence": sqe_no_gaps, + "pdb_chain_id": [_get_ch_name(chn, False)], + "prot_name": prot_name, + "description": description + } + cif_ent.update(upkb) + + return [cif_ent], ost_ent + + +def _get_modelcif_entities(target_ents, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + for cif_ent in target_ents: + mdlcif_ent = modelcif.Entity( + # NOTE: sequence here defines residues in model! + cif_ent["seqres"], + description=cif_ent["description"], + source=ihm.source.Natural( + ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"], + scientific_name=cif_ent["up_organism"], + ), + references=[ + modelcif.reference.UniProt( + cif_ent["up_id"], + cif_ent["up_ac"], + align_begin=cif_ent["up_range"][0], + align_end=cif_ent["up_range"][1], + isoform=cif_ent["up_isoform"], + ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"], + organism_scientific=cif_ent["up_organism"], + sequence_version_date=cif_ent["up_last_mod"], + sequence_crc64=cif_ent["up_crc64"], + ) + ], + ) + # NOTE: this assigns (potentially new) alphabetic chain names + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit( + mdlcif_ent, strand_id=pdb_chain_id, + ) + system.target_entities.append(mdlcif_ent) + + +def _assemble_modelcif_software(soft_dict): + """Create a modelcif.Software instance from dictionary.""" + return modelcif.Software( + soft_dict["name"], + soft_dict["classification"], + soft_dict["description"], + soft_dict["location"], + soft_dict["type"], + soft_dict["version"], + citation=soft_dict["citation"], + ) + + +def _get_sequence_dbs(config_data): + """Get AF seq. DBs.""" + # hard coded UniProt release (see https://www.uniprot.org/release-notes) + # (TO BE UPDATED FOR EVERY DEPOSITION!) + up_version = "2021_04" + up_rel_date = datetime.datetime(2021, 11, 17) + # fill list of DBs + seq_dbs = [] + if config_data["use_small_bfd"]: + seq_dbs.append(modelcif.ReferenceDatabase( + "Reduced BFD", + "https://storage.googleapis.com/alphafold-databases/" + + "reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz" + )) + else: + seq_dbs.append(modelcif.ReferenceDatabase( + "BFD", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/" + + "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz", + version="6a634dc6eb105c2e9b4cba7bbae93412", + )) + if config_data["af_version"] < "2.3.0": + seq_dbs.append(modelcif.ReferenceDatabase( + "MGnify", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/mgy_clusters_2018_12.fa.gz", + version="2018_12", + release_date=datetime.datetime(2018, 12, 6), + )) + seq_dbs.append(modelcif.ReferenceDatabase( + "Uniclust30", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz", + version="2018_08", + release_date=None, + )) + else: + # NOTE: release date according to https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/ + seq_dbs.append(modelcif.ReferenceDatabase( + "MGnify", + "https://storage.googleapis.com/alphafold-databases/" + + "v2.3/mgy_clusters_2022_05.fa.gz", + version="2022_05", + release_date=datetime.datetime(2022, 5, 6), + )) + seq_dbs.append(modelcif.ReferenceDatabase( + "UniRef30", + "https://storage.googleapis.com/alphafold-databases/" + + "v2.3/UniRef30_2021_03.tar.gz", + version="2021_03", + release_date=None, + )) + if config_data["use_multimer"]: + seq_dbs.append(modelcif.ReferenceDatabase( + "TrEMBL", + "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/" + + "knowledgebase/complete/uniprot_trembl.fasta.gz", + version=up_version, + release_date=up_rel_date, + )) + seq_dbs.append(modelcif.ReferenceDatabase( + "Swiss-Prot", + "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/" + + "knowledgebase/complete/uniprot_sprot.fasta.gz", + version=up_version, + release_date=up_rel_date, + )) + seq_dbs.append(modelcif.ReferenceDatabase( + "UniRef90", + "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/" + + "uniref90.fasta.gz", + version=up_version, + release_date=up_rel_date, + )) + if config_data["use_templates"]: + seq_dbs.append(modelcif.ReferenceDatabase( + "PDB70", + "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + + "hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz", + release_date=datetime.datetime(2020, 4, 1) + )) + return seq_dbs + + +def _get_modelcif_protocol_software(js_step): + """Assemble software entries for a ModelCIF protocol step.""" + if js_step["software"]: + if len(js_step["software"]) == 1: + sftwre = _assemble_modelcif_software(js_step["software"][0]) + else: + sftwre = [] + for sft in js_step["software"]: + sftwre.append(_assemble_modelcif_software(sft)) + sftwre = modelcif.SoftwareGroup(elements=sftwre) + if js_step["software_parameters"]: + params = [] + for key, val in js_step["software_parameters"].items(): + params.append(modelcif.SoftwareParameter(key, val)) + if isinstance(sftwre, modelcif.SoftwareGroup): + sftwre.parameters = params + else: + sftwre = modelcif.SoftwareGroup( + elements=(sftwre,), parameters=params + ) + return sftwre + return None + + +def _get_modelcif_protocol_input(js_step, target_entities, ref_dbs, model): + """Assemble input data for a ModelCIF protocol step.""" + if js_step["input"] == "target_sequences": + input_data = modelcif.data.DataGroup(target_entities) + input_data.extend(ref_dbs) + elif js_step["input"] == "model": + input_data = model + else: + raise RuntimeError(f"Unknown protocol input: '{js_step['input']}'") + return input_data + + +def _get_modelcif_protocol_output(js_step, model): + """Assemble output data for a ModelCIF protocol step.""" + if js_step["output"] == "model": + output_data = model + else: + raise RuntimeError(f"Unknown protocol output: '{js_step['output']}'") + return output_data + + +def _get_modelcif_protocol(protocol_steps, target_entities, model, ref_dbs): + """Create the protocol for the ModelCIF file.""" + protocol = modelcif.protocol.Protocol() + for js_step in protocol_steps: + sftwre = _get_modelcif_protocol_software(js_step) + input_data = _get_modelcif_protocol_input( + js_step, target_entities, ref_dbs, model + ) + output_data = _get_modelcif_protocol_output(js_step, model) + + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=output_data, + name=js_step["name"], + details=js_step["details"], + software=sftwre, + ) + ) + protocol.steps[-1].method_type = js_step["method_type"] + return protocol + + +def _compress_cif_file(cif_file): + """Compress cif file and delete original.""" + with open(cif_file, 'rb') as f_in: + with gzip.open(cif_file + '.gz', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(cif_file) + + +def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress): + """Mix all the data into a ModelCIF file.""" + print(" generating ModelCIF objects...", end="") + pstart = timer() + # create system to gather all the data + system = modelcif.System( + title=data_json["title"], + id=data_json["mdl_id"].upper(), + model_details=data_json["model_details"], + ) + + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities( + data_json["target_entities"], asym_units, system + ) + + # audit_authors + system.authors.extend(data_json["audit_authors"]) + + # set up the model to produce coordinates + model = _OST2ModelCIF( + assembly=modelcif.Assembly(asym_units.values()), + asym=asym_units, + ost_entity=ost_ent, + name="Top ranked model", + ) + print(f" ({timer()-pstart:.2f}s)") + print(" processing QA scores...", end="", flush=True) + pstart = timer() + model.add_scores() + print(f" ({timer()-pstart:.2f}s)") + + model_group = modelcif.model.ModelGroup( + [model], name=data_json["model_group_name"] + ) + system.model_groups.append(model_group) + + ref_dbs = _get_sequence_dbs(data_json["config_data"]) + protocol = _get_modelcif_protocol( + data_json["protocol"], system.target_entities, model, ref_dbs + ) + system.protocols.append(protocol) + + # write modelcif System to file (NOTE: no PAE here!) + print(" write to disk...", end="", flush=True) + pstart = timer() + out_path = os.path.join(out_dir, f"{mdl_name}.cif") + with open(out_path, "w", encoding="ascii") as mmcif_fh: + modelcif.dumper.write(mmcif_fh, [system]) + if compress: + _compress_cif_file(out_path) + print(f" ({timer()-pstart:.2f}s)") + + +def _translate2modelcif(up_ac, prot_name, opts): + """Convert a model with its accompanying data to ModelCIF.""" + mdl_id = prot_name + # skip if done already (disabled here due to info to be returned) + if opts.compress: + cifext = "cif.gz" + else: + cifext = "cif" + mdl_path = os.path.join(opts.out_dir, f"{mdl_id}.{cifext}") + # if os.path.exists(mdl_path): + # print(f" {mdl_id} already done...") + # return + + # go for it... + print(f" translating {mdl_id}...") + pdb_start = timer() + + # gather data into JSON-like structure + print(" preparing data...", end="") + pstart = timer() + + # now we can fill all data + config_data = _get_config() + mdlcf_json = {} + mdlcf_json["audit_authors"] = _get_audit_authors() + mdlcf_json["protocol"] = _get_protocol_steps_and_software(config_data) + mdlcf_json["config_data"] = config_data + mdlcf_json["mdl_id"] = mdl_id + + # process coordinates + pdb_file = os.path.join(opts.model_dir, f"{prot_name}.pdb") + target_entities, ost_ent = _get_entities(pdb_file, up_ac, prot_name) + mdlcf_json["target_entities"] = target_entities + + # fill annotations + mdlcf_json["title"] = _get_title(prot_name) + mdlcf_json["model_details"] = _get_model_details(prot_name) + mdlcf_json["model_group_name"] = _get_model_group_name() + print(f" ({timer()-pstart:.2f}s)") + + # save ModelCIF + _store_as_modelcif(mdlcf_json, ost_ent, opts.out_dir, mdl_id, opts.compress) + + # check if result can be read and has expected seq. + ent = io.LoadMMCIF(mdl_path) + exp_seqs = [trg_ent["pdb_sequence"] \ + for trg_ent in mdlcf_json["target_entities"]] + assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}" + ent_seq = "".join(res.one_letter_code for res in ent.residues) + assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}" + + print(f" ... done with {mdl_id} ({timer()-pdb_start:.2f}s).") + + +def _main(): + """Run as script.""" + opts = _parse_args() + + # parse/fetch global data + metadata = _get_metadata(opts.metadata_file) + + # get on with models + print(f"Working on {opts.metadata_file}...") + + # iterate over models + for _, mrow in metadata.iterrows(): + up_ac = mrow.Accession.strip() + prot_name = mrow.Name.strip() + _translate2modelcif(up_ac, prot_name, opts) + + print(f"... done with {opts.metadata_file}.") + + +if __name__ == "__main__": + _main() diff --git a/pyproject.toml b/pyproject.toml index f0328248e8503099ecd248f79dd9d2612c9a8ddf..f9f5fc8e012d1eac82be68c01fee02a046e23318 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,10 @@ reports='no' # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code -extension-pkg-allow-list='rapidjson' +extension-pkg-allow-list=["rapidjson", "ost"] + +# [tool.pylint.typecheck] +generated-members = ["FindSequence"] [tool.pylint.FORMAT] max-line-length=80