diff --git a/projects/cancer-PPI-domains/README.md b/projects/cancer-PPI-domains/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ad90c6d92b647ee39e8acdc0e6e9525e5e681499 --- /dev/null +++ b/projects/cancer-PPI-domains/README.md @@ -0,0 +1,23 @@ +# Modelling of Spongilla lacustris proteome with functional annotations + +[Link to project in ModelArchive](https://modelarchive.org/doi/10.5452/ma-t3vr3) (incl. background on project itself) + +Setup: +- Domains of interacting proteins extracted from full length proteins (sequences from UniProtKB) +- Models generated using sequences of domains which can have discontinuous mapping to full length sequence +- Same protocol used as in [model set for core eukaryotic protein complexes](https://www.modelarchive.org/doi/10.5452/ma-bak-cepc) + - Paired multiple sequence alignment (MSA) generated for each dimer + - Model using AlphaFold ("model 3" parameters; pTM monomer version) with a 200 residue gap between the two chains, without templates and without model relaxation +- Input from them: + - one zip file with all the PDB files (no b-factor values, residue numbers matching position in UniProtKB sequence) + - one zip file with all the extra files (1 fasta file for alignment, 1 npz file with pLDDT, PAE and contact probabilities) + - a CSV file with description and UniProtKB links for each protein + +Special features here: +- Custom MSA generation with intermediate result in accompanying data +- PAE and contact probabilities only kept for inter-chain residue-pairs +- Author provided residue numbers kept as auth_seq_num +- Mapping to most recent UniProtKB sequence generated, checked and stored as fasta files (ModelCIF file only has covered range with respect to the originally used sequence) + +Content: +- translate2modelcif.py : script to do conversion; compatible with Docker setup from [ma-wilkins-import](https://git.scicore.unibas.ch/schwede/ma-wilkins-import/-/tree/6bbd6fa7ec53e1a0971fba40c96fa971d1022f74) (and script based on code there) diff --git a/projects/cancer-PPI-domains/translate2modelcif.py b/projects/cancer-PPI-domains/translate2modelcif.py new file mode 100644 index 0000000000000000000000000000000000000000..f5076cd6c684f277c2bb713f0d40a12e19a4253b --- /dev/null +++ b/projects/cancer-PPI-domains/translate2modelcif.py @@ -0,0 +1,1106 @@ +#! /usr/local/bin/ost +"""Translate models from Jing Zhang from PDB + extra data into ModelCIF.""" + +# EXAMPLES for running: +""" +GT test setup: +ost scripts/translate2modelcif.py ./pdbs ./accompanying_data \ + ./modelarchive_submission.csv ./modelcif +# DO ALL +ost scripts/translate2modelcif.py ./pdbs ./accompanying_data \ + ./modelarchive_submission.csv ./modelcif --compress > script_out.txt +""" + +import argparse +import datetime +import gzip +import os +import shutil +import sys +import zipfile + +from timeit import default_timer as timer +import numpy as np +import requests +import ujson as json + +import ihm +import ihm.citations +import modelcif +import modelcif.associated +import modelcif.dumper +import modelcif.model +import modelcif.protocol +import modelcif.reference + +import pandas as pd +from ost import io, seq + +# global cache +up_cache = {} + +def _parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__, + ) + + parser.add_argument( + "model_dir", + type=str, + metavar="<MODEL DIR>", + help="Directory with models to be translated.", + ) + parser.add_argument( + "accompanying_data_dir", + type=str, + metavar="<ACC. DIR>", + help="Directory with zip files containing accompanying data.", + ) + parser.add_argument( + "metadata_file", + type=str, + metavar="<METADATA FILE>", + help="Path to CSV file with metadata.", + ) + parser.add_argument( + "out_dir", + type=str, + metavar="<OUTPUT DIR>", + help="Path to directory to store results.", + ) + parser.add_argument( + "--compress", + default=False, + action="store_true", + help="Compress ModelCIF file with gzip " + "(note that QA file is zipped either way).", + ) + opts = parser.parse_args() + + # check that model dir exists + if opts.model_dir.endswith("/"): + opts.model_dir = opts.model_dir[:-1] + if not os.path.exists(opts.model_dir): + _abort_msg(f"Model directory '{opts.model_dir}' does not exist.") + if not os.path.isdir(opts.model_dir): + _abort_msg(f"Path '{opts.model_dir}' does not point to a directory.") + # check that acc. dir exists + if opts.accompanying_data_dir.endswith("/"): + opts.accompanying_data_dir = opts.accompanying_data_dir[:-1] + if not os.path.exists(opts.accompanying_data_dir): + _abort_msg(f"Model directory '{opts.accompanying_data_dir}' does not exist.") + if not os.path.isdir(opts.accompanying_data_dir): + _abort_msg(f"Path '{opts.accompanying_data_dir}' does not point to a directory.") + # check metadata_file + if not os.path.exists(opts.metadata_file): + _abort_msg(f"Metadata file '{opts.metadata_file}' does not exist.") + if not os.path.isfile(opts.metadata_file): + _abort_msg(f"Path '{opts.metadata_file}' does not point to a file.") + # check out_dir + if opts.out_dir.endswith("/"): + opts.out_dir = opts.out_dir[:-1] + if not os.path.exists(opts.out_dir): + os.makedirs(opts.out_dir) + if not os.path.isdir(opts.out_dir): + _abort_msg(f"Path '{opts.out_dir}' does not point to a directory.") + return opts + + +# pylint: disable=too-few-public-methods +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _PAE(modelcif.qa_metric.MetricType): + """Predicted aligned error (in Angstroms)""" + + type = "PAE" + other_details = None + + +class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE): + """Predicted aligned error (in Angstroms)""" + + name = "PAE" + software = None + + +class _ContactProb(modelcif.qa_metric.MetricType): + """Contact probability of a pairwise interaction in [0,1]""" + + type = "contact probability" + other_details = None + + +class _LocalPairwiseCP(modelcif.qa_metric.LocalPairwise, _ContactProb): + """Contact probability of a pairwise interaction in [0,1]""" + + name = "contact probability" + software = None + + +# pylint: enable=too-few-public-methods + + +class _OST2ModelCIF(modelcif.model.AbInitioModel): + """Map OST entity elements to ihm.model""" + + def __init__(self, *args, **kwargs): + """Initialise a model""" + for i in ["ost_entity", "asym", "scores_json"]: + if i not in kwargs: + raise TypeError(f"Required keyword argument '{i}' not found.") + self.ost_entity = kwargs.pop("ost_entity") + self.asym = kwargs.pop("asym") + self.scores_json = kwargs.pop("scores_json") + # need reverse mapping from chain name + res num to seq. id + self.seq_id_map = { + ch.name: { + res.number.num: idx+1 for idx, res in enumerate(ch.residues) + } for ch in self.ost_entity.chains + } + # self.seq_id_map = { + # ch_id: {v: k for k, v in asym_unit.auth_seq_id_map.items()} \ + # for ch_id, asym_unit in self.asym.items() + # } + # fetch plddts for all residues (for mapping) + self.res_bfactors = { + r.qualified_name: self.scores_json["plddt"][i] \ + for i, r in enumerate(self.ost_entity.residues) + } + + super().__init__(*args, **kwargs) + + def get_atoms(self): + # note we blindly replace b-factors here! + for atm in self.ost_entity.atoms: + yield modelcif.model.Atom( + asym_unit=self.asym[atm.chain.name], + seq_id=self.seq_id_map[atm.chain.name][atm.residue.number.num], + atom_id=atm.name, + type_symbol=atm.element, + x=atm.pos[0], + y=atm.pos[1], + z=atm.pos[2], + het=atm.is_hetatom, + biso=self.res_bfactors[atm.residue.qualified_name], + occupancy=atm.occupancy, + ) + + def add_scores(self): + """Add QA metrics from AF2 scores.""" + # global scores + self.qa_metrics.append( + _GlobalPLDDT(np.mean(self.scores_json["plddt"])) + ) + + # local scores + lpae = [] + lcp = [] + i = 0 + for chn_idx, chn_i in enumerate(self.ost_entity.chains): + for res_i in chn_i.residues: + # local pLDDT + seq_id_i = self.seq_id_map[chn_i.name][res_i.number.num] + asym_i = self.asym[chn_i.name].residue(seq_id_i) + self.qa_metrics.append( + _LocalPLDDT(asym_i, self.scores_json["plddt"][i]) + ) + + # per-residue scores + # NOTE: only off diagonal stored (i.e. chain A with chain B) + j = 0 + for chn_j in self.ost_entity.chains[chn_idx+1:]: + for res_j in chn_j.residues: + seq_id_j = self.seq_id_map[chn_j.name][res_j.number.num] + asym_j = self.asym[chn_j.name].residue(seq_id_j) + pae_ij = self.scores_json["pae"][i][j] + lpae.append( + _LocalPairwisePAE(asym_i, asym_j, pae_ij) + ) + cp_ij = self.scores_json["contact_prob"][i][j] + lcp.append( + _LocalPairwiseCP(asym_i, asym_j, round(cp_ij, 3)) + ) + j += 1 + + i += 1 + + self.qa_metrics.extend(lpae) + self.qa_metrics.extend(lcp) + + +def _abort_msg(msg, exit_code=1): + """Write error message and exit with exit_code.""" + print(f"{msg}\nAborting.", file=sys.stderr) + sys.exit(exit_code) + + +def _warn_msg(msg): + """Write a warning message to stdout.""" + print(f"WARNING: {msg}") + + +def _check_file(file_path): + """Make sure a file exists and is actually a file.""" + if not os.path.exists(file_path): + _abort_msg(f"File not found: '{file_path}'.") + if not os.path.isfile(file_path): + _abort_msg(f"File path does not point to file: '{file_path}'.") + + +def _get_audit_authors(): + """Return the list of authors that produced this model.""" + return ( + "Zhang, J.", + "Pei, J.", + "Durham, J.", + "Bos, T.", + "Cong, Q.", + ) + + +def _get_metadata(metadata_file): + """Read csv file with metedata and prepare for next steps.""" + # NEEDS LOTS OF CLEANUP!! + # -> ASSUMED "A" always chain A + df_in = pd.read_csv(metadata_file) + cols = None + mdl_part_txt = " This model is part of" + mdl_af_txt = "Model generated using AlphaFold2" + for idx, row in df_in.iterrows(): + if row[0].startswith("#"): + continue + elif cols is None: + cols = list(row[~row.isna()]) + df = pd.DataFrame(columns=cols) + else: + row_data = [row[0]] + # fix overview entry (ASSUME "A" is always chain 1) + # -> any ',' in oevreview was split into columns + # -> also new-lines missing between lines + # -> and need to remove "This model is part of ..." text + ch1_idx = row.to_list().index("A") + overview = ",".join(row[1:ch1_idx]) + mdl_af_idx = overview.index(mdl_af_txt) + mdl_rel_idx = overview.index(mdl_part_txt) + assert mdl_af_idx < mdl_rel_idx + ov_fixed = overview[:mdl_af_idx] + "\n\n" \ + + overview[mdl_af_idx:mdl_rel_idx] + row_data.append(ov_fixed) + data_end_idx = ch1_idx + len(cols) - 2 + assert data_end_idx <= len(row) + assert not any(row[ch1_idx:data_end_idx].isna()) + row_data.extend(row[ch1_idx:data_end_idx]) + df.loc[idx] = row_data + # sanity checks + assert len(set(df["model id"])) == len(df["model id"]) + assert all(df.alignment_file.str.endswith(".fas")) + return df.set_index("model id") + + +def _get_modeling_description(): + """Fixed text for model creation.""" + return "Model using AlphaFold ('model 3' parameters; pTM monomer version)" \ + " with a 200 residue gap between the two chains, without templates" \ + " and without model relaxation" + +def _get_protocol_steps_and_software(): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + # MSA step + step = { + "method_type": "coevolution MSA", + "name": None, + "details": "Create paired MSAs for the dimers", + } + step["input"] = "target_sequences" + step["output"] = "MSA" + step["software"] = [] + step["software_parameters"] = {} + protocol.append(step) + + # modelling step + step = { + "method_type": "modeling", + "name": None, + "details": _get_modeling_description(), + } + # get input data + # Must refer to data already in the JSON, so we try keywords + step["input"] = "target_sequences_and_MSA" + # get output data + # Must refer to existing data, so we try keywords + step["output"] = "model" + # get software + step["software"] = [ + { + "name": "AlphaFold", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.citations.alphafold2, + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": "2.0.0", + }] + step["software_parameters"] = {} + protocol.append(step) + + return protocol + + +def _get_title(gene_name_infos): + """Get a title for this modelling experiment.""" + ranged_gene_names = [f"{gn} ({start_pos}-{end_pos})" + for gn, start_pos, end_pos in gene_name_infos] + return f"Predicted interaction between {' and '.join(ranged_gene_names)}" + + +def _get_model_group_name(): + """Get a name for a model group.""" + return None + + +def _get_sequence(chn): + """Get the sequence out of an OST chain incl. '-' for gaps in resnums.""" + # initialise + lst_rn = chn.residues[0].number.num + idx = 1 + sqe = chn.residues[0].one_letter_code + + for res in chn.residues[idx:]: + lst_rn += 1 + while lst_rn != res.number.num: + sqe += "-" + lst_rn += 1 + sqe += res.one_letter_code + return sqe + + +def _check_sequence(up_ac, sequence): + """Verify sequence to only contain standard olc.""" + ns_aa_pos = [] # positions of non-standard amino acids + for i, res in enumerate(sequence): + if res not in "ACDEFGHIKLMNPQRSTVWY": + if res == "U": + _warn_msg( + f"Selenocysteine found at position {i+1} of entry " + + f"'{up_ac}', this residue may be missing in the " + + "model." + ) + ns_aa_pos.append(i) + continue + raise RuntimeError( + "Non-standard aa found in UniProtKB sequence " + + f"for entry '{up_ac}': {res}, position {i+1}" + ) + return ns_aa_pos + + +def _get_n_parse_up_entry(up_ac, up_url): + """Get data for an UniProtKB entry and parse it.""" + # This is a simple parser for UniProtKB txt format, instead of breaking it + # up into multiple functions, we just allow many many branches & statements, + # here. + # pylint: disable=too-many-branches,too-many-statements + data = {} + data["up_organism"] = "" + data["up_sequence"] = "" + data["up_ac"] = up_ac + rspns = requests.get(up_url, timeout=180) + for line in rspns.iter_lines(decode_unicode=True): + if line.startswith("ID "): + sline = line.split() + if len(sline) != 5: + _abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'") + data["up_id"] = sline[1] + elif line.startswith("OX NCBI_TaxID="): + # Following strictly the UniProtKB format: 'OX NCBI_TaxID=<ID>;' + data["up_ncbi_taxid"] = line[len("OX NCBI_TaxID=") : -1] + data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip() + elif line.startswith("OS "): + if line[-1] == ".": + data["up_organism"] += line[len("OS ") : -1] + else: + data["up_organism"] += line[len("OS ") : -1] + " " + elif line.startswith("SQ "): + sline = line.split() + if len(sline) != 8: + _abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'") + data["up_seqlen"] = int(sline[2]) + data["up_crc64"] = sline[6] + elif line.startswith(" "): + sline = line.split() + if len(sline) > 6: + _abort_msg( + "Unusual UniProtKB sequence data line " + + f"found:\n'{line}'" + ) + data["up_sequence"] += "".join(sline) + elif line.startswith("RP "): + if "ISOFORM" in line.upper(): + RuntimeError( + f"First ISOFORM found for '{up_ac}', needs " + "handling." + ) + elif line.startswith("DT "): + # 2012-10-03 + dt_flds = line[len("DT ") :].split(", ") + if dt_flds[1].upper().startswith("SEQUENCE VERSION "): + data["up_last_mod"] = datetime.datetime.strptime( + dt_flds[0], "%d-%b-%Y" + ) + elif dt_flds[1].upper().startswith("ENTRY VERSION "): + data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :] + if data["up_entry_version"][-1] == ".": + data["up_entry_version"] = data["up_entry_version"][:-1] + data["up_entry_version"] = int(data["up_entry_version"]) + elif line.startswith("GN Name="): + data["up_gn"] = line[len("GN Name=") :].split(";")[0] + data["up_gn"] = data["up_gn"].split("{")[0].strip() + + # we have not seen isoforms in the data set, yet, so we just set them to '.' + data["up_isoform"] = None + + if "up_gn" not in data: + _warn_msg( + f"No gene name found for UniProtKB entry '{up_ac}', using " + + "UniProtKB AC instead." + ) + data["up_gn"] = up_ac + if "up_last_mod" not in data: + _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.") + if "up_crc64" not in data: + _abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.") + if len(data["up_sequence"]) == 0: + _abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.") + # check that sequence length and CRC64 is correct + if data["up_seqlen"] != len(data["up_sequence"]): + _abort_msg( + "Sequence length of SQ line and sequence data differ for " + + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != " + + f"{len(data['up_sequence'])}" + ) + data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"]) + + if "up_id" not in data: + _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.") + if "up_ncbi_taxid" not in data: + _abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.") + if len(data["up_organism"]) == 0: + _abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.") + return data + + +def _fetch_upkb_entry(up_ac): + """Get an UniProtKB entry.""" + return _get_n_parse_up_entry( + up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt" + ) + + +def _fetch_unisave_entry(up_ac, version): + """Get an UniSave entry, in contrast to an UniProtKB entry, that allows us + to specify a version.""" + return _get_n_parse_up_entry( + up_ac, + f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&" + + f"versions={version}", + ) + + +def _build_and_check_up_aln(up_data, up_start, up_end, ch_seq): + # return: aln (UP (name: "up_ac|0-based-offset"), CH) if 100% match, + # None else + # up_start and up_end expected to be ready for 0-indexed slicing + # ch_seq expected to contain gaps to cover same length as up_end - up_start + if up_end > len(up_data["up_sequence"]): + # skip UP seq. versions with meaningless lengths + return None + s_up = seq.CreateSequence(f"{up_data['up_ac']}|{up_start}", + up_data["up_sequence"][up_start:up_end]) + s_ch = seq.CreateSequence("ch_seq", ch_seq) + aln = seq.CreateAlignment(s_up, s_ch) + seq_id = seq.alg.SequenceIdentity(aln) # ignores gaps + if seq_id < 100: + return None + else: + return aln + +def _build_and_cut_up_aln(up_data, ch_seq): + # return: aln (UP (name: "up_ac|0-based-offset"), CH) + # cut to remove terminal gaps in CH + s_ch = seq.CreateSequence("ch_seq", ch_seq) + s_up = seq.CreateSequence("up_seq", up_data["up_sequence"]) + aln = seq.alg.SemiGlobalAlign(s_up, s_ch, seq.alg.BLOSUM62)[0] + # cut to part covered by model and rename + offset = aln.sequences[1].first_non_gap + aln.Cut(0, aln.sequences[1].first_non_gap) + aln.Cut(aln.sequences[1].last_non_gap + 1, len(aln)) + aln.SetSequenceName(0, f"{up_data['up_ac']}|{offset}") + return aln + +def _get_upkb_for_sequence(label, up_ac, up_start, up_end, ch_seq, up_id, + up_cache={}): + # check length + if len(ch_seq) != up_end - up_start: + raise RuntimeError(f"BAD RES. COUNT FOR {label}") + # return matching up_data and aln to latest UP + up_data = up_cache.get(up_ac, None) + if up_data is None: + up_data = _fetch_upkb_entry(up_ac) + aln = _build_and_check_up_aln(up_data, up_start, up_end, ch_seq) + if aln is None: + # keep original version in cache and check other versions until hit + up_cache[up_ac + "-latest"] = up_data + while aln is None: + if up_data["up_entry_version"] > 1: + up_data = _fetch_unisave_entry( + up_ac, up_data["up_entry_version"] - 1 + ) + aln = _build_and_check_up_aln(up_data, up_start, up_end, ch_seq) + else: + raise RuntimeError( + f"Sequences not equal from file: {ch_seq}, from UniProtKB: " + f"{up_data['up_sequence']} ({label}), checked entire entry " + "history." + ) + # check UP ID + if up_id != up_data["up_id"]: + raise RuntimeError(f"UP ID MISMATCH for {label}; {up_id} vs {up_data['up_id']}") + # keep cached for matched data + up_cache[up_ac] = up_data + # align to latest sequence if needed + if up_ac + "-latest" in up_cache: + aln = _build_and_cut_up_aln(up_cache[up_ac + "-latest"], ch_seq) + seq_id = seq.alg.SequenceIdentity(aln) # ignores gaps + if len(aln) != up_end - up_start or seq_id < 95: + raise RuntimeError( + f"Failed to align {label} to latest {up_ac}!" + f" Lengths {len(aln)} vs {up_end - up_start};" + f" seq. id {seq_id}." + ) + return up_data, aln + + +def _get_entities(pdb_file, up_infos): + """Gather data for the mmCIF (target) entities.""" + # up_infos as dict {ch_name: (up_ac, up_start_idx, up_end_idx, up_id)} + # -> [up_start_idx:up_end_idx] meant to be for slicing up_sequence + entities = [] + _check_file(pdb_file) + ost_ent = io.LoadPDB(pdb_file) + # check assumption of dimer + if ost_ent.chain_count != len(up_infos): + raise RuntimeError(f"UNEXPECTED CHAIN COUNT IN {pdb_file}") + # check assumption that none of the PDB files have bfactors + unique_b_factors = sorted(set(a.b_factor for a in ost_ent.atoms)) + if unique_b_factors != [0.0]: + raise RuntimeError(f"B-FACTORS SPOTTED IN {pdb_file}: " + f"{unique_b_factors}") + already_seen = [] + for chn in ost_ent.chains: + cif_ent = {} + ch_label = f"{pdb_file}|CH-{chn.name}" + up_ac, up_start_idx, up_end_idx, up_id = up_infos[chn.name] + up_key = (up_ac, up_start_idx, up_end_idx) + sqe_no_gaps = "".join([res.one_letter_code for res in chn.residues]) + sqe_gaps = _get_sequence(chn) + try: + e_idx = already_seen.index(up_key) + except ValueError: + pass + else: + if sqe_no_gaps != entities[e_idx]["pdb_sequence"]: + _abort_msg( + "Sequences are different for two chains of the same " + "UniProtKB AC. This case is not implemented, yet." + ) + entities[e_idx]["pdb_chain_id"].append(chn.name) + continue + already_seen.append(up_key) + # note: upkb is for matching seq. version, aln is for latest seq. + upkb, up_aln = _get_upkb_for_sequence( + ch_label, up_ac, up_start_idx, up_end_idx, sqe_gaps, + up_id, up_cache + ) + cif_ent["pdb_sequence"] = sqe_no_gaps + cif_ent["pdb_seqlen"] = len(sqe_no_gaps) + cif_ent["pdb_chain_id"] = [chn.name] + cif_ent["description"] = ( + f"{upkb['up_organism']} {upkb['up_gn']} ({upkb['up_ac']}; " + f"{up_start_idx + 1} - {up_end_idx})" + ) + cif_ent["up_aln"] = up_aln + # 1-indexed positions for closed interval + cif_ent["up_start_pos"] = up_start_idx + 1 + cif_ent["up_end_pos"] = up_end_idx + # alignment didn't check starting position in UP + if chn.residues[0].number.num != cif_ent["up_start_pos"]: + print(f"UNEXPECTED STARTING RESNUM IN {ch_label}") + # mapping between seq_id and auth_seq_id + # -> assume same for all chains of this entity + # (trivial here as hetero-dimer) + cif_ent["auth_seq_id_map"] = { + idx+1: res.number.num for idx, res in enumerate(chn.residues) + } + cif_ent.update(upkb) + entities.append(cif_ent) + return entities, ost_ent + + +def _get_scores(data, acc_path, cp_name, aln_name): + """Check that all files needed to process this model are present.""" + _check_file(acc_path) + with zipfile.ZipFile(acc_path) as zf: + if len(zf.namelist()) > 2: + print(f"TOO MANY FILES IN ACC {acc_path}") + if cp_name not in zf.namelist(): + raise RuntimeError(f"MISSING NPZ FILE {acc_path}") + if aln_name not in zf.namelist(): + raise RuntimeError(f"MISSING ALN FILE {acc_path}") + aln_data = zf.open(aln_name).read() + # NOTE: this also checks that all sequences in FASTA have same length + aln = io.AlignmentFromString(aln_data, "fasta") + cp_data = np.load(zf.open(cp_name)) + # check aln data + # -> should match combined length of gapless chain sequences + combined_seq = "".join(trg_ent["pdb_sequence"] \ + for trg_ent in data["target_entities"]) + if str(aln.sequences[0]) != combined_seq: + raise RuntimeError(f"BAD ALN SEQ {acc_path}") + # if aln.sequence_count < 10: + # print("SHALLOW ALN", acc_path, aln.sequence_count) + # # not an issue really, just curious + # check cp data + # -> expected pLDDT vector and PAE/CP matrices + # -> expected to either have one off diagonal or full matrix + seq_lens = [trg_ent["pdb_seqlen"] for trg_ent in data["target_entities"]] + assert len(seq_lens) == 2 # only dimers supported here + sum_lens = sum(seq_lens) + exp_shape_offdiag = (seq_lens[0], seq_lens[1]) + exp_shape_full = (sum_lens, sum_lens) + if len(cp_data) != 3: + print(f"UNEXPECTED NUMBER OF ITEMS IN NPZ FILE {acc_path}") + if "contact_prob" not in cp_data: + raise RuntimeError(f"MISSING CP IN NPZ FILE {acc_path}") + cp_matrix = cp_data["contact_prob"] + if "predicted_aligned_error" not in cp_data: + raise RuntimeError(f"MISSING PAE IN NPZ FILE {acc_path}") + pae_matrix = cp_data["predicted_aligned_error"] + if cp_matrix.shape == exp_shape_full: + # cut matrices + cp_matrix = cp_matrix[:seq_lens[0], seq_lens[0]:] + pae_matrix = pae_matrix[:seq_lens[0], seq_lens[0]:] + elif cp_matrix.shape != exp_shape_offdiag: + raise RuntimeError(f"BAD SHAPE OF CP DATA {acc_path}") + if pae_matrix.shape != cp_matrix.shape: + raise RuntimeError(f"BAD SHAPE OF PAE DATA {acc_path}") + if "plddt" not in cp_data: + raise RuntimeError(f"MISSING PLDDT IN NPZ FILE {acc_path}") + elif cp_data["plddt"].shape != (sum_lens, ): + raise RuntimeError(f"BAD SHAPE OF PLDDT DATA {acc_path}") + # update data + data["plddt"] = cp_data["plddt"] + data["pae"] = pae_matrix + data["contact_prob"] = cp_matrix + data["aln_data"] = aln_data + + +def _get_modelcif_entities(target_ents, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + for cif_ent in target_ents: + mdlcif_ent = modelcif.Entity( + # NOTE: here pdb_sequence is what has actually been modelled + # -> compared to UP this contains gaps! + cif_ent["pdb_sequence"], + description=cif_ent["description"], + source=ihm.source.Natural( + ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"], + scientific_name=cif_ent["up_organism"], + ), + references=[ + modelcif.reference.UniProt( + cif_ent["up_id"], + cif_ent["up_ac"], + align_begin=cif_ent["up_start_pos"], + align_end=cif_ent["up_end_pos"], + isoform=cif_ent["up_isoform"], + ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"], + organism_scientific=cif_ent["up_organism"], + sequence_version_date=cif_ent["up_last_mod"], + sequence_crc64=cif_ent["up_crc64"], + ) + ], + ) + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit( + mdlcif_ent, auth_seq_id_map=cif_ent["auth_seq_id_map"] + ) + system.target_entities.append(mdlcif_ent) + + +def _get_assoc_aln_file(fle_path): + """Generate a modelcif.associated.File object pointing to FASTA formatted + file containing MSA. + """ + cfile = modelcif.associated.File( + fle_path, + details="Paired MSAs" + ) + cfile.file_format = "fasta" + cfile.file_content = "multiple sequence alignments" + return cfile + + +def _get_associated_files(entry_id, mdl_name, add_files): + """Create entry for associated files.""" + # extract local pairwise into extra file + ac_file = f"{mdl_name}_local_pairwise_qa.cif" + arc_files = [ + modelcif.associated.LocalPairwiseQAScoresFile( + ac_file, + categories=["_ma_qa_metric_local_pairwise"], + copy_categories=["_ma_qa_metric"], + entry_id=entry_id, + entry_details="This file is an associated file consisting " + + "of local pairwise QA metrics. This is a partial mmCIF " + + "file and can be validated by merging with the main " + + "mmCIF file containing the model coordinates and other " + + "associated data.", + details="Predicted aligned error and contact probability", + ) + ] + # add any extra files if needed + if add_files: + arc_files.extend(add_files) + # package all into zip file + return modelcif.associated.Repository( + "", + [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)], + ) + # NOTE: by convention MA expects zip file with same name as model-cif + + +def _assemble_modelcif_software(soft_dict): + """Create a modelcif.Software instance from dictionary.""" + return modelcif.Software( + soft_dict["name"], + soft_dict["classification"], + soft_dict["description"], + soft_dict["location"], + soft_dict["type"], + soft_dict["version"], + citation=soft_dict["citation"], + ) + + +def _get_modelcif_protocol_software(js_step): + """Assemble software entries for a ModelCIF protocol step.""" + if js_step["software"]: + if len(js_step["software"]) == 1: + sftwre = _assemble_modelcif_software(js_step["software"][0]) + else: + sftwre = [] + for sft in js_step["software"]: + sftwre.append(_assemble_modelcif_software(sft)) + sftwre = modelcif.SoftwareGroup(elements=sftwre) + if js_step["software_parameters"]: + params = [] + for key, val in js_step["software_parameters"].items(): + params.append(modelcif.SoftwareParameter(key, val)) + if isinstance(sftwre, modelcif.SoftwareGroup): + sftwre.parameters = params + else: + sftwre = modelcif.SoftwareGroup( + elements=(sftwre,), parameters=params + ) + return sftwre + return None + + +def _get_modelcif_protocol_data(data_label, target_entities, aln_data, model): + """Assemble data for a ModelCIF protocol step.""" + if data_label == "target_sequences": + data = modelcif.data.DataGroup(target_entities) + elif data_label == "MSA": + data = aln_data + elif data_label == "target_sequences_and_MSA": + data = modelcif.data.DataGroup(target_entities) + data.append(aln_data) + elif data_label == "model": + data = model + else: + raise RuntimeError(f"Unknown protocol data: '{data_label}'") + return data + + +def _get_modelcif_protocol(protocol_steps, target_entities, aln_data, model): + """Create the protocol for the ModelCIF file.""" + protocol = modelcif.protocol.Protocol() + for js_step in protocol_steps: + sftwre = _get_modelcif_protocol_software(js_step) + input_data = _get_modelcif_protocol_data( + js_step["input"], target_entities, aln_data, model + ) + output_data = _get_modelcif_protocol_data( + js_step["output"], target_entities, aln_data, model + ) + + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=output_data, + name=js_step["name"], + details=js_step["details"], + software=sftwre, + ) + ) + protocol.steps[-1].method_type = js_step["method_type"] + return protocol + + +def _compress_cif_file(cif_file): + """Compress cif file and delete original.""" + with open(cif_file, "rb") as f_in: + with gzip.open(cif_file + ".gz", "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(cif_file) + + +def _package_associated_files(repo): + """Compress associated files into single zip file and delete original.""" + # zip settings tested for good speed vs compression + for archive in repo.files: + with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip: + for zfile in archive.files: + cif_zip.write(zfile.path, arcname=zfile.path) + os.remove(zfile.path) + + +def _store_as_modelcif( + data_json, ost_ent, out_dir, mdl_name, compress, add_files +): + """Mix all the data into a ModelCIF file.""" + print(" generating ModelCIF objects...", end="") + pstart = timer() + # create system to gather all the data + system = modelcif.System( + title=data_json["title"], + id=data_json["mdl_id"].upper(), + model_details=data_json["model_details"], + ) + + # add primary citation (HC here but could also fetch via from_pubmed_id) + system.citations.append(ihm.Citation( + pmid="36261849", + title="Computed cancer interactome explains the effects of somatic " + "mutations in cancers.", + journal="Protein Sci", + volume=31, + page_range="e4479", + year=2022, + authors=['Zhang, J.', 'Pei, J.', 'Durham, J.', 'Bos, T.', 'Cong, Q.'], + doi="10.1002/pro.4479", + is_primary=True, + )) + + # create target entities, references, source, asymmetric units & assembly + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities( + data_json["target_entities"], + asym_units, + system, + ) + + # audit_authors + system.authors.extend(data_json["audit_authors"]) + + # set up the model to produce coordinates + mdl_list_name = "Model 3" # fixed set of params used here + model = _OST2ModelCIF( + assembly=modelcif.Assembly(asym_units.values()), + asym=asym_units, + ost_entity=ost_ent, + scores_json=data_json, + name=mdl_list_name, + ) + print(f" ({timer()-pstart:.2f}s)") + print(" processing QA scores...", end="", flush=True) + pstart = timer() + model.add_scores() + print(f" ({timer()-pstart:.2f}s)") + + system.model_groups.append( + modelcif.model.ModelGroup([model], name=data_json["model_group_name"]) + ) + + # handle additional files + system.repositories.append( + _get_associated_files(system.id, mdl_name, add_files) + ) + + # prepare aln_data info + aln_files = [f for f in add_files \ + if f.file_content == "multiple sequence alignments"] + assert len(aln_files) == 1 # no support for multiple one atm + aln_file_name = aln_files[0].path + aln_data = modelcif.data.Data( + "Paired MSA for the dimer", + f"MSA stored as {aln_file_name} in accompanying data" + ) + + system.protocols.append( + _get_modelcif_protocol( + data_json["protocol"], + system.target_entities, + aln_data, + model, + ) + ) + + # write modelcif System to file + print(" write to disk...", end="", flush=True) + pstart = timer() + # NOTE: this will dump PAE on path provided in add_scores + # -> hence we cheat by changing path and back while being exception-safe... + # -> also we need to write aln file to have it in zip file with rest + oldpwd = os.getcwd() + os.chdir(out_dir) + mdl_fle = f"{mdl_name}.cif" + try: + with open(mdl_fle, "w", encoding="ascii") as mmcif_fh: + modelcif.dumper.write(mmcif_fh, [system]) + with open(aln_file_name, "wb") as fh: + fh.write(data_json["aln_data"]) + _package_associated_files(system.repositories[0]) + if compress: + _compress_cif_file(mdl_fle) + mdl_fle += ".gz" + # also output alignments to latest UP + for cif_ent in data_json["target_entities"]: + ch_names = "-".join(chn for chn in cif_ent["pdb_chain_id"]) + up_aln_file_name = f"{mdl_name}-UP-ALN-{ch_names}.fasta" + io.SaveAlignment(cif_ent["up_aln"], up_aln_file_name) + finally: + os.chdir(oldpwd) + print(f" ({timer()-pstart:.2f}s)") + return os.path.join(out_dir, mdl_fle) + + +def _create_interaction_json(): + """Create a dictionary (mimicking JSON) that contains data which is the same + for all models.""" + data = {} + data["audit_authors"] = _get_audit_authors() + data["protocol"] = _get_protocol_steps_and_software() + return data + + +def _create_model_json(data, pdb_file, metadata): + """Create a dictionary (mimicking JSON) that contains all the data.""" + up_infos = { + metadata[f"chain{ch_idx}"]: ( + metadata[f"chain{ch_idx}_UniProtAC"], + int(metadata[f"chain{ch_idx}_startPosi"]) - 1, + int(metadata[f"chain{ch_idx}_endPosi"]), + metadata[f"chain{ch_idx}_UniProtKB"], + ) for ch_idx in [1, 2] + } + data["target_entities"], ost_ent = _get_entities(pdb_file, up_infos) + gns = [(i["up_gn"], i["up_start_pos"], i["up_end_pos"]) \ + for i in data["target_entities"]] + data["title"] = _get_title(gns) + data["model_details"] = metadata.overview + data["model_group_name"] = _get_model_group_name() + return ost_ent + + +def _translate2modelcif(mdl_id, metadata, opts): + """Convert a PDB file with its accompanying data to ModelCIF.""" + pdb_fle = os.path.join(opts.model_dir, mdl_id + ".pdb") + acc_path = os.path.join(opts.accompanying_data_dir, + mdl_id + "_accompanying_data.zip") + + # gather data into JSON-like structure + print(" preparing data...", end="") + pstart = timer() + + mdlcf_json = _create_interaction_json() + mdlcf_json["mdl_id"] = mdl_id + + ost_ent = _create_model_json(mdlcf_json, pdb_fle, metadata) + + # read quality scores from JSON file + _get_scores( + mdlcf_json, acc_path, + metadata.contact_probility_file, + metadata.alignment_file + ) + print(f" ({timer()-pstart:.2f}s)") + add_files = [_get_assoc_aln_file(metadata.alignment_file)] + mdl_path = _store_as_modelcif( + mdlcf_json, + ost_ent, + opts.out_dir, + mdl_id, + opts.compress, + add_files, + ) + # check if result can be read and has expected seq. + ent = io.LoadMMCIF(mdl_path) + exp_seqs = [trg_ent["pdb_sequence"] \ + for trg_ent in mdlcf_json["target_entities"]] + assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}" + ent_seq = "".join(res.one_letter_code for res in ent.residues) + assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}" + + +def _main(): + """Run as script.""" + opts = _parse_args() + + # parse/fetch global data + metadata_full = _get_metadata(opts.metadata_file) + if opts.compress: + cifext = "cif.gz" + else: + cifext = "cif" + print(f"Working on {opts.metadata_file}...") + + # iterate models + # TEST: selected models + # for mdl_id in ["O00329-D1_P27986-D2", "O14602_P47813"]: + for mdl_id in ["O00329-D1_P27986-D2", "O14602_P47813", + "O75376-D12_Q14995-D5", "Q8NFD5-D13_Q92925-D2", + "P20248-D2_Q09472-D14", "O14744-D1_Q9Y618-D10", + "O00257-D2_Q6W2J9-D8"]: + metadata = metadata_full.loc[mdl_id] + # for mdl_id, metadata in metadata.iterrows(): + if os.path.exists(os.path.join(opts.out_dir, f"{mdl_id}.{cifext}")): + print(f" {mdl_id} already done...") + continue + print(f" translating {mdl_id}...") + pdb_start = timer() + _translate2modelcif(mdl_id, metadata, opts) + print(f" ... done with {mdl_id} ({timer()-pdb_start:.2f}s).") + + print(f"... done with {opts.metadata_file}.") + + +if __name__ == "__main__": + _main() + +# LocalWords: aa pdb