diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3610828a887ca275f0672225a55e13a55742a8de --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# Don't have Emac's backup files +*~ + +# ignore test scripts +biop-test.py +junk.py + +# ignore some files used for testing +1ake.1.pdb +3lre.3.A.pdb +6xne.pdb +cmp.cif diff --git a/.spelling b/.spelling new file mode 100644 index 0000000000000000000000000000000000000000..239bed71dca5930640153c1ce988b7436bd0b2ab --- /dev/null +++ b/.spelling @@ -0,0 +1,19 @@ +Biopython +CIF +DBs +FastA +Jupyter +MSA +ModelCIF +PAE +PDB +PPI +coevolution +modeled +modeling +multimer +polypeptide +pre +repo +reproducibility +subdirectory \ No newline at end of file diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py new file mode 100755 index 0000000000000000000000000000000000000000..fbb6a846021a8d03dd53496b46b4c659b30cc67e --- /dev/null +++ b/convert_to_modelcif.py @@ -0,0 +1,1117 @@ +#!/usr/bin/env python3 + +"""Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF +file with a lot of metadata in place.""" + +from typing import Tuple +import datetime +import gzip +import hashlib +import json +import os +import pickle +import re +import shutil +import sys +import tempfile +import zipfile + +from Bio import SeqIO +from Bio.PDB import PDBParser, PPBuilder +from Bio.PDB.Structure import Structure as BioStructure +from absl import app, flags, logging +import numpy as np + +import ihm.citations +import modelcif +import modelcif.associated +import modelcif.dumper +import modelcif.model +import modelcif.protocol + +from alphapulldown.utils import make_dir_monomer_dictionary + +# ToDo: DISCUSS Get options properly, best get the same names as used in +# existing scripts +# ToDo: Monomers work separately - features may come from different set of +# software, databases... so target sequences may be connected to different +# versions of the same sequence database, may use different versions of +# software... can this still go into single protocol steps or would this +# prevent identifying which MSAs were produced with which software? E.g. +# can there still be a single "sequence search" step? (This is +# definitively for later, not the first working version of the converter +# script) +# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__' +# ToDo: protocol step software parameters +# ToDo: Example 1 from the GitHub repo mentions MMseqs2 +# ToDo: Discuss input of protocol steps, feature creation has baits, sequences +# does modelling depend on mode? +# ToDo: Option to add remaining models w PAE files to archive +# ToDo: deal with `--max_template_date`, beta-barrel project has it as software +# parameter +flags.DEFINE_string( + "ap_output", None, "AlphaPulldown pipeline output directory" +) +flags.DEFINE_list( + "monomer_objects_dir", + None, + "a list of directories where monomer objects are stored", +) +flags.DEFINE_integer( + "model_selected", + None, + "model to be converted into ModelCIF, omit to convert all models found in " + + "'--af2_output'", +) +flags.DEFINE_bool( + "add_associated", + False, + "Add models not marked by " + + "'--model_selected' to the archive for associated files", +) +flags.DEFINE_bool("compress", False, "compress the ModelCIF file(s) using Gzip") +flags.mark_flags_as_required(["ap_output", "monomer_objects_dir"]) + +FLAGS = flags.FLAGS + +# ToDo: implement a flags.register_validator() checking that files/ directories +# exist as expected. +# ToDo: implement a flags.register_validator() to make sure that +# --add_associated is only activated if --model_selected is used, too + + +# pylint: disable=too-few-public-methods +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM): + """Predicted accuracy according to the TM-score score in [0,1]""" + + name = "pTM" + software = None + + +class _GlobalIPTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM): + # `python-modelcif` reads the first line of class-doc has description of a + # score, so we need to allow long lines, here. + # pylint: disable=line-too-long + """Predicted protein-protein interface score, based on the TM-score score in [0,1].""" + + name = "ipTM" + software = None + + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100]""" + + name = "pLDDT" + software = None + + +class _PAE(modelcif.qa_metric.MetricType): + """Predicted aligned error (in Angstroms)""" + + type = "PAE" + other_details = None + + +class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE): + """Predicted aligned error (in Angstroms)""" + + name = "PAE" + software = None + + +# pylint: enable=too-few-public-methods + + +class _Biopython2ModelCIF(modelcif.model.AbInitioModel): + """Map Biopython `PDB.Structure()` object to `ihm.model()`.""" + + def __init__(self, *args, **kwargs): + """Initialise a model""" + self.structure = kwargs.pop("bio_pdb_structure") + self.asym = kwargs.pop("asym") + + super().__init__(*args, **kwargs) + + def get_atoms(self): + for atm in self.structure.get_atoms(): + yield modelcif.model.Atom( + asym_unit=self.asym[atm.parent.parent.id], + seq_id=atm.parent.id[1], + atom_id=atm.name, + type_symbol=atm.element, + x=atm.coord[0], + y=atm.coord[1], + z=atm.coord[2], + het=atm.parent.id[0] != " ", + biso=atm.bfactor, + occupancy=atm.occupancy, + ) + + def add_scores(self, scores_json, entry_id, file_prefix, sw_dct, add_files): + """Add QA metrics""" + _GlobalPLDDT.software = sw_dct["AlphaFold"] + _GlobalPTM.software = sw_dct["AlphaFold"] + _GlobalIPTM.software = sw_dct["AlphaFold"] + _LocalPLDDT.software = sw_dct["AlphaFold"] + _LocalPairwisePAE.software = sw_dct["AlphaFold"] + # global scores + self.qa_metrics.extend( + ( + _GlobalPLDDT(np.mean(scores_json["plddt"])), + _GlobalPTM(scores_json["ptm"]), + _GlobalIPTM(scores_json["iptm"]), + ) + ) + + # local scores + # iterate polypeptide chains + # local PLDDT + i = 0 + lpae = [] + # aa_only=False includes non-canonical amino acids but seems to skip + # non-peptide-linking residues like ions + polypeptides = PPBuilder().build_peptides(self.structure, aa_only=False) + for chn_i in polypeptides: + for res_i in chn_i: + # local pLDDT + # Assertion assumes that pLDDT values are also stored in the + # B-factor column. + assert ( + round(scores_json["plddt"][i], 2) + == next(res_i.get_atoms()).bfactor + ) + self.qa_metrics.append( + _LocalPLDDT( + self.asym[res_i.parent.id].residue(res_i.id[1]), + round(scores_json["plddt"][i], 2), + ) + ) + + # pairwise alignment error + j = 0 + # We do a 2nd iteration over the structure instead of doing + # index magic because it keeps the code cleaner and should not + # be noticeably slower than iterating the array directly. + # Majority of time goes into writing files, anyway. + for chn_j in polypeptides: + for res_j in chn_j: + lpae.append( + _LocalPairwisePAE( + self.asym[res_i.parent.id].residue(res_i.id[1]), + self.asym[res_j.parent.id].residue(res_j.id[1]), + scores_json["pae"][i][j], + ) + ) + j += 1 + + i += 1 + self.qa_metrics.extend(lpae) + + # outsource PAE to associated file + arc_files = [ + modelcif.associated.LocalPairwiseQAScoresFile( + f"{file_prefix}_local_pairwise_qa.cif", + categories=["_ma_qa_metric_local_pairwise"], + copy_categories=["_ma_qa_metric"], + entry_id=entry_id, + entry_details="This file is an associated file consisting " + + "of local pairwise QA metrics. This is a partial mmCIF " + + "file and can be validated by merging with the main " + + "mmCIF file containing the model coordinates and other " + + "associated data.", + details="Predicted aligned error", + ) + ] + + if add_files: + arc_files.extend([x[1] for x in add_files.values()]) + + return modelcif.associated.Repository( + "", + [ + modelcif.associated.ZipFile( + f"{file_prefix}.zip", files=arc_files + ) + ], + ) + + +def _get_modelcif_entities(target_ents, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + for cif_ent in target_ents: + mdlcif_ent = modelcif.Entity( + # 'pdb_sequence' can be used here, since AF2 always has the + # complete sequence. + cif_ent["pdb_sequence"], + description=cif_ent["description"], + ) + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent) + system.target_entities.append(mdlcif_ent) + + +def _get_step_output_method_type(method_type, protocol_steps): + """Get the output of a protocol step of a certain type.""" + for step in protocol_steps: + if step.method_type == method_type: + # `modelcif.data.DataGroup()` is some kind of list + if isinstance(step.output_data, list): + return step.output_data + return modelcif.data.DataGroup(step.output_data) + + raise RuntimeError(f"Step with 'method_type' '{method_type}' not found.") + + +def _get_modelcif_protocol_input( + input_data_group, target_entities, ref_dbs, protocol_steps +): + """Assemble input data for a ModelCIF protocol step.""" + input_data = modelcif.data.DataGroup() + for inpt in input_data_group: + if inpt == "target_sequences": + input_data.extend(target_entities) + elif inpt == "reference_dbs": + input_data.extend(ref_dbs) + elif inpt.startswith("STEPTYPE$"): + input_data.extend( + _get_step_output_method_type( + inpt[len("STEPTYPE$") :], protocol_steps + ) + ) + else: + raise RuntimeError(f"Unknown protocol input: '{inpt}'") + + return input_data + + +def _get_modelcif_protocol_output(output_data_group, model): + """Assemble output data for a ModelCIF protocol step.""" + if output_data_group == "model": + output_data = model + elif output_data_group == "monomer_pickle_files": + output_data = modelcif.data.DataGroup( + [ + modelcif.data.Data( + "Pickle files", details="Monomer feature/ MSA pickle files" + ) + ] + ) + else: + raise RuntimeError(f"Unknown protocol output: '{output_data_group}'") + return output_data + + +def _get_modelcif_protocol( + protocol_steps, target_entities, model, sw_dict, ref_dbs +): + """Create the protocol for the ModelCIF file.""" + protocol = modelcif.protocol.Protocol() + for js_step in protocol_steps: + # assemble input & output data + input_data = _get_modelcif_protocol_input( + js_step["input_data_group"], + target_entities, + ref_dbs, + protocol.steps, + ) + output_data = _get_modelcif_protocol_output( + js_step["output_data_group"], model + ) + # loop over software group and assemble software group from that + sw_grp = modelcif.SoftwareGroup() + for pss in js_step["software_group"]: # protocol step software + if sw_dict[pss] is not None: + # add software w/o individual parameters + sw_grp.append(sw_dict[pss]) + # add software with individual parameters + # Commented out code does not need spelling check, so disable + # it in Pylint + # pylint: disable=wrong-spelling-in-comment + # sw_grp.append( + # modelcif.SoftwareWithParameters( + # sw_dict[pss], + # [modelcif.SoftwareParameter(<name>, <value>)], + # ) + # ) + # pylint: enable=wrong-spelling-in-comment + # ToDo: make sure AlphaPulldown is first in the SoftwareGroup() list, + # AlphaFold second; that influences citation order in the ModelCIF + # file. + # mix everything together in a protocol step + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=output_data, + name=js_step["step_name"], + details=js_step["details"], + software=sw_grp, + ) + ) + protocol.steps[-1].method_type = js_step["method_type"] + + return protocol + + +def _cast_release_date(release_date): + """Type cast a date into `datetime.date`""" + # "AF2" has a special meaning, those DBs did not change since the first + # release of AF2. This information is needed in the model-producing + # pipeline. + if release_date is None or release_date == "AF2": + return None + + try: + return datetime.datetime.strptime(release_date, "%Y-%m-%d %H:%M:%S") + except ValueError: + logging.warning( + f"Unsupported release date format found: {release_date}" + ) + raise + + +def _cmp_ref_dbs(db_dct, db_objs): + """Compare a reference DB dict to a list of ReferenceDatabase objects. + Note: does not check the DB name!""" + for obj in db_objs: + if db_dct["release_date"] != obj.release_date: + continue + if db_dct["version"] != obj.version: + continue + for url in db_dct["location_url"]: + if url == obj.url: + return True + + return False + + +def _get_modelcif_ref_dbs(meta_json): + """Get sequence databases used for monomer features.""" + # vendor formatting for DB names/ URLs, extend on KeyError + # ToDo: adapt to new JSON input + sdb_lst = {} # 'sequence database list' starts as dict since we need to + # compare DBs between the different monomers. + i = 0 + for data in meta_json.values(): + i += 1 + for db_name, vdct in data["databases"].items(): + vdct["release_date"] = _cast_release_date(vdct["release_date"]) + if db_name in sdb_lst: + if _cmp_ref_dbs(vdct, sdb_lst[db_name]): + continue + else: + sdb_lst[db_name] = [] + for url in vdct["location_url"]: + sdb_lst[db_name].append( + modelcif.ReferenceDatabase( + db_name, + url, + version=vdct["version"], + release_date=vdct["release_date"], + ) + ) + + return [x for sublist in sdb_lst.values() for x in sublist] + + +def _store_as_modelcif( + data_json: dict, + structure: BioStructure, + mdl_file: str, + out_dir: str, + compress: bool = False, + add_files: list = None, + # file_prfx +) -> None: + """Create the actual ModelCIF file.""" + system = modelcif.System( + title=data_json["_struct.title"], + id=data_json["data_"].upper(), + model_details=data_json["_struct.pdbx_model_details"], + ) + + # create target entities, references, source, asymmetric units & assembly + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities( + data_json["target_entities"], + asym_units, + system, + ) + + # ToDo: get modelling-experiment authors + # audit_authors + # system.authors.extend(data_json["audit_authors"]) + + # set up the model to produce coordinates + model = _Biopython2ModelCIF( + assembly=modelcif.Assembly(asym_units.values()), + asym=asym_units, + bio_pdb_structure=structure, + name=data_json["_ma_model_list.model_name"], + ) + + # create software list from feature metadata + # ToDo: store_as_modelcif should not use __meta__ + sw_dct = _get_software_data(data_json["__meta__"]) + + # process scores + mdl_file = os.path.splitext(os.path.basename(mdl_file))[0] + system.repositories.append( + model.add_scores(data_json, system.id, mdl_file, sw_dct, add_files) + ) + + system.model_groups.append(modelcif.model.ModelGroup([model])) + + # ToDo: get protocol steps together + # - 'coevolution MSA' (create_individual_features.py) step + # - 'template search' is usually omitted for AF2 as that term more + # relates to homology modelling, AF2 uses templates in a different + # way and a template search usually results in a list of templates + # which would need to be included, here, in theory + # - for MSA/ template search, how to represent the outcome? Adding the + # pickle files quickly exceeds reasonable storage use + # - 'modeling' (run_multimer_jobs.py), are the four modes reflected by + # the JSON data/ does the JSON data look different for each mode? + # - are the scores only calculated by `alpha-analysis.sif` or do they + # come out of run_multimer_jobs.py? Does this go into its own step? + # - what about including the tabular summary? + # - model selection: only if just a certain model is translated to + # ModelCIF, or mix it with scoring step? + # + # - cancer-PPI-domains has 'coevolution MSA' + # - then modelling + # - do we have an example with a split MSA & modelling step? + # - model selection like for Tara + system.protocols.append( + _get_modelcif_protocol( + data_json["ma_protocol_step"], + system.target_entities, + model, + sw_dct, + # ToDo: _store_as_modelcif should not use __meta__, __meta__ is + # tool specific + _get_modelcif_ref_dbs(data_json["__meta__"]), + ) + ) + + # write `modelcif.System()` to file + # NOTE: this will dump PAE on path provided in add_scores + # -> hence we cheat by changing path and back while being exception-safe... + oldpwd = os.getcwd() + os.chdir(out_dir) + created_files = {} + try: + mdl_file = f"{mdl_file}.cif" + with open( + mdl_file, + "w", + encoding="ascii", + ) as mmcif_fh: + modelcif.dumper.write(mmcif_fh, [system]) + if compress: + mdl_file = _compress_cif_file(mdl_file) + created_files[mdl_file] = ( + os.path.join(out_dir, mdl_file), + _get_assoc_mdl_file(mdl_file, data_json), + ) + # Create associated archive + for archive in system.repositories[0].files: + with zipfile.ZipFile( + archive.path, "w", zipfile.ZIP_BZIP2 + ) as cif_zip: + for zfile in archive.files: + try: + # Regardless off error, fall back to `zfile.path`, the + # other path is only needed as a special case. + # pylint: disable=bare-except + sys_path = add_files[zfile.path][0] + except: + sys_path = zfile.path + cif_zip.write(sys_path, arcname=zfile.path) + os.remove(sys_path) + created_files[archive.path] = ( + os.path.join(out_dir, archive.path), + _get_assoc_zip_file(archive.path, data_json), + ) + finally: + os.chdir(oldpwd) + + return created_files + + +def _get_assoc_mdl_file(fle_path, data_json): + """Generate a `modelcif.associated.File` object that looks like a CIF + file.""" + cfile = modelcif.associated.File( + fle_path, + details=data_json["_ma_model_list.model_name"], + ) + cfile.file_format = "cif" + return cfile + + +def _get_assoc_zip_file(fle_path, data_json): + """Create a `modelcif.associated.File` object that looks like a ZIP file. + This is NOT the archive ZIP file for the PAEs but to store that in the + ZIP archive of the selected model.""" + zfile = modelcif.associated.File( + fle_path, + details="archive with multiple files for " + + data_json["_ma_model_list.model_name"], + ) + zfile.file_format = "other" + return zfile + + +def _compress_cif_file(cif_file): + """Compress CIF file and delete original.""" + cif_gz_file = cif_file + ".gz" + with open(cif_file, "rb") as f_in: + with gzip.open(cif_gz_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(cif_file) + return cif_gz_file + + +def _get_model_details(cmplx_name: str, data_json: dict) -> str: + """Get the model description.""" + ap_versions = [] + af2_version = None + for mnmr in data_json["__meta__"]: # `mnmr = monomer` + if ( + data_json["__meta__"][mnmr]["software"]["AlphaPulldown"]["version"] + not in ap_versions + ): + ap_versions.append( + data_json["__meta__"][mnmr]["software"]["AlphaPulldown"][ + "version" + ] + ) + # AlphaFold-Multimer builds the model we are looking at, can only be a + # single version. + if af2_version is None: + af2_version = data_json["__meta__"][mnmr]["software"]["AlphaFold"][ + "version" + ] + else: + if ( + data_json["__meta__"][mnmr]["software"]["AlphaFold"]["version"] + != af2_version + ): + # pylint: disable=line-too-long + raise RuntimeError( + "Different versions of AlphaFold-Multimer found: " + + f"'{data_json['__meta__'][mnmr]['software']['alphafold']['version']}'" + + f" vs. '{af2_version}'" + ) + + return ( + f"Model generated for {' and '.join(cmplx_name)}, produced " + + f"using AlphaFold-Multimer ({af2_version}) as implemented by " + + f"AlphaPulldown ({', '.join(ap_versions)})." + ) + + +def _get_feature_metadata( + modelcif_json: dict, + cmplx_name: str, + monomer_objects_dir: list, +) -> list: + """Read metadata from a feature JSON file.""" + cmplx_name = cmplx_name.split("_and_") + mnmr_obj_fls = make_dir_monomer_dictionary(monomer_objects_dir) + if "__meta__" not in modelcif_json: + modelcif_json["__meta__"] = {} + for mnmr in cmplx_name: + modelcif_json["__meta__"][mnmr] = {} + feature_json = f"{mnmr}_feature_metadata.json" + feature_json = os.path.join(mnmr_obj_fls[feature_json], feature_json) + if not os.path.isfile(feature_json): + logging.info(f"No feature metadata file '{feature_json}' found.") + sys.exit() + # ToDo: make sure that its always ASCII + with open(feature_json, "r", encoding="ascii") as jfh: + jdata = json.load(jfh) + modelcif_json["__meta__"][mnmr]["software"] = jdata["software"] + modelcif_json["__meta__"][mnmr]["databases"] = jdata["databases"] + + return cmplx_name + + +def _get_model_info( + cif_json: dict, + cmplx_name: str, + mdl_id: str, + mdl_rank: int, +) -> None: + """Get 'data_' block ID and data for categories '_struct' and '_entry'.""" + cif_json["data_"] = "_".join(cmplx_name) + cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}" + cif_json["_struct.pdbx_model_details"] = _get_model_details( + cmplx_name, cif_json + ) + cif_json[ + "_ma_model_list.model_name" + ] = f"Model {mdl_id} (ranked #{mdl_rank})" + + +def _get_entities( + cif_json: dict, pdb_file: str, cmplx_name: list, prj_dir: str +) -> BioStructure: + """Gather data for the mmCIF (target) entities.""" + # read sequence from FastA, map to sequences from PDB file + sequences = {} + # Fetch FastA sequences to assign monomers to the molecular entities + prj_dir = os.path.join(prj_dir, "fastas") + if not os.path.isdir(prj_dir): + logging.info(f"FastA directory '{prj_dir}' does not exist.") + sys.exit() + for mnmr in cmplx_name: + fasta = os.path.join(prj_dir, f"{mnmr}.fa") + if not os.path.isfile(fasta): + logging.info(f"FastA file '{fasta}' does not exist.") + sys.exit() + with open(fasta, "r", encoding="ascii") as ffh: + seq = SeqIO.read(ffh, "fasta") + # Using MD5 sums for comparing sequences only works since AF2 + # *ALWAYS* has the complete sequence in a chain. + sequences[hashlib.md5(seq.seq.encode()).hexdigest()] = mnmr + # gather molecular entities from PDB file + structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file) + cif_json["target_entities"] = [] + already_seen = [] + for seq in PPBuilder().build_peptides(structure): + chn_id = seq[0].parent.id + seq = seq.get_sequence() + seq_md5 = hashlib.md5(seq.encode()).hexdigest() + cif_ent = {} + try: + e_idx = already_seen.index(seq_md5) + except ValueError: + pass + else: + cif_json["target_entities"][e_idx]["pdb_chain_id"].append(chn_id) + continue + cif_ent["pdb_sequence"] = seq + cif_ent["pdb_chain_id"] = [chn_id] + cif_ent["description"] = sequences[seq_md5] + cif_json["target_entities"].append(cif_ent) + already_seen.append(sequences[seq_md5]) + + return structure + + +def _get_scores(cif_json: dict, scr_file: str) -> None: + """Add scores to JSON data.""" + with open(scr_file, "rb") as sfh: + scr_dict = pickle.load(sfh) + # Get pLDDT as a list, the global pLDDT is the average, calculated on the + # spot. + cif_json["plddt"] = scr_dict["plddt"] + cif_json["ptm"] = float(scr_dict["ptm"]) + cif_json["iptm"] = float(scr_dict["iptm"]) + cif_json["pae"] = scr_dict["predicted_aligned_error"] + + +def _get_software_data(meta_json: dict) -> list: + """Turn meta data about software into `modelcif.Software()` objects.""" + cite_hhsuite = ihm.Citation( + pmid="31521110", + title="HH-suite3 for fast remote homology detection and deep " + + "protein annotation.", + journal="BMC Bioinformatics", + volume=20, + page_range=None, + year=2019, + authors=[ + "Steinegger, M.", + "Meier, M.", + "Mirdita, M.", + "Voehringer, H.", + "Haunsberger, S.J.", + "Soeding, J.", + ], + doi="10.1186/s12859-019-3019-7", + ) + # {key from JSON: dict needed to produce software entry plus internal key} + sw_data = { + "AlphaFold": modelcif.Software( + "AlphaFold-Multimer", + "model building", + "Structure prediction", + "https://github.com/deepmind/alphafold", + "package", + None, + ihm.Citation( + pmid=None, + title="Protein complex prediction with AlphaFold-Multimer.", + journal="bioRxiv", + volume=None, + page_range=None, + year=2021, + authors=[ + "Evans, R.", + "O'Neill, M.", + "Pritzel, A.", + "Antropova, N.", + "Senior, A.", + "Green, T.", + "Zidek, A.", + "Bates, R.", + "Blackwell, S.", + "Yim, J.", + "Ronneberger, O.", + "Bodenstein, S.", + "Zielinski, M.", + "Bridgland, A.", + "Potapenko, A.", + "Cowie, A.", + "Tunyasuvunakool, K.", + "Jain, R.", + "Clancy, E.", + "Kohli, P.", + "Jumper, J.", + "Hassabis, D.", + ], + doi="10.1101/2021.10.04.463034", + ), + ), + "AlphaPulldown": modelcif.Software( + "AlphaPulldown", + "model building", + "Structure prediction", + "https://github.com/KosinskiLab/AlphaPulldown", + "package", + None, + ihm.Citation( + pmid="36413069", + title="AlphaPulldown-a python package for protein-protein " + + "interaction screens using AlphaFold-Multimer.", + journal="Bioinformatics", + volume=39, + page_range=None, + year=2023, + authors=[ + "Yu, D.", + "Chojnowski, G.", + "Rosenthal, M.", + "Kosinski, J.", + ], + doi="10.1093/bioinformatics/btac749", + ), + ), + "hhblits": modelcif.Software( + "HHblits", + "data collection", + "Iterative protein sequence searching by HMM-HMM alignment", + "https://github.com/soedinglab/hh-suite", + "program", + None, + cite_hhsuite, + ), + "hhsearch": modelcif.Software( + "HHsearch", + "data collection", + "Protein sequence searching by HMM-HMM comparison", + "https://github.com/soedinglab/hh-suite", + "program", + None, + cite_hhsuite, + ), + "hmmbuild": modelcif.Software( + "hmmbuild", + "data collection", + "Building HMM search profiles", + "http://hmmer.org/", + "program", + None, + None, + ), + "hmmsearch": None, + "jackhmmer": None, + "kalign": None, + } + # ToDo: refactor to only those SW objects created/ added that are actually + # in the dictionary. That is, instead of a pre-build dictionary, + # instantiate on the fly, there is anyway a hard-coded-tool-name. + for data in meta_json.values(): + for sftwr, version in data["software"].items(): + if sftwr not in sw_data: + raise RuntimeError( + "Unknown software found in meta data: " + f"'{sftwr}'" + ) + version = version["version"] + # ToDo: software should not be None, remove in final version + if sw_data[sftwr] is not None: + if sw_data[sftwr].version is not None: + if sw_data[sftwr].version != version: + raise RuntimeError( + "Software versions differ for " + + f"'{sftwr}': '{sw_data[sftwr].version}' vs. " + + f"'{version}'" + ) + sw_data[sftwr].version = version + + return sw_data + + +def _get_protocol_steps(modelcif_json): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + # MSA/ monomer feature generation step + # ToDo: Discuss input, manual has baits & sequences + step = { + "method_type": "coevolution MSA", + "step_name": "MSA generation", + "details": "Create sequence features for corresponding monomers.", + "input_data_group": ["target_sequences", "reference_dbs"], + "output_data_group": "monomer_pickle_files", + "software_group": [], + } + for sftwr in modelcif_json["__meta__"].values(): + sftwr = sftwr["software"] + for tool in sftwr: + if tool not in step["software_group"]: + step["software_group"].append(tool) + protocol.append(step) + + # modelling step + # ToDo: Discuss input, seem to depend on mode + # ToDo: what about step details? Would it be nice to add the AlphaPulldown + # mode here? + # ToDo: get software_group from external input + step = { + "method_type": "modeling", + "step_name": None, + "details": None, + "input_data_group": ["target_sequences", "STEPTYPE$coevolution MSA"], + "output_data_group": "model", + "software_group": ["AlphaPulldown", "AlphaFold"], + } + protocol.append(step) + + # model selection step <- ask if there is automated selection, if only + # manual, skip this step here? + + # ToDo: Example 1 in the GitHub repo has a 3rd step: "Evaluation and + # visualisation" + + return protocol + + +def alphapulldown_model_to_modelcif( + cmplx_name: str, + mdl: tuple, + out_dir: str, + prj_dir: str, + monomer_objects_dir: list, + compress: bool = False, + additional_assoc_files: list = None, +) -> None: + """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file. + + Metadata for the ModelCIF categories will be fetched from AlphaPulldown + output as far as possible. This expects modelling projects to exists in + AlphaPulldown's output directory structure.""" + # ToDo: ENABLE logging.info(f"Processing '{mdl[0]}'...") + modelcif_json = {} + # fetch metadata + cmplx_name = _get_feature_metadata( + modelcif_json, cmplx_name, monomer_objects_dir + ) + # fetch/ assemble more data about the modelling experiment + _get_model_info( + modelcif_json, + cmplx_name, + mdl[2], + mdl[3], + ) + # gather target entities (sequences that have been modeled) info + structure = _get_entities(modelcif_json, mdl[0], cmplx_name, prj_dir) + + # read quality scores from pickle file + _get_scores(modelcif_json, mdl[1]) + + modelcif_json["ma_protocol_step"] = _get_protocol_steps(modelcif_json) + cfs = _store_as_modelcif( + modelcif_json, + structure, + mdl[0], + out_dir, + compress, + additional_assoc_files, + ) + # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'") + return cfs + + +def _add_mdl_to_list(mdl, model_list, mdl_path, score_files): + """Fetch info from file name to add to list""" + rank = re.match(r"ranked_(\d+)\.pdb", mdl) + if rank is not None: + rank = int(rank.group(1)) + model_list.append( + ( + os.path.join(mdl_path, mdl), + score_files[rank][0], + score_files[rank][1], # model ID + score_files[rank][2], # model rank + ) + ) + + +def _get_model_list( + ap_dir: str, model_selected: str, get_non_selected: bool +) -> Tuple[str, str, list, list]: + """Get the list of models to be converted. + + If `model_selected` is none, all models will be marked for conversion.""" + mdl_path = os.path.join(ap_dir, "models") + cmplx = os.listdir(mdl_path) + # For now, exactly 1 complex is expected in the 'models' subdirectory. If + # there are more, the 'model_selected' mechanism needs to be further tuned + # to get to the right model. + assert len(cmplx) == 1 + cmplx = cmplx[0] + mdl_path = os.path.join(mdl_path, cmplx) + models = [] + # We are going for models with name "rank_?.pdb" as it does not depend on + # relaxation. But this means we have to match the pickle files via + # ranking_debug.json. + ranking_dbg = os.path.join(mdl_path, "ranking_debug.json") + if not os.path.isfile(ranking_dbg): + logging.info( + f"Ranking file '{ranking_dbg} does not exist or is no regular " + + "file." + ) + sys.exit() + with open(ranking_dbg, "r", encoding="utf8") as jfh: + ranking_dbg = json.load(jfh) + score_files = {} + for i, fle in enumerate(ranking_dbg["order"]): + if not fle.startswith("model_"): + raise RuntimeError( + "Filename does not start with 'model_', can " + + f"not determine model ID: '{fle}'" + ) + score_files[i] = ( + os.path.join(mdl_path, f"result_{fle}.pkl"), + fle.split("_")[1], + i, + ) + # match PDB files with pickle files + not_selected_models = [] + if model_selected is not None: + if model_selected not in score_files: + logging.info(f"Model of rank {model_selected} not found.") + sys.exit() + _add_mdl_to_list( + f"ranked_{model_selected}.pdb", models, mdl_path, score_files + ) + + if get_non_selected: + for mdl in os.listdir(mdl_path): + if mdl == f"ranked_{model_selected}.pdb": + continue + _add_mdl_to_list( + mdl, not_selected_models, mdl_path, score_files + ) + else: + for mdl in os.listdir(mdl_path): + _add_mdl_to_list(mdl, models, mdl_path, score_files) + + # check that files actually exist + for mdl, scrs, *_ in models: + if not os.path.isfile(mdl): + logging.info( + f"Model file '{mdl}' does not exist or is not a regular file." + ) + sys.exit() + if not os.path.isfile(scrs): + logging.info( + f"Scores file '{scrs}' does not exist or is not a regular file." + ) + sys.exit() + + return cmplx, mdl_path, models, not_selected_models + + +def main(argv): + """Run as script.""" + # pylint: disable=pointless-string-statement + """ + Here, the metadata json files for each feature are in features_monomers/ + directory. The models are in models/ directory, and usually there are many + complexes modelled using different permutations of the monomeric features. + For the sake of size, I send you the models of only one dimer + cage_B_and_cage_C/ that was generated using features_monomers/cage_B.pkl + and features_monomers/cage_C.pkl accordingly. + Please note that typically all the cage_?_feature_metadata.json files are + identical in terms of used databases and software versions and generated + in one go. + However, theoretically they could be generated using different binaries/DBs + versions, so maybe it makes sense to compare them and store both/all + versions if they are different. This merging can be done on our + AlphaPulldown side and may be added now or later on. Let me know if it is + critical for you now. + """ + # pylint: enable=pointless-string-statement + del argv # Unused. + + # get list of selected models and assemble ModelCIF files + associated data + complex_name, model_dir, model_list, not_selected = _get_model_list( + FLAGS.ap_output, + FLAGS.model_selected, + FLAGS.add_associated, + ) + add_assoc_files = {} + if len(not_selected) > 0: + # pylint: disable=consider-using-with + ns_tmpdir = tempfile.TemporaryDirectory(suffix="_modelcif") + for mdl in not_selected: + add_assoc_files.update( + alphapulldown_model_to_modelcif( + complex_name, + mdl, + ns_tmpdir.name, + FLAGS.ap_output, + FLAGS.monomer_objects_dir, + FLAGS.compress, + ) + ) + for mdl in model_list: + alphapulldown_model_to_modelcif( + complex_name, + mdl, + model_dir, + FLAGS.ap_output, + FLAGS.monomer_objects_dir, + FLAGS.compress, + add_assoc_files, + ) + + +if __name__ == "__main__": + app.run(main) + +# ToDo: Question - option to include all the non-selected models in associated +# data archive? This blows up storage size (especially if PAEs included), +# but we did that already in the past. Idea is to have all models +# available for... reproducibility and whatnot, but show the selected +# (representative) of the modelling experiment/ study more prominently. +# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details', +# 'data_', '_entry', maybe have a user-defined JSON document with things +# like that, including author names? +# ToDo: where to store which model was chosen? Should be in Tara's models. +# ToDo: make sure all functions come with types + +# From former discussions: +# - including Jupyter notebooks would require adding the pickle files to the +# associated files (too much storage needed for that) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..39913b661df304953bcf89905003eda628206c19 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,39 @@ +[project] +name = "alphapulldown-modelcif-converter" +version = "0.0.1" +requires-python = "==3.8" +dependencies = [ + "modelcif==0.8", +] + +[tool.black] +line-length = 80 + +[tool.pylint.MAIN] +load-plugins = "pylint.extensions.bad_builtin" + +[tool.pylint.REPORTS] +reports = "no" + +[tool.pylint.FORMAT] +max-line-length = 81 + +[tool.pylint.deprecated_builtins] +# We want to use proper logging, so we can control *ALL* output bei the Abseil +# logger, hence: deprecate 'print' +bad-functions = ["map", "filter", "print"] + +# Run the spell check every once in a while, having it enabled always, is too +# annoying. +#[tool.pylint.spelling] +#max-spelling-suggestions = 4 +# +#spelling-dict = "en_GB" +# +#spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:" +# +#spelling-ignore-words = "" +# +#spelling-private-dict-file = ".spelling" +# +#spelling-store-unknown-words = false \ No newline at end of file