From 27db3776407cf1ca84acc3635a6f5e428610e916 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Thu, 12 Oct 2023 10:12:38 +0200 Subject: [PATCH] black * PEP8 --- .../translate2modelcif.py | 291 ++++++++++-------- 1 file changed, 161 insertions(+), 130 deletions(-) diff --git a/projects/dark-matter-metagenomics/translate2modelcif.py b/projects/dark-matter-metagenomics/translate2modelcif.py index cd3a91d..fd44a6d 100644 --- a/projects/dark-matter-metagenomics/translate2modelcif.py +++ b/projects/dark-matter-metagenomics/translate2modelcif.py @@ -1,27 +1,18 @@ #! /usr/local/bin/ost # -*- coding: utf-8 -*- - """Translate models for Sergey from PDB + extra data into ModelCIF.""" -# EXAMPLES for running: -""" -ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \ - ./web_dloads/pivot ./modelcif --prefix=F000347 \ - --pdb-web-path=./web_dloads/pdb \ - --refseq-path=./web_dloads/consensus_all.fasta -""" -# NOTE: add "--compress" for final runs - from timeit import default_timer as timer import argparse import filecmp import gzip import os -import pandas as pd import shutil import sys import zipfile +import pandas as pd + import ihm import ihm.citations import modelcif @@ -34,6 +25,16 @@ import modelcif.reference from ost import io +# EXAMPLES for running: +""" +ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \ + ./web_dloads/pivot ./modelcif --prefix=F000347 \ + --pdb-web-path=./web_dloads/pdb \ + --refseq-path=./web_dloads/consensus_all.fasta +""" +# NOTE: add "--compress" for final runs + + def _abort_msg(msg, exit_code=1): """Write error message and exit with exit_code.""" print(f"{msg}\nAborting.", file=sys.stderr) @@ -105,7 +106,7 @@ def _parse_args(): type=str, metavar="<PREFIX>", default="", - help="Only process families starting with given prefix. By default " \ + help="Only process families starting with given prefix. By default " + "all families are processed.", ) parser.add_argument( @@ -120,7 +121,7 @@ def _parse_args(): type=str, metavar="<PDB WEB PATH>", default=None, - help="Optional path to directory with F*.pdb files as available on " \ + help="Optional path to directory with F*.pdb files as available on " + "NMPFamsDB web site. Used to check if top ranked model the same.", ) parser.add_argument( @@ -128,7 +129,7 @@ def _parse_args(): type=str, metavar="<PDB REFSEQ PATH>", default=None, - help="Optional path to fasta file with all ref. seq. as available on " \ + help="Optional path to fasta file with all ref. seq. as available on " + "NMPFamsDB web site. Used to check if it matches with MSA.", ) opts = parser.parse_args() @@ -155,26 +156,32 @@ def _parse_args(): # pylint: disable=too-few-public-methods class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM): """Predicted accuracy according to the TM-score score in [0,1]""" + name = "pTM" software = None class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): """Predicted accuracy according to the CA-only lDDT in [0,100]""" + name = "pLDDT" software = None class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): """Predicted accuracy according to the CA-only lDDT in [0,100]""" + name = "pLDDT" software = None class _NmpfamsdbTrgRef(modelcif.reference.TargetReference): """PDB as target reference.""" + name = "Other" other_details = "NMPFamsDB" + + # pylint: enable=too-few-public-methods @@ -213,12 +220,11 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): self.plddts = [] for res in self.ost_entity.residues: b_factors = [a.b_factor for a in res.atoms] - assert len(set(b_factors)) == 1 # must all be equal! + assert len(set(b_factors)) == 1 # must all be equal! self.plddts.append(b_factors[0]) super().__init__(*args, **kwargs) - def get_atoms(self): # ToDo [internal]: Take B-factor out since its not a B-factor? # NOTE: this assumes that _get_res_num maps residue to pos. in seqres @@ -247,7 +253,6 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): ) ) - # local scores i = 0 for chn_i in self.ost_entity.chains: @@ -296,8 +301,9 @@ def _get_audit_authors(): def _get_metadata(metadata_file): """Read csv file with metedata and prepare for next steps.""" - metadata = pd.read_csv(metadata_file, sep=' ', - names=["ID", "mdl", "pTM", "pLDDT"]) + metadata = pd.read_csv( + metadata_file, sep=" ", names=["ID", "mdl", "pTM", "pLDDT"] + ) return metadata @@ -305,18 +311,20 @@ def _get_pdb_files(model_base_dir): """Collect PDB files from pub_data_* folders. Returns dict with key = family name and value = list of paths to PDB files. """ - pdb_files_split = dict() # to return - pdb_files_raw = set() # to check for duplicates - pub_paths = [f for f in os.listdir(model_base_dir) \ - if f.startswith("pub_data_")] + pdb_files_split = dict() # to return + pdb_files_raw = set() # to check for duplicates + pub_paths = [ + f for f in os.listdir(model_base_dir) if f.startswith("pub_data_") + ] # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03 for pub_path in sorted(pub_paths): sub_path = os.path.join(model_base_dir, pub_path) - pdb_files_new = [f for f in os.listdir(sub_path) \ - if not f.startswith(".")] + pdb_files_new = [ + f for f in os.listdir(sub_path) if not f.startswith(".") + ] for f in pdb_files_new: f_path = os.path.join(sub_path, f) - f_name = f.split('_')[0] + f_name = f.split("_")[0] if f_name in pdb_files_split: pdb_files_split[f_name].append(f_path) else: @@ -325,21 +333,27 @@ def _get_pdb_files(model_base_dir): pdb_files_new_set = set(pdb_files_new) new_duplicates = pdb_files_raw.intersection(pdb_files_new_set) if new_duplicates: - _warn_msg(f"{len(new_duplicates)} duplicated files found in " \ - f"{sub_path}.") + _warn_msg( + f"{len(new_duplicates)} duplicated files found in " + f"{sub_path}." + ) pdb_files_raw = pdb_files_raw.union(pdb_files_new_set) return pdb_files_split def _get_config(): """Define AF setup.""" - msa_description = "MSA created by calculating the central or \"pivot\" " \ - "sequence of each seed MSA, and refining each " \ - "alignment using that sequence as the guide." - mdl_description = "Model generated using AlphaFold (v2.0.0 with models " \ - "finetuned to return ptm weights) producing 5 models, " \ - "without model relaxation, without templates, ranked " \ - "by pLDDT, starting from a custom MSA." + msa_description = ( + 'MSA created by calculating the central or "pivot" ' + "sequence of each seed MSA, and refining each " + "alignment using that sequence as the guide." + ) + mdl_description = ( + "Model generated using AlphaFold (v2.0.0 with models " + "finetuned to return ptm weights) producing 5 models, " + "without model relaxation, without templates, ranked " + "by pLDDT, starting from a custom MSA." + ) af_config = {} return { "af_config": af_config, @@ -382,58 +396,62 @@ def _get_protocol_steps_and_software(config_data): step["output"] = "model" # get software if config_data["use_multimer"]: - step["software"] = [{ - "name": "AlphaFold-Multimer", - "classification": "model building", - "description": "Structure prediction", - "citation": ihm.Citation( - pmid=None, - title="Protein complex prediction with " - + "AlphaFold-Multimer.", - journal="bioRxiv", - volume=None, - page_range=None, - year=2021, - authors=[ - "Evans, R.", - "O'Neill, M.", - "Pritzel, A.", - "Antropova, N.", - "Senior, A.", - "Green, T.", - "Zidek, A.", - "Bates, R.", - "Blackwell, S.", - "Yim, J.", - "Ronneberger, O.", - "Bodenstein, S.", - "Zielinski, M.", - "Bridgland, A.", - "Potapenko, A.", - "Cowie, A.", - "Tunyasuvunakool, K.", - "Jain, R.", - "Clancy, E.", - "Kohli, P.", - "Jumper, J.", - "Hassabis, D.", - ], - doi="10.1101/2021.10.04.463034", - ), - "location": "https://github.com/deepmind/alphafold", - "type": "package", - "version": config_data["af_version"], - }] + step["software"] = [ + { + "name": "AlphaFold-Multimer", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.Citation( + pmid=None, + title="Protein complex prediction with " + + "AlphaFold-Multimer.", + journal="bioRxiv", + volume=None, + page_range=None, + year=2021, + authors=[ + "Evans, R.", + "O'Neill, M.", + "Pritzel, A.", + "Antropova, N.", + "Senior, A.", + "Green, T.", + "Zidek, A.", + "Bates, R.", + "Blackwell, S.", + "Yim, J.", + "Ronneberger, O.", + "Bodenstein, S.", + "Zielinski, M.", + "Bridgland, A.", + "Potapenko, A.", + "Cowie, A.", + "Tunyasuvunakool, K.", + "Jain, R.", + "Clancy, E.", + "Kohli, P.", + "Jumper, J.", + "Hassabis, D.", + ], + doi="10.1101/2021.10.04.463034", + ), + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + } + ] else: - step["software"] = [{ - "name": "AlphaFold", - "classification": "model building", - "description": "Structure prediction", - "citation": ihm.citations.alphafold2, - "location": "https://github.com/deepmind/alphafold", - "type": "package", - "version": config_data["af_version"], - }] + step["software"] = [ + { + "name": "AlphaFold", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.citations.alphafold2, + "location": "https://github.com/deepmind/alphafold", + "type": "package", + "version": config_data["af_version"], + } + ] step["software_parameters"] = config_data["af_config"] protocol.append(step) @@ -449,11 +467,13 @@ def _get_model_details(fam_name): """Get the model description.""" db_url = f"https://bib.fleming.gr/NMPFamsDB/family?id={fam_name}" # TODO: check if ok to use HTML for the URL - db_url = f"<a href=\"{db_url}\" target=\"_blank\">{db_url}</a>" - return f"Model generated using AlphaFold (v2.0.0) for the " \ - f"\"Representative Sequence\" of NMPFamsDB Metagenome / " \ - f"Metatranscriptome Family {fam_name}.\n\nSee {db_url} for " \ - f"additional details." + db_url = f'<a href="{db_url}" target="_blank">{db_url}</a>' + return ( + f"Model generated using AlphaFold (v2.0.0) for the " + f'"Representative Sequence" of NMPFamsDB Metagenome / ' + f"Metatranscriptome Family {fam_name}.\n\nSee {db_url} for " + f"additional details." + ) def _get_model_group_name(): @@ -466,8 +486,7 @@ def _get_sequence(chn, use_auth=False): # initialise (add gaps if first is not at num. 1) lst_rn = _get_res_num(chn.residues[0], use_auth) idx = 1 - sqe = "-" * (lst_rn - 1) \ - + chn.residues[0].one_letter_code + sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code for res in chn.residues[idx:]: lst_rn += 1 @@ -484,28 +503,26 @@ def _get_entities(pdb_file, ref_seq, fam_name): ost_ent = io.LoadPDB(pdb_file) if ost_ent.chain_count != 1: - raise RuntimeError( - f"Unexpected oligomer in {pdb_file}" - ) + raise RuntimeError(f"Unexpected oligomer in {pdb_file}") chn = ost_ent.chains[0] sqe_gaps = _get_sequence(chn) # NOTE: can have gaps to accommodate "X" in ref_seq - exp_seq = sqe_gaps.replace('-', 'X') + exp_seq = sqe_gaps.replace("-", "X") len_diff = len(ref_seq.string) - len(exp_seq) if len_diff > 0: - exp_seq += 'X' * len_diff + exp_seq += "X" * len_diff if exp_seq != ref_seq.string: raise RuntimeError(f"Sequence in {pdb_file} does not match ref_seq") # TODO: waiting for input on whether they change handling of "X" in seq # -> HERE: assuming that "X" were in modelled sequence and PDB can have gaps cif_ent = { - "seqres": ref_seq.string, # HACK for testing: .replace('X', 'A') + "seqres": ref_seq.string, # HACK for testing: .replace('X', 'A') "pdb_sequence": sqe_gaps, "pdb_chain_id": [_get_ch_name(chn, False)], "fam_name": fam_name, - "description": f"Representative Sequence of NMPFamsDB Family {fam_name}" + "description": f"Representative Sequence of NMPFamsDB Family {fam_name}", } return [cif_ent], ost_ent @@ -524,14 +541,15 @@ def _get_modelcif_entities(target_ents, asym_units, system): cif_ent["fam_name"], cif_ent["fam_name"], align_begin=1, - align_end=len(cif_ent["seqres"]) + align_end=len(cif_ent["seqres"]), ) ], ) # NOTE: this assigns (potentially new) alphabetic chain names for pdb_chain_id in cif_ent["pdb_chain_id"]: asym_units[pdb_chain_id] = modelcif.AsymUnit( - mdlcif_ent, strand_id=pdb_chain_id, + mdlcif_ent, + strand_id=pdb_chain_id, ) system.target_entities.append(mdlcif_ent) @@ -543,7 +561,7 @@ def _get_assoc_aln_file(fle_path): cfile = modelcif.associated.File( fle_path, details="Custom MSA for modelling", - data=modelcif.data.Data("Custom MSA for modelling") + data=modelcif.data.Data("Custom MSA for modelling"), ) cfile.file_format = "fasta" cfile.file_content = "multiple sequence alignments" @@ -640,8 +658,8 @@ def _get_modelcif_protocol(protocol_steps, target_entities, aln_data, model): def _compress_cif_file(cif_file): """Compress cif file and delete original.""" - with open(cif_file, 'rb') as f_in: - with gzip.open(cif_file + '.gz', 'wb') as f_out: + with open(cif_file, "rb") as f_in: + with gzip.open(cif_file + ".gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) os.remove(cif_file) @@ -669,9 +687,7 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress): # create an asymmetric unit and an entity per target sequence asym_units = {} - _get_modelcif_entities( - data_json["target_entities"], asym_units, system - ) + _get_modelcif_entities(data_json["target_entities"], asym_units, system) # audit_authors system.authors.extend(data_json["audit_authors"]) @@ -697,13 +713,14 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress): # handle additional files aln_file = _get_assoc_aln_file(data_json["aln_file_name"]) - system.repositories.append( - _get_associated_files(mdl_name, [aln_file]) - ) + system.repositories.append(_get_associated_files(mdl_name, [aln_file])) # get data and steps protocol = _get_modelcif_protocol( - data_json["protocol"], system.target_entities, aln_file.data, model, + data_json["protocol"], + system.target_entities, + aln_file.data, + model, ) system.protocols.append(protocol) @@ -713,7 +730,7 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress): # copy aln file to compress them shutil.copyfile( data_json["aln_file_path"], - os.path.join(out_dir, data_json["aln_file_name"]) + os.path.join(out_dir, data_json["aln_file_name"]), ) # NOTE: we change path and back while being exception-safe to handle zipfile oldpwd = os.getcwd() @@ -736,8 +753,10 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): # TODO: unclear what to do with such cases; skipped for now if len(metadata_fam) != 5: - _warn_msg(f"Unexpected number of {len(metadata_fam)} models in " \ - f"metadata for family {f_name}. Skipping...") + _warn_msg( + f"Unexpected number of {len(metadata_fam)} models in " + f"metadata for family {f_name}. Skipping..." + ) return # @@ -762,18 +781,23 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): # TODO: if we need to handle files without ALN, this needs fixing # -> e.g. 11 extra models in pub_data_* cannot be handled right now if not os.path.exists(aln_path): - _warn_msg(f"Cannot deal with missing MSA for {f_name} (yet). " \ - f"Skipping...") + _warn_msg( + f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..." + ) return # - aln = io.LoadAlignment(aln_path) # note: this checks that it's an actual MSA + aln = io.LoadAlignment( + aln_path + ) # note: this checks that it's an actual MSA ref_seq = aln.sequences[0] if ref_seq_check is not None and ref_seq_check.string != ref_seq.string: raise RuntimeError(f"Sequence mismatch for {f_name}") # TODO: allow "X" (or whatever can be used to label unknown AA) if needed if "X" in ref_seq.string: - _warn_msg(f"Cannot deal with 'X' in ref_seq for {f_name} (yet). " \ - f"Skipping...") + _warn_msg( + f"Cannot deal with 'X' in ref_seq for {f_name} (yet). " + f"Skipping..." + ) return # @@ -814,8 +838,9 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): if opts.pdb_web_path is not None: pdb_file_web = os.path.join(opts.pdb_web_path, f"{f_name}.pdb") if not filecmp.cmp(pdb_file, pdb_file_web): - raise RuntimeError(f"PDB file mismatch web vs top-ranked for " \ - f"{f_name}") + raise RuntimeError( + f"PDB file mismatch web vs top-ranked for " f"{f_name}" + ) # fill annotations mdlcf_json["title"] = _get_title(f_name) @@ -824,22 +849,27 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): print(f" ({timer()-pstart:.2f}s)") # save ModelCIF - _store_as_modelcif(mdlcf_json, ost_ent, opts.out_dir, mdl_id, opts.compress) + _store_as_modelcif( + mdlcf_json, ost_ent, opts.out_dir, mdl_id, opts.compress + ) # check if result can be read and has expected seq. ent, ss = io.LoadMMCIF(mdl_path, seqres=True) - exp_seqs = [trg_ent["pdb_sequence"] \ - for trg_ent in mdlcf_json["target_entities"]] + exp_seqs = [ + trg_ent["pdb_sequence"] for trg_ent in mdlcf_json["target_entities"] + ] assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}" # here we expect auth = label IDs ent_seq = "".join([_get_sequence(chn, False) for chn in ent.chains]) ent_seq_a = "".join([_get_sequence(chn, True) for chn in ent.chains]) assert ent_seq == ent_seq_a assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}" - ent_seqres = "".join([ss.FindSequence(chn.name).string \ - for chn in ent.chains]) - exp_seqres = "".join([trg_ent["seqres"] \ - for trg_ent in mdlcf_json["target_entities"]]) + ent_seqres = "".join( + [ss.FindSequence(chn.name).string for chn in ent.chains] + ) + exp_seqres = "".join( + [trg_ent["seqres"] for trg_ent in mdlcf_json["target_entities"]] + ) assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}" print(f" ... done with {mdl_id} ({timer()-pdb_start:.2f}s).") @@ -864,7 +894,8 @@ def _main(): for f_name in sorted(pdb_files_split): if f_name.startswith(opts.prefix): _translate2modelcif( - f_name, opts, + f_name, + opts, metadata_full[metadata_full.ID == f_name], pdb_files_split[f_name], refseqs.FindSequence(f_name) if refseqs is not None else None, -- GitLab