diff --git a/projects/novelfams/translate2modelcif.py b/projects/novelfams/translate2modelcif.py index 9c3f5f80a3e64e3d8057f42311c7cd3c15bb8b8f..2b4762c440b90ee18563b03818b7043b2c1d3ad5 100644 --- a/projects/novelfams/translate2modelcif.py +++ b/projects/novelfams/translate2modelcif.py @@ -1,12 +1,11 @@ #! /usr/local/bin/ost # -*- coding: utf-8 -*- """Translate models for novelfams from PDB + extra data into ModelCIF.""" -# re-enable Pylint for final version +# ToDo: re-enable Pylint # pylint: disable=too-many-lines from timeit import default_timer as timer import argparse -import filecmp import gzip import os import shutil @@ -14,7 +13,6 @@ import sys import zipfile import pandas as pd -import numpy as np import ihm import ihm.citations @@ -35,185 +33,6 @@ from ost import io, geom, mol # ost translate2modelcif.py ... -################################################################################ -# HELPERS (mostly copied from from ost/modules/io/tests/test_io_omf.py) -# to compare PDBs -def _compare_atoms( - a1, a2, occupancy_thresh=0.01, bfactor_thresh=0.01, dist_thresh=0.001 -): - if abs(a1.occupancy - a2.occupancy) > occupancy_thresh: - return False - if abs(a1.b_factor - a2.b_factor) > bfactor_thresh: - return False - # modification: look at x,y,z spearately - if abs(a1.pos.x - a2.pos.x) > dist_thresh: - return False - if abs(a1.pos.y - a2.pos.y) > dist_thresh: - return False - if abs(a1.pos.z - a2.pos.z) > dist_thresh: - return False - if a1.is_hetatom != a2.is_hetatom: - return False - if a1.element != a2.element: - return False - return True - - -def _compare_residues( - r1, - r2, - at_occupancy_thresh=0.01, - at_bfactor_thresh=0.01, - at_dist_thresh=0.001, - skip_ss=False, - skip_rnums=False, -): - if r1.GetName() != r2.GetName(): - return False - if skip_rnums is False: - if r1.GetNumber() != r2.GetNumber(): - return False - if skip_ss is False: - if str(r1.GetSecStructure()) != str(r2.GetSecStructure()): - return False - if r1.one_letter_code != r2.one_letter_code: - return False - if r1.chem_type != r2.chem_type: - return False - if r1.chem_class != r2.chem_class: - return False - anames1 = [a.GetName() for a in r1.atoms] - anames2 = [a.GetName() for a in r2.atoms] - if sorted(anames1) != sorted(anames2): - return False - anames = anames1 - for aname in anames: - a1 = r1.FindAtom(aname) - a2 = r2.FindAtom(aname) - if not _compare_atoms( - a1, - a2, - occupancy_thresh=at_occupancy_thresh, - bfactor_thresh=at_bfactor_thresh, - dist_thresh=at_dist_thresh, - ): - return False - return True - - -def _compare_chains( - ch1, - ch2, - at_occupancy_thresh=0.01, - at_bfactor_thresh=0.01, - at_dist_thresh=0.001, - skip_ss=False, - skip_rnums=False, -): - if len(ch1.residues) != len(ch2.residues): - return False - for r1, r2 in zip(ch1.residues, ch2.residues): - if not _compare_residues( - r1, - r2, - at_occupancy_thresh=at_occupancy_thresh, - at_bfactor_thresh=at_bfactor_thresh, - at_dist_thresh=at_dist_thresh, - skip_ss=skip_ss, - skip_rnums=skip_rnums, - ): - return False - return True - - -def _compare_bonds(ent1, ent2): - bonds1 = list() - for b in ent1.bonds: - bond_partners = [str(b.first), str(b.second)] - bonds1.append([min(bond_partners), max(bond_partners), b.bond_order]) - bonds2 = list() - for b in ent2.bonds: - bond_partners = [str(b.first), str(b.second)] - bonds2.append([min(bond_partners), max(bond_partners), b.bond_order]) - return sorted(bonds1) == sorted(bonds2) - - -def _compare_ent( - ent1, - ent2, - at_occupancy_thresh=0.01, - at_bfactor_thresh=0.01, - at_dist_thresh=0.001, - skip_ss=False, - skip_cnames=False, - skip_bonds=False, - skip_rnums=False, - bu_idx=None, -): - if bu_idx is not None: - if ent1.GetName() + " " + str(bu_idx) != ent2.GetName(): - return False - else: - if ent1.GetName() != ent2.GetName(): - return False - chain_names_one = [ch.GetName() for ch in ent1.chains] - chain_names_two = [ch.GetName() for ch in ent2.chains] - if skip_cnames: - # only check whether we have the same number of chains - if len(chain_names_one) != len(chain_names_two): - return False - else: - if chain_names_one != chain_names_two: - return False - for ch1, ch2 in zip(ent1.chains, ent2.chains): - if not _compare_chains( - ch1, - ch2, - at_occupancy_thresh=at_occupancy_thresh, - at_bfactor_thresh=at_bfactor_thresh, - at_dist_thresh=at_dist_thresh, - skip_ss=skip_ss, - skip_rnums=skip_rnums, - ): - return False - if not skip_bonds: - if not _compare_bonds(ent1, ent2): - return False - return True - - -def _compare_pdbs(f_name, f1, f2): - """Use atom-by-atom comparison on PDB files allowing num. errors.""" - # first do simple file diff. - if filecmp.cmp(f1, f2): - return True - else: - ent1 = io.LoadPDB(f1) - ent2 = io.LoadPDB(f2) - # allow a bit more errors as input files can have rounding errors - if _compare_ent(ent1, ent2, 0.011, 0.011, 0.0011, True, False, True): - return True - else: - # check manually and give warning... - atom_names_1 = [a.qualified_name for a in ent1.atoms] - atom_names_2 = [a.qualified_name for a in ent2.atoms] - assert atom_names_1 == atom_names_2 - b_diffs = [ - abs(a1.b_factor - a2.b_factor) - for a1, a2 in zip(ent1.atoms, ent2.atoms) - ] - max_b_diff = max(b_diffs) - rmsd = mol.alg.CalculateRMSD(ent1.Select(""), ent2.Select("")) - _warn_msg( - f"PDB file mismatch web vs top-ranked for {f_name}: " - f"RMSD {rmsd:.3f}, max. b_factor diff {max_b_diff:.3f}" - ) - return False - - -################################################################################ - - def _abort_msg(msg, exit_code=1): """Write error message and exit with exit_code.""" print(f"{msg}\nAborting.", file=sys.stderr) @@ -225,14 +44,6 @@ def _warn_msg(msg): print(f"WARNING: {msg}") -def _check_file(file_path): - """Make sure a file exists and is actually a file.""" - if not os.path.exists(file_path): - _abort_msg(f"File not found: '{file_path}'.") - if not os.path.isfile(file_path): - _abort_msg(f"File path does not point to file: '{file_path}'.") - - def _check_folder(dir_path): """Make sure a file exists and is actually a file.""" if not os.path.exists(dir_path): @@ -398,80 +209,6 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): occupancy=atm.occupancy, ) - def add_scores(self): - """Add QA metrics from AF2 scores.""" - # global scores - self.qa_metrics.extend( - ( - _GlobalPLDDT(self.scores_json["plddt_global"]), - _GlobalPTM(self.scores_json["ptm"]), - ) - ) - - # local scores - lpae = [] - i = 0 - for chn_i in self.ost_entity.chains: - ch_name = _get_ch_name(chn_i, self.use_auth) - for res_i in chn_i.residues: - # local pLDDT - res_num_i = _get_res_num(res_i, self.use_auth) - self.qa_metrics.append( - _LocalPLDDT( - self.asym[ch_name].residue(res_num_i), - self.plddts[i], - ) - ) - i += 1 - - # PAE needs to go by residue index as it also stores ones - # for missing residues (i.e. X) - if "pae" in self.scores_json: - pae_i = self.scores_json["pae"][res_num_i - 1] - for chn_j in self.ost_entity.chains: - for res_j in chn_j.residues: - res_num_j = _get_res_num(res_j, self.use_auth) - pae_ij = pae_i[res_num_j - 1] - lpae.append( - _LocalPairwisePAE( - self.asym[chn_i.name].residue(res_num_i), - self.asym[chn_j.name].residue(res_num_j), - round(pae_ij, self.pae_digits), - ) - ) - - self.qa_metrics.extend(lpae) - - -def _get_audit_authors(): - """Return the list of authors that produced this model.""" - return ( - "Pavlopoulos, Georgios A.", - "Baltoumas, Fotis A.", - "Liu, Sirui", - "Selvitopi, Oguz", - "Camargo, Antonio Pedro", - "Nayfach, Stephen", - "Azad, Ariful", - "Roux, Simon", - "Call, Lee", - "Ivanova, Natalia N.", - "Chen, I-Min", - "Paez-Espino, David", - "Karatzas, Evangelos", - "Novel Metagenome Protein Families Consortium", - "Iliopoulos, Ioannis", - "Konstantinidi, Konstantinos", - "Tiedje, James M.", - "Pett-Ridge, Jennifer", - "Baker, David", - "Visel, Axel", - "Ouzounis, Christos A.", - "Ovchinnikov, Sergey", - "Buluc, Aydin", - "Kyrpides, Nikos C.", - ) - def _get_metadata(metadata_file): """Read csv file with metedata and prepare for next steps.""" @@ -495,143 +232,16 @@ def _get_pdb_files(model_dir): return pdb_paths -def _get_config(): - """Define AF setup.""" - msa_description = ( - 'MSA created by calculating the central or "pivot" ' - "sequence of each seed MSA, and refining each " - "alignment using that sequence as the guide." - ) - mdl_description = ( - "Model generated using AlphaFold (v2.0.0 with models " - "fine-tuned to return pTM weights) producing 5 models, " - "without model relaxation, without templates, ranked " - "by pLDDT, starting from a custom MSA." - ) - af_config = {} - return { - "af_config": af_config, - "af_version": "2.0.0", - "mdl_description": mdl_description, - "msa_description": msa_description, - "use_templates": False, - "use_small_bfd": False, - "use_multimer": False, - } - - -def _get_protocol_steps_and_software(config_data): - """Create the list of protocol steps with software and parameters used.""" - protocol = [] - - # MSA step - step = { - "method_type": "coevolution MSA", - "name": None, - "details": config_data["msa_description"], - } - step["input"] = "target_sequences" - step["output"] = "MSA" - step["software"] = [] - step["software_parameters"] = {} - protocol.append(step) - - # modelling step - step = { - "method_type": "modeling", - "name": None, - "details": config_data["mdl_description"], - } - # get input data - # Must refer to data already in the JSON, so we try keywords - step["input"] = "target_sequences_and_MSA" - # get output data - # Must refer to existing data, so we try keywords - step["output"] = "model" - # get software - if config_data["use_multimer"]: - step["software"] = [ - { - "name": "AlphaFold-Multimer", - "classification": "model building", - "description": "Structure prediction", - "citation": ihm.Citation( - pmid=None, - title="Protein complex prediction with " - + "AlphaFold-Multimer.", - journal="bioRxiv", - volume=None, - page_range=None, - year=2021, - authors=[ - "Evans, R.", - "O'Neill, M.", - "Pritzel, A.", - "Antropova, N.", - "Senior, A.", - "Green, T.", - "Zidek, A.", - "Bates, R.", - "Blackwell, S.", - "Yim, J.", - "Ronneberger, O.", - "Bodenstein, S.", - "Zielinski, M.", - "Bridgland, A.", - "Potapenko, A.", - "Cowie, A.", - "Tunyasuvunakool, K.", - "Jain, R.", - "Clancy, E.", - "Kohli, P.", - "Jumper, J.", - "Hassabis, D.", - ], - doi="10.1101/2021.10.04.463034", - ), - "location": "https://github.com/deepmind/alphafold", - "type": "package", - "version": config_data["af_version"], - } - ] - else: - step["software"] = [ - { - "name": "AlphaFold", - "classification": "model building", - "description": "Structure prediction", - "citation": ihm.citations.alphafold2, - "location": "https://github.com/deepmind/alphafold", - "type": "package", - "version": config_data["af_version"], - } - ] - step["software_parameters"] = config_data["af_config"] - protocol.append(step) - - return protocol - - def _get_title(fam_name): """Get a title for this modelling experiment.""" return f"AlphaFold model for NMPFamsDB Family {fam_name}" -def _get_model_details(fam_name, max_pLDDT, max_pTM): +def _get_model_details(fam_name): """Get the model description.""" - db_url = f"https://bib.fleming.gr/NMPFamsDB/family?id={fam_name}" - return ( - f'Model generated using AlphaFold (v2.0.0) for the "Representative ' - f'Sequence" of NMPFamsDB Metagenome / Metatranscriptome Family ' - f"{fam_name}.\n\nThe 5 produced models reached a max. global pLDDT of " - f"{round(max_pLDDT, 3)} and max. pTM of {round(max_pTM, 3)}.\n\n" - f"See {db_url} for additional details." - ) - + db_url = f"https://novelfams.cgmlab.org/query/{fam_name}/" -def _get_model_group_name(): - """Get a name for a model group.""" - return None + return f"Model for {fam_name}.\nSee {db_url} for additional details." def _get_sequence(chn, use_auth=False): @@ -650,10 +260,8 @@ def _get_sequence(chn, use_auth=False): return sqe -def _get_entities(pdb_file, ref_seq, fam_name): +def _get_entities(pdb_file, fam_name): """Gather data for the mmCIF (target) entities.""" - _check_file(pdb_file) - ost_ent = io.LoadPDB(pdb_file) if ost_ent.chain_count != 1: raise RuntimeError(f"Unexpected oligomer in {pdb_file}") @@ -662,19 +270,13 @@ def _get_entities(pdb_file, ref_seq, fam_name): # NOTE: can have gaps to accommodate "X" in ref_seq exp_seq = sqe_gaps.replace("-", "X") - len_diff = len(ref_seq.string) - len(exp_seq) - if len_diff > 0: - exp_seq += "X" * len_diff - if exp_seq != ref_seq.string: - raise RuntimeError(f"Sequence in {pdb_file} does not match ref_seq") cif_ent = { - "seqres": ref_seq.string, + "seqres": exp_seq, "pdb_sequence": sqe_gaps, "pdb_chain_id": [_get_ch_name(chn, False)], "fam_name": fam_name, - "description": "Representative Sequence of NMPFamsDB Family " - + f"{fam_name}", + "description": f"Sequence of family {fam_name}", } return [cif_ent], ost_ent @@ -690,14 +292,7 @@ def _get_modelcif_entities(target_ents, asym_units, system): alphabet=alphabet, description=cif_ent["description"], source=None, - references=[ - _NmpfamsdbTrgRef( - cif_ent["fam_name"], - cif_ent["fam_name"], - align_begin=1, - align_end=len(cif_ent["seqres"]), - ) - ], + references=[], ) # NOTE: this assigns (potentially new) alphabetic chain names for pdb_chain_id in cif_ent["pdb_chain_id"]: @@ -851,20 +446,6 @@ def _package_associated_files(repo): os.remove(zfile.path) -def _get_assoc_mdl_file(fle_path, data_json): - """Generate a modelcif.associated.File object that looks like a CIF file. - The dedicated CIFFile functionality in modelcif would also try to write it. - """ - cfile = modelcif.associated.File( - fle_path, - details=f"#{data_json['mdl_rank']} ranked model; " - + f"pTM {round(data_json['ptm'], 3)}, " - + f"pLDDT {round(data_json['plddt_global'], 3)}", - ) - cfile.file_format = "cif" - return cfile - - def _get_assoc_zip_file(fle_path, data_json): """Create a modelcif.associated.File object that looks like a ZIP file. This is NOT the archive ZIP file for the PAEs but to store that in the @@ -884,13 +465,8 @@ def _store_as_modelcif( out_dir, mdl_name, compress, - add_pae, - add_aln, - add_files, ): """Mix all the data into a ModelCIF file.""" - print(" generating ModelCIF objects...", end="") - pstart = timer() # create system to gather all the data system = modelcif.System( title=data_json["title"], @@ -898,106 +474,23 @@ def _store_as_modelcif( model_details=data_json["model_details"], ) - # add primary citation (not using from_pubmed_id to ensure that author names - # have no special chars) - system.citations.append( - ihm.Citation( - pmid="37821698", - title="Unraveling the functional dark matter through global " - + "metagenomics.", - journal="Nature", - volume=622, - page_range=(594, 602), - year=2023, - authors=[ - "Pavlopoulos, G.A.", - "Baltoumas, F.A.", - "Liu, S.", - "Selvitopi, O.", - "Camargo, A.P.", - "Nayfach, S.", - "Azad, A.", - "Roux, S.", - "Call, L.", - "Ivanova, N.N.", - "Chen, I.M.", - "Paez-Espino, D.", - "Karatzas, E.", - "Iliopoulos, I.", - "Konstantinidis, K.", - "Tiedje, J.M.", - "Pett-Ridge, J.", - "Baker, D.", - "Visel, A.", - "Ouzounis, C.A.", - "Ovchinnikov, S.", - "Buluc, A.", - "Kyrpides, N.C.", - ], - doi="10.1038/s41586-023-06583-7", - is_primary=True, - ) - ) - # create an asymmetric unit and an entity per target sequence asym_units = {} _get_modelcif_entities(data_json["target_entities"], asym_units, system) - # audit_authors - system.authors.extend(data_json["audit_authors"]) - # set up the model to produce coordinates model = _OST2ModelCIF( assembly=modelcif.Assembly(asym_units.values()), asym=asym_units, ost_entity=ost_ent, scores_json=data_json, - name=data_json["mdl_name"], - ) - print(f" ({timer()-pstart:.2f}s)") - print(" processing QA scores...", end="", flush=True) - pstart = timer() - model.add_scores() - print(f" ({timer()-pstart:.2f}s)") - - model_group = modelcif.model.ModelGroup( - [model], name=data_json["model_group_name"] + name=mdl_name, ) - system.model_groups.append(model_group) - # handle additional files - arc_files = [] - if add_pae: - arc_files.append(_get_assoc_pae_file(system.id, mdl_name)) - if add_aln: - aln_file = _get_assoc_aln_file(data_json["aln_file_name"]) - arc_files.append(aln_file) - aln_data = aln_file.data - else: - aln_data = _get_aln_data() - aln_data.data_other_details = "MSA stored with parent entry" - arc_files.extend(add_files) - if arc_files: - system.repositories.append(_get_associated_files(mdl_name, arc_files)) - - # get data and steps - protocol = _get_modelcif_protocol( - data_json["protocol"], - system.target_entities, - aln_data, - model, - ) - system.protocols.append(protocol) + model_group = modelcif.model.ModelGroup([model]) + system.model_groups.append(model_group) # write modelcif System to file (NOTE: no PAE here!) - print(" write to disk...", end="", flush=True) - pstart = timer() - # copy aln file to compress them - if add_aln: - shutil.copyfile( - data_json["aln_file_path"], - os.path.join(out_dir, data_json["aln_file_name"]), - ) # NOTE: we change path and back while being exception-safe to handle zipfile oldpwd = os.getcwd() os.chdir(out_dir) @@ -1005,149 +498,49 @@ def _store_as_modelcif( try: with open(mdl_fle, "w", encoding="ascii") as mmcif_fh: modelcif.dumper.write(mmcif_fh, [system]) - if arc_files: - _package_associated_files(system.repositories[0]) if compress: _compress_cif_file(mdl_fle) mdl_fle += ".gz" finally: os.chdir(oldpwd) - print(f" ({timer()-pstart:.2f}s)") - assoc_files = [_get_assoc_mdl_file(mdl_fle, data_json)] - if arc_files: - assoc_files.append( - _get_assoc_zip_file( - system.repositories[0].files[0].path, data_json - ) - ) - return assoc_files def _translate2modelcif_single( f_name, opts, - metadata, - pdb_files, - mdl_rank, - aln_file, - aln_path, - ref_seq, mdl_details, - add_files=[], ): """Convert a single model with its accompanying data to ModelCIF.""" - mdl_id = f_name - if mdl_rank > 1: - mdl_id += f"_rank_{mdl_rank}_{metadata.mdl}" - - print(f" translating {mdl_id}...") - pdb_start = timer() + # ToDo: re-enable Pylint + # pylint: disable=too-many-statements + fam_name = os.path.splitext(os.path.basename(f_name))[0] # gather data into JSON-like structure - print(" preparing data...", end="") - pstart = timer() - - config_data = _get_config() mdlcf_json = {} - mdlcf_json["audit_authors"] = _get_audit_authors() - mdlcf_json["protocol"] = _get_protocol_steps_and_software(config_data) - mdlcf_json["config_data"] = config_data - mdlcf_json["mdl_id"] = mdl_id # used for entry ID - mdlcf_json["mdl_rank"] = mdl_rank - mdlcf_json["aln_file_name"] = aln_file - mdlcf_json["aln_file_path"] = aln_path - - # find model to process - pdb_list_sel = [f for f in pdb_files if metadata.mdl in f] - if len(pdb_list_sel) != 1: - # this should never happen - raise RuntimeError( - f"Multiple file matches found for {metadata.mdl} in {f_name}" - ) - if mdl_rank == 1: - mdlcf_json["mdl_name"] = f"Top ranked model ({metadata.mdl})" - else: - mdlcf_json["mdl_name"] = f"#{mdl_rank} ranked model ({metadata.mdl})" + mdlcf_json["mdl_id"] = fam_name # used for entry ID # process coordinates - pdb_file = pdb_list_sel[0] - target_entities, ost_ent = _get_entities(pdb_file, ref_seq, f_name) + target_entities, ost_ent = _get_entities(f_name, fam_name) mdlcf_json["target_entities"] = target_entities - # sanity check (only for top ranked model!) - if mdl_rank == 1 and opts.pdb_web_path is not None: - pdb_file_web = os.path.join(opts.pdb_web_path, f"{f_name}.pdb") - # warning handled in compare function... - _compare_pdbs(f_name, pdb_file, pdb_file_web) - - # get scores for this entry - mdlcf_json["plddt_global"] = metadata.pLDDT - mdlcf_json["ptm"] = metadata.pTM - add_pae = mdl_rank == 1 or opts.all_pae - if add_pae: - pdb_basename = os.path.basename(pdb_file) - pae_basename = os.path.splitext(pdb_basename)[0] + ".txt.gz" - pae_file = os.path.join(opts.pae_dir, pae_basename) - _check_file(pae_file) - mdlcf_json["pae"] = np.loadtxt(pae_file) - exp_num_res = len(ref_seq.string) - if mdlcf_json["pae"].shape != (exp_num_res, exp_num_res): - raise RuntimeError(f"Unexpected PAE shape in {pae_file}") # fill annotations mdlcf_json["title"] = _get_title(f_name) - if mdl_rank != 1: - mdlcf_json["title"] += f" (#{mdl_rank} ranked model)" mdlcf_json["model_details"] = mdl_details - mdlcf_json["model_group_name"] = _get_model_group_name() - print(f" ({timer()-pstart:.2f}s)") # save ModelCIF - assoc_files = _store_as_modelcif( + _store_as_modelcif( mdlcf_json, ost_ent, opts.out_dir, - mdl_id, - opts.compress and mdl_rank == 1, - add_pae, - opts.all_msa or mdl_rank == 1, - add_files, + fam_name, + opts.compress, ) - # check if result can be read and has expected seq. - mdl_path = os.path.join(opts.out_dir, assoc_files[0].path) - ent, ss = io.LoadMMCIF(mdl_path, seqres=True) - exp_seqs = [ - trg_ent["pdb_sequence"] for trg_ent in mdlcf_json["target_entities"] - ] - assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}" - # here we expect auth = label IDs - ent_seq = "".join([_get_sequence(chn, False) for chn in ent.chains]) - ent_seq_a = "".join([_get_sequence(chn, True) for chn in ent.chains]) - assert ent_seq == ent_seq_a - assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}" - ent_seqres = "".join( - [ss.FindSequence(chn.name).string for chn in ent.chains] - ) - exp_seqres = "".join( - [trg_ent["seqres"] for trg_ent in mdlcf_json["target_entities"]] - ) - assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}" - - print(f" ... done with {mdl_id} ({timer()-pdb_start:.2f}s).") - - return assoc_files - -def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): +def _translate2modelcif(f_name, opts): """Convert a family of models with their accompanying data to ModelCIF.""" - # re-enable Pylint for final version + # ToDo: re-enable Pylint # pylint: disable=too-many-locals - # expected to have exactly 5 models per family - if len(metadata_fam) != 5: - raise RuntimeError( - f"Unexpected number of {len(metadata_fam)} models in " - f"metadata for family {f_name}." - ) # skip if done already if opts.compress: @@ -1156,58 +549,19 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): cifext = "cif" mdl_path = os.path.join(opts.out_dir, f"{f_name}.{cifext}") if os.path.exists(mdl_path): - print(f" {f_name} already done...") - return - - # get aln_data and ref. seq. for this entry - aln_file = f"{f_name}.fasta" - aln_path = os.path.join(opts.msa_data_dir, aln_file) - # expected 11 extra families compared to web data but those don't have MSAs - # -> skipped for consistency and to keep code simple here - if not os.path.exists(aln_path): - _warn_msg(f"Missing MSA for {f_name}. Skipping...") + print(f" {f_name} already done") return - aln = io.LoadAlignment( - aln_path - ) # note: this checks that it's an actual MSA - ref_seq = aln.sequences[0] - if ref_seq_check is not None and ref_seq_check.string != ref_seq.string: - raise RuntimeError(f"Sequence mismatch for {f_name}") - - # get global model details + # get model details mdl_details = _get_model_details( - f_name, metadata_fam.pLDDT.max(), metadata_fam.pTM.max() + os.path.splitext(os.path.basename(f_name))[0] ) - # rank available models - metadata_sorted = metadata_fam.sort_values("pLDDT", ascending=False) - add_files = [] - if opts.all_models: - for idx in range(1, 5): - assoc_files = _translate2modelcif_single( - f_name, - opts, - metadata_sorted.iloc[idx], - pdb_files, - idx + 1, - aln_file, - aln_path, - ref_seq, - mdl_details, - ) - add_files.extend(assoc_files) - # process top ranked one + + # Turn into ModelCIF _translate2modelcif_single( f_name, opts, - metadata_sorted.iloc[0], - pdb_files, - 1, - aln_file, - aln_path, - ref_seq, mdl_details, - add_files, ) @@ -1224,17 +578,12 @@ def _main(): print(f"Processing {n_mdls} models.") tmstmp = s_tmstmp for f_name in sorted(pdb_files): + _translate2modelcif( + f_name, + opts, + ) n_mdls -= 1 - """ - if f_name.startswith(opts.prefix): - _translate2modelcif( - f_name, - opts, - metadata_full[metadata_full.ID == f_name], - pdb_files_split[f_name], - refseqs.FindSequence(f_name) if refseqs is not None else None, - ) - """ + # report progress after a bit of time if timer() - tmstmp > 60: print(