From b98d887878d3ae118cde2ca726585b60451ee0bd Mon Sep 17 00:00:00 2001 From: B13nch3n <b13nch3n_01@theb-si.de> Date: Mon, 17 Oct 2022 16:05:55 +0200 Subject: [PATCH] Update converter script. --- .../translate2modelcif.py | 222 ++++++++++++------ 1 file changed, 152 insertions(+), 70 deletions(-) diff --git a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py index 4deb91e..900c848 100644 --- a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py +++ b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py @@ -46,10 +46,12 @@ def _parse_args(): + "'<UniProtKB AC>-<UniProtKB AC>'", ) parser.add_argument( - "--rank", + "--selected_rank", type=str, default=None, - help="Only process the model with this rank.", + help="If a certain model of a modelling project is selected by rank, " + + "the other models are still translated to ModelCIF but stored as " + + "accompanying files to the selected model.", ) parser.add_argument( "--out_dir", @@ -151,7 +153,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): occupancy=atm.occupancy, ) - def add_scores(self, scores_json, entry_id, mdl_name): + def add_scores(self, scores_json, entry_id, mdl_name, add_files): """Add QA metrics from AF2 scores.""" # global scores self.qa_metrics.extend( @@ -196,21 +198,26 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): self.qa_metrics.extend(lpae) ac_file = f"{mdl_name}_local_pairwise_qa.cif" - qa_file = modelcif.associated.LocalPairwiseQAScoresFile( - ac_file, - categories=["_ma_qa_metric_local_pairwise"], - copy_categories=["_ma_qa_metric"], - entry_id=entry_id, - entry_details="This file is an associated file consisting " - + "of local pairwise QA metrics. This is a partial mmCIF " - + "file and can be validated by merging with the main " - + "mmCIF file containing the model coordinates and other " - + "associated data.", - details="Predicted aligned error", - ) + arc_files = [ + modelcif.associated.LocalPairwiseQAScoresFile( + ac_file, + categories=["_ma_qa_metric_local_pairwise"], + copy_categories=["_ma_qa_metric"], + entry_id=entry_id, + entry_details="This file is an associated file consisting " + + "of local pairwise QA metrics. This is a partial mmCIF " + + "file and can be validated by merging with the main " + + "mmCIF file containing the model coordinates and other " + + "associated data.", + details="Predicted aligned error", + ) + ] + if add_files: + arc_files.extend(add_files) + return modelcif.associated.Repository( "", - [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=[qa_file])], + [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)], ) # NOTE: by convention MA expects zip file with same name as model-cif @@ -248,14 +255,14 @@ def _check_model_extra_files_present(model_dir, pdb_file): def _get_audit_authors(): """Return the list of authors that produced this model.""" return ( - "Bartolec, T.", + "Bartolec, T.K.", "Vazquez-Campos, X.", - "Johnson, M.", "Norman, A.", - "Payne, R.", - "Wilkins, M.", - "Mackay, J.", - "Low, J.", + "Luong, C.", + "Payne, R.J.", + "Wilkins, M.R.", + "Mackay, J.P.", + "Low, J.K.K.", ) @@ -818,19 +825,19 @@ def _compress_cif_file(cif_file): os.remove(cif_file) -def _package_associated_files(mdl_name): +def _package_associated_files(repo): """Compress associated files into single zip file and delete original.""" - # file names must match ones from add_scores - zip_path = f"{mdl_name}.zip" - files = [f"{mdl_name}_local_pairwise_qa.cif"] # zip settings tested for good speed vs compression - with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_BZIP2) as myzip: - for file in files: - myzip.write(file) - os.remove(file) + for archive in repo.files: + with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip: + for zfile in archive.files: + cif_zip.write(zfile.path, arcname=zfile.path) + os.remove(zfile.path) -def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): +def _store_as_modelcif( + data_json, ost_ent, out_dir, file_prfx, compress, add_files +): """Mix all the data into a ModelCIF file.""" print(" generating ModelCIF objects...", end="") pstart = timer() @@ -876,18 +883,23 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): print(" processing QA scores...", end="", flush=True) pstart = timer() mdl_name = os.path.basename(file_prfx) - system.repositories.append(model.add_scores(data_json, system.id, mdl_name)) + system.repositories.append( + model.add_scores(data_json, system.id, mdl_name, add_files) + ) print(f" ({timer()-pstart:.2f}s)") system.model_groups.append( modelcif.model.ModelGroup([model], name=data_json["model_group_name"]) ) - ref_dbs = _get_sequence_dbs(data_json["config_data"]["seq_dbs"]) - protocol = _get_modelcif_protocol( - data_json["protocol"], system.target_entities, model, ref_dbs + system.protocols.append( + _get_modelcif_protocol( + data_json["protocol"], + system.target_entities, + model, + _get_sequence_dbs(data_json["config_data"]["seq_dbs"]), + ) ) - system.protocols.append(protocol) # write modelcif System to file print(" write to disk...", end="", flush=True) @@ -896,17 +908,48 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): # -> hence we cheat by changing path and back while being exception-safe... oldpwd = os.getcwd() os.chdir(out_dir) + mdl_fle = f"{mdl_name}.cif" try: - with open(f"{mdl_name}.cif", "w", encoding="ascii") as mmcif_fh: + with open(mdl_fle, "w", encoding="ascii") as mmcif_fh: modelcif.dumper.write(mmcif_fh, [system]) - _package_associated_files(mdl_name) + _package_associated_files(system.repositories[0]) if compress: - _compress_cif_file(f"{mdl_name}.cif") + _compress_cif_file(mdl_fle) finally: os.chdir(oldpwd) - print(f" ({timer()-pstart:.2f}s)") + mdl_fle = _get_assoc_mdl_file(mdl_fle, data_json) + zip_fle = _get_assoc_zip_file( + system.repositories[0].files[0].path, data_json + ) + return mdl_fle, zip_fle + + +def _get_assoc_mdl_file(fle_path, data_json): + """Generate a modelcif.associated.File object that looks like a CIF file. + The dedicated CIFFile functionality in modelcif would also try to write it. + """ + cfile = modelcif.associated.File( + fle_path, + details=f"model {data_json['mdl_num']}; rank {data_json['rank_num']}", + ) + cfile.file_format = "cif" + return cfile + + +def _get_assoc_zip_file(fle_path, data_json): + """Create a modelcif.associated.File object that looks like a ZIP file. + This is NOT the archive ZIP file for the PAEs but to store that in the + ZIP archive of the selected model.""" + zfile = modelcif.associated.File( + fle_path, + details="archive with multiple files for model " + + f"{data_json['mdl_num']}; rank {data_json['rank_num']}", + ) + zfile.file_format = "zip" + return zfile + def _create_interaction_json(config_data): """Create a dictionary (mimicking JSON) that contains data which is the same @@ -931,6 +974,41 @@ def _create_model_json(data, pdb_file, up_acs, block_id): return ost_ent +def _translate2modelcif(up_acs, pdb_fle, config_data, opts, add_files): + """Convert a PDB file with its accompanying data to ModelCIF.""" + pdb_start = timer() + file_prfx, uid = _check_model_extra_files_present(opts.model_dir, pdb_fle) + pdb_fle = os.path.join(opts.model_dir, pdb_fle) + + # gather data into JSON-like structure + print(" preparing data...", end="") + pstart = timer() + + mdlcf_json = _create_interaction_json(config_data) + + # uid = ..._rank_X_model_Y.pdb + mdl_name_parts = uid.split("_") + assert mdl_name_parts[-4] == "rank" + assert mdl_name_parts[-2] == "model" + mdlcf_json["rank_num"] = int(mdl_name_parts[-3]) + mdlcf_json["mdl_num"] = int(mdl_name_parts[-1]) + + ost_ent = _create_model_json(mdlcf_json, pdb_fle, up_acs, uid) + + # read quality scores from JSON file + _get_scores(mdlcf_json, file_prfx) + print(f" ({timer()-pstart:.2f}s)") + mdlcf_fle, zip_fle = _store_as_modelcif( + mdlcf_json, + ost_ent, + opts.out_dir, + file_prfx, + opts.compress, + add_files, + ) + return pdb_start, pdb_fle, mdlcf_fle, zip_fle + + def _main(): """Run as script.""" opts = _parse_args() @@ -945,45 +1023,49 @@ def _main(): config_data = _parse_colabfold_config(cnfg) # iterate model directory - found_ranked = False + # There is 1 representative for a modelling project, the other models are + # stored in its ZIP archive. + not_slctd_mdls = [] + slctd_mdl = None for fle in sorted(os.listdir(opts.model_dir)): # iterate PDB files if not fle.endswith(".pdb"): continue - if opts.rank is not None and f"rank_{opts.rank}" not in fle: + if ( + opts.selected_rank is not None + and f"rank_{opts.selected_rank}" in fle + ): + slctd_mdl = fle continue - found_ranked = True print(f" translating {fle}...") - pdb_start = timer() - file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle) - fle = os.path.join(opts.model_dir, fle) - - # gather data into JSON-like structure - print(" preparing data...", end="") - pstart = timer() - - mdlcf_json = _create_interaction_json(config_data) - - # uid = ..._rank_X_model_Y.pdb - mdl_name_parts = uid.split("_") - assert mdl_name_parts[-4] == "rank" - assert mdl_name_parts[-2] == "model" - mdlcf_json["rank_num"] = int(mdl_name_parts[-3]) - mdlcf_json["mdl_num"] = int(mdl_name_parts[-1]) - - ost_ent = _create_model_json(mdlcf_json, fle, up_acs, uid) - - # read quality scores from JSON file - _get_scores(mdlcf_json, file_prfx) - print(f" ({timer()-pstart:.2f}s)") - - _store_as_modelcif( - mdlcf_json, ost_ent, opts.out_dir, file_prfx, opts.compress + pdb_start, fle, mdlcf_fle, zip_fle = _translate2modelcif( + up_acs, + fle, + config_data, + opts, + None, ) print(f" ... done with {fle} ({timer()-pdb_start:.2f}s).") + not_slctd_mdls.append(mdlcf_fle) + not_slctd_mdls.append(zip_fle) + if opts.selected_rank: + if slctd_mdl is None: + _abort_msg( + f"Could not find model of requested rank '{opts.selected_rank}'" + ) + print( + f" translating selected model {opts.selected_rank} " + + f"({slctd_mdl})..." + ) + _translate2modelcif( + up_acs, + slctd_mdl, + config_data, + opts, + not_slctd_mdls, + ) + print(f" ... done with {slctd_mdl} ({timer()-pdb_start:.2f}s).") - if opts.rank and not found_ranked: - _abort_msg(f"Could not find model of requested rank '{opts.rank}'") print(f"... done with {opts.model_dir}.") -- GitLab