Skip to content
Snippets Groups Projects
Commit b98d8878 authored by B13nch3n's avatar B13nch3n
Browse files

Update converter script.

parent 0ba72f69
No related branches found
No related tags found
No related merge requests found
...@@ -46,10 +46,12 @@ def _parse_args(): ...@@ -46,10 +46,12 @@ def _parse_args():
+ "'<UniProtKB AC>-<UniProtKB AC>'", + "'<UniProtKB AC>-<UniProtKB AC>'",
) )
parser.add_argument( parser.add_argument(
"--rank", "--selected_rank",
type=str, type=str,
default=None, default=None,
help="Only process the model with this rank.", help="If a certain model of a modelling project is selected by rank, "
+ "the other models are still translated to ModelCIF but stored as "
+ "accompanying files to the selected model.",
) )
parser.add_argument( parser.add_argument(
"--out_dir", "--out_dir",
...@@ -151,7 +153,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): ...@@ -151,7 +153,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
occupancy=atm.occupancy, occupancy=atm.occupancy,
) )
def add_scores(self, scores_json, entry_id, mdl_name): def add_scores(self, scores_json, entry_id, mdl_name, add_files):
"""Add QA metrics from AF2 scores.""" """Add QA metrics from AF2 scores."""
# global scores # global scores
self.qa_metrics.extend( self.qa_metrics.extend(
...@@ -196,21 +198,26 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): ...@@ -196,21 +198,26 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
self.qa_metrics.extend(lpae) self.qa_metrics.extend(lpae)
ac_file = f"{mdl_name}_local_pairwise_qa.cif" ac_file = f"{mdl_name}_local_pairwise_qa.cif"
qa_file = modelcif.associated.LocalPairwiseQAScoresFile( arc_files = [
ac_file, modelcif.associated.LocalPairwiseQAScoresFile(
categories=["_ma_qa_metric_local_pairwise"], ac_file,
copy_categories=["_ma_qa_metric"], categories=["_ma_qa_metric_local_pairwise"],
entry_id=entry_id, copy_categories=["_ma_qa_metric"],
entry_details="This file is an associated file consisting " entry_id=entry_id,
+ "of local pairwise QA metrics. This is a partial mmCIF " entry_details="This file is an associated file consisting "
+ "file and can be validated by merging with the main " + "of local pairwise QA metrics. This is a partial mmCIF "
+ "mmCIF file containing the model coordinates and other " + "file and can be validated by merging with the main "
+ "associated data.", + "mmCIF file containing the model coordinates and other "
details="Predicted aligned error", + "associated data.",
) details="Predicted aligned error",
)
]
if add_files:
arc_files.extend(add_files)
return modelcif.associated.Repository( return modelcif.associated.Repository(
"", "",
[modelcif.associated.ZipFile(f"{mdl_name}.zip", files=[qa_file])], [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
) )
# NOTE: by convention MA expects zip file with same name as model-cif # NOTE: by convention MA expects zip file with same name as model-cif
...@@ -248,14 +255,14 @@ def _check_model_extra_files_present(model_dir, pdb_file): ...@@ -248,14 +255,14 @@ def _check_model_extra_files_present(model_dir, pdb_file):
def _get_audit_authors(): def _get_audit_authors():
"""Return the list of authors that produced this model.""" """Return the list of authors that produced this model."""
return ( return (
"Bartolec, T.", "Bartolec, T.K.",
"Vazquez-Campos, X.", "Vazquez-Campos, X.",
"Johnson, M.",
"Norman, A.", "Norman, A.",
"Payne, R.", "Luong, C.",
"Wilkins, M.", "Payne, R.J.",
"Mackay, J.", "Wilkins, M.R.",
"Low, J.", "Mackay, J.P.",
"Low, J.K.K.",
) )
...@@ -818,19 +825,19 @@ def _compress_cif_file(cif_file): ...@@ -818,19 +825,19 @@ def _compress_cif_file(cif_file):
os.remove(cif_file) os.remove(cif_file)
def _package_associated_files(mdl_name): def _package_associated_files(repo):
"""Compress associated files into single zip file and delete original.""" """Compress associated files into single zip file and delete original."""
# file names must match ones from add_scores
zip_path = f"{mdl_name}.zip"
files = [f"{mdl_name}_local_pairwise_qa.cif"]
# zip settings tested for good speed vs compression # zip settings tested for good speed vs compression
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_BZIP2) as myzip: for archive in repo.files:
for file in files: with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
myzip.write(file) for zfile in archive.files:
os.remove(file) cif_zip.write(zfile.path, arcname=zfile.path)
os.remove(zfile.path)
def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): def _store_as_modelcif(
data_json, ost_ent, out_dir, file_prfx, compress, add_files
):
"""Mix all the data into a ModelCIF file.""" """Mix all the data into a ModelCIF file."""
print(" generating ModelCIF objects...", end="") print(" generating ModelCIF objects...", end="")
pstart = timer() pstart = timer()
...@@ -876,18 +883,23 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): ...@@ -876,18 +883,23 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
print(" processing QA scores...", end="", flush=True) print(" processing QA scores...", end="", flush=True)
pstart = timer() pstart = timer()
mdl_name = os.path.basename(file_prfx) mdl_name = os.path.basename(file_prfx)
system.repositories.append(model.add_scores(data_json, system.id, mdl_name)) system.repositories.append(
model.add_scores(data_json, system.id, mdl_name, add_files)
)
print(f" ({timer()-pstart:.2f}s)") print(f" ({timer()-pstart:.2f}s)")
system.model_groups.append( system.model_groups.append(
modelcif.model.ModelGroup([model], name=data_json["model_group_name"]) modelcif.model.ModelGroup([model], name=data_json["model_group_name"])
) )
ref_dbs = _get_sequence_dbs(data_json["config_data"]["seq_dbs"]) system.protocols.append(
protocol = _get_modelcif_protocol( _get_modelcif_protocol(
data_json["protocol"], system.target_entities, model, ref_dbs data_json["protocol"],
system.target_entities,
model,
_get_sequence_dbs(data_json["config_data"]["seq_dbs"]),
)
) )
system.protocols.append(protocol)
# write modelcif System to file # write modelcif System to file
print(" write to disk...", end="", flush=True) print(" write to disk...", end="", flush=True)
...@@ -896,17 +908,48 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress): ...@@ -896,17 +908,48 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, file_prfx, compress):
# -> hence we cheat by changing path and back while being exception-safe... # -> hence we cheat by changing path and back while being exception-safe...
oldpwd = os.getcwd() oldpwd = os.getcwd()
os.chdir(out_dir) os.chdir(out_dir)
mdl_fle = f"{mdl_name}.cif"
try: try:
with open(f"{mdl_name}.cif", "w", encoding="ascii") as mmcif_fh: with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
modelcif.dumper.write(mmcif_fh, [system]) modelcif.dumper.write(mmcif_fh, [system])
_package_associated_files(mdl_name) _package_associated_files(system.repositories[0])
if compress: if compress:
_compress_cif_file(f"{mdl_name}.cif") _compress_cif_file(mdl_fle)
finally: finally:
os.chdir(oldpwd) os.chdir(oldpwd)
print(f" ({timer()-pstart:.2f}s)") print(f" ({timer()-pstart:.2f}s)")
mdl_fle = _get_assoc_mdl_file(mdl_fle, data_json)
zip_fle = _get_assoc_zip_file(
system.repositories[0].files[0].path, data_json
)
return mdl_fle, zip_fle
def _get_assoc_mdl_file(fle_path, data_json):
"""Generate a modelcif.associated.File object that looks like a CIF file.
The dedicated CIFFile functionality in modelcif would also try to write it.
"""
cfile = modelcif.associated.File(
fle_path,
details=f"model {data_json['mdl_num']}; rank {data_json['rank_num']}",
)
cfile.file_format = "cif"
return cfile
def _get_assoc_zip_file(fle_path, data_json):
"""Create a modelcif.associated.File object that looks like a ZIP file.
This is NOT the archive ZIP file for the PAEs but to store that in the
ZIP archive of the selected model."""
zfile = modelcif.associated.File(
fle_path,
details="archive with multiple files for model "
+ f"{data_json['mdl_num']}; rank {data_json['rank_num']}",
)
zfile.file_format = "zip"
return zfile
def _create_interaction_json(config_data): def _create_interaction_json(config_data):
"""Create a dictionary (mimicking JSON) that contains data which is the same """Create a dictionary (mimicking JSON) that contains data which is the same
...@@ -931,6 +974,41 @@ def _create_model_json(data, pdb_file, up_acs, block_id): ...@@ -931,6 +974,41 @@ def _create_model_json(data, pdb_file, up_acs, block_id):
return ost_ent return ost_ent
def _translate2modelcif(up_acs, pdb_fle, config_data, opts, add_files):
"""Convert a PDB file with its accompanying data to ModelCIF."""
pdb_start = timer()
file_prfx, uid = _check_model_extra_files_present(opts.model_dir, pdb_fle)
pdb_fle = os.path.join(opts.model_dir, pdb_fle)
# gather data into JSON-like structure
print(" preparing data...", end="")
pstart = timer()
mdlcf_json = _create_interaction_json(config_data)
# uid = ..._rank_X_model_Y.pdb
mdl_name_parts = uid.split("_")
assert mdl_name_parts[-4] == "rank"
assert mdl_name_parts[-2] == "model"
mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
ost_ent = _create_model_json(mdlcf_json, pdb_fle, up_acs, uid)
# read quality scores from JSON file
_get_scores(mdlcf_json, file_prfx)
print(f" ({timer()-pstart:.2f}s)")
mdlcf_fle, zip_fle = _store_as_modelcif(
mdlcf_json,
ost_ent,
opts.out_dir,
file_prfx,
opts.compress,
add_files,
)
return pdb_start, pdb_fle, mdlcf_fle, zip_fle
def _main(): def _main():
"""Run as script.""" """Run as script."""
opts = _parse_args() opts = _parse_args()
...@@ -945,45 +1023,49 @@ def _main(): ...@@ -945,45 +1023,49 @@ def _main():
config_data = _parse_colabfold_config(cnfg) config_data = _parse_colabfold_config(cnfg)
# iterate model directory # iterate model directory
found_ranked = False # There is 1 representative for a modelling project, the other models are
# stored in its ZIP archive.
not_slctd_mdls = []
slctd_mdl = None
for fle in sorted(os.listdir(opts.model_dir)): for fle in sorted(os.listdir(opts.model_dir)):
# iterate PDB files # iterate PDB files
if not fle.endswith(".pdb"): if not fle.endswith(".pdb"):
continue continue
if opts.rank is not None and f"rank_{opts.rank}" not in fle: if (
opts.selected_rank is not None
and f"rank_{opts.selected_rank}" in fle
):
slctd_mdl = fle
continue continue
found_ranked = True
print(f" translating {fle}...") print(f" translating {fle}...")
pdb_start = timer() pdb_start, fle, mdlcf_fle, zip_fle = _translate2modelcif(
file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle) up_acs,
fle = os.path.join(opts.model_dir, fle) fle,
config_data,
# gather data into JSON-like structure opts,
print(" preparing data...", end="") None,
pstart = timer()
mdlcf_json = _create_interaction_json(config_data)
# uid = ..._rank_X_model_Y.pdb
mdl_name_parts = uid.split("_")
assert mdl_name_parts[-4] == "rank"
assert mdl_name_parts[-2] == "model"
mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
ost_ent = _create_model_json(mdlcf_json, fle, up_acs, uid)
# read quality scores from JSON file
_get_scores(mdlcf_json, file_prfx)
print(f" ({timer()-pstart:.2f}s)")
_store_as_modelcif(
mdlcf_json, ost_ent, opts.out_dir, file_prfx, opts.compress
) )
print(f" ... done with {fle} ({timer()-pdb_start:.2f}s).") print(f" ... done with {fle} ({timer()-pdb_start:.2f}s).")
not_slctd_mdls.append(mdlcf_fle)
not_slctd_mdls.append(zip_fle)
if opts.selected_rank:
if slctd_mdl is None:
_abort_msg(
f"Could not find model of requested rank '{opts.selected_rank}'"
)
print(
f" translating selected model {opts.selected_rank} "
+ f"({slctd_mdl})..."
)
_translate2modelcif(
up_acs,
slctd_mdl,
config_data,
opts,
not_slctd_mdls,
)
print(f" ... done with {slctd_mdl} ({timer()-pdb_start:.2f}s).")
if opts.rank and not found_ranked:
_abort_msg(f"Could not find model of requested rank '{opts.rank}'")
print(f"... done with {opts.model_dir}.") print(f"... done with {opts.model_dir}.")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment