diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a6bc05813438bab2b2b92554385905fdf5122c11 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,59 @@ +ARG VERSION_OST="2.3.0" +FROM registry.scicore.unibas.ch/schwede/openstructure:${VERSION_OST} +# We need to declare ARGs again which were declared before the build stage +# (FROM directive), otherwise they won't be available in this stage. +ARG VERSION_OST + +## Set up environment +ENV SRC_DIR="/tmp" \ + VERSION_OST=${VERSION_OST} \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + + +LABEL org.openstructure.base-image="${VERSION_OST}" +LABEL org.openstructure.translate2modelcif="2022-04-08.1" +LABEL maintainer="Stefan Bienert <stefan.bienert@unibas.ch>" +LABEL vendor1="Schwede Group (schwedelab.org)" +LABEL vendor2="SIB - Swiss Institute of Bioinformatics (sib.swiss)" +LABEL vendor3="Biozentrum - University of Basel (biozentrum.unibas.ch)" + + +## Install python-modelcif and python-ihm +COPY requirements.txt ${SRC_DIR} +WORKDIR ${SRC_DIR} +RUN set -e pipefail; \ + apt-get update -y; \ + apt-get install -y git pip; \ + pip install -r requirements.txt; \ + git clone https://github.com/ihmwg/python-ihm.git ihm.git; \ + cd ihm.git; \ + python3 setup.py build; \ + python3 setup.py install; \ + rm -rf ${SRC_DIR}/ihm.git; \ + cd ${SRC_DIR}; \ + git clone https://github.com/ihmwg/python-modelcif.git modelcif.git; \ + cd modelcif.git; \ + python3 setup.py build; \ + python3 setup.py install; \ + rm -rf ${SRC_DIR}/modelcif.git; \ + apt-get remove -y git pip + +## Copy tool(s) +COPY --chmod=755 translate2modelcif.py /usr/local/bin/translate2modelcif + +## Add a dedicated user +## MMCIF_USER_ID can be used to avoid file permission issues. +ARG MMCIF_USER_ID=501 +RUN adduser --system -u ${MMCIF_USER_ID} mmcif + + +#COPY --chmod=755 docker-entrypoint.sh / + +USER mmcif + +#ENTRYPOINT ["/docker-entrypoint.sh"] + +# LocalWords: ARG OST ARGs ENV SRC tmp PYTHONUNBUFFERED Schwede schwedelab py +# LocalWords: PYTHONDONTWRITEBYTECODE Bioinformatics sib swiss Biozentrum ihm +# LocalWords: modelcif txt WORKDIR pipefail chmod adduser mmcif ENTRYPOINT diff --git a/pyproject.toml b/pyproject.toml index b95b5ec25a2eef2ff5629d07e649ba9b7f960547..90392da16a7316df546dc4d16157320679e4943f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,11 @@ [tool.black] line-length=80 +[tool.pylint.MASTER] +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-allow-list='ujson' + [tool.pylint.REPORTS] reports='no' diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2abd7d3a956e774cb5b3d3765de5e9b9d66f2d96 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +ujson diff --git a/translate2modelcif.py b/translate2modelcif.py index a5572989bfa51aded4582e46a055180d351d9540..6a3ded6411c754526fbd519e878ef39151aab906 100644 --- a/translate2modelcif.py +++ b/translate2modelcif.py @@ -1,3 +1,4 @@ +#! /usr/local/bin/ost """Translate models from Tara/ Xabi from PDB + extra data into ModelCIF.""" import argparse @@ -5,10 +6,14 @@ import datetime import os import sys +from timeit import default_timer as timer +import numpy as np import requests +import ujson as json import ihm import modelcif +import modelcif.associated import modelcif.dumper import modelcif.model import modelcif.reference @@ -34,6 +39,8 @@ def _parse_args(): opts = parser.parse_args() # check that model dir exists + if opts.model_dir.endswith("/"): + opts.model_dir = opts.model_dir[:-1] if not os.path.exists(opts.model_dir): _abort_msg(f"Model directory '{opts.model_dir}' does not exist.") if not os.path.isdir(opts.model_dir): @@ -42,12 +49,52 @@ def _parse_args(): return opts +# pylint: disable=too-few-public-methods +class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM): + """Predicted accuracy according to the TM-score score in [0,1].""" + + # ToDo [input]: Add software + name = "pTM" + software = None + + +class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100].""" + + name = "pLDDT" + software = None + + +class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT): + """Predicted accuracy according to the CA-only lDDT in [0,100].""" + + name = "pLDDT" + software = None + + +class _PAE(modelcif.qa_metric.MetricType): + """Predicted aligned error (in Angstroms). + See :class:`MetricType` for more information.""" + + type = "PAE" + other_details = None + + +class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE): + """predicted aligned error (in Angstroms).""" + + name = "PAE" + software = None + + +# pylint: enable=too-few-public-methods + + class _OST2ModelCIF(modelcif.model.AbInitioModel): """Map OST entity elements to ihm.model""" def __init__(self, *args, **kwargs): """Initialise a model""" - # check that ost_entity exists, TypeError otherwise self.ost_entity = kwargs.pop("ost_entity") self.asym = kwargs.pop("asym") @@ -69,6 +116,70 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel): occupancy=atm.occupancy, ) + def add_scores(self, scores_json, entry_id, ac_file_prfx): + """Add QA metrics from AF2 scores.""" + # global scores + self.qa_metrics.extend( + ( + _GlobalPLDDT(np.mean(scores_json["plddt"])), + _GlobalPTM(scores_json["ptm"]), + ) + ) + + # local scores + lpae = [] + i = 0 + for chn_i in self.ost_entity.chains: + for res_i in chn_i.residues: + # local pLDDT + self.qa_metrics.append( + _LocalPLDDT( + self.asym[chn_i.name].residue(res_i.number.num), + scores_json["plddt"][i], + ) + ) + + # pairwise alignment error + j = 0 + # We do a 2nd iteration over the OST entity above doing index + # magic because it keeps the code cleaner and is only around + # 0.3s slower than iterating the array directly. Majority of + # time goes into writing files, anyway. + for chn_j in self.ost_entity.chains: + for res_j in chn_j.residues: + lpae.append( + _LocalPairwisePAE( + self.asym[chn_i.name].residue(res_i.number.num), + self.asym[chn_j.name].residue(res_j.number.num), + scores_json["pae"][i][j], + ) + ) + j += 1 + + i += 1 + + self.qa_metrics.extend(lpae) + + ac_file = f"{os.path.basename(ac_file_prfx)}_local_pairwise_qa.cif" + + return modelcif.associated.Repository( + "", + [ + modelcif.associated.LocalPairwiseQAScoresFile( + ac_file, + categories=["_ma_qa_metric_local_pairwise"], + copy_categories=["_ma_qa_metric"], + entry_id=entry_id, + entry_details="This file is an associated file consisting " + + "of local pairwise QA metrics. This is a partial mmCIF " + + "file and can be validated by merging with the main " + + "mmCIF file containing the model coordinates and other " + + "associated data.", + details="Predicted aligned error.", + ) + ], + ) + def _abort_msg(msg, exit_code=1): """Write error message and exit with exit_code.""" @@ -259,21 +370,35 @@ def _get_entities(pdb_file, up_acs): return entities, ost_ent +def _get_scores(data, prfx): + """Check that all files needed to process this model are present.""" + scrs_fle = f"{prfx}_scores.json" + with open(scrs_fle, encoding="utf8") as jfh: + scrs_json = json.load(jfh) + + data.update(scrs_json) + + def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): """Mix all the data into a ModelCIF file.""" + print(" generating ModelCIF objects...", end="") + pstart = timer() + # ToDo [internal]: Get protocol/ software + # ToDo [internal]: Get QA metrics # create system to gather all the data system = modelcif.System( title=data_json["title"], id=interaction_name.upper(), model_details=data_json["model_details"], ) - # create target entities, references, source, asymmetric units... + # create target entities, references, source, asymmetric units & assembly # for source we assume all chains come from the same taxon source = ihm.source.Natural( ncbi_taxonomy_id=data_json["target_entities"][0]["up_ncbi_taxid"], scientific_name=data_json["target_entities"][0]["up_organism"], ) + # create an asymmetric unit and an entity per target sequence asym_units = {} for cif_ent in data_json["target_entities"]: # ToDo [input]: Get entity description @@ -321,6 +446,14 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): ost_entity=ost_ent, name="ma_model_list.model_name", ) + print(f" ({timer()-pstart:.2f}s)") + print(" processing QA scores...", end="", flush=True) + pstart = timer() + system.repositories.append( + model.add_scores(data_json, system.id, file_prfx) + ) + print(f" ({timer()-pstart:.2f}s)") + # ToDo [input]: Get name model_group = modelcif.model.ModelGroup( [model], name="ma_model_list.model_group_name" @@ -328,8 +461,11 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): system.model_groups.append(model_group) # write modelcif System to file + print(" write to disk...", end="", flush=True) + pstart = timer() with open(f"{file_prfx}.cif", "w", encoding="utf8") as mmcif_fh: modelcif.dumper.write(mmcif_fh, [system]) + print(f" ({timer()-pstart:.2f}s)") def _create_interaction_json(): @@ -370,14 +506,23 @@ def _main(): # iterate PDB files if not fle.endswith(".pdb"): continue + print(f" translating {fle}...") + pdb_start = timer() file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle) fle = os.path.join(opts.model_dir, fle) # gather data into JSON-like structure + print(" preparing data...", end="") + pstart = timer() ost_ent = _create_model_json(mdlcf_json, fle, up_acs) + # read quality scores from JSON file + _get_scores(mdlcf_json, file_prfx) + print(f" ({timer()-pstart:.2f}s)") + _store_as_modelcif(uid, mdlcf_json, ost_ent, file_prfx) # ToDo [internal]: wipe data or is it overwritten in mdlcf_json? + print(f" ... done with {fle} ({timer()-pdb_start:.2f}s).") print(f"... done with {opts.model_dir}.") @@ -392,4 +537,7 @@ if __name__ == "__main__": # LocalWords: sqe olc ACDEFGHIKLMNPQRSTVWY RuntimeError upkb txt pylint iter # LocalWords: rspns unicode startswith sline len elif NCBI TaxID ncbi taxid # LocalWords: seqlen crc ISOFORM DT dt flds isoforms isoform ent LoadPDB ihm -# LocalWords: mdlcf mdlcif asym AsymUnit +# LocalWords: mdlcf mdlcif asym AsymUnit init kwargs atm pos het hetatom pTM +# LocalWords: biso ujson GlobalPTM pLDDT ptm jfh numpy np GlobalPLDDT lDDT +# LocalWords: plddt LocalPLDDT timeit PAE MetricType LocalPairwisePAE lpae +# LocalWords: nd pae qa pstart