diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..60ce4b66d28d3f142abee00425cc53e98dc572f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +\.DS_Store diff --git a/README.md b/README.md index fe1fefb731d261952b138ed9b2ce3dc13136d86c..2d39a95e7e1ccca4660fcdfdfa6df77827ea1737 100644 --- a/README.md +++ b/README.md @@ -2,5 +2,22 @@ This is a collection of tools, code and examples used for converting user's modelling projects into [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/) for upload to [ModelArchive](https://www.modelarchive.org) (MA). The whole repository has grown over time from bulk uploads to MA, we assisted users with. Do not expect everything to work out of the box, old conversion projects may be outdated by now. -<!-- LocalWords: modelcif ModelArchive + +## ModelCIF resources + +The official documentation of [ModelCIF](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/) and [PDBx/mmCIF](https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Index/) can be found here: https://mmcif.wwpdb.org + +ModelCIF dictionary files can be found in the official repository: https://github.com/ihmwg/ModelCIF + + +## Directories + +A short overview of the directories in this repository and what they do. + +|Path |Content | +|-----------|----------------------------------------------------------------| +|[projects/](projects/) |Collection of model conversions done for various user projects. | +|[validation/](validation/)|A tool to check the formatting of ModelCIF files. | + +<!-- LocalWords: modelcif ModelArchive PDBx --> diff --git a/pyproject.toml b/pyproject.toml index b95b5ec25a2eef2ff5629d07e649ba9b7f960547..a70eddfb5aaf01d4e8e6aa40222a8213facbdf91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,3 +3,9 @@ line-length=80 [tool.pylint.REPORTS] reports='no' + +[tool.pylint.MASTER] +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-allow-list='rapidjson' \ No newline at end of file diff --git a/validation/Dockerfile b/validation/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..8b4e6baa1a7211b7f1fedd066e09e7787a000972 --- /dev/null +++ b/validation/Dockerfile @@ -0,0 +1,159 @@ +ARG VERSION_PYTHON="3.7" +ARG VERSION_BASE_IMAGE="python:${VERSION_PYTHON}-alpine3.16" +FROM ${VERSION_BASE_IMAGE} +# We need to declare ARGs again which were declared before the build stage +# (FROM directive), otherwise they won't be available in this stage. +ARG VERSION_PYTHON +ARG VERSION_BASE_IMAGE + +ARG VERSION_CPP_DICT_PACK="v2.500" +ARG VERSION_PY_MMCIF="0.76" + +## Set up environment +ENV MMCIF_DICTS_DIR="/usr/local/share/mmcif-dict-suite" \ + SRC_DIR="/tmp" \ + VERSION_CPP_DICT_PACK=${VERSION_CPP_DICT_PACK} \ + VERSION_BASE_IMAGE=${VERSION_BASE_IMAGE} \ + VERSION_PYTHON=${VERSION_PYTHON} \ + VERSION_PY_MMCIF=${VERSION_PY_MMCIF} \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + + +LABEL org.modelarchive.base-image="${VERSION_BASE_IMAGE}" +LABEL org.modelarchive.cpp-dict-pack.version="${VERSION_CPP_DICT_PACK}" +LABEL maintainer="Stefan Bienert <stefan.bienert@unibas.ch>" +LABEL vendor1="Schwede Group (schwedelab.org)" +LABEL vendor2="SIB - Swiss Institute of Bioinformatics (sib.swiss)" +LABEL vendor3="Biozentrum - University of Basel (biozentrum.unibas.ch)" + +## Install the RCSB CPP Dict Suite (only the binaries we need) +WORKDIR ${SRC_DIR} +RUN set -e pipefail; \ + export DICT_PACK_SRC_DIR="${SRC_DIR}/cpp-dict-pack.git"; \ + apk update; \ + apk upgrade; \ + apk add abuild binutils bison build-base cmake flex git gcc \ + extra-cmake-modules tcsh; \ + # + ## Install the RCSB mmCIF Dict Suite + git clone -b ${VERSION_CPP_DICT_PACK} \ + --single-branch --recurse-submodules \ + https://github.com/rcsb/cpp-dict-pack.git \ + ${DICT_PACK_SRC_DIR}; \ + mkdir ${DICT_PACK_SRC_DIR}/build; \ + cd ${DICT_PACK_SRC_DIR}; \ + cd ${DICT_PACK_SRC_DIR}/build; \ + cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON; \ + make; \ + for cif_tool in CifCheck DictToSdb; do \ + mv bin/${cif_tool} /usr/local/bin; \ + done; \ + cd ${SRC_DIR}; \ + rm -r ${DICT_PACK_SRC_DIR}; \ + # + ## Install the RCSB py-mmcif Python module + /usr/local/bin/python -m pip install --upgrade pip; \ + /usr/local/bin/python -m pip install mmcif==${VERSION_PY_MMCIF} \ + python-rapidjson; \ + # + ## Clean up/ remove unnecessary stuff + apk del abuild binutils bison build-base cmake flex git gcc \ + extra-cmake-modules tcsh; \ + apk add libstdc++ + + +## Copy tools/ entrypoint script +COPY --chmod=755 get-mmcif-dict-versions.py \ + /usr/local/bin/get-mmcif-dict-versions +COPY --chmod=755 entrypoint.sh / +COPY --chmod=755 validate-mmcif-file.py /usr/local/bin/validate-mmcif-file + + +## Create dictionaries for validating mmCIF files. +## The version to be created is controlled by build argument USE_DICT_VERSION. +## Use a versions vailable at +## https://github.com/ihmwg/ModelCIF/tree/master/archive or the special word +## "latest", that will use +## https://github.com/ihmwg/ModelCIF/blob/master/dist/mmcif_ma.dic. +## Dictionaries do not change that frequently therefore we skip the hassle of +## keeping them in an external volume. +ARG USE_DICT_VERSION="1.4.2" +ENV USE_DICT_VERSION=${USE_DICT_VERSION} +LABEL org.modelarchive.dict_release="${USE_DICT_VERSION}" +WORKDIR ${SRC_DIR} +RUN set -e pipefail; \ + apk add curl; \ + export _DICT_DIR="${SRC_DIR}/mmcif_dicts"; \ + export _DICT_URL="https://mmcif.wwpdb.org/dictionaries/ascii"; \ + export _GIT_URL="https://raw.github.com/ihmwg/ModelCIF/master"; \ + export _MA_DICT_URL="${_GIT_URL}/archive/mmcif_ma-v${USE_DICT_VERSION}.dic"; \ + mkdir ${_DICT_DIR}; \ + mkdir ${MMCIF_DICTS_DIR}; \ + cd ${_DICT_DIR}; \ + # + ## Fetch the dictionary definition language + curl ${_DICT_URL}/mmcif_ddl.dic.gz -s -o mmcif_ddl.dic.gz; \ + gunzip *.gz; \ + # + ## Fetch the merged ModelCIF dictionary + if test x${USE_DICT_VERSION} == xlatest; then \ + export _MA_DICT_URL="${_GIT_URL}/dist/mmcif_ma.dic"; \ + fi; \ + curl ${_MA_DICT_URL} -s -L -o mmcif_ma.dic; \ + # + ## Build the ModelCIF SDB + DictToSdb -ddlFile mmcif_ddl.dic \ + -dictFile mmcif_ma.dic \ + -dictSdbFile ${MMCIF_DICTS_DIR}/mmcif_ma.sdb; \ + # + ## Fetch the stable PDBx/mmCIF dictionary + curl ${_DICT_URL}/mmcif_pdbx_v50.dic -s -o mmcif_pdbx_v50.dic; \ + # + ## Build the PDBx/mmCIF SDB + DictToSdb -ddlFile mmcif_ddl.dic \ + -dictFile mmcif_pdbx_v50.dic \ + -dictSdbFile ${MMCIF_DICTS_DIR}/mmcif_pdbx_v50.dic.sdb; \ + # + ## Get versions of ModelCIF & PDBx/mmCIF dictionaries + get-mmcif-dict-versions --parent-location ${_GIT_URL}/base/mmcif_pdbx_v50.dic \ + --child-location ${_MA_DICT_URL} \ + mmcif_ma.dic; \ + mv mmcif_ma_version.json ${MMCIF_DICTS_DIR}/; \ + # + ## Make SDBs readable and keep possible error logs from building them + mv *.log ${MMCIF_DICTS_DIR}/ 2>/dev/null || :; \ + chmod o+r ${MMCIF_DICTS_DIR}/*; \ + # + ## Clean up + cd ${SRC_DIR}; \ + rm -r ${_DICT_DIR}; \ + apk del curl + + +# for development +#RUN set -e pipefail; \ +# apk add bash emacs gcc build-base; \ +# /usr/local/bin/python -m pip install pylint black; \ +# apk del gcc build-base + +## Add a dedicated user for mmCIF file validation +## MMCIF_USER_ID can be used to avoid file permission issues in development. +ARG MMCIF_USER_ID=501 +RUN adduser -S -u ${MMCIF_USER_ID} mmcif-vldtr + +USER mmcif-vldtr + +ENTRYPOINT ["/entrypoint.sh"] + +# have tool ready +# - entrypoint: validate... just runs validation, celery runs celery, CMD else +# write Python to run & check mmCIF +# - Note dictionary versions in the mmCIF file! +# for Celery: +# - depends_on without implementing the 'waits' in this entrypoint.sh: +# https://marcopeg.com/docker-compose-healthcheck/ + + +# LocalWords: ENV DICTS SRC tmp schwedelab RCSB WORKDIR pipefail apk dev ARG +# LocalWords: ARGs diff --git a/validation/README.md b/validation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5f6407c1fd1bf36f67e45e142adab5efe10a0403 --- /dev/null +++ b/validation/README.md @@ -0,0 +1,125 @@ +[[_TOC_]] + +# ModelCIF validation tool + +This is a tool to check that the formatting of ModelCIF files complies with the ModelCIF format declaration (aka "dictionary"). Upon successful validation, a ModelCIF file can be extended with the dictionary version the file was compared to (option [`--extend-validated-file`](#add-validation-dictionary-to-ModelCIF-file)). For more basic mmCIF validation, the dictionary of the underlying PDBx/mmCIF format is also available. + +The easiest way to run validation is from Docker container. + +The tool itself is a wrapper around the [`CifCheck`](https://github.com/rcsb/cpp-dict-pack) tool by [RCSB](https://www.rcsb.org/). + + +## How to run the validation tool + +This is just a description of the validation tool itself. When running it from inside a container, the command needs to be prefixed with the instructions to start the container. Find information for running the validation container in "[How to run the container](#how-to-run-the-container)". + +Upon completion, if there hasn't been any error running the command, the validation tool returns a concise report in JSON format. That output is meant to be input to a website or any kind of nicely formatted report. Output can also be stored as a file. If the tested ModelCIF file is fully compliant with the ModelCIF format, the JSON output has + +- `status` "completed" +- no messages in the `diagnosis` list +- `versions` of the dictionaries the file was tested against + +Format violations will be listed in `diagnosis`. + +The most basic way to invoke the validation tool is just with a ModelCIF file (example shows the command plus possible output): + +```bash +$ validate-mmcif-file model.cif +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` + +### Add dictionary information used for validation to ModelCIF file + +Since both dictionaries, ModelCIF and PDBx/mmCIF, represent actively developed file formats, different versions exist. While extending them, quite some thinking goes into making only non-breaking changes. The idea is that a ModelCIF file formatted following dictionary 1.3, is still valid with version 1.4. But the version number also tells you which feature to expect from a ModelCIF file, so it seems like a good idea to keep the version inside the file. + +The validation tool can add the version upon positive validation, enabled by the `--extend-validated-file` (`-e`). + +`-e` can take an alternative file name to write the validated ModelCIF file to, e.g. if one wants to keep the original ModelCIF file unaltered: +```bash +$ validate-mmcif-file -e validated_model.cif model.cif +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` +The last command will generate a new file `validated_model.cf` upon positive validation (`diagnosis` points to an empty list), with the `versions` added to the `_audit_conform` list inside the file. + +To add the validation dictionaries to `_audit_conform` in the original ModelCIF file, just invoke `-e` without an alternative file name... well almost. By the way Python handles this kind of command line arguments, `-e` consumes everything after it, that does not start with a `-`, as a file name. So `validate-mmcif-file -e model.cif` would mean that `-e` assumes `model.cif` as its file name but then the command fails because it is missing the ModelCIF file to be validated. The solution is either putting `-e` at the beginning of the arguments list or after the ModelCIF file name at the very end, if there are no other command line arguments: + +```bash +$ validate-mmcif-file model.cif -e +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` + +### Base directory for associated files + +For a ModelCIF file using the `_ma_entry_associated_files` category, the validation tool tries to merge associated data into the ModelCIF file, if `_ma_entry_associated_files.file_format` is `cif`. That way the outsourced data is validated, too. + +Command line argument `--associates-dir` (`-a`) is used to declare the base directory associated files are stored in. Inside the directory, the path must follow what is defined in `_ma_entry_associated_files.file_url`. If the URL is just the file name, the file must be stored right in the associates directory. The following example works for `_ma_entry_associated_files.file_url model_pae.cif` + +```bash +$ grep _ma_entry_associated_files.file_url model.cif +_ma_entry_associated_files.file_url model_pae.cif +$ ls extra +model_pae.cif +$ validate-mmcif-file -a extra model.cif +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` + +If the URL points to a subdirectory, this must be reflected by the associates directory tree declared to the validation tool. The following example illustrates that the `extra` directory needs a `pae` directory storing the associated file as expected by `_ma_entry_associated_files.file_url`: + +```bash +$ grep _ma_entry_associated_files.file_url model.cif +_ma_entry_associated_files.file_url pae/model_pae.cif +$ ls extra +pae +$ ls extra/pae +model_pae.cif +$ validate-mmcif-file -a extra model.cif +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` + +### Misc. arguments + +**`--help`** (**`-h`**) Print a help/ usage page for the validation tool. + +**`--dict-sdb <SDB FILE>`** (**`-d`**) Format dictionary in (binary) SDB format used for validating a ModelCIF file. The container comes with a SDB for ModelCIF and one for the original PDBx/mmCIF format. + +**`--out-file <JSON FILE>`** (**`-o`**) Instead of printing the output to `stdout`, store it in a JSON file. + +**`--verbose`** (**`-v`**) Write information from intermediate steps to `stdout`. This includes the raw output of `CifCheck`. + + +## How to run the container + +The call to the validation tool (almost) stays the same, it just needs instructions to start the Docker container as a prefix: + +```bash +$ docker run --rm -v /home/user/models:/data registry.scicore.unibas.ch/schwede/mabakerimport/mmcif-dict-suite:dev validate-mmcif-file /data/model.cif +{"status":"completed","diagnosis":[],"versions":[{"title":"mmcif_pdbx_v50.dic","version":"5.358","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/base/mmcif_pdbx_v50.dic"},{"title":"mmcif_ma.dic","version":"1.4.0","location":"https://raw.githubusercontent.com/ihmwg/ModelCIF/a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f/dist/mmcif_ma.dic"}]} +$ +``` + +`docker run` is the call to execute a certain command inside a container. `--rm` makes sure that the container is removed from the system once the job completed. + +Since the container has its own internal file system separated, `-v` is utilised to mount a directory from the host into the container. + +- explain what command does +- explain volumes/ external mounts +- explain for -e +- explain for -a + +## How to pull a copy of the container from our registry + +- since we use it ourselves and are involved in the development, we usually notice when a new dictionary comes out + +## How to build the container from scratch + + +# Files in this directory + +<!-- LocalWords: PDBx ModelCIF TOC JSON CifCheck RCSB mmcif cif pdbx dic dir + LocalWords: url pae sdb SDB stdout + --> diff --git a/validation/entrypoint.sh b/validation/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..224d7d611ffc4bb09f1702607da0daa02e5721ec --- /dev/null +++ b/validation/entrypoint.sh @@ -0,0 +1,30 @@ +#!/bin/sh +## (We use sh since Alpine does not have Bash by default) + + +## exit immediately on commands with a non-zero exit status. +set -euo pipefail + +## When started without any arguments, "-h", "--help", "-help" or "help", print +## usage. +if [ $# -eq 0 ] || [ x$1 == x"-h" ] || [ x$1 == x"--help" ] || + [ x$1 == x"-help" ] || [ x$1 == x"help" ]; then + echo " mmCIF file format validation tool." + echo "------------------------------------------" + echo "Provided by SWISS-MODEL / Schwede group" + echo "(swissmodel.expasy.org / schwedelab.org)" + echo "" + echo "This container checks that mmCIF files are" + echo "properly formatted according to the" + echo "MAX/ mmCIF dictionary. At the moment," + echo "there is one tool available that acts as a" + echo "command line tool: validate-mmcif-file." + echo "For further usage information, call this" + echo "container executing" + echo "'validate-mmcif-file --help'." + exit 1 +fi + +exec "$@" + +# LocalWords: euo pipefail eq Schwede schwedelab mmcif fi diff --git a/validation/get-mmcif-dict-versions.py b/validation/get-mmcif-dict-versions.py new file mode 100755 index 0000000000000000000000000000000000000000..da1f1ea19833939d9a1a9869c33b3bc47fc2bef4 --- /dev/null +++ b/validation/get-mmcif-dict-versions.py @@ -0,0 +1,157 @@ +#! /usr/local/bin/python +"""Get version and location of relevant mmCIF dictionaries for ModelCIF. + +Fetch the versions of the ModelCIF dictionary and the PDBx/mmCIF dictionary used +to build it into a JSON file. +""" +# pylint: disable=invalid-name +# pylint: enable=invalid-name + +import argparse +import sys + +import rapidjson as json + +from mmcif.io.IoAdapterPy import IoAdapterPy + + +def _parse_command_line(): + """Get arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "dic_file", + type=str, + metavar="<DICTIONARY FILE>", + help="The mmCIF dictionary file to read the versions from.", + ) + parser.add_argument( + "--parent", + "-p", + type=str, + metavar="<NAME OF PARENT DICT>", + help="Name of to the 'parent' dictionary. This is the one the other " + + "dictionary is appended to. This is usually the mmcif_pdbx_v50.dic.", + default="mmcif_pdbx_v50.dic", + ) + parser.add_argument( + "--output", + "-o", + type=str, + metavar="<PATH TO VERSION FILE>", + help="Path to store the JSON file with the version at.", + default="mmcif_ma_version.json", + ) + parser.add_argument( + "--parent-location", + "-u", + type=str, + metavar="<URL OF PARENT DICT FILE>", + help="Download location of the parent dictionary file.", + default=None, + ) + parser.add_argument( + "--child-location", + "-l", + type=str, + metavar="<URL OF CHILD DICT FILE>", + help="Download location of the child dictionary file.", + default=None, + ) + opts = parser.parse_args() + + return opts + + +def _error(msg): + """Print a final error message.""" + print(msg + "\nAborting.", file=sys.stderr) + sys.exit(1) + + +def _get_data_cat(cat, file_name, data): + """Get a data category from a mmCIF data blob.""" + obj = data.getObj(cat) + if obj is None: + _error(f"No '{cat}' object found in '{file_name}'.") + + return obj + + +def _get_data_item(itm, cat, file_name, cat_data): + """Get a single data item from a data category.""" + val = cat_data.getAttributeValueList(itm) + if len(val) != 1: + _error( + f"Expected exactly 1 '{cat}.{itm}' in '{file_name}', " + + f"found '{', '.join(val)}'." + ) + + return val[0] + + +def _get_versions(dic_file, parent_name, io_adapter): + """Fetch the 'category_group_list' object and assemble a version for the + dictionary.""" + + dic = io_adapter.readFile(inputFilePath=dic_file) + + # fetch a data container from the list returned by the adapter + cntnr = None + for obj in dic: + if "dictionary" in obj.getObjNameList(): + cntnr = obj + break + + if cntnr is None: + _error(f"No 'dictionary' object found in '{dic_file}'.") + + dic = _get_data_cat("dictionary", dic_file, cntnr) + + vrsn = _get_data_item("version", "dictionary", dic_file, dic) + ttl = _get_data_item("title", "dictionary", dic_file, dic) + dic_version = {"title": ttl, "version": vrsn} + + cmp = _get_data_cat("pdbx_dictionary_component", dic_file, cntnr) + dc_idx = cmp.getAttributeIndex("dictionary_component_id") + vs_idx = cmp.getAttributeIndex("version") + for row in cmp: + if row[dc_idx] == parent_name: + vrsn = row[vs_idx] + prnt_version = {"title": parent_name, "version": vrsn} + break + + return dic_version, prnt_version + + +def _add_dict_location(parent, child, parent_loc, child_loc): + """Add URLs to the dictionary versions if available.""" + if parent_loc is None: + parent["location"] = "." + else: + parent["location"] = parent_loc + if child_loc is None: + child["location"] = "." + else: + child["location"] = child_loc + + +def _main(): + """Run as script.""" + opts = _parse_command_line() + + io_adapter = IoAdapterPy(False, sys.stdout) + c_vrsn, p_vrsn = _get_versions(opts.dic_file, opts.parent, io_adapter) + + _add_dict_location( + p_vrsn, c_vrsn, opts.parent_location, opts.child_location + ) + with open(opts.output, "w", encoding="utf8") as jfh: + json.dump({"versions": [p_vrsn, c_vrsn]}, jfh) + + +if __name__ == "__main__": + _main() + +# LocalWords: DictToSdb SDB PDBx CifCheck pylint mmcif pdbx dic nAborting +# LocalWords: macromolecular utf diff --git a/validation/validate-mmcif-file.py b/validation/validate-mmcif-file.py new file mode 100755 index 0000000000000000000000000000000000000000..94ddd443f254d26aab0b0da5ea6e5552f0086c26 --- /dev/null +++ b/validation/validate-mmcif-file.py @@ -0,0 +1,708 @@ +#! /usr/local/bin/python +"""Validate mmCIF format in a model mmCIF file. + +Does not check if the model/ coordinates make sense. But includes associated +cif files in the check by merging files. That is, as an example, associated +files with quality scores stored in mmCIF format will be merged with the model +file and checked, but associated MSA files in FASTA format can not be merged +and thus, won't be merged into the model mmCIF file and won't be checked. +""" +# pylint: disable=invalid-name +# pylint: enable=invalid-name + +from io import TextIOWrapper +import argparse +import atexit +import copy +import os +import subprocess +import sys +import tempfile +import zipfile + +import rapidjson as json + + +from mmcif.api.DataCategory import DataCategory +from mmcif.api.PdbxContainers import DataContainer +from mmcif.io.PdbxReader import PdbxReader +from mmcif.io.PdbxWriter import PdbxWriter + + +def _parse_command_line(): + """Get arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "model_cif", + type=str, + metavar="<MODEL MMCIF FILE>", + help="Path to the model mmCIF file. This is the 'main' cif file of a " + + "modelling project including coordinates.", + ) + parser.add_argument( + "--associates-dir", + "-a", + type=str, + metavar="<DIR>", + help="Path to associated files, needed when the mmCIF file has " + + "external files attached.", + default=None, + ) + parser.add_argument( + "--dict-sdb", + "-d", + type=str, + metavar="<SDB FILE>", + help="The dictionary in SDB format used for checking.", + default="/usr/local/share/mmcif-dict-suite/mmcif_ma.sdb", + ) + parser.add_argument( + "--out-file", + "-o", + type=str, + metavar="<JSON FILE>", + help="Write the JSON output to file. Default is to write to stdout.", + default=None, + ) + parser.add_argument( + "--extend-validated-file", + "-e", + nargs="?", + const=" same ", + metavar="<FILE>", + help="Extend a positively validated mmCIF file with dictionary " + + "versions. If invoked without argument, write to the input model " + + "mmCIF file, otherwise specify a file name. Please note, the " + + "dictionary will not be added to the mmCIF file if there are any " + + "issues.", + default=None, + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Write some messages to stdout instead of just having it as JSON. " + + "Useful for debugging on the command line.", + ) + + opts = parser.parse_args() + + # post process arguments + if opts.extend_validated_file is not None: + if opts.extend_validated_file == " same ": + opts.extend_validated_file = opts.model_cif + + return opts + + +def _error(msg): + """Print a final error message.""" + print(msg + "\nAborting.", file=sys.stderr) + sys.exit(1) + + +def _warn(msg): + """Pritn a warning message.""" + print(f"WARNING: {msg}", file=sys.stderr) + + +def _parse_cifcheck_stderr(stderr): + """Parse the error output of CifCheck.""" + error_lst = [] + for line in stderr.splitlines(): + if line.startswith("Message: "): + error_lst.append(line[10:].rsplit('"', 1)[0]) + elif line.startswith("cp: "): + error_lst.append(line[4:]) + else: + _error(f"Unknown error output found: '{line}'") + + return error_lst + + +def _parse_parser_file(filename): + """Parse the parser output file of CifCheck.""" + parserfile = filename + "-parser.log" + if not os.path.exists(parserfile): + return [] + + error_lst = [] + with open(parserfile, encoding="utf-8") as dfh: + for line in dfh: + line = line.strip() + error_lst.append(line) + + # remove the diag file + os.unlink(parserfile) + + return error_lst + + +def _parse_diag_file(filename): + """Parse the diagnosis file of CifCheck.""" + # CifCheck places the diag file in the cwd. + diagfile = filename + "-diag.log" + if not os.path.exists(diagfile): + return [] + + error_lst = [] + # CifCheck outputs diag files as iso-8859 + with open(diagfile, encoding="iso-8859-1") as dfh: + for line in dfh: + line = line.strip() + if line == "": + continue + error_lst.append(line) + + # remove the diag file + os.unlink(diagfile) + + return error_lst + + +class _CifCheckFailedError(RuntimeError): + """Raise for failed CifCheck runs but include error messages.""" + + def __init__(self, cifcheck_cmd, error_lst): + """Create an exception""" + super().__init__(f"CifCheck failed for {' '.join(cifcheck_cmd)}") + self.cifcheck_errors = error_lst + self.cifcheck_cmd = cifcheck_cmd + + +def _read_mmcif(filepath_or_object): + """Read a mmCIF file""" + data_lst = [] + if isinstance(filepath_or_object, str): + with open(filepath_or_object, encoding="utf-8") as ifh: + prd = PdbxReader(ifh) + prd.read(data_lst) + else: + prd = PdbxReader(filepath_or_object) + prd.read(data_lst) + + return data_lst + + +def _write_mmcif(filepath, cif_data): + """Write data to mmCIF file""" + with open(filepath, "w", encoding="ascii") as ofh: + cifwriter = PdbxWriter(ofh) + # save a lot of whitespaces! + cifwriter.setAlignmentFlag(flag=False) + cifwriter.write(cif_data) + + +def _get_indeces(data_category, attribute_list): + """Get column indexes for a list of attributes.""" + idxs = {} + for attr in attribute_list: + idxs[attr] = data_category.getAttributeIndex(attr) + if idxs[attr] == -1: + return {} + + return idxs + + +def _get_entry_id(cif_datablock, entry_id_map, datablock_idx): + """Get a mapping of the entry.id from a cif datablock.""" + entry = cif_datablock.getObj("entry") + if entry is not None: + eidx = entry.getAttributeIndex("id") + if eidx != -1: + for row in entry: + entry_id_map[row[eidx]] = datablock_idx + + +def _unzip_arc_cif(arc_file, cif_file, assoc_dir): + """Extract a cif file from a ZIP archive.""" + assoc_data = [] + with zipfile.ZipFile(os.path.join(assoc_dir, arc_file)) as arc_zip: + with TextIOWrapper(arc_zip.open(cif_file), encoding="utf-8") as cif_fh: + assoc_data = _read_mmcif(cif_fh) + + return assoc_data + + +def _get_associated_files(model_cif_file, assoc_dir): + """Get the list of associated files from a model cif file.""" + # This is an intermediate step, so we do not need to check/ report anything + # here. The actual confirmation comes out of CifCheck at a later stage. + mdl_cif = _read_mmcif(model_cif_file) + + entry_id_map = {} + assoc_files = [] + archives = {} + for i, pdbx_cntnr in enumerate(mdl_cif): + # gather entry.id's for later + _get_entry_id(pdbx_cntnr, entry_id_map, i) + dat_cat = pdbx_cntnr.getObj("ma_entry_associated_files") + # If ma_entry_associated_files is not present then + # ma_associated_archive_file_details can't exist either since it has a + # ma_entry_associated_files.id relation. (CifCheck should notice that.) + if dat_cat is None: + continue + idxs = _get_indeces( + dat_cat, ["entry_id", "file_format", "file_type", "file_url", "id"] + ) + if not idxs: + continue + for row in dat_cat: + if row[idxs["file_type"]] == "archive": + archives[row[idxs["id"]]] = ( + row[idxs["file_url"]], + row[idxs["entry_id"]], + ) + if row[idxs["file_format"]] != "cif": + continue + data = _read_mmcif(os.path.join(assoc_dir, row[idxs["file_url"]])) + assoc_files.append((data, row[idxs["entry_id"]])) + # make sure entry_id is matching in associated file! + dat_cat = pdbx_cntnr.getObj("ma_associated_archive_file_details") + if dat_cat is None: + continue + idxs = _get_indeces( + dat_cat, ["archive_file_id", "file_format", "file_path"] + ) + for row in dat_cat: + if row[idxs["file_format"]] == "cif": + arc_id = row[idxs["archive_file_id"]] + arc_file = archives[arc_id][0] + cif_file = row[idxs["file_path"]] + data = _unzip_arc_cif(arc_file, cif_file, assoc_dir) + assoc_files.append((data, archives[arc_id][1])) + + return assoc_files, mdl_cif, entry_id_map + + +def _cmp_cif_rows(a_row, b_row, a_idxs, b_idxs, attrs): + """Compare two cif rows by given attributes""" + for i in attrs: + if a_row[a_idxs[i]] != b_row[b_idxs[i]]: + return False + + return True + + +def _add_row(row, src_idxs, dest_idxs, dest, attrs_l): + """Add a data row to an existing datablock with the right item order.""" + # create a new row fitting dest's order + new_row = list("?" * attrs_l) + for i, j in src_idxs.items(): + new_row[dest_idxs[i]] = row[j] + dest.append(new_row) + + +def _add_or_extend_rows(src, dest, common, not_in_dest): + """Mix/ add rows from src into dest.""" + # extend dest with new attributes + for attr in not_in_dest: + dest.appendAttribute(attr) + s_idx = src.getAttributeIndexDict() + d_idx = dest.getAttributeIndexDict() + attrs_l = len(d_idx) + d_rows = list(range(len(dest))) + for src_row in src: + match = False + for i in d_rows: + dest_row = dest[i] + match = _cmp_cif_rows(src_row, dest_row, s_idx, d_idx, common) + if match: + # extend with missing data items + for attr in not_in_dest: + dest_row.append(src_row[s_idx[attr]]) + d_rows.remove(i) + break + if not match: + _add_row(src_row, s_idx, d_idx, dest, attrs_l) + # extend dest rows that never matched with "?" as default value + for i in d_rows: + dest_row = dest[i] + for attr in not_in_dest: + dest_row.append("?") + + +def _merge_cif_datacontainer( + parent_datablock, datablock, exclude_categories=None +): + """Merge datablock into parent_datablock ignoring exclude_categories.""" + for category in datablock.getObjNameList(): + if category in exclude_categories: + continue + db_ctgry = datablock.getObj(category) + # check if the data category exists in parent + if parent_datablock.exists(category): + p_ctgry = parent_datablock.getObj(category) + # compare items + not_in_p, in_both, _ = db_ctgry.cmpAttributeNames(p_ctgry) + _add_or_extend_rows(db_ctgry, p_ctgry, in_both, not_in_p) + else: + # data category does not exist in parent, append it to datablock + parent_datablock.append(db_ctgry) + + +def _try_os_remove(path): + """Try to remove a file, don't complain if that fails.""" + try: + os.remove(path) + except: # pylint: disable=bare-except + pass + + +def _merge_cif_data(model_cif_data, assoc_cif, row_entry_id, entry_id_map): + """Merge contents of an associated file into cif data.""" + error_msgs = {"cifcheck-errors": []} + + # per datablock, check to which datablock it belongs in the parent cif + for assoc_cntnr in assoc_cif: + # check/ get 'entry_link' + assoc_entry_link = assoc_cntnr.getObj("entry_link") + if assoc_entry_link is None: + error_msgs["cifcheck-errors"].append( + 'ERROR - category "entry_link" is mandatory, but it is not ' + + f'present in datablock "{assoc_cntnr.getName()}"' + ) + continue + # make sure entry_id exists for entry_link + entry_id_idx = assoc_entry_link.getAttributeIndex("entry_id") + if entry_id_idx == -1: + error_msgs["cifcheck-errors"].append( + f'ERROR - In block "{assoc_cntnr.getName()}", mandatory item ' + + '"entry_id" is not in category "entry_link"' + ) + continue + # For each entry_id, look up the corresponding datablock in + # model_cif_data and merge with that datablock. + for row in assoc_entry_link: + entry_id = row[entry_id_idx] + if entry_id != row_entry_id: + error_msgs["cifcheck-errors"].append( + f'ERROR - In block "{assoc_cntnr.getName()}", item ' + + '"entry_id" does not match item ' + + '"ma_entry_associated_files.entry_id"' + ) + continue + _merge_cif_datacontainer( + model_cif_data[entry_id_map[entry_id]], + assoc_cntnr, + exclude_categories=["entry_link"], + ) + + return error_msgs + + +class _CifCheck: + """Handling the CifCheck tool.""" + + def __init__(self, dict_sdb, json_out_file=None, verbose=False): + self._version = None + self.check_results = {} + self.dict_sdb = os.path.abspath(dict_sdb) + self.json_out_file = json_out_file + self.verbose = verbose + + @property + def version(self): + """Get version dictionary if available""" + if self._version is not None: + return self._version + vrsn_file = os.path.splitext(self.dict_sdb)[0] + "_version.json" + try: + with open(vrsn_file, "r", encoding="utf-8") as jfh: + vrsn = json.load(jfh) + except FileNotFoundError: + self._version = {"version": [{"title": None, "version": None}]} + self.add_general_error( + f"Version JSON file not found at '{vrsn_file}'" + ) + else: + self._version = vrsn + + return self._version + + def add_general_error(self, msg): + """Add a uncategorised error to the list.""" + if "errors" not in self.check_results: + self.check_results["errors"] = [msg] + else: + self.check_results["errors"].append(msg) + + def _execute(self, filepath): + """Execute the CifCheck tool on a model mmCIF file.""" + # If permission errors occur with the source directory of the CIF file, + # consider copying the file to a Python tempfile generated path. That + # deals with missing $TMP, $TEMP, etc.... variables. + # At the moment, cwd is switched to the source directory since CifCheck + # copies the file, otherwise. + cifcheck_filepath = os.path.basename(filepath) + cifcheck_cmd = [ + "CifCheck", + "-dictSdb", + self.dict_sdb, + "-f", + cifcheck_filepath, + ] + cifcheck_wd = os.path.dirname(os.path.abspath(filepath)) + cps = subprocess.run( + cifcheck_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + check=False, + universal_newlines=True, + cwd=cifcheck_wd, + ) + + error_lst = [] + # get error messages on the command line + error_lst.extend(_parse_cifcheck_stderr(cps.stderr)) + error_lst.extend(_parse_parser_file(filepath)) + if len(error_lst) > 0: + raise _CifCheckFailedError(cifcheck_cmd, error_lst) + + # get messages from diagnosis file + error_lst.extend(_parse_diag_file(filepath)) + + return error_lst + + def run(self, cif_file): + """Run CifCheck for a given file and catch the output. + + Returns False if the CifCheck execution itself failed.""" + try: + format_errors = self._execute(cif_file) + except _CifCheckFailedError as exc: + if self.verbose: + _warn("failed to run CifCheck, Stopping.") + for line in exc.args: + print(line, file=sys.stderr) + print("CifCheck errors:", file=sys.stderr) + for line in exc.cifcheck_errors: + print(" ", line, file=sys.stderr) + + self.check_results["status"] = "aborted" + self.check_results["cifcheck-command"] = " ".join(exc.cifcheck_cmd) + self.check_results["cifcheck-errors"] = exc.cifcheck_errors + + return False + + self.check_results["status"] = "completed" + self.check_results["diagnosis"] = format_errors + + return True + + def got_issues(self): + """Query if there are parser or diagnosis messages.""" + if ( + "diagnosis" in self.check_results + and len(self.check_results["diagnosis"]) > 0 + ): + return True + + if ( + "cifcheck-errors" in self.check_results + and len(self.check_results["cifcheck-errors"]) > 0 + ): + return True + + return False + + def _update_audit_conform(self, ac_cat): + """Update an existing audit_conform category entry.""" + # check if name is there, if not, append + nm_idx = ac_cat.getAttributeIndex("dict_name") + vs_idx = ac_cat.getAttributeIndex("dict_version") + lc_idx = ac_cat.getAttributeIndex("dict_location") + for dct in self.version["versions"]: + found = False + for itm in ac_cat: + if dct["title"] == itm[nm_idx]: + itm[vs_idx] = dct["version"] + itm[lc_idx] = dct["location"] + found = True + break + if not found: + new_ac = [""] * 3 + new_ac[nm_idx] = dct["title"] + new_ac[vs_idx] = dct["version"] + new_ac[lc_idx] = dct["location"] + ac_cat.append(new_ac) + + def _add_audit_conform(self, pdbx_cntnr, mdl_cif, container_idx): + """Add audit_conform category entry to data container.""" + ac_cat = DataCategory( + "audit_conform", + ["dict_name", "dict_version", "dict_location"], + [ + [x["title"], x["version"], x["location"]] + for x in self.version["versions"] + ], + ) + # We want nicely formatted cif files, so place audit_conform + # after entry. + objs = pdbx_cntnr.getObjCatalog() + names = list(objs.keys()) + pdbx_cntnr = DataContainer(pdbx_cntnr.getName()) + found = False + while len(names) > 0: + nme = names.pop(0) + pdbx_cntnr.append(objs[nme]) + if nme == "entry": + pdbx_cntnr.append(ac_cat) + found = True + break + for nme in names: + pdbx_cntnr.append(objs[nme]) + if not found: + pdbx_cntnr.append(ac_cat) + mdl_cif[container_idx] = pdbx_cntnr + + def add_versions_to_mmcif_file(self, mdl_cif, dest_file): + """Add versions of mmCIF dictionaries to a mmCIF file. + + :param mdl_cif: CIF data to be equipped with version data. + :type mdl_cif: :class:`list` of DataContainer + :param dest_file: Path to write the modified file to. + :type dest_file: :class:`str` + """ + # add/ modify audit_conform category + for i, pdbx_cntnr in enumerate(mdl_cif): + ac_cat = pdbx_cntnr.getObj("audit_conform") + if ac_cat is not None: + self._update_audit_conform(ac_cat) + else: + self._add_audit_conform(pdbx_cntnr, mdl_cif, i) + + # write modified mmCIF containers to file + _write_mmcif(dest_file, mdl_cif) + + return mdl_cif + + def to_json(self): + """Get CifCheck results as JSON.""" + self.check_results.update(self.version) + return json.dumps(self.check_results) + + def make_json_output(self): + """Dump JSON results of CifCheck either as file or print to stdout.""" + if self.verbose: + print("=============== CifCheck Errors ==============") + if "cifcheck-errors" in self.check_results: + for line in self.check_results["cifcheck-errors"]: + print(line) + sys.stdout.write("\n") + print("============= CifCheck Diagnosis =============") + if "diagnosis" in self.check_results: + for line in self.check_results["diagnosis"]: + print(line) + sys.stdout.write("\n") + + json_data = self.to_json() + if self.verbose or not self.json_out_file: + print(json_data) + + if self.json_out_file is not None: + with open(self.json_out_file, "w", encoding="utf-8") as jfh: + jfh.write(json_data) + + def add_to_results(self, msgs): + """Add messages to the CifCheck results""" + if "cifcheck-errors" not in self.check_results: + self.check_results["cifcheck-errors"] = msgs["cifcheck-errors"] + else: + self.check_results["cifcheck-errors"].extend( + msgs["cifcheck-errors"] + ) + + +def _find_utf(line): + """Try to find a word with an UTF character in a string.""" + for i, ltr in enumerate(line): + try: + ltr.encode("ascii", "strict") + except UnicodeEncodeError: + return i + + return None + + +def _file_has_utf(filename, cifcheck): + """Check a file to not contain UTF characters as mmCIF only allows ASCII.""" + with open(filename, encoding="utf-8") as cfh: + for i, line in enumerate(cfh): + try: + line.encode("ascii", "strict") + except UnicodeEncodeError: + idx = _find_utf(line) + cifcheck.add_general_error( + "File is non-ascii as it has an UTF character in line " + + f"{i}, index {idx}." + ) + return True + + return False + + +def _main(): + """Run as script""" + opts = _parse_command_line() + + # set up the CifCheck tool + cifcheck = _CifCheck(opts.dict_sdb, opts.out_file, opts.verbose) + + if _file_has_utf(opts.model_cif, cifcheck): + cifcheck.make_json_output() + sys.exit(1) + + # check for associated files referenced by the model cif file + assoc_files, model_cif_data, entry_id_map = _get_associated_files( + opts.model_cif, + opts.associates_dir, + ) + # save original data for later + if opts.extend_validated_file is not None: + o_model_cif_data = copy.deepcopy(model_cif_data) + # make sure associated files exist and merge all of them into the model + for assoc, entry_id in assoc_files: + # merge the model.cif and the associated file + msgs = _merge_cif_data(model_cif_data, assoc, entry_id, entry_id_map) + cifcheck.add_to_results(msgs) + + validate_file = opts.model_cif + if assoc_files: + # write merged data to disk, create tmp file, clean up when done + cfh, cfn = tempfile.mkstemp(suffix=".cif", text=True) + # register for deletion here and in cwd + atexit.register(_try_os_remove, cfn) + os.close(cfh) + _write_mmcif(cfn, model_cif_data) + validate_file = cfn + + # validate file + success = cifcheck.run(validate_file) + if not success: + cifcheck.make_json_output() + sys.exit(1) + + # upon request (-e) extend the ORIGINAL file (not the merged one) + if not cifcheck.got_issues() and opts.extend_validated_file is not None: + cifcheck.add_versions_to_mmcif_file( + o_model_cif_data, opts.extend_validated_file + ) + + # return JSON as file or to stdout + if opts.out_file and opts.verbose: + print(f"Writing results of CifCheck to '{opts.out_file}'") + cifcheck.make_json_output() + + if cifcheck.got_issues(): + # If CifCheck found issues with the mmCIF file, exit with code 2. Exit + # code 1 is reserved for general issues running the command, like "file + # not found". + sys.exit(2) + + +if __name__ == "__main__": + _main() + +# LocalWords: cif MSA FASTA pylint stdout CifCheck param src str dest cwd