Skip to content
Snippets Groups Projects
Commit 65ec1620 authored by B13nch3n's avatar B13nch3n
Browse files

Add validation tool

parent 9f8adf37
No related branches found
No related tags found
No related merge requests found
ARG VERSION_PYTHON="3.6.15"
ARG VERSION_BASE_IMAGE="python:${VERSION_PYTHON}-alpine3.15"
FROM ${VERSION_BASE_IMAGE}
# We need to declare ARGs again which were declared before the build stage
# (FROM directive), otherwise they won't be available in this stage.
ARG VERSION_PYTHON
ARG VERSION_BASE_IMAGE
ARG VERSION_CPP_DICT_PACK="v2.500"
ARG VERSION_PY_MMCIF="0.76"
## Set up environment
ENV MMCIF_DICTS_DIR="/usr/local/share/mmcif-dict-suite" \
SRC_DIR="/tmp" \
VERSION_CPP_DICT_PACK=${VERSION_CPP_DICT_PACK} \
VERSION_BASE_IMAGE=${VERSION_BASE_IMAGE} \
VERSION_PYTHON=${VERSION_PYTHON} \
VERSION_PY_MMCIF=${VERSION_PY_MMCIF} \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
LABEL org.modelarchive.base-image="${VERSION_BASE_IMAGE}"
LABEL org.modelarchive.cpp-dict-pack.version="${VERSION_CPP_DICT_PACK}"
LABEL maintainer="Stefan Bienert <stefan.bienert@unibas.ch>"
LABEL vendor1="Schwede Group (schwedelab.org)"
LABEL vendor2="SIB - Swiss Institute of Bioinformatics (sib.swiss)"
LABEL vendor3="Biozentrum - University of Basel (biozentrum.unibas.ch)"
## Install the RCSB CPP Dict Suite (only the binaries we need)
WORKDIR ${SRC_DIR}
RUN set -e pipefail; \
export DICT_PACK_SRC_DIR="${SRC_DIR}/cpp-dict-pack.git"; \
apk update; \
apk upgrade; \
apk add abuild binutils bison build-base cmake flex git gcc \
extra-cmake-modules tcsh; \
#
## Install the RCSB mmCIF Dict Suite
git clone -b ${VERSION_CPP_DICT_PACK} \
--single-branch --recurse-submodules \
https://github.com/rcsb/cpp-dict-pack.git \
${DICT_PACK_SRC_DIR}; \
mkdir ${DICT_PACK_SRC_DIR}/build; \
cd ${DICT_PACK_SRC_DIR}; \
cd ${DICT_PACK_SRC_DIR}/build; \
cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON; \
make; \
for cif_tool in CifCheck DictToSdb; do \
mv bin/${cif_tool} /usr/local/bin; \
done; \
cd ${SRC_DIR}; \
rm -r ${DICT_PACK_SRC_DIR}; \
#
## Install the RCSB py-mmcif Python module
/usr/local/bin/python -m pip install --upgrade pip; \
/usr/local/bin/python -m pip install mmcif==${VERSION_PY_MMCIF} \
python-rapidjson; \
#
## Clean up/ remove unnecessary stuff
apk del abuild binutils bison build-base cmake flex git gcc \
extra-cmake-modules tcsh; \
apk add libstdc++
## Add a dedicated user for mmCIF file validation
## MMCIF_USER_ID can be used to avoid file permission issues in development.
ARG MMCIF_USER_ID=501
RUN adduser -S -u ${MMCIF_USER_ID} mmcif-vldtr
## Copy tools (already in use during dictionary SDB creation)
COPY --chmod=755 get-mmcif-dict-versions.py \
/usr/local/bin/get-mmcif-dict-versions
## Create dictionaries for validating mmCIF files. To rebuild dictionaries,
## rebuild the container with build argument DICT_FETCH_DATE="<DATA>.n" so
## only the RUN Command for building the dictionary is triggered. The ".n"
## should be an increasing number to enable simple multiple builds in one
## day, in case something goes wrong.
## Dictionaries do not change that frequently therefore we skip the hassle of
## keeping them in an external volume.
## To explore development versions of the MAX/mmCIF dictionary, right out of
## the Git repo, build with USE_DICT_RELEASE="dev". Default is "master" which
## loads from the master branch at https://github.com/ihmwg/ModelCIF.
ARG DICT_FETCH_DATE="2022-05-02.1"
ARG USE_DICT_RELEASE="master"
ENV DICT_FETCH_DATE=${DICT_FETCH_DATE}
ENV USE_DICT_RELEASE=${USE_DICT_RELEASE}
LABEL org.modelarchive.dict-fetch-date="${DICT_FETCH_DATE}"
LABEL org.modelarchive.dict_release="${USE_DICT_RELEASE}"
WORKDIR ${SRC_DIR}
RUN set -e pipefail; \
apk add curl; \
export _DICT_DIR="${SRC_DIR}/mmcif_dicts"; \
export _DICT_URL="https://mmcif.wwpdb.org/dictionaries/ascii"; \
export _PATHSPEC="a24fcfa8d6c3ceb4b6f0676bcc341ac0cd24ff1f"; \
export _REPO_URL="https://raw.githubusercontent.com/ihmwg/ModelCIF/${_PATHSPEC}"; \
export _MA_DICT_URL="${_REPO_URL}/dist/mmcif_ma.dic"; \
export _DICT_REPO="ModelCIF.git"; \
mkdir ${_DICT_DIR}; \
mkdir ${MMCIF_DICTS_DIR}; \
cd ${_DICT_DIR}; \
#
## Fetch the dictionary definition language
curl ${_DICT_URL}/mmcif_ddl.dic.gz -s -o mmcif_ddl.dic.gz; \
gunzip *.gz; \
#
## Fetch the merged ModelCIF dictionary
#
## Fetch the Git repo with the dictionaries
curl ${_MA_DICT_URL} -s -L -o mmcif_ma.dic; \
#
## Build the ModelCIF SDB
DictToSdb -ddlFile mmcif_ddl.dic \
-dictFile mmcif_ma.dic \
-dictSdbFile ${MMCIF_DICTS_DIR}/mmcif_ma.sdb; \
#
## Fetch the stable PDBx/mmCIF dictionary
curl ${_DICT_URL}/mmcif_pdbx_v50.dic -s -o mmcif_pdbx_v50.dic; \
#
## Build the PDBx/mmCIF SDB
DictToSdb -ddlFile mmcif_ddl.dic \
-dictFile mmcif_pdbx_v50.dic \
-dictSdbFile ${MMCIF_DICTS_DIR}/mmcif_pdbx_v50.dic.sdb; \
#
## Get versions of ModelCIF & PDBx/mmCIF dictionaries
get-mmcif-dict-versions --parent-location ${_REPO_URL}/base/mmcif_pdbx_v50.dic \
--child-location ${_MA_DICT_URL} \
mmcif_ma.dic; \
mv mmcif_ma_version.json ${MMCIF_DICTS_DIR}/; \
#
## Make SDBs readable and keep possible error logs from building them
mv *.log ${MMCIF_DICTS_DIR}/ 2>/dev/null || :; \
chmod o+r ${MMCIF_DICTS_DIR}/*; \
#
## Clean up
cd ${SRC_DIR}; \
rm -r ${_DICT_DIR}; \
apk del curl
COPY --chmod=755 entrypoint.sh /
COPY --chmod=755 validate-mmcif-file.py /usr/local/bin/validate-mmcif-file
# for development
#RUN set -e pipefail; \
# apk add bash emacs gcc build-base; \
# /usr/local/bin/python -m pip install pylint black; \
# apk del gcc build-base
USER mmcif-vldtr
ENTRYPOINT ["/entrypoint.sh"]
# have tool ready
# - entrypoint: validate... just runs validation, celery runs celery, CMD else
# write Python to run & check mmCIF
# - Note dictionary versions in the mmCIF file!
# for Celery:
# - depends_on without implementing the 'waits' in this entrypoint.sh:
# https://marcopeg.com/docker-compose-healthcheck/
# LocalWords: ENV DICTS SRC tmp schwedelab RCSB WORKDIR pipefail apk dev ARG
# LocalWords: ARGs
#!/bin/sh
## (We use sh since Alpine does not have Bash by default)
## exit immediately on commands with a non-zero exit status.
set -euo pipefail
## When started without any arguments, "-h", "--help", "-help" or "help", print
## usage.
if [ $# -eq 0 ] || [ x$1 == x"-h" ] || [ x$1 == x"--help" ] ||
[ x$1 == x"-help" ] || [ x$1 == x"help" ]; then
echo " mmCIF file format validation tool."
echo "------------------------------------------"
echo "Provided by SWISS-MODEL / Schwede group"
echo "(swissmodel.expasy.org / schwedelab.org)"
echo ""
echo "This container checks that mmCIF files are"
echo "properly formatted according to the"
echo "MAX/ mmCIF dictionary. At the moment,"
echo "there is one tool available that acts as a"
echo "command line tool: validate-mmcif-file."
echo "For further usage information, call this"
echo "container executing"
echo "'validate-mmcif-file --help'."
exit 1
fi
exec "$@"
# LocalWords: euo pipefail eq Schwede schwedelab mmcif fi
#! /usr/local/bin/python
"""Get version and location of relevant mmCIF dictionaries for ModelCIF.
Fetch the versions of the ModelCIF dictionary and the PDBx/mmCIF dictionary used
to build it into a JSON file.
"""
# pylint: disable=invalid-name
# pylint: enable=invalid-name
import argparse
import sys
import rapidjson as json
from mmcif.io.IoAdapterPy import IoAdapterPy
def _parse_command_line():
"""Get arguments."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"dic_file",
type=str,
metavar="<DICTIONARY FILE>",
help="The mmCIF dictionary file to read the versions from.",
)
parser.add_argument(
"--parent",
"-p",
type=str,
metavar="<NAME OF PARENT DICT>",
help="Name of to the 'parent' dictionary. This is the one the other "
+ "dictionary is appended to. This is usually the mmcif_pdbx_v50.dic.",
default="mmcif_pdbx_v50.dic",
)
parser.add_argument(
"--output",
"-o",
type=str,
metavar="<PATH TO VERSION FILE>",
help="Path to store the JSON file with the version at.",
default="mmcif_ma_version.json",
)
parser.add_argument(
"--parent-location",
"-u",
type=str,
metavar="<URL OF PARENT DICT FILE>",
help="Download location of the parent dictionary file.",
default=None,
)
parser.add_argument(
"--child-location",
"-l",
type=str,
metavar="<URL OF CHILD DICT FILE>",
help="Download location of the child dictionary file.",
default=None,
)
opts = parser.parse_args()
return opts
def _error(msg):
"""Print a final error message."""
print(msg + "\nAborting.", file=sys.stderr)
sys.exit(1)
def _get_data_cat(cat, file_name, data):
"""Get a data category from a mmCIF data blob."""
obj = data.getObj(cat)
if obj is None:
_error(f"No '{cat}' object found in '{file_name}'.")
return obj
def _get_data_item(itm, cat, file_name, cat_data):
"""Get a single data item from a data category."""
val = cat_data.getAttributeValueList(itm)
if len(val) != 1:
_error(
f"Expected exactly 1 '{cat}.{itm}' in '{file_name}', "
+ f"found '{', '.join(val)}'."
)
return val[0]
def _get_versions(dic_file, parent_name, io_adapter):
"""Fetch the 'category_group_list' object and assemble a version for the
dictionary."""
dic = io_adapter.readFile(inputFilePath=dic_file)
# fetch a data container from the list returned by the adapter
cntnr = None
for obj in dic:
if "dictionary" in obj.getObjNameList():
cntnr = obj
break
if cntnr is None:
_error(f"No 'dictionary' object found in '{dic_file}'.")
dic = _get_data_cat("dictionary", dic_file, cntnr)
vrsn = _get_data_item("version", "dictionary", dic_file, dic)
ttl = _get_data_item("title", "dictionary", dic_file, dic)
dic_version = {"title": ttl, "version": vrsn}
cmp = _get_data_cat("pdbx_dictionary_component", dic_file, cntnr)
dc_idx = cmp.getAttributeIndex("dictionary_component_id")
vs_idx = cmp.getAttributeIndex("version")
for row in cmp:
if row[dc_idx] == parent_name:
vrsn = row[vs_idx]
prnt_version = {"title": parent_name, "version": vrsn}
break
return dic_version, prnt_version
def _add_dict_location(parent, child, parent_loc, child_loc):
"""Add URLs to the dictionary versions if available."""
if parent_loc is None:
parent["location"] = "."
else:
parent["location"] = parent_loc
if child_loc is None:
child["location"] = "."
else:
child["location"] = child_loc
def _main():
"""Run as script."""
opts = _parse_command_line()
io_adapter = IoAdapterPy(False, sys.stdout)
c_vrsn, p_vrsn = _get_versions(opts.dic_file, opts.parent, io_adapter)
_add_dict_location(
p_vrsn, c_vrsn, opts.parent_location, opts.child_location
)
with open(opts.output, "w", encoding="utf8") as jfh:
json.dump({"versions": [p_vrsn, c_vrsn]}, jfh)
if __name__ == "__main__":
_main()
# LocalWords: DictToSdb SDB PDBx CifCheck pylint mmcif pdbx dic nAborting
# LocalWords: macromolecular utf
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment