From be971a663192e73e6a0bca9124a6540ed0d538e9 Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Wed, 7 Dec 2022 15:49:49 +0100 Subject: [PATCH] Download associated archive from internet for validation. --- validation/Dockerfile | 2 ++ validation/requirements.txt | 2 ++ validation/validate-mmcif-file.py | 43 ++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 validation/requirements.txt diff --git a/validation/Dockerfile b/validation/Dockerfile index 7ec976f..3f20534 100644 --- a/validation/Dockerfile +++ b/validation/Dockerfile @@ -29,6 +29,7 @@ LABEL vendor3="Biozentrum - University of Basel (biozentrum.unibas.ch)" ## Install the RCSB CPP Dict Suite (only the binaries we need) WORKDIR ${SRC_DIR} +COPY requirements.txt ${SRC_DIR} RUN set -e pipefail; \ export DICT_PACK_SRC_DIR="${SRC_DIR}/cpp-dict-pack.git"; \ apk update; \ @@ -56,6 +57,7 @@ RUN set -e pipefail; \ /usr/local/bin/python -m pip install --upgrade pip; \ /usr/local/bin/python -m pip install mmcif==${VERSION_PY_MMCIF} \ python-rapidjson; \ + /usr/local/bin/python -m pip install -r requirements.txt; \ # ## Clean up/ remove unnecessary stuff apk del abuild binutils bison build-base cmake flex git gcc \ diff --git a/validation/requirements.txt b/validation/requirements.txt new file mode 100644 index 0000000..a5f0a8b --- /dev/null +++ b/validation/requirements.txt @@ -0,0 +1,2 @@ +python-rapidjson==1.9 +validators==0.20.0 diff --git a/validation/validate-mmcif-file.py b/validation/validate-mmcif-file.py index ad9fab4..252c334 100755 --- a/validation/validate-mmcif-file.py +++ b/validation/validate-mmcif-file.py @@ -20,7 +20,9 @@ import sys import tempfile import zipfile +from validators import url as is_url import rapidjson as json +import requests from mmcif.api.DataCategory import DataCategory @@ -215,12 +217,51 @@ def _get_entry_id(cif_datablock, entry_id_map, datablock_idx): entry_id_map[row[eidx]] = datablock_idx +def _download_file(file_url): + """Download a file into a temporary file. Mark for deletion on + termination""" + rspns = requests.get(file_url, stream=True, timeout=600) + if rspns.status_code != 200: + raise RuntimeError(f"File not found by URL '{file_url}'.") + + dlf = tempfile.TemporaryFile() + for chunk in rspns.iter_content(chunk_size=1024): + dlf.write(chunk) + dlf.seek(0) + + return dlf + + +def _get_assoc_obj(file_or_url, assoc_dir): + """Get a path to an associated file. Will download from internet if path + is a URL. Downloaded files are automatically hooked up for deletion after + the script terminates.""" + if assoc_dir is None or not os.path.exists( + os.path.join(assoc_dir, file_or_url) + ): + if is_url(file_or_url): + return _download_file(file_or_url) + + raise RuntimeError( + "Associated file path does not point to actual file or URL: " + + f"'{assoc_dir}/{file_or_url}'" + ) + + return os.path.join(assoc_dir, file_or_url) + + def _unzip_arc_cif(arc_file, cif_file, assoc_dir): """Extract a cif file from a ZIP archive.""" assoc_data = [] - with zipfile.ZipFile(os.path.join(assoc_dir, arc_file)) as arc_zip: + assoc_obj = _get_assoc_obj(arc_file, assoc_dir) + with zipfile.ZipFile(assoc_obj) as arc_zip: with TextIOWrapper(arc_zip.open(cif_file), encoding="utf-8") as cif_fh: assoc_data = _read_mmcif(cif_fh) + # in case assoc_obj is a temporary file, we need to close + try: + assoc_obj.close() + except AttributeError: + pass return assoc_data -- GitLab