diff --git a/projects/2024-09-ma-taas/README.md b/projects/2024-09-ma-taas/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51f99dcd9328ba2c363e3d8a0d5fc137a1905262
--- /dev/null
+++ b/projects/2024-09-ma-taas/README.md
@@ -0,0 +1,44 @@
+# Modelling of full-length trimeric autotransporter adhesins (TAAs)
+
+[Link to project in ModelArchive](https://www.modelarchive.org/doi/10.5452/ma-taas) (incl. background on project itself)
+
+Modelling setup:
+- Step 1: AlphaFold-Multimer (AF) using v2.1.2 or v2.3.1 with different parameters for overlapping ranges of full-length sequence
+- Step 2 (if needed): AF-models truncated and superposed and assembled using MODELLER
+- Step 3 (if multiple models combined): custom QE calculated combining pLDDT and MODELLER energy score
+- Step 4 (for assemblies): domain annotation performed using a combination of DSSP, SamCC-Turbo, Foldseek, and US-align
+- Single domain models only included step 1 and a truncation step (usually a subset of another model in the same set)
+- Full-length sequences taken from NCBI.
+
+Input files for conversion:
+- Single ZIP file for each model
+- Each ZIP file contains:
+  - assembly_info.json or domain_info.json with metadata on model
+  - assembly.cif or domain.cif with the model coordinates
+  - accompanying_data.zip or accompanying_data sub-folder with additional files as listed in the metadata
+  - image.png with an overview image for use in ModelArchive
+
+Special features here compared to past projects:
+- NCBI data loaded in bulk before doing conversion (helpful to process large set in parallel).
+- Generic `ZipFileHandler` class to simplify processing ZIP files.
+- Fills `not_modeled_residue_ranges` which was recently added to python-modelcif.
+- Enable packing of associated files without having to dump data to disk (avoids having to extract files from provided ZIP files).
+- Used software includes AlphaFold-Multimer, MODELLER, DSSP, Foldseek, SamCC-Turbo, and US-align.
+- QE metrics linked to software used (fetched from protocol steps to avoid duplicated entries in ModelCIF).
+- Complex modelling pipeline with different variants of the same software (AF) being used to produce distinct intermediate files. Global definition of objects was needed to avoid duplicated entries in ModelCIF. Data input/output for protocol steps handled with lists of keywords.
+- Provided assembly.cif files contain additional mmCIF data to keep (categories pdbx_domain, pdbx_feature_domain, pdbx_domain_range kept and added using gemmi).
+- Intermediate models kept as accompanying data in PDB format. AF-models additionally with scores in JSON format (processed from pickle files into more portable JSON) and PAE matrix as PNG file.
+- Use of modelcif.reference.SeqDif to handle mismatches between NCBI and entity sequence (simplified compared to 2024-08-ma-dm-hisrep).
+- Similar checking for issues as in 2024-04-ma-dm-prc.
+
+Content:
+- translate2modelcif.py : script to do conversion (run in virtual environment with same setup as Docker container here; using OST 2.8, python-modelcif 1.1 and python-ihm 1.6)
+- fetch_ncbi_entries.py : script used to obtain info from NCBI stored in ncbi_data.json
+- ncbi_data.json.gz : version of ncbi_data.json used for this conversion (from 24.9.2024); to be unpacked to use translate2modelcif.py
+- ma-taas-0272.zip : example for a small assembly in the set
+- ma-taas-9037.zip : example for a small domain in the set
+- example_modelcif.zip : output from running conversion of examples with the commands below:
+```
+ost translate2modelcif.py ma-taas-0272.zip modelcif
+ost translate2modelcif.py ma-taas-9037.zip modelcif
+```
diff --git a/projects/2024-09-ma-taas/example_modelcif.zip b/projects/2024-09-ma-taas/example_modelcif.zip
new file mode 100644
index 0000000000000000000000000000000000000000..6c3992766e4ab6c36e0203299fb94be184308f27
Binary files /dev/null and b/projects/2024-09-ma-taas/example_modelcif.zip differ
diff --git a/projects/2024-09-ma-taas/fetch_ncbi_entries.py b/projects/2024-09-ma-taas/fetch_ncbi_entries.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bdf577ccd021dd0f8d8cbaac0b6a337b7e479b
--- /dev/null
+++ b/projects/2024-09-ma-taas/fetch_ncbi_entries.py
@@ -0,0 +1,335 @@
+# get info from NCBI entries for the full set
+
+import requests, json, os, re, zipfile, time
+import pandas as pd
+import numpy as np
+from xml.etree import ElementTree as ET
+from ost import io
+
+
+# HC setup
+out_json_ncbi = "./ncbi_data.json"
+out_json_data = "./summary_data.json" # collection of all assembly/domain.json for testing
+assemblies_path = "./data_assemblies/taa_export_MA_180724"
+domains_path = "./data_domains/domain_export_merged_200624"
+#
+
+def _warn_msg(msg):
+    """Write a warning message to stdout."""
+    print(f"WARNING: {msg}")
+
+
+def _get_ncbi_base_url():
+    return "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+
+
+def _get_ncbi_docsums(params):
+    # params = parameters to pass to esummary
+    response = requests.get(_get_ncbi_base_url() + "esummary.fcgi",
+                            params=params)
+    root = ET.fromstring(response.text)
+    docsums = root.findall(".//DocSum")
+    ncbi_dicts = []
+    for docsum in docsums:
+        ncbi_dict = {}
+        for cn in docsum:
+            if cn.tag == "Item":
+                cn_name = cn.get("Name")
+                cn_type = cn.get("Type")
+                if cn.text:
+                    d = cn.text
+                    if cn_type == "String":
+                        ncbi_dict[cn_name] = d
+                    elif cn_type == "Integer":
+                        ncbi_dict[cn_name] = int(d)
+                    elif cn_type == "Date":
+                        # kept as string
+                        ncbi_dict[cn_name] = d
+                    else:
+                        raise RuntimeError(
+                            f"Unknown type {cn_type} for {ncbi_ac}"
+                        )
+                else:
+                    ncbi_dict[cn_name] = None
+        ncbi_dicts.append(ncbi_dict)
+    return ncbi_dicts
+
+
+def _get_ncbi_data(ncbi_acs):
+    """Fetch data from NCBI in bulk."""
+    max_num_ids = 2000 # recommended to do max. 5000 at once
+    ncbi_seq = {}
+    ncbi_info = {}
+    sel_ids = list(ncbi_acs)
+    print(f"FETCHING NCBI INFO FOR {len(sel_ids)} PROTEIN ACs")
+    while len(sel_ids) > 0:
+        # SOURCE: https://www.ncbi.nlm.nih.gov/books/NBK25501/
+        db = "protein"
+        # Upload the list of IDs using the epost endpoint
+        sel_ids = sel_ids[:max_num_ids]
+        params = {"db": db, "id": ",".join(sel_ids)}
+        response = requests.post(_get_ncbi_base_url() + "epost.fcgi",
+                                 data=params)
+        xml_string = response.text
+        root = ET.fromstring(xml_string)
+        query_key = root.find(".//QueryKey").text
+        web_env = root.find(".//WebEnv").text
+        # Fetch all sequences using the efetch endpoint via epost info
+        params = {"db": db, "query_key": query_key, "WebEnv": web_env,
+                  "rettype": "fasta", "retmode": "text"}
+        response = requests.get(_get_ncbi_base_url() + "efetch.fcgi",
+                                params=params)
+        ss = io.SequenceListFromString(response.text, "fasta")
+        for s in ss:
+            ncbi_ac = s.name.split()[0]
+            if ncbi_ac not in sel_ids:
+                # assume that we store them without version
+                ncbi_ac = ncbi_ac.rsplit('.', 1)[0]
+                assert ncbi_ac in sel_ids
+            ncbi_seq[ncbi_ac] = s
+        # Fetch all infos using the esummary endpoint via epost info
+        params = {"db": db, "query_key": query_key, "WebEnv": web_env}
+        for ncbi_dict in _get_ncbi_docsums(params):
+            ncbi_ac = ncbi_dict["AccessionVersion"]
+            if ncbi_ac not in sel_ids:
+                # assume that we store them without version
+                ncbi_ac = ncbi_ac.rsplit('.', 1)[0]
+                assert ncbi_ac in sel_ids
+            ncbi_info[ncbi_ac] = ncbi_dict
+        assert ncbi_info.keys() == ncbi_seq.keys()
+        # what's left?
+        # (random ones get dropped; reason unknown; so we try until all done)
+        sel_ids = [ncbi_ac for ncbi_ac in ncbi_acs \
+                   if ncbi_ac not in ncbi_info]
+        print(f"- {len(ncbi_info)} DONE; {len(sel_ids)} TODO")
+    assert sorted(ncbi_acs) == sorted(ncbi_info)
+
+    # combine nicely for further use
+    ncbi_data = {
+        ncbi_ac: {
+            "seq_name": ncbi_seq[ncbi_ac].name,
+            "seq_str": ncbi_seq[ncbi_ac].string,
+            "info": ncbi_info[ncbi_ac]
+        } \
+        for ncbi_ac in ncbi_acs
+    }
+    return ncbi_data
+
+
+def _get_ncbi_data_cached(ncbi_acs, ncbi_metadata_file):
+    """Fetch dict with info for all NCBI proteins from file or web."""
+    if ncbi_metadata_file:
+        if os.path.exists(ncbi_metadata_file):
+            return json.load(open(ncbi_metadata_file))
+        else:
+            ncbi_data = _get_ncbi_data(ncbi_acs)
+            json.dump(ncbi_data, open(ncbi_metadata_file, "w"))
+            return ncbi_data
+    else:
+        return _get_ncbi_data(ncbi_acs)
+
+
+def _get_ncbi_tax_info(tax_ids):
+    """Fetch data from NCBI species in bulk."""
+    max_num_ids = 2000 # recommended to do max. 5000 at once
+    ncbi_info = {}
+    sel_ids = list(tax_ids)
+    print(f"FETCHING NCBI INFO FOR {len(sel_ids)} TAX IDs")
+    while len(sel_ids) > 0:
+        # keep track of TODOs to abort if no progress made
+        last_num_todo = len(sel_ids)
+        # SOURCE: https://www.ncbi.nlm.nih.gov/books/NBK25501/
+        db = "taxonomy"
+        # Upload the list of IDs using the epost endpoint
+        sel_ids = sel_ids[:max_num_ids]
+        params = {"db": db, "id": ",".join(sel_ids)}
+        response = requests.post(_get_ncbi_base_url() + "epost.fcgi",
+                                 data=params)
+        xml_string = response.text
+        root = ET.fromstring(xml_string)
+        query_key = root.find(".//QueryKey").text
+        web_env = root.find(".//WebEnv").text
+        # Fetch all infos using the esummary endpoint via epost info
+        params = {"db": db, "query_key": query_key, "WebEnv": web_env}
+        for ncbi_dict in _get_ncbi_docsums(params):
+            tax_id = str(ncbi_dict["TaxId"])
+            assert tax_id in sel_ids
+            ncbi_info[tax_id] = ncbi_dict
+        # what's left?
+        # (random ones get dropped; reason unknown; so we try until all done)
+        sel_ids = [tax_id for tax_id in tax_ids \
+                   if tax_id not in ncbi_info]
+        print(f"- {len(ncbi_info)} DONE; {len(sel_ids)} TODO")
+        if last_num_todo == len(sel_ids):
+            print(f"ABORTING...failed to get {sel_ids}")
+            return ncbi_info
+    assert sorted(tax_ids) == sorted(ncbi_info)
+    return ncbi_info
+
+
+def _check_ncbi_data(ncbi_data):
+    """Run sanity checks on fetched data."""
+    non_live = []
+    outdated = [] # (AC, replaced-by)
+    for ncbi_ac, ncbi_item in ncbi_data.items():
+        ncbi_info = ncbi_item["info"]
+        if ncbi_info["Status"] != "live":
+            non_live.append(ncbi_ac)
+        if ncbi_info["ReplacedBy"]:
+            outdated.append((ncbi_ac, ncbi_info['ReplacedBy']))
+        if ncbi_info["AccessionVersion"] != ncbi_ac:
+            ncbi_info_ac = ncbi_info["AccessionVersion"].rsplit('.', 1)[0]
+            if ncbi_info_ac != ncbi_ac:
+                raise RuntimeError(f"NCBI AC is not AC")
+        mmcif_regex = "[][ \t_(),.;:\"&<>/\\\\{}'`~!@#$%?+=*A-Za-z0-9|^-]*"
+        description = ncbi_info["Title"]
+        if not re.fullmatch(mmcif_regex, description):
+            raise RuntimeError(f"Illegal characters found in title of " \
+                               f"{ncbi_ac}: {description}")
+        tax_id = str(ncbi_info["TaxId"])
+        organism_scientific = ncbi_info["SpeciesName"]
+    if len(non_live) > 0:
+        msg = f"{len(non_live)} NCBI entries not live"
+        if len(non_live) < 20:
+            msg += f": {non_live}"
+        _warn_msg(msg)
+    if len(outdated) > 0:
+        ncbi_acs = [v[0] for v in outdated]
+        ncbi_rep = [v[1] for v in outdated]
+        msg = f"{len(ncbi_acs)} outdated NCBI entries"
+        if len(outdated) < 20:
+            msg += f": {ncbi_acs} replaced by {ncbi_rep}"
+        _warn_msg(msg)
+    return non_live, outdated
+
+
+def _check_summary_data(json_data):
+    # collect info for further large scale checks
+    json_dicts = {}
+    df_dict_list = []
+    for item in json_data:
+        # keep JSON dict for easy access
+        json_dicts[item["ma_id"]] = item["json_dict"]
+        # prepare data for DataFrame
+        keys_to_keep = [
+            "ma_id", "mdl_type", "has_accompanying_data", "has_cif", "has_image"
+        ]
+        new_item = {k: item[k] for k in keys_to_keep}
+        new_item["ncbi_ac"] = item["json_dict"]["source_sequence_RefSeq_ID"]
+        new_item["no_unparsed_files"] = len(item["unparsed_files"]) == 0
+        df_dict_list.append(new_item)
+    # keep data in DataFrame for further checks
+    df = pd.DataFrame(df_dict_list)
+    assert len(set(df.ma_id)) == len(df.ma_id)
+    df.set_index("ma_id", inplace=True)
+    for k, v in df.isna().any().to_dict().items():
+        if v:
+            _warn_msg(f"TO CHECK: NA found in summary data key {k}")
+    for k, v in df.all().to_dict().items():
+        if not v:
+            _warn_msg(f"TO CHECK: False values found in summary data key {k}")
+
+
+def _parse_zip_file(zip_file_path):
+    data_from_zip = {
+        "ma_id": os.path.splitext(os.path.basename(zip_file_path))[0],
+        "zip_file_path": zip_file_path,
+        "mdl_type": None,
+        "json_dict": None,
+        "has_accompanying_data": False,
+        "has_cif": False,
+        "has_image": False,
+        "unparsed_files": [],
+    }
+    with zipfile.ZipFile(zip_file_path) as zf:
+        for file_name in sorted(zf.namelist()):
+            base_name = os.path.basename(file_name)
+            if base_name == "":
+                # we ignore entries for paths
+                continue
+            elif base_name == "accompanying_data.zip" and not data_from_zip["has_accompanying_data"]:
+                data_from_zip["has_accompanying_data"] = True
+            elif "accompanying_data/" in file_name:
+                data_from_zip["has_accompanying_data"] = True
+            elif base_name == "assembly_info.json" and data_from_zip["json_dict"] is None and data_from_zip["mdl_type"] in [None, "assembly"]:
+                data_from_zip["mdl_type"] = "assembly"
+                data_from_zip["json_dict"] = json.load(zf.open(file_name))
+            elif base_name == "assembly.cif" and not data_from_zip["has_cif"] and data_from_zip["mdl_type"] in [None, "assembly"]:
+                data_from_zip["mdl_type"] = "assembly"
+                data_from_zip["has_cif"] = True
+            elif base_name == "domain_info.json" and data_from_zip["json_dict"] is None and data_from_zip["mdl_type"] in [None, "domain"]:
+                data_from_zip["mdl_type"] = "domain"
+                data_from_zip["json_dict"] = json.load(zf.open(file_name))
+            elif base_name == "domain.cif" and not data_from_zip["has_cif"] and data_from_zip["mdl_type"] in [None, "domain"]:
+                data_from_zip["mdl_type"] = "domain"
+                data_from_zip["has_cif"] = True
+            elif base_name == "image.png" and not data_from_zip["has_image"]:
+                data_from_zip["has_image"] = True
+            else:
+                data_from_zip["unparsed_files"].append(file_name)
+    return data_from_zip
+
+
+def _parse_path(path, exp_type):
+    t0 = time.time()
+    data_from_zips = []
+    for file_name in sorted(os.listdir(path)):
+        data_from_zip = _parse_zip_file(os.path.join(path, file_name))
+        if data_from_zip["mdl_type"] != exp_type:
+            print("WARNING:", path, file_name, data_from_zip["mdl_type"], exp_type)
+        data_from_zips.append(data_from_zip)
+        if len(data_from_zips) % 200 == 0:
+            print("DONE ZIP", path, len(data_from_zips), time.time() - t0)
+    return data_from_zips
+
+
+def main():
+    # get all JSONs
+    if os.path.exists(out_json_data):
+        data_from_zips = json.load(open(out_json_data))
+    else:
+        data_from_zips = _parse_path(assemblies_path, "assembly") \
+                       + _parse_path(domains_path, "domain")
+        json.dump(data_from_zips, open(out_json_data, "w"))
+
+    # collect all NCBI ACs
+    ncbi_acs = [item["json_dict"]["source_sequence_RefSeq_ID"] \
+                for item in data_from_zips]
+    unique_acs = sorted(set(ncbi_acs))
+    # fetch NCBI data for each
+    ncbi_data = _get_ncbi_data_cached(unique_acs, out_json_ncbi)
+    
+    # fetch all needed tax. infos (excluding overridden ones)
+    tax_ids = sorted(set(
+        str(ncbi_item["info"]["TaxId"]) \
+        for ncbi_item in ncbi_data.values()
+    ))
+    ncbi_tax_infos = _get_ncbi_tax_info(tax_ids)
+    # some checks (expect to have all active tax. ids)
+    for item, exp_vals in [("Status", ["active"])]:
+        observed_values = sorted(set(
+            ncbi_tax_info[item] \
+            for ncbi_tax_info in ncbi_tax_infos.values()
+        ))
+        if observed_values != exp_vals:
+            print(f"UNEXPECTED observed '{item}' values: {observed_values}")
+    # extract/keep scientific names
+    ncbi_species_names = {
+        ncbi_tax_id: ncbi_tax_info["ScientificName"] \
+        for ncbi_tax_id, ncbi_tax_info in ncbi_tax_infos.items()
+    }
+    
+    # apply to data
+    for ncbi_ac, ncbi_item in ncbi_data.items():
+        tax_id = str(ncbi_item["info"]["TaxId"])
+        ncbi_item["info"]["SpeciesName"] = ncbi_species_names[tax_id]
+    # dump file
+    json.dump(ncbi_data, open(out_json_ncbi, "w"))
+
+    # do some checks at the end
+    _check_summary_data(data_from_zips)
+    _check_ncbi_data(ncbi_data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/2024-09-ma-taas/ma-taas-0272.zip b/projects/2024-09-ma-taas/ma-taas-0272.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a2990dbee6630dca60eef8eae6876ac9568e7bee
Binary files /dev/null and b/projects/2024-09-ma-taas/ma-taas-0272.zip differ
diff --git a/projects/2024-09-ma-taas/ma-taas-9037.zip b/projects/2024-09-ma-taas/ma-taas-9037.zip
new file mode 100644
index 0000000000000000000000000000000000000000..42630098690d245ea4d8f5ab5f8f138057742813
Binary files /dev/null and b/projects/2024-09-ma-taas/ma-taas-9037.zip differ
diff --git a/projects/2024-09-ma-taas/ncbi_data.json.gz b/projects/2024-09-ma-taas/ncbi_data.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..eaa31245f2e0008b5aed9b721f60dfbf03355bc7
Binary files /dev/null and b/projects/2024-09-ma-taas/ncbi_data.json.gz differ
diff --git a/projects/2024-09-ma-taas/translate2modelcif.py b/projects/2024-09-ma-taas/translate2modelcif.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6b41bc1248c0353e5206ed504d4b3492f68a5e
--- /dev/null
+++ b/projects/2024-09-ma-taas/translate2modelcif.py
@@ -0,0 +1,2327 @@
+#! /usr/local/bin/ost
+# -*- coding: utf-8 -*-
+
+"""Translate TAA models for ma-taas.
+
+Example for running:
+ost translate2modelcif.py ./sample_files/ma-taas-0272.zip ./modelcif
+-> due to size of the set, each model in the set is converted individually
+-> needs ncbi_data.json from 2_fetch_ncbi_entries.py
+
+Expected output in ./modelcif for example above:
+- ma-taas-0272.cif as ModelCIF file
+- ma-taas-0272.zip with accompanying data
+- ma-taas-0272-image.png as image to use in ModelArchive
+- ma-taas-0272-issues.json listing issues for that conversion (if any)
+"""
+
+import argparse
+import datetime
+import gzip
+import os
+import shutil
+import sys
+import zipfile
+import pickle
+import re
+import traceback
+
+from io import StringIO
+from timeit import default_timer as timer
+import numpy as np
+import requests
+import ujson as json
+
+import ihm
+import ihm.citations
+
+import modelcif
+import modelcif.associated
+import modelcif.dumper
+import modelcif.model
+import modelcif.protocol
+import modelcif.reference
+import gemmi
+
+import pandas as pd
+from ost import io, seq
+
+################################################################################
+# GENERAL HELPER FUNCTIONS
+################################################################################
+def _abort_msg(msg, exit_code=1):
+    """Write error message and exit with exit_code."""
+    print(f"{msg}\nAborting.", file=sys.stderr)
+    sys.exit(exit_code)
+
+
+def _warn_msg(msg):
+    """Write a warning message to stdout."""
+    print(f"WARNING: {msg}")
+
+
+def _check_file(file_path):
+    """Make sure a file exists and is actually a file."""
+    if not os.path.exists(file_path):
+        _abort_msg(f"File not found: '{file_path}'.")
+    if not os.path.isfile(file_path):
+        _abort_msg(f"File path does not point to file: '{file_path}'.")
+
+
+def _check_folder(dir_path):
+    """Make sure a file exists and is actually a file."""
+    if not os.path.exists(dir_path):
+        _abort_msg(f"Path not found: '{dir_path}'.")
+    if not os.path.isdir(dir_path):
+        _abort_msg(f"Path does not point to a directory: '{dir_path}'.")
+
+
+def _check_opts_folder(dir_path):
+    """Remove trailing '/' (return fixed one) and check if path valid."""
+    if dir_path.endswith("/"):
+        dir_path = dir_path[:-1]
+    _check_folder(dir_path)
+    return dir_path
+
+
+def _get_res_num(r, use_auth=False):
+    """Get res. num. from auth. IDs if reading from mmCIF files."""
+    if use_auth:
+        return int(r.GetStringProp("pdb_auth_resnum"))
+    return r.number.num
+
+
+def _get_ch_name(ch, use_auth=False):
+    """Get chain name from auth. IDs if reading from mmCIF files."""
+    if use_auth:
+        return ch.GetStringProp("pdb_auth_chain_name")
+    return ch.name
+
+
+def _get_sequence(chn, use_auth=False):
+    """Get the sequence out of an OST chain incl. '-' for gaps in resnums."""
+    # initialise (add gaps if first is not at num. 1)
+    lst_rn = _get_res_num(chn.residues[0], use_auth)
+    idx = 1
+    sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code
+
+    for res in chn.residues[idx:]:
+        lst_rn += 1
+        while lst_rn != _get_res_num(res, use_auth):
+            sqe += "-"
+            lst_rn += 1
+        sqe += res.one_letter_code
+    return sqe
+
+
+class ZipFileHandler:
+    """
+    A class to handle ZIP files, including nested ZIP files, allowing for 
+    listing files and extracting or reading files on demand.
+    File names are represented as tuples where all besides the last item
+    are expected to be nested paths to ZIP files.
+    
+    Attributes:
+        zip_path (str): The path to the main ZIP file.
+        file_list (list): List of tuples representing the hierarchical file structure.
+    """
+    
+    def __init__(self, zip_path):
+        """
+        Initializes the ZipFileHandler with the path to the main ZIP file.
+        
+        Args:
+            zip_path (str): The path to the main ZIP file.
+        """
+        self.zip_path = zip_path
+        self.file_list = self._build_file_list()
+    
+    def _build_file_list(self, zip_file=None, base_path=()):
+        """
+        Recursively builds a list representing the hierarchical file structure
+        within the ZIP file.
+        
+        Args:
+            zip_file (zipfile.ZipFile, optional): The current ZIP file object being processed.
+                If None, the main ZIP file is processed. Default is None.
+            base_path (tuple): The base path within the hierarchical structure.
+        
+        Returns:
+            list of tuples representing the hierarchical file structure.
+        """
+        file_list = []
+        if zip_file is None:
+            with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
+                file_list.extend(self._build_file_list(zip_ref))
+        else:
+            for file_info in zip_file.infolist():
+                file_name = file_info.filename
+                if file_name.endswith('/'):
+                    continue  # Skip directories
+                if file_name.endswith('.zip'):
+                    with zip_file.open(file_name) as nested_zip:
+                        with zipfile.ZipFile(nested_zip) as nested_zip_ref:
+                            nested_base_path = base_path + (file_name,)
+                            file_list.extend(self._build_file_list(nested_zip_ref, nested_base_path))
+                else:
+                    file_list.append(base_path + (file_name,))
+        return file_list
+    
+    def _read_all_files(self, zip_file=None, base_path=()):
+        """
+        Recursively builds a list representing the hierarchical file structure
+        within the ZIP file and reads all files.
+        
+        Args:
+            zip_file (zipfile.ZipFile, optional): The current ZIP file object being processed.
+                If None, the main ZIP file is processed. Default is None.
+            base_path (tuple): The base path within the hierarchical structure.
+        
+        Returns:
+            dictionary with key matching tuples in self.file_list and
+            value being raw read content of file as in self.read_file.
+        """
+        file_list = {}
+        if zip_file is None:
+            with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
+                file_list.update(self._read_all_files(zip_ref))
+        else:
+            for file_name in zip_file.namelist():
+                if file_name.endswith('/'):
+                    continue  # Skip directories
+                if file_name.endswith('.zip'):
+                    with zip_file.open(file_name) as nested_zip:
+                        with zipfile.ZipFile(nested_zip) as nested_zip_ref:
+                            nested_base_path = base_path + (file_name,)
+                            file_list.update(self._read_all_files(nested_zip_ref, nested_base_path))
+                else:
+                    file_list[base_path + (file_name,)] = zip_file.open(file_name).read()
+        return file_list
+
+    def _zip_parser(self, current_zip, filename_tuple, callback):
+        """
+        Hierarchical parsing of ZIP file through filename_tuple.
+        Calls callback(zip_ref, name) at final nesting level with
+        zip_ref being the open ZipFile and name the filename within it.
+        """
+        with zipfile.ZipFile(current_zip, 'r') as zip_ref:
+            name = filename_tuple[0]
+            if len(filename_tuple) == 1:
+                return callback(zip_ref, name)
+            else:
+                with zip_ref.open(name) as nested_zip:
+                    return self._zip_parser(
+                        nested_zip, filename_tuple[1:], callback
+                    )
+    
+    def extract_file(self, filename_tuple, extract_to='.'):
+        """
+        Extracts a specific file from the ZIP file based on the hierarchical path.
+        Note that the extracted file path will be according to filename_tuple[-1]
+        within the extract_to folder.
+        
+        Args:
+            filename_tuple (tuple): A tuple representing the hierarchical path to the file.
+            extract_to (str): The directory to extract the file to. Default is current directory ('.').
+        
+        Raises:
+            FileNotFoundError: If the specified file is not found in the ZIP file.
+        """
+        self._zip_parser(
+            self.zip_path,
+            filename_tuple,
+            lambda zip_ref, name: zip_ref.extract(name, path=extract_to)
+        )
+
+    def extract_file_to_path(self, filename_tuple, output_path):
+        """
+        Extracts a specific file from the ZIP file based on the hierarchical path and
+        saves it directly to the specified output path.
+        
+        Args:
+            filename_tuple (tuple): A tuple representing the hierarchical path to the file.
+            output_path (str): The desired output path for the extracted file.
+        
+        Raises:
+            FileNotFoundError: If the specified file is not found in the ZIP file.
+        """
+        def _extract_to_path(zip_ref, name):
+            with zip_ref.open(name) as source, open(output_path, 'wb') as target:
+                shutil.copyfileobj(source, target)
+        
+        self._zip_parser(self.zip_path, filename_tuple, _extract_to_path)
+
+    def read_file(self, filename_tuple):
+        """
+        Reads and returns the content of a specific file from the ZIP file based on the hierarchical path.
+        
+        Args:
+            filename_tuple (tuple): A tuple representing the hierarchical path to the file.
+        
+        Returns:
+            bytes: The content of the specified file.
+        
+        Raises:
+            FileNotFoundError: If the specified file is not found in the ZIP file.
+        """
+        return self._zip_parser(
+            self.zip_path,
+            filename_tuple,
+            lambda zip_ref, name: zip_ref.open(name).read()
+        )
+
+    def read_all_files(self):
+        """
+        Recursively builds a list representing the hierarchical file structure
+        within the ZIP file and reads all files.
+
+        NOTE: for large zip files with data in nested zip file, this is much,
+        much faster than calling read_file individually!
+        
+        Returns:
+            dictionary with key matching tuples in self.file_list and
+            value being raw read content of file as in self.read_file.
+        """
+        return self._read_all_files()
+
+    def process_file(self, filename_tuple, func):
+        """
+        Opens a specific file from the ZIP file based on the hierarchical path
+        and returns the result of func(fo) where fo is a file-like object.
+        
+        Args:
+            filename_tuple (tuple): A tuple representing the hierarchical path to the file.
+            func (function): Function to call on file-like object.
+        
+        Returns:
+            Result of func.
+        
+        Raises:
+            FileNotFoundError: If the specified file is not found in the ZIP file.
+        """
+        return self._zip_parser(
+            self.zip_path,
+            filename_tuple,
+            lambda zip_ref, name: func(zip_ref.open(name))
+        )
+################################################################################
+
+################################################################################
+# DATA HANDLING
+################################################################################
+def _parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        "input_zip_path",
+        type=str,
+        metavar="<INPUT ZIP PATH>",
+        help="Path to ZIP file provided by depositors. Expected to contain "
+        + "assembly_info.json or domain_info.json with metadata on model " \
+        + "assembly.cif or domain.cif with the model coordinates " \
+        + "accompanying_data.zip with additional files as listed in the " \
+        + " metadata and image.png with an image for ModelArchive.",
+    )
+    parser.add_argument(
+        "out_dir",
+        type=str,
+        metavar="<OUTPUT DIR>",
+        help="Path to directory to store results. For a ZIP file named X.zip " \
+        + "the output will contain: X.cif[.gz] as ModelCIF file, X.zip with " \
+        + "accompanying data, X-image.png as image to use in ModelArchive, " \
+        + "and X-issues.json listing issues for that conversion."
+    )
+    parser.add_argument(
+        "--ncbi-json",
+        type=str,
+        default="./ncbi_data.json",
+        help="Pre-fetched NCBI entries (from 2_fetch_ncbi_entries.py).",
+    )
+    parser.add_argument(
+        "--compress",
+        default=False,
+        action="store_true",
+        help="Compress ModelCIF file with gzip.",
+    )
+    parser.add_argument(
+        "--checks-only",
+        default=False,
+        action="store_true",
+        help="Only check for issues without producing ModelCIF files.",
+    )
+    parser.add_argument(
+        "--no-extra-files",
+        default=False,
+        action="store_true",
+        help="Skip writing accompanying data (for testing).",
+    )
+
+    opts = parser.parse_args()
+
+    # check input
+    _check_file(opts.input_zip_path)
+    if opts.out_dir.endswith("/"):
+        opts.out_dir = opts.out_dir[:-1]
+    if not os.path.exists(opts.out_dir):
+        os.makedirs(opts.out_dir, exist_ok=True)
+    _check_file(opts.ncbi_json)
+    return opts
+
+
+def _get_audit_authors():
+    """Return the list of authors that produced this model."""
+    return (
+        "Dobbelstein, Adrian",
+        "Alva, Vikram",
+    )
+
+
+def _get_file_list_dict(file_list):
+    """
+    Get dictionary for access to files for conversion.
+    
+    Args:
+        file_list (list): As returned by ZipFileHandler.
+    
+    Returns:
+        dict with following keys / values:
+        - "mdl_type" / "assembly" or "domain"
+        - "json_file" / filename_tuple for access in ZipFileHandler
+        - "cif_file" / filename_tuple for access in ZipFileHandler
+        - "image_file" / filename_tuple for access in ZipFileHandler
+        - "accompanying_data" / dict with
+          filename without path as key and
+          filename_tuple for access in ZipFileHandler as value
+        - "unparsed_files" / list of filename_tuple
+    """
+    result = {
+        "mdl_type": None,
+        "json_file": None,
+        "cif_file": None,
+        "image_file": None,
+        "accompanying_data": {},
+        "unparsed_files": [],
+    }
+    for filename_tuple in file_list:
+        basename = os.path.basename(filename_tuple[-1])
+        fs = filename_tuple[0].split(os.path.sep)
+        if fs[-2] == "accompanying_data" or fs[-1] == "accompanying_data.zip":
+            assert basename not in result["accompanying_data"]
+            result["accompanying_data"][basename] = filename_tuple
+        elif basename == "assembly_info.json":
+            assert result["mdl_type"] in [None, "assembly"]
+            assert result["json_file"] is None
+            result["mdl_type"] = "assembly"
+            result["json_file"] = filename_tuple
+        elif basename == "assembly.cif":
+            assert result["mdl_type"] in [None, "assembly"]
+            assert result["cif_file"] is None
+            result["mdl_type"] = "assembly"
+            result["cif_file"] = filename_tuple
+        elif basename == "domain_info.json":
+            assert result["mdl_type"] in [None, "domain"]
+            assert result["json_file"] is None
+            result["mdl_type"] = "domain"
+            result["json_file"] = filename_tuple
+        elif basename == "domain.cif":
+            assert result["mdl_type"] in [None, "domain"]
+            assert result["cif_file"] is None
+            result["mdl_type"] = "domain"
+            result["cif_file"] = filename_tuple
+        elif basename == "image.png":
+            assert result["image_file"] is None
+            result["image_file"] = filename_tuple
+        else:
+            result["unparsed_files"].append(filename_tuple)
+    for k, v in result.items():
+        has_content = bool(v)
+        if k != "unparsed_files" and not has_content:
+            raise RuntimeError(f"{k} not set in zip file.")
+    return result
+
+
+def _get_zip_content(zip_file_path):
+    """Same as _get_file_list_dict but reads zip file content."""
+    zip_handler = ZipFileHandler(zip_file_path)
+    zip_dict = _get_file_list_dict(zip_handler.file_list)
+    zip_content = zip_handler.read_all_files()
+    for k, v in zip_dict.items():
+        if k == "accompanying_data":
+            v = {ak: zip_content[av] for ak, av in v.items()}
+        elif k in ["json_file", "cif_file", "image_file"]:
+            v = zip_content[v]
+        zip_dict[k] = v
+    return zip_dict
+
+
+def _get_modeller_software(version=None):
+    """Get MODELLER as a dictionary, suitable to create a modelcif software
+    object."""
+    # Note: depositor had suggested use of Webb and Sali 2016 but decided
+    # against in given input from Ben Webb (1993 citation is still the canoncial
+    # one: https://salilab.org/modeller/manual/node8.html).
+    return {
+        "name": "MODELLER",
+        "classification": "model building",
+        "description": "Comparative modeling by satisfaction of spatial " \
+                       "restraints",
+        "citation": ihm.citations.modeller,
+        "location": "https://salilab.org/modeller/",
+        "type": "program",
+        "version": version,
+    }
+
+
+# global definition to avoid duplicated entries in ModelCIF
+ihm.citations.af2_multimer = ihm.Citation(
+    pmid=None,
+    title="Protein complex prediction with AlphaFold-Multimer.",
+    journal="bioRxiv",
+    volume=None,
+    page_range=None,
+    year=2021,
+    authors=[
+        "Evans, R.",
+        "O'Neill, M.",
+        "Pritzel, A.",
+        "Antropova, N.",
+        "Senior, A.",
+        "Green, T.",
+        "Zidek, A.",
+        "Bates, R.",
+        "Blackwell, S.",
+        "Yim, J.",
+        "Ronneberger, O.",
+        "Bodenstein, S.",
+        "Zielinski, M.",
+        "Bridgland, A.",
+        "Potapenko, A.",
+        "Cowie, A.",
+        "Tunyasuvunakool, K.",
+        "Jain, R.",
+        "Clancy, E.",
+        "Kohli, P.",
+        "Jumper, J.",
+        "Hassabis, D.",
+    ],
+    doi="10.1101/2021.10.04.463034",
+)
+
+
+def _get_af2_software(version=None, is_multimer=False):
+    """Get AF2 as dictionary, suitable to create a modelcif software object."""
+    if is_multimer:
+        return {
+            "name": "AlphaFold-Multimer",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.citations.af2_multimer,
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": version,
+        }
+    else:
+        return {
+            "name": "AlphaFold",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.citations.alphafold2,
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": version,
+        }
+
+
+def _get_dssp_software(version=None):
+    """Get DSSP SW as dictionary, suitable to create a modelcif software
+    object."""
+    # NOTE: this is using the old repo and old citation (ok for v2.3)
+    return {
+        "name": "DSSP",
+        "classification": "data collection",
+        "description": "Secondary structure assignment",
+        "citation": ihm.Citation(
+            pmid="6667333",
+            title="Dictionary of protein secondary structure: pattern "
+            + "recognition of hydrogen-bonded and geometrical features.",
+            journal="Biopolymers",
+            volume=22,
+            page_range=(2577, 2637),
+            year=1983,
+            authors=['Kabsch, W.', 'Sander, C.'],
+            doi="10.1002/bip.360221211",
+        ),
+        "location": "https://github.com/cmbi/dssp",
+        "type": "program",
+        "version": version,
+    }
+
+
+def _get_foldseek_software(version=None):
+    """Get Foldseek SW as dictionary, suitable to create a modelcif software
+    object."""
+    return {
+        "name": "Foldseek",
+        "classification": "data collection",
+        "description": "Protein structure search",
+        "citation": ihm.Citation(
+            pmid="37156916",
+            title="Fast and accurate protein structure search with Foldseek.",
+            journal="Nat Biotechnol",
+            volume=42,
+            page_range=(243, 246),
+            year=2024,
+            authors=[
+                'van Kempen, M.', 'Kim, S.S.', 'Tumescheit, C.', 'Mirdita, M.',
+                'Lee, J.', 'Gilchrist, C.L.M.', 'Soeding, J.', 'Steinegger, M.'
+            ],
+            doi="10.1038/s41587-023-01773-0",
+        ),
+        "location": "https://search.foldseek.com/search",
+        "type": "package",
+        "version": version,
+    }
+
+
+def _get_sam_cc_software(version=None):
+    """Get SamCC-Turbo SW as dictionary, suitable to create a modelcif software
+    object."""
+    return {
+        "name": "SamCC-Turbo",
+        "classification": "data collection",
+        "description": "Detection and measurement of coiled coils",
+        "citation": ihm.Citation(
+            pmid="33325494",
+            title="A library of coiled-coil domains: from regular bundles to "
+            + "peculiar twists.",
+            journal="Bioinformatics",
+            volume=36,
+            page_range=(5368, 5376),
+            year=2020,
+            authors=[
+                'Szczepaniak, K.', 'Bukala, A.', 'da Silva Neto, A.M.',
+                'Ludwiczak, J.', 'Dunin-Horkawicz, S.'
+            ],
+            doi="10.1093/bioinformatics/btaa1041",
+        ),
+        "location": "https://github.com/labstructbioinf/samcc_turbo",
+        "type": "package",
+        "version": version,
+    }
+
+
+def _get_us_align_software(version=None):
+    """Get US-align SW as dictionary, suitable to create a modelcif software
+    object."""
+    return {
+        "name": "US-align",
+        "classification": "data collection",
+        "description": "Universal Structural alignment of macromolecules",
+        "citation": ihm.Citation(
+            pmid="36038728",
+            title="US-align: universal structure alignments of proteins, "
+            + "nucleic acids, and macromolecular complexes.",
+            journal="Nat Methods",
+            volume=19,
+            page_range=(1109, 1115),
+            year=2022,
+            authors=['Zhang, C.', 'Shine, M.', 'Pyle, A.M.', 'Zhang, Y.'],
+            doi="10.1038/s41592-022-01585-1",
+        ),
+        "location": "https://zhanggroup.org/US-align/",
+        "type": "package",
+        "version": version,
+    }
+
+
+def _get_af2_description(pred):
+    """Get description text based on parameters used."""
+    if pred["version"] == "2.1.2":
+        num_models_str = "producing 5 models with 3 recycles"
+    elif pred["version"] == "2.3.1":
+        num_models_str = "producing 10 models (2 random seeds per parameter " \
+                         "set) with up to 20 recycles"
+    else:
+        raise RuntimeError(f"Unexpected AF2 version: '{pred['version']}'")
+    return f"using AlphaFold v{pred['version']} {num_models_str} and " \
+           f"{pred['num_ensemble']} ensemble each, with AMBER relaxation, " \
+           f"using templates up to date {pred['tmplate_cutoff_date']}, " \
+           f"ranked by ipTM*0.8+pTM*0.2, starting from MSAs with reduced_dbs " \
+           f"setting"
+
+
+def _get_af2_merge_dict(pred):
+    """Get N/C-terminal padding lengths for artificial coiled-coils."""
+    pred_name = os.path.splitext(pred["prediction_p"])[0]
+    ps = pred_name.split('_')
+    pred_merge = {"N": 0, "C": 0}
+    if "mergeN" in ps:
+        pred_merge["N"] = int(ps[ps.index("mergeN") + 1])
+    if "mergeC" in ps:
+        pred_merge["C"] = int(ps[ps.index("mergeC") + 1])
+    return pred_merge
+
+
+def _get_af2_range_string(pred):
+    """Get string repr. of modelled range incl. coiled-coil padding."""
+    range_str = f"{pred['prediction_start']}-{pred['prediction_end']}"
+    pred_merge = _get_af2_merge_dict(pred)
+    if pred_merge["N"] or pred_merge["C"]:
+        if pred_merge["N"] and pred_merge["C"]:
+            seq_str = "sequences"
+        else:
+            seq_str = "sequence"
+        pad_strs = []
+        if pred_merge["N"]:
+            pad_strs.append(f"{pred_merge['N']} N-terminal residues")
+        if pred_merge["C"]:
+            pad_strs.append(f"{pred_merge['C']} C-terminal residues")
+        pad_str = ", ".join(pad_strs)
+        range_str += f" padded with artificial coiled-coil {seq_str} " \
+                     f"({pad_str}) to improve oligomerization"
+    return range_str
+
+
+def _get_preds_and_parts(json_dict, mdl_type):
+    """Get list of AF2 pred. and trucated parts in a consistent way.
+    For domains, the final model is the truncated part and gets "model"
+    as truncated_prediction_path.
+    """
+    if mdl_type == "assembly":
+        af2_preds = json_dict["AF_predictions"]
+        trunc_parts = json_dict["assembly_parts"]
+    elif mdl_type == "domain":
+        af2_pred = json_dict["source_AF_prediction"]
+        af2_preds = [af2_pred]
+        trunc_parts = [{
+            "part_start": json_dict["domain_sequence_start"],
+            "part_end": json_dict["domain_sequence_end"],
+            "truncated_prediction_path": "model",
+            "source_prediction": os.path.splitext(af2_pred["prediction_p"])[0]
+        }]
+    else:
+        raise RuntimeError(f"Unexpected mdl_type: '{mdl_type}'")
+    # sort them by range
+    af2_preds.sort(
+        key=lambda pred: (
+            int(pred["prediction_start"]), int(pred["prediction_end"])
+        )
+    )
+    trunc_parts.sort(
+        key=lambda part: (part["part_start"], part["part_end"])
+    )
+    return af2_preds, trunc_parts
+
+
+def _get_af2_files(af2_path):
+    """Return names for PDB file, PKL file, and PAE file."""
+    af_base = os.path.splitext(af2_path)[0]
+    return af2_path, f"{af_base}_info.pkl", f"PAE_{af_base}.png"
+
+
+def _get_protocol_steps_and_software(json_dict, mdl_type, mdl_id):
+    """Create the list of protocol steps with software and parameters used."""
+    protocol = []
+
+    # LOGIC:
+    # - First steps are AF2 predictions with different settings
+    # -> each setting has a number of ranges
+    # -> if only one AF pred., the text gets simplified
+    # - Next step: models truncated (and superposed) into parts
+    # -> one step for all ranges
+    # -> no superpos. if only one part
+    # - For assemblies with more than one part:
+    # -> MODELLER step
+    # -> QE step combining pLDDT and MODELLER energy score
+    # Note: sanity checks all done in _do_sanity_checks and not here!
+
+    # collect info on predictions and parts
+    af2_preds, trunc_parts = _get_preds_and_parts(json_dict, mdl_type)
+
+    # group AF2 predictions by SW config
+    af2_config_preds = {} # dict = version, tmplate_cutoff_date, num_ensemble
+    for pred in af2_preds:
+        config_tuple = (
+            pred["version"], pred["tmplate_cutoff_date"], pred["num_ensemble"]
+        )
+        if config_tuple not in af2_config_preds:
+            af2_config_preds[config_tuple] = [pred]
+        else:
+            af2_config_preds[config_tuple].append(pred)
+    
+    # create AF2 steps
+    for config_tuple in sorted(af2_config_preds):
+        version, max_template_date, num_ensemble = config_tuple
+        sw_params = {
+            "model_preset": "multimer",
+            "db_preset": "reduced_dbs",
+            "max_template_date": max_template_date,
+            "num_ensemble": num_ensemble,
+        }
+        if version == "2.3.1":
+            sw_params["num_multimer_predictions_per_model"] = 2
+        sw_plus_params = [(
+            _get_af2_software(version=version, is_multimer=True), sw_params
+        )]
+        # build up text
+        cur_preds = af2_config_preds[config_tuple]
+        ranges_txt = ", ".join([
+            _get_af2_range_string(pred) for pred in cur_preds
+        ])
+        af_txt = _get_af2_description(cur_preds[0]) # all cur_preds the same
+        # shortened text for single pred.
+        if len(af2_preds) == 1:
+            details = f"AlphaFold-Multimer prediction {af_txt} for range " \
+                      f"{ranges_txt}."
+        else:
+            details = f"AlphaFold-Multimer prediction of overlapping, " \
+                      f"partial sequences {af_txt}. The following residue " \
+                      f"ranges were predicted: {ranges_txt}."
+        # collect output files
+        output = []
+        for pred in cur_preds:
+            output.extend(_get_af2_files(pred["prediction_p"]))
+        protocol.append({
+            "method_type": "modeling",
+            "name": "AF2",
+            "details": details,
+            "input": [f"target_sequences_{mdl_id}", f"ref_dbs_AF2_{version}"],
+            "output": output,
+            "software_plus_params": sw_plus_params,
+        })
+
+    # create truncation step
+    ranges_txt = ", ".join([
+        f"{part['part_start']}-{part['part_end']}" for part in trunc_parts
+    ])
+    if len(trunc_parts) == 1:
+        details = f"Model truncated to predicted domain range {ranges_txt}."
+    else:
+        details = f"Models truncated and superposed based on the following " \
+                  f"overlapping residue ranges: {ranges_txt}."
+    trunc_data = [part["truncated_prediction_path"] for part in trunc_parts]
+    # add MDL ID suffix if needed
+    trunc_data = [
+        (f"model_{mdl_id}" if td == "model" else td) for td in trunc_data
+    ]
+    protocol.append({
+        "method_type": "other",
+        "name": "Truncation",
+        "details": details,
+        "input": sorted(set([
+            f"{part['source_prediction']}.pdb" for part in trunc_parts
+        ])),
+        "output": trunc_data,
+        "software_plus_params": [],
+    })
+
+    # create other steps to combine truncated parts
+    if len(trunc_parts) > 1:
+        # MODELLER step
+        protocol.append({
+            "method_type": "modeling",
+            "name": "MODELLER",
+            "details": "Assembly of the truncated models using MODELLER " \
+                       "executed using the \"AutoModel\" configuration.",
+            "input": trunc_data,
+            "output": [f"model_{mdl_id}"],
+            "software_plus_params": [(_get_modeller_software("10.4"), {})],
+        })
+        # QE step
+        protocol.append({
+            "method_type": "other",
+            "name": "QE",
+            "details": "Calculation of combined per-residue confidence " \
+                       "metric (range 0-100, higher is better) based on " \
+                       "per-residue AlphaFold confidence score (pLDDT) " \
+                       "(range 0-100, higher is better, for residues with " \
+                       "overlapping predictions, the higher pLDDT score is " \
+                       "used) and per-residue MODELLER energy score (>0, " \
+                       "lower is better) using the following formula: " \
+                       "confidence = clip(0,100, pLDDT - 60 / (1 + exp((150 " \
+                       "- energy)/20))). This score reduces the pLDDT score " \
+                       "by max. 60 (for energy values >> 150). For energy " \
+                       "values < 100, the score approximately follows the " \
+                       "pLDDT score.",
+            "input": [f"model_{mdl_id}"],
+            "output": [f"model_{mdl_id}"],
+            "software_plus_params": [],
+        })
+    if "software_used_for_domain_annotation" in json_dict:
+        # domain annotation step
+        protocol.append({
+            "method_type": "other",
+            "name": "domain annotation",
+            "details": "Domain annotation performed using a combination of " \
+                       "the following tools: DSSP, SamCC-Turbo, Foldseek, " \
+                       "US-align.",
+            "input": [f"model_{mdl_id}"],
+            "output": [f"model_{mdl_id}"],
+            "software_plus_params": [
+                (_get_dssp_software("2.3.0"), {}),
+                (_get_sam_cc_software(), {}),
+                (_get_foldseek_software(), {}),
+                (_get_us_align_software(), {})
+            ],
+        })
+
+    return protocol
+
+
+def _process_ent(ent):
+    """Helper to process OST entities for sanity checks.
+    Returns:
+    - ch_names: list of chain names in order as appearing in file
+    - mdlsqe: atomseq (no gaps) for model
+    -> incl. assertion that all chains have same seq.
+    - resnum_range: (min. res. num., max. res. num.)
+    -> incl. assertion that res. num. are continuous and w/o gaps
+    - ent_bfs: b-factor for each residue
+    -> incl. assertion that all atoms of res. have same
+    -> list of length len(ch_names) * len(mdlsqe)
+    """
+    # chain names in order
+    ch_names = [ch.name for ch in ent.chains]
+    # sequence
+    unique_atomseqs = sorted(set([
+        "".join(res.one_letter_code for res in ch.residues) \
+        for ch in ent.chains
+    ]))
+    assert len(unique_atomseqs) == 1
+    mdlsqe = unique_atomseqs[0]
+    # res. nums (required to be continuous)
+    resnum_ranges = []
+    for ch in ent.chains:
+        res_nums = [res.number.num for res in ch.residues]
+        assert res_nums == list(range(min(res_nums), max(res_nums) + 1))
+        resnum_ranges.append((min(res_nums), max(res_nums)))
+    unique_resnum_ranges = sorted(set(resnum_ranges))
+    assert len(unique_resnum_ranges) == 1
+    resnum_range = unique_resnum_ranges[0]
+    # b-factors
+    ent_bfs = []
+    for res in ent.residues:
+        b_factors = [a.b_factor for a in res.atoms]
+        assert len(set(b_factors)) == 1 # must all be equal!
+        ent_bfs.append(b_factors[0])
+    assert len(ent_bfs) == len(ch_names) * len(mdlsqe)
+    return ch_names, mdlsqe, resnum_range, ent_bfs
+
+
+def _load_cif_file(zip_dict):
+    """Read CIF file for given entry.
+    Returns OST entity and dictionary of categories to put back into resulting
+    ModelCIF file with gemmi (categories not supported by python-modelcif).
+    """
+    # fetch file and fix for OST
+    cif_str = zip_dict["cif_file"]
+    doc = gemmi.cif.read_string(cif_str)
+    block = doc.sole_block()
+    cat_dict = block.get_mmcif_category('_struct_ref.')
+    if cat_dict and "db_code" not in cat_dict:
+        if "pdbx_db_accession" in cat_dict:
+            cat_dict["db_code"] = cat_dict["pdbx_db_accession"]
+        else:
+            lengths = [len(v) for v in cat_dict.values()]
+            assert len(set(lengths)) == 1
+            cat_dict["db_code"] = [""] * lengths[0]
+        block.set_mmcif_category('_struct_ref.', cat_dict)
+    # kept mmCIF data: to check consistency later
+    kept_cif_cats = {}
+    for cat in ["_pdbx_domain.", "_pdbx_feature_domain.", "_pdbx_domain_range."]:
+        cat_dict = block.get_mmcif_category(cat)
+        if cat_dict:
+            kept_cif_cats[cat] = cat_dict
+    # fix data
+    if len(kept_cif_cats) == 3:
+        # fix IDs if possible
+        pd_ids = kept_cif_cats["_pdbx_domain."]["id"]
+        pfd_ids = kept_cif_cats["_pdbx_feature_domain."]["id"]
+        pfd_dom_ids = kept_cif_cats["_pdbx_feature_domain."]["domain_id"]
+        pdr_dom_ids = kept_cif_cats["_pdbx_domain_range."]["domain_id"]
+        if pdr_dom_ids != pd_ids and pfd_ids == pd_ids \
+           and pfd_dom_ids == pdr_dom_ids:
+            kept_cif_cats["_pdbx_feature_domain."]["domain_id"] = pd_ids
+            kept_cif_cats["_pdbx_domain_range."]["domain_id"] = pd_ids
+        # fix characters in expected places
+        exp_cat_items = ['_pdbx_domain.details', '_pdbx_feature_domain.feature']
+        for cat, cat_dict in kept_cif_cats.items():
+            for item in cat_dict.keys():
+                if f"{cat}{item}" in exp_cat_items:
+                    cat_dict[item] = [
+                        value.replace('°-', ' degree ') \
+                             .replace('°', ' degree') \
+                             .replace('αβ', 'alpha/beta') \
+                             .replace('α', 'alpha') \
+                             .replace('β', 'beta') \
+                             .replace('“', '"') \
+                             .replace('→', ' to ') \
+                        for value in cat_dict[item]
+                    ]
+    # get OST ent.
+    cif_ent, info, ss = io.MMCifStrToEntity(
+        doc.as_string(), profile=io.profiles["DEFAULT"], process=True
+    )
+    return cif_ent, kept_cif_cats
+
+
+def _do_sanity_checks(mdl_id, json_dict, zip_dict, cif_ent, kept_cif_cats):
+    """Check if everything in order and return issues for weird cases."""
+    issues = []
+    
+    # get some general info depending on model type
+    mdl_type = zip_dict["mdl_type"]
+    plddt_len = len(json_dict["pLDDT"])
+    if mdl_type == "assembly":
+        mdl_seq = json_dict["predicted_sequence"]
+        seq_range = (
+            json_dict["predicted_sequence_start"],
+            json_dict["predicted_sequence_end"],
+        )
+        ent_desc = json_dict["source_sequence_description"]
+        assert "software_used_for_domain_annotation" in json_dict
+        assert len(json_dict["per_res_conf"]) == plddt_len
+    else:
+        assert "predicted_sequence" not in json_dict
+        mdl_seq = json_dict["domain_sequence"]
+        seq_range = (
+            json_dict["domain_sequence_start"],
+            json_dict["domain_sequence_end"],
+        )
+        label = f'domain/motif "{json_dict["domain_name"]}"'
+        assert label in json_dict["abstract"]
+        ent_desc = f'{json_dict["source_sequence_description"]} ({label})'
+        assert "merge" not in json_dict["source_AF_prediction"]["prediction_p"]
+        assert "software_used_for_domain_annotation" not in json_dict
+        assert "per_res_conf" not in json_dict
+
+    # global checks
+    assert sum((c in "XOUBJZ") for c in mdl_seq) == 0
+    src_seq = json_dict["source_sequence"]
+    assert mdl_seq == src_seq[seq_range[0] - 1:seq_range[1]]
+    if len(mdl_seq) != plddt_len:
+        short_data = (len(mdl_seq), plddt_len)
+        long_data = (mdl_seq, json_dict["pLDDT"])
+        issues.append(("plddt_size_mismatch", short_data, long_data))
+    assert seq_range[1] > seq_range[0]
+    if seq_range[1] > len(src_seq):
+        short_data = (seq_range, len(src_seq))
+        issues.append(("range_source_mismatch", short_data, []))
+    for key in ["percent_confident_residues", "mean_per_res_conf"]:
+        assert key in json_dict
+    extra_keys = set(json_dict.keys()) - set([
+        'AF_predictions',
+        'MODELLER_energy_score',
+        'abstract',
+        'assembly_parts',
+        'domain_name',
+        'domain_sequence',
+        'domain_sequence_end',
+        'domain_sequence_start',
+        'mean_per_res_conf',
+        'pLDDT',
+        'per_res_conf',
+        'percent_confident_residues',
+        'predicted_sequence',
+        'predicted_sequence_end',
+        'predicted_sequence_start',
+        'software_used_for_domain_annotation',
+        'software_used_for_prediction',
+        'source_AF_prediction',
+        'source_sequence',
+        'source_sequence_RefSeq_ID',
+        'source_sequence_description',
+        'title',
+        'source_sequence_download_date',
+        'domain_id',
+        'coiled_coil_annot',
+    ])
+    if len(extra_keys) > 0:
+        issues.append(("extra_keys", sorted(extra_keys), []))
+    # unused/unknown coiled_coil_annot (expect to be None if there)
+    assert json_dict.get("coiled_coil_annot") is None
+
+    # for valid mmCIF...
+    mmcif_regex_ent = "[][ \t_(),.;:\"&<>/\\\\{}'`~!@#$%?+=*A-Za-z0-9|^-]*"
+    mmcif_regex_desc = "[][ \n\t()_,.;:\"&<>/\\\\{}'`~!@#$%?+=*A-Za-z0-9|^-]*"
+    assert bool(re.fullmatch(mmcif_regex_ent, ent_desc))
+    assert bool(re.fullmatch(mmcif_regex_desc, json_dict["title"]))
+    assert bool(re.fullmatch(mmcif_regex_desc, json_dict["abstract"]))
+
+    # collect info on predictions and parts
+    af2_preds, trunc_parts = _get_preds_and_parts(json_dict, mdl_type)
+
+    # check AF pred.
+    exp_pred_keys = set([
+        "prediction_start", "prediction_end", "prediction_p", "version",
+        "tmplate_cutoff_date", "num_ensemble", "pTM", "ipTM"
+    ])
+    opt_pred_keys = set([
+        "merged_GCN4linker_len_N", "merged_GCN4linker_len_C"
+    ])
+    pred_ranges = {}
+    sw_kws_to_check = set()
+    for pred in af2_preds:
+        # check keys
+        extra_keys = set(pred.keys()) - exp_pred_keys - opt_pred_keys
+        assert len(extra_keys) == 0
+        noncovered_keys = exp_pred_keys - pred.keys()
+        assert len(noncovered_keys) == 0
+        assert pred["version"] in pred["prediction_p"]
+        # keep track of files and ranges for checks of parts
+        pred_name = os.path.splitext(pred["prediction_p"])[0]
+        assert pred_name not in pred_ranges
+        pred_range = (
+            int(pred["prediction_start"]), int(pred["prediction_end"])
+        )
+        pred_ranges[pred_name] = pred_range
+        # keep track of AF2 versions
+        sw_kws_to_check.add(("AlphaFold", pred["version"]))
+        # check merge stuff
+        pred_merge = _get_af2_merge_dict(pred)
+        dict_pred_merge = {"N": 0, "C": 0}
+        if "merged_GCN4linker_len_N" in pred.keys():
+            dict_pred_merge["N"] = pred["merged_GCN4linker_len_N"]
+        if "merged_GCN4linker_len_C" in pred.keys():
+            dict_pred_merge["C"] = pred["merged_GCN4linker_len_C"]
+        assert pred_merge == dict_pred_merge
+        # check acc. file content
+        pdb_file, pkl_file, pae_file = _get_af2_files(pred["prediction_p"])
+        pdb_ent = io.PDBStrToEntity(
+            zip_dict["accompanying_data"][pdb_file],
+            profile=io.profiles["DEFAULT"],
+            process=True
+        )
+        pkl_data = pickle.loads(zip_dict["accompanying_data"][pkl_file])
+        (
+            pdb_ch_names, pdb_mdlsqe, pdb_resnum_range, pdb_ent_bfs
+        ) = _process_ent(pdb_ent)
+        # pdb_ch_names can have random order and we don't care...
+        # we ignore pdb_resnum_range and just check seq.
+        exp_mdlsqe = src_seq[pred_range[0] - 1:pred_range[1]]
+        cut_end = len(pdb_mdlsqe) - pred_merge["C"]
+        pdb_mdlsqe_cut = pdb_mdlsqe[pred_merge["N"]:cut_end]
+        assert pdb_mdlsqe_cut == exp_mdlsqe
+        # check QE
+        exp_qe_len = len(pdb_ch_names) * len(pdb_mdlsqe)
+        assert len(pkl_data["pLDDT"]) == exp_qe_len
+        assert len(pkl_data["PAE"]) == exp_qe_len
+        assert len(pkl_data["PAE"][0]) == exp_qe_len
+        assert "pTM" in pkl_data
+        assert "ipTM" in pkl_data
+        qe_max_diff = np.max(abs(np.asarray(pkl_data["pLDDT"]) - pdb_ent_bfs))
+        if qe_max_diff > 0.01:
+            # 2nd option: chain names in PDB were reordered compared to pkl
+            bfs_arr = np.asarray(pdb_ent_bfs).reshape(len(pdb_ch_names), -1)
+            ch_indices = [pdb_ch_names.index(ch) for ch in ['A', 'C', 'B']]
+            bfs_new = bfs_arr[ch_indices,:].flatten()
+            qe_max_diff = np.max(abs(np.asarray(pkl_data["pLDDT"]) - bfs_new))
+        if qe_max_diff > 0.01:
+            short_data = [qe_max_diff, pkl_file]
+            long_data = [list(pkl_data["pLDDT"]), pdb_ent_bfs]
+            issues.append(("pkl_plddt_diff", short_data, long_data))
+        # check redundant data
+        assert str(pred["prediction_start"]) == pkl_data["pred_start"]
+        assert str(pred["prediction_end"]) == pkl_data["pred_end"]
+        assert pred["tmplate_cutoff_date"] == pkl_data["tmpl_max_date"]
+        assert pred["version"] == pkl_data["version"]
+        assert pred["num_ensemble"] == pkl_data["num_ensemble"]
+        if pred["pTM"] != pkl_data["pTM"]:
+            long_data = [pred["pTM"], float(pkl_data["pTM"])]
+            short_data = [abs(long_data[0] - long_data[1]), pkl_file]
+            issues.append(("pkl_ptm_diff", short_data, long_data))
+        if pred["ipTM"] != pkl_data["ipTM"]:
+            long_data = [pred["ipTM"], float(pkl_data["ipTM"])]
+            short_data = [abs(long_data[0] - long_data[1]), pkl_file]
+            issues.append(("pkl_iptm_diff", short_data, long_data))
+        if "merged_GCN4linker_len_N" in pred:
+            if "merge_len_N" in pkl_data.keys():
+                other_len = pkl_data["merge_len_N"]
+            else:
+                # HACK for 0898
+                assert "merge_len" in pkl_data.keys()
+                other_len = pkl_data["merge_len"]
+            assert pred["merged_GCN4linker_len_N"] == other_len
+        if "merged_GCN4linker_len_C" in pred:
+            assert pred["merged_GCN4linker_len_C"] == pkl_data["merge_len_C"]
+        # check expected fixed data
+        assert pkl_data["max_msa_size"] == 10000
+        assert pkl_data["db_preset"] == "reduced_dbs"
+        assert pkl_data["amber_relax"] == True
+
+    # check truncated parts
+    if len(trunc_parts) > 1:
+        assert len(json_dict["MODELLER_energy_score"]) == plddt_len
+        sw_kws_to_check.add(("MODELLER", "10.4"))
+    else:
+        assert "MODELLER_energy_score" not in json_dict
+    exp_part_keys = sorted([
+        "part_start", "part_end", "truncated_prediction_path",
+        "source_prediction"
+    ])
+    part_names = []
+    source_predictions = []
+    for part in trunc_parts:
+        # check metadata
+        assert sorted(part.keys()) == exp_part_keys
+        source_predictions.append(part["source_prediction"])
+        part_names.append(part["truncated_prediction_path"])
+        pred_range = pred_ranges[part["source_prediction"]]
+        assert part["part_start"] >= pred_range[0]
+        assert part["part_end"] <= pred_range[1]
+        # check acc. file content
+        part_path = part["truncated_prediction_path"]
+        if part_path != "model":
+            pdb_ent = io.PDBStrToEntity(
+                zip_dict["accompanying_data"][part_path],
+                profile=io.profiles["DEFAULT"],
+                process=True
+            )
+            (
+                pdb_ch_names, pdb_mdlsqe, pdb_resnum_range, pdb_ent_bfs
+            ) = _process_ent(pdb_ent)
+            part_range = (part["part_start"], part["part_end"])
+            assert pdb_resnum_range == part_range # not really important
+            exp_mdlsqe = src_seq[part_range[0] - 1:part_range[1]]
+            assert exp_mdlsqe == pdb_mdlsqe
+
+    # check CIF file
+    (
+        cif_ch_names, cif_mdlsqe, cif_resnum_range, cif_ent_bfs
+    ) = _process_ent(cif_ent)
+    assert seq_range == cif_resnum_range # NOTE: critical for kept_cif_cats
+    assert cif_mdlsqe == mdl_seq
+    if cif_ch_names != ['A', 'B', 'C']:
+        issues.append(("cif_ch_names", cif_ch_names, []))
+    # check b-factors (assume to match average or first chain)
+    if mdl_id == "ma-taas-2867":
+        # known to have faulty b-factors in CIF file; should use ones from PDB
+        bfs_arr = np.asarray(pdb_ent_bfs).reshape(len(cif_ch_names), -1)
+        bfs_avg = bfs_arr[0,:]
+    else:
+        bfs_arr = np.asarray(cif_ent_bfs).reshape(len(cif_ch_names), -1)
+        bfs_avg = bfs_arr.mean(axis=0)
+    assert len(bfs_avg) == len(cif_mdlsqe)
+    if "per_res_conf" in json_dict:
+        # for assemblies
+        jd_sc = json_dict["per_res_conf"]
+        max_diff = np.max(abs(bfs_avg - jd_sc))
+        if max_diff >= 0.01:
+            long_data = [bfs_avg.tolist(), jd_sc]
+            issues.append(("per_res_conf", max_diff, long_data))
+    else:
+        # for domains
+        jd_sc = json_dict["pLDDT"]
+    max_diff = np.max(abs(bfs_arr[0,:] - jd_sc))
+    # b-factors known to be faulty for some models...
+    if max_diff >= 0.01 and mdl_id[-4:] not in ["9293", "9344"]:
+        long_data = [bfs_arr[0,:].tolist(), jd_sc]
+        issues.append(("cif_b_factors", max_diff, long_data))
+
+    # make sure prediction covers everything
+    min_start = min(part["part_start"] for part in trunc_parts)
+    max_end = max(part["part_end"] for part in trunc_parts)
+    if min_start != seq_range[0] or max_end != seq_range[1]:
+        short_data = [(min_start, max_end), seq_range]
+        issues.append(("pred_range_mismatch", short_data, []))
+    assert len(set(part_names)) == len(trunc_parts)
+    assert sorted(set(source_predictions)) == sorted(pred_ranges)
+
+    # all files there?
+    exp_files = []
+    for pred in af2_preds:
+        exp_files.extend(_get_af2_files(pred["prediction_p"]))
+    for part in trunc_parts:
+        if part["truncated_prediction_path"] != "model":
+            exp_files.append(part["truncated_prediction_path"])
+    acc_files = sorted(zip_dict["accompanying_data"].keys())
+    assert len(set(exp_files) - set(acc_files)) == 0
+    extra_files = set(acc_files) - set(exp_files)
+    if len(extra_files) > 0:
+        long_data = [sorted(exp_files), acc_files]
+        issues.append(("extra_acc_files", sorted(extra_files), long_data))
+
+    # check SW
+    sw_checked = set()
+    claimed_sw = json_dict["software_used_for_prediction"]
+    for kw_to_check in sw_kws_to_check:
+        matching_sw = [
+            sw for sw in claimed_sw \
+            if all(kw in sw for kw in kw_to_check)
+        ]
+        assert len(matching_sw) == 1
+        sw_checked.add(matching_sw[0])
+    assert sorted(sw_checked) == sorted(claimed_sw)
+    if "software_used_for_domain_annotation" in json_dict:
+        claimed_sw = json_dict["software_used_for_domain_annotation"]
+        exp_sw = [
+            'DSSP 2.3.0 (Kabsch and Sander 1983)',
+            'Foldseek (van Kempen et al. 2024)',
+            'SamCC-Turbo (Szczepaniak et al. 2020)',
+            'US-align (Zhang et al. 2022)'
+        ]
+        assert sorted(claimed_sw) == exp_sw
+
+    # QE checks
+    plddts = json_dict["pLDDT"]
+    plddt_range = (min(plddts), max(plddts))
+    if plddt_range[0] < 0 or plddt_range[1] > 100:
+        issues.append(("plddt_range_mismatch", plddt_range, plddts))
+    if "MODELLER_energy_score" in json_dict:
+        energy = json_dict["MODELLER_energy_score"]
+        if min(energy) <= 0:
+            issues.append(("energy_range_mismatch", min(energy), energy))
+    if "per_res_conf" in json_dict:
+        per_res_confs = json_dict["per_res_conf"]
+        prc_range = (min(per_res_confs), max(per_res_confs))
+        if prc_range[0] < 0 or prc_range[1] > 100:
+            issues.append(("prc_range_mismatch", prc_range, per_res_confs))
+    if "MODELLER_energy_score" not in json_dict \
+       and "per_res_conf" in json_dict:
+        v1 = np.asarray(plddts)
+        v2 = np.asarray(per_res_confs)
+        qe_max_diff = np.max(abs(v1 - v2))
+        if qe_max_diff > 0.05:
+            long_data = [plddts, per_res_confs]
+            issues.append(("prc_plddt_diff", qe_max_diff, long_data))
+
+    # check domains
+    if mdl_type == "assembly":
+        exp_num_kept_cif_cats = 3
+    else:
+        exp_num_kept_cif_cats = 0
+    if exp_num_kept_cif_cats != len(kept_cif_cats):
+        short_data = (exp_num_kept_cif_cats, sorted(kept_cif_cats.keys()))
+        issues.append(("num_kept_cif_cats", short_data, kept_cif_cats))
+    # check categories
+    if len(kept_cif_cats) == 3:
+        # the following should all match after fixes appled in _load_cif_file
+        pd_ids = sorted(kept_cif_cats["_pdbx_domain."]["id"])
+        if len(set(pd_ids)) != len(pd_ids):
+            issues.append(("non_unique_pd_ids", [], pd_ids))
+        pfd_ids = kept_cif_cats["_pdbx_feature_domain."]["id"]
+        if len(set(pfd_ids)) != len(pfd_ids):
+            issues.append(("non_unique_pfd_ids", [], pfd_ids))
+        pfd_dom_ids = sorted(
+            kept_cif_cats["_pdbx_feature_domain."]["domain_id"]
+        )
+        if pd_ids != pfd_dom_ids:
+            issues.append(("pfd_dom_ids", [], [pd_ids, pfd_dom_ids]))
+        pdr_dom_ids = sorted(kept_cif_cats["_pdbx_domain_range."]["domain_id"])
+        if pd_ids != pdr_dom_ids:
+            issues.append(("pdr_dom_ids", [], [pd_ids, pdr_dom_ids]))
+        # special characters?
+        for cat, cat_dict in kept_cif_cats.items():
+            for item, values in cat_dict.items():
+                for value in values:
+                    # I use the most permissive one
+                    if not re.fullmatch(mmcif_regex_desc, str(value)):
+                        invalid_chars = re.sub(mmcif_regex_desc, '', str(value))
+                        issues.append((
+                            "cif_invalid_chars",
+                            (f"{cat}{item}", invalid_chars),
+                            value
+                        ))
+        # matches the range?
+        cat_dict = kept_cif_cats["_pdbx_domain_range."]
+        res_list = list(zip(
+            cat_dict["beg_label_asym_id"],
+            cat_dict["beg_label_comp_id"],
+            cat_dict["beg_label_seq_id"],
+        )) + list(zip(
+            cat_dict["end_label_asym_id"],
+            cat_dict["end_label_comp_id"],
+            cat_dict["end_label_seq_id"],
+        ))
+        for ch_name, res_name, res_num in res_list:
+            res = cif_ent.FindResidue(ch_name, int(res_num))
+            if not res.IsValid() or not res.name == res_name:
+                issues.append((
+                    "invalid_dom_range_res", [ch_name, res_name, res_num], []
+                ))
+
+    return issues
+
+
+def _get_entities(json_dict, ncbi_data, cif_ent, mdl_type):
+    """Gather data for the mmCIF (target) entities.
+    Returns (list of target entities, list of issues)
+    """
+    issues = []
+    # get NCBI data
+    ncbi_ac = json_dict["source_sequence_RefSeq_ID"]
+    ncbi_item = ncbi_data[ncbi_ac]
+    ncbi_info = ncbi_item["info"]
+    assert ncbi_ac == ncbi_info["AccessionVersion"]
+    ncbi_seq = ncbi_item["seq_str"]
+    # get json_dict data
+    if mdl_type == "assembly":
+        ent_desc = json_dict["source_sequence_description"]
+        # full seq.
+        seqres = json_dict["source_sequence"]
+        ncbi_range = (1, len(ncbi_seq))
+        mdl_range = ncbi_range
+    else:
+        label = f'domain/motif "{json_dict["domain_name"]}"'
+        ent_desc = f'{json_dict["source_sequence_description"]} ({label})'
+        # cut seq.
+        seqres = json_dict["domain_sequence"]
+        ncbi_range = (
+            json_dict["domain_sequence_start"],
+            json_dict["domain_sequence_end"],
+        )
+        mdl_range = (1, len(seqres))
+    auth_seq_id_map = {
+        pdb_idx: ncbi_idx for pdb_idx, ncbi_idx in zip(
+            range(mdl_range[0], mdl_range[1] + 1),
+            range(ncbi_range[0], ncbi_range[1] + 1)
+        )
+    }
+    # compare with NCBI
+    ncbi_seq_cut = ncbi_seq[ncbi_range[0] - 1:ncbi_range[1]]
+    if seqres != ncbi_seq_cut:
+        # one case exists with a single mismatch
+        assert len(seqres) == len(ncbi_seq_cut)
+        mismatches = [
+            (idx + 1, c1, c2) \
+            for idx, (c1, c2) in enumerate(zip(seqres, ncbi_seq_cut)) \
+            if c1 != c2
+        ]
+        issues.append(("mismatches", mismatches, [seqres, ncbi_seq_cut]))
+    else:
+        mismatches = []
+    # fill data
+    cif_ch_names, cif_mdlsqe, _, _ = _process_ent(cif_ent)
+    entity = {
+        "seqres": seqres,
+        "ncbi_seq": ncbi_seq,
+        "sqe_no_gaps": cif_mdlsqe,
+        "mismatches": mismatches,
+        "ncbi_range": ncbi_range,
+        "mdl_range": mdl_range,
+        "auth_seq_id_map": auth_seq_id_map,
+        "pdb_chain_ids": cif_ch_names,
+        "description": ent_desc,
+        "ncbi_acv": ncbi_info["AccessionVersion"],
+        "ncbi_gi": str(ncbi_info["Gi"]),
+        "ncbi_taxid": str(ncbi_info["TaxId"]),
+        "ncbi_organism": ncbi_info["SpeciesName"],
+        "ncbi_last_mod": datetime.datetime.strptime(
+            ncbi_info["UpdateDate"], "%Y/%m/%d"
+        )
+    }
+    return [entity], issues
+################################################################################
+
+################################################################################
+# ModelCIF HANDLING
+################################################################################
+# pylint: disable=too-few-public-methods
+class _GlobalPCR(modelcif.qa_metric.Global, modelcif.qa_metric.NormalizedScore):
+    """Percentage of confident (score > 70) residues in [0,1]"""
+    name = "percent confident residues"
+    software = None
+
+
+class _GlobalPRC(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Average per-residue confidence (pLDDT-like) in [0,1]"""
+    name = "average per-residue confidence (pLDDT-like)"
+    software = None
+
+
+class _GlobalPRC_pLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Average per-residue confidence (pLDDT) in [0,1]"""
+    name = "average per-residue confidence (pLDDT)"
+    software = None
+
+
+class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+    name = "pLDDT"
+    software = None
+
+
+class _LocalMOD(modelcif.qa_metric.Local, modelcif.qa_metric.Energy):
+    """Per-residue MODELLER energy score (>0, lower is better)."""
+    name = "MODELLER energy score"
+    software = None
+
+
+class _LocalPRC(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+    """Per-residue confidence score in [0,100]."""
+    name = "confidence score (pLDDT-like)"
+    software = None
+
+
+class _NcbiTrgRef(modelcif.reference.TargetReference):
+    """NCBI as target reference."""
+    name = "NCBI"
+    other_details = None
+
+
+class _LPeptideAlphabetWithXO(ihm.LPeptideAlphabet):
+    """Have the default amino acid alphabet plus 'X' for unknown residues
+    and 'O' as allowed non-def. AA (U already in alphabet)."""
+
+    # extra entry added according to LPeptideAlphabet def. in
+    # https://python-ihm.readthedocs.io/en/latest/_modules/ihm.html
+    # and https://files.rcsb.org/view/1NTH.cif for values for 'O'.
+
+    def __init__(self):
+        """Create the alphabet."""
+        super().__init__()
+        self._comps["X"] = self._comps["UNK"]
+        self._comps['O'] = ihm.LPeptideChemComp(
+            "PYL", "O", "O", "PYRROLYSINE", "C12 H21 N3 O3"
+        )
+# pylint: enable=too-few-public-methods
+
+
+# TEST: _get_not_modeled_residues from modelcif/util/make_mmcif.py
+def _get_not_modeled_residues(model):
+    """Yield NotModeledResidueRange objects for all residue ranges in the
+       Model that are not referenced by Atom, Sphere, or pre-existing
+       NotModeledResidueRange objects"""
+    for assem in model.assembly:
+        asym = assem.asym if hasattr(assem, 'asym') else assem
+        if not asym.entity.is_polymeric():
+            continue
+        # Make a set of all residue indices of this asym "handled" either
+        # by being modeled (with Atom or Sphere objects) or by being
+        # explicitly marked as not-modeled
+        handled_residues = set()
+        for rr in model.not_modeled_residue_ranges:
+            if rr.asym_unit is asym:
+                for seq_id in range(rr.seq_id_begin, rr.seq_id_end + 1):
+                    handled_residues.add(seq_id)
+        for atom in model.get_atoms():
+            if atom.asym_unit is asym:
+                handled_residues.add(atom.seq_id)
+        # Convert set to a list of residue ranges
+        handled_residues = ihm.util._make_range_from_list(
+            sorted(handled_residues))
+        # Return not-modeled for each non-handled range
+        for r in ihm.util._invert_ranges(handled_residues,
+                                         end=assem.seq_id_range[1],
+                                         start=assem.seq_id_range[0]):
+            yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1])
+
+
+class _OST2ModelCIF(modelcif.model.AbInitioModel):
+    """Map OST entity elements to ihm.model"""
+
+    def __init__(self, *args, **kwargs):
+        """Initialise a model"""
+        # process kwargs
+        for i in ["ost_entity", "asym", "scores_json"]:
+            if i not in kwargs:
+                raise TypeError(f"Required keyword argument '{i}' not found.")
+        self.ost_entity = kwargs.pop("ost_entity")
+        self.asym = kwargs.pop("asym")
+        self.scores_json = kwargs.pop("scores_json")
+
+        # call parent init (needs to pop new kwargs first!)
+        super().__init__(*args, **kwargs)
+
+        # use auth IDs for res. nums and chain names
+        self.use_auth = False
+
+        # fetch b-factors for all residues (for get_atoms)
+        self.res_bfactors = {
+            r.qualified_name: self.scores_json["b_factors"][i] \
+            for i, r in enumerate(self.ost_entity.residues)
+        }
+
+        # need reverse mapping from chain name + res num to seq. id
+        self.seq_id_map = {
+            ch_id: {v: k for k, v in asym_unit.auth_seq_id_map.items()} \
+            for ch_id, asym_unit in self.asym.items()
+        }
+
+        # explicitly add parts which were not modelled
+        for ch in self.ost_entity.chains:
+            # get covered range
+            asym_unit = self.asym[_get_ch_name(ch, self.use_auth)]
+            res_nums = [res.number.num for res in ch.residues]
+            res_first = min(res_nums)
+            res_last = max(res_nums)
+            # assertion true in this set (no gaps in modelled res. nums)
+            assert res_nums == list(range(res_first, res_last + 1))
+            # compare with ent seq.
+            mdl_seq_first = self.seq_id_map[ch.name][res_first]
+            mdl_seq_last = self.seq_id_map[ch.name][res_last]
+            ent_seq_first, ent_seq_last = asym_unit.seq_id_range
+            if mdl_seq_first != ent_seq_first:
+                assert mdl_seq_first > ent_seq_first
+                self.not_modeled_residue_ranges.append(
+                    modelcif.model.NotModeledResidueRange(
+                        asym_unit, ent_seq_first, mdl_seq_first - 1,\
+                    )
+                )
+            # NOTE: the case below doesn't actually happen in this set...
+            if mdl_seq_last != ent_seq_last:
+                assert mdl_seq_last < ent_seq_last
+                self.not_modeled_residue_ranges.append(
+                    modelcif.model.NotModeledResidueRange(
+                        asym_unit, mdl_seq_last + 1, ent_seq_last,\
+                    )
+                )
+        # -> note: could be auto-filled as in modelcif/util/make_mmcif.py
+        #    (see commented _get_not_modeled_residues TEST here)
+        # -> see https://github.com/ihmwg/python-modelcif/issues/37
+
+
+    def get_atoms(self):
+        # ToDo [internal]: Take B-factor out since its not a B-factor?
+        # NOTE: this assumes that _get_res_num maps residue to pos. in seqres
+        #       within asym
+        for atm in self.ost_entity.atoms:
+            yield modelcif.model.Atom(
+                asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)],
+                seq_id=self.seq_id_map[atm.chain.name][atm.residue.number.num],
+                atom_id=atm.name,
+                type_symbol=atm.element,
+                x=atm.pos[0],
+                y=atm.pos[1],
+                z=atm.pos[2],
+                het=atm.is_hetatom,
+                biso=self.res_bfactors[atm.residue.qualified_name],
+                occupancy=atm.occupancy,
+            )
+
+    def add_scores(self):
+        """Add QA metrics from AF2 scores."""
+        # global scores
+        for score, score_class in [
+            ("percent_confident_residues", _GlobalPCR),
+            ("mean_per_res_conf", _GlobalPRC),
+            ("mean_per_res_conf_plddt", _GlobalPRC_pLDDT)
+        ]:
+            if score in self.scores_json:
+                self.qa_metrics.append(score_class(self.scores_json[score]))
+
+        # local scores
+        for score, score_class in [
+            ("pLDDT", _LocalPLDDT),
+            ("MODELLER_energy_score", _LocalMOD),
+            ("per_res_conf", _LocalPRC),
+        ]:
+            if score in self.scores_json:
+                i = 0
+                for chn in self.ost_entity.chains:
+                    for res in chn.residues:
+                        seq_id = self.seq_id_map[chn.name][res.number.num]
+                        asym = self.asym[chn.name].residue(seq_id)
+                        self.qa_metrics.append(
+                            score_class(asym, self.scores_json[score][i])
+                        )
+                        i += 1
+
+
+def _get_modelcif_entities(target_ents, asym_units, system):
+    """Create ModelCIF entities and asymmetric units."""
+    alphabet = _LPeptideAlphabetWithXO()
+    for cif_ent in target_ents:
+        # collect references
+        ncbi_ref = _NcbiTrgRef(
+            code=cif_ent["ncbi_acv"],
+            accession=cif_ent["ncbi_gi"],
+            ncbi_taxonomy_id=cif_ent["ncbi_taxid"],
+            organism_scientific=cif_ent["ncbi_organism"],
+            sequence_version_date=cif_ent["ncbi_last_mod"],
+            sequence=cif_ent["ncbi_seq"],
+        )
+        # add alignment incl. mismatches
+        ncbi_ref.alignments.append(modelcif.reference.Alignment(
+            db_begin=cif_ent["ncbi_range"][0],
+            db_end=cif_ent["ncbi_range"][1],
+            entity_begin=cif_ent["mdl_range"][0],
+            entity_end=cif_ent["mdl_range"][1],
+            seq_dif=[
+                ihm.reference.SeqDif(
+                    seq_id,
+                    alphabet[olc_db],
+                    alphabet[olc]
+                ) for seq_id, olc, olc_db in cif_ent["mismatches"]
+            ]
+        ))
+        #
+        references = [ncbi_ref]
+        # combine into ModelCIF entity
+        mdlcif_ent = modelcif.Entity(
+            cif_ent["seqres"],
+            description=cif_ent["description"],
+            alphabet=alphabet,
+            source=ihm.source.Natural(
+                ncbi_taxonomy_id=cif_ent["ncbi_taxid"],
+                scientific_name=cif_ent["ncbi_organism"],
+            ),
+            references=references,
+        )
+        # NOTE: this assigns (potentially new) alphabetic chain names
+        for pdb_chain_id in cif_ent["pdb_chain_ids"]:
+            asym_units[pdb_chain_id] = modelcif.AsymUnit(
+                mdlcif_ent, strand_id=pdb_chain_id,
+                auth_seq_id_map=cif_ent["auth_seq_id_map"]
+            )
+        system.entities.append(mdlcif_ent)
+
+
+def _get_assoc_af2_pdb_data(pred):
+    """Generate a modelcif.data.Data object for an AF2 prediction."""
+    return modelcif.data.Data(
+        name=f"AlphaFold-Multimer (v{pred['version']}) model for range "
+        + f"{_get_af2_range_string(pred)}; pTM {pred['pTM']}, "
+        + f"ipTM {pred['ipTM']}",
+    )
+
+
+def _get_assoc_af2_json_data(af2_path):
+    """Generate a modelcif.data.Data object for JSON file
+    with extra QE for PDB file in af2_path.
+    """
+    return modelcif.data.Data(
+        name=f"Detailed quality estimates (pLDDT, PAE) for {af2_path}",
+    )
+
+
+def _get_assoc_pae_png_data(af2_path):
+    """Generate a modelcif.data.Data object for PNG file
+    with PAE plot for PDB file in af2_path.
+    """
+    return modelcif.data.Data(
+        name=f"Plot showing PAE matrix for {af2_path}",
+    )
+
+
+def _get_assoc_trunc_pdb_data(part, is_single):
+    """Generate a modelcif.associated.File object pointing to PDB file
+    for truncated model an AF2 prediction.
+    """
+    if is_single:
+        mdl_txt = "model"
+    else:
+        mdl_txt = "and superposed model"
+    return modelcif.data.Data(
+        name=f"Truncated {mdl_txt} for range {part['part_start']}-"
+        + f"{part['part_end']} derived from AlphaFold-Multimer model "
+        + f"{part['source_prediction']}.pdb",
+    )
+
+
+def _get_associated_file(fle_path, data, file_format="other",
+                         file_content="other"):
+    """Generate a modelcif.associated.File object for given data."""
+    afile = modelcif.associated.File(
+        fle_path,
+        details=data.name,
+        data=data,
+    )
+    afile.file_format = file_format
+    afile.file_content = file_content
+    return afile
+
+
+def _get_associated_files(mdl_name, arc_files):
+    """Create entry for associated files."""
+    # package all into zip file
+    return modelcif.associated.Repository(
+        "",
+        [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
+    )
+    # NOTE: by convention MA expects zip file with same name as model-cif
+
+
+global_ref_dbs = {}
+def _get_ref_db_object(name, url, version=None, release_date=None):
+    """Cached access to modelcif.ReferenceDatabase objects.
+    Needed to remove duplicates in ModelCIF.
+    """
+    key = (name, url, version, release_date)
+    if key not in global_ref_dbs:
+        global_ref_dbs[key] = modelcif.ReferenceDatabase(
+            name, url, version, release_date
+        )
+    return global_ref_dbs[key]
+
+
+def _get_sequence_dbs(config_data):
+    """Get AF seq. DBs."""
+    # hard coded UniProt release (see https://www.uniprot.org/release-notes)
+    # (TO BE UPDATED FOR EVERY DEPOSITION!)
+    pdb_rel_date = config_data["pdb_rel_date"]
+    up_version = config_data["up_version"]
+    up_version_dates = {
+        "2021_03": datetime.datetime(2021,  6,  2),
+        "2022_05": datetime.datetime(2022, 12, 14),
+    }
+    up_rel_date = up_version_dates[up_version]
+    # fill list of DBs
+    seq_dbs = []
+    if config_data["use_small_bfd"]:
+        seq_dbs.append(_get_ref_db_object(
+            "Reduced BFD",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
+        ))
+    else:
+        seq_dbs.append(_get_ref_db_object(
+            "BFD",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "casp14_versions/"
+            + "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz",
+            version="6a634dc6eb105c2e9b4cba7bbae93412",
+        ))
+    if config_data["af_version"] < "2.3.0":
+        seq_dbs.append(_get_ref_db_object(
+            "MGnify",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "casp14_versions/mgy_clusters_2018_12.fa.gz",
+            version="2018_12",
+            release_date=datetime.datetime(2018, 12, 6),
+        ))
+        seq_dbs.append(_get_ref_db_object(
+            "Uniclust30",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz",
+            version="2018_08",
+            release_date=None,
+        ))
+    else:
+        # NOTE: release date according to https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/
+        seq_dbs.append(_get_ref_db_object(
+            "MGnify",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "v2.3/mgy_clusters_2022_05.fa.gz",
+            version="2022_05",
+            release_date=datetime.datetime(2022, 5, 6),
+        ))
+        seq_dbs.append(_get_ref_db_object(
+            "UniRef30",
+            "https://storage.googleapis.com/alphafold-databases/"
+            + "v2.3/UniRef30_2021_03.tar.gz",
+            version="2021_03",
+            release_date=None,
+        ))
+    if config_data["use_multimer"]:
+        seq_dbs.append(_get_ref_db_object(
+            "TrEMBL",
+            "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
+            + "knowledgebase/complete/uniprot_trembl.fasta.gz",
+            version=up_version,
+            release_date=up_rel_date,
+        ))
+        seq_dbs.append(_get_ref_db_object(
+            "Swiss-Prot",
+            "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
+            + "knowledgebase/complete/uniprot_sprot.fasta.gz",
+            version=up_version,
+            release_date=up_rel_date,
+        ))
+    seq_dbs.append(_get_ref_db_object(
+        "UniRef90",
+        "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/"
+        + "uniref90.fasta.gz",
+        version=up_version,
+        release_date=up_rel_date,
+    ))
+    if config_data["use_templates"]:
+        if config_data["use_multimer"]:
+            # uses whatever is latest set of PDB sequences
+            # see AF2 scripts/download_pdb_seqres.sh
+            seq_dbs.append(_get_ref_db_object(
+                "PDB seqres",
+                "https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt",
+                release_date=pdb_rel_date
+            ))
+        else:
+            # fixed version used in AF2 scripts/download_pdb70.sh
+            seq_dbs.append(_get_ref_db_object(
+                "PDB70",
+                "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+                + "hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz",
+                release_date=datetime.datetime(2020, 4, 1)
+            ))
+    return seq_dbs
+
+
+def _assemble_modelcif_software(soft_dict, params_dict):
+    """Create a modelcif.SoftwareWithParameters instance from dictionaries."""
+    # create SW object
+    sw = modelcif.Software(
+        soft_dict["name"],
+        soft_dict["classification"],
+        soft_dict["description"],
+        soft_dict["location"],
+        soft_dict["type"],
+        soft_dict["version"],
+        citation=soft_dict["citation"],
+    )
+    # assemble parameters
+    params = []
+    for key, val in params_dict.items():
+        params.append(modelcif.SoftwareParameter(key, val))
+    # put them together
+    return modelcif.SoftwareWithParameters(sw, params)
+
+
+def _get_modelcif_protocol_software(js_step):
+    """Assemble software entries for a ModelCIF protocol step."""
+    # new setup in python-modelcif (as of late 2023): params with each SW
+    sw_list = []
+    for sw, sw_params in js_step["software_plus_params"]:
+        sw_list.append(_assemble_modelcif_software(sw, sw_params))
+    # group and done...
+    if sw_list:
+        return modelcif.SoftwareGroup(sw_list)
+    else:
+        return None
+
+
+data_group_cache = {}
+def _get_modelcif_protocol_data(data_labels, target_entities, model, acc_data):
+    """Assemble data for a ModelCIF protocol step.
+    Cached access to objects needed to remove duplicates in ModelCIF.
+    """
+    cache_key = '|'.join(sorted(data_labels))
+    if cache_key in data_group_cache:
+        return data_group_cache[cache_key]
+    data = modelcif.data.DataGroup()
+    for data_label in data_labels:
+        if data_label.startswith("target_sequences_"):
+            data.extend(target_entities)
+        elif data_label == "ref_dbs_AF2_2.1.2":
+            # HC for that version
+            data.extend(_get_sequence_dbs({
+                "pdb_rel_date": datetime.datetime(2021, 11, 5),
+                "up_version": "2021_03",
+                "use_small_bfd": True,
+                "af_version": "2.1.2",
+                "use_multimer": True,
+                "use_templates": True,
+            }))
+        elif data_label == "ref_dbs_AF2_2.3.1":
+            # HC for that version
+            data.extend(_get_sequence_dbs({
+                "pdb_rel_date": datetime.datetime(2022, 12, 9),
+                "up_version": "2022_05",
+                "use_small_bfd": True,
+                "af_version": "2.3.1",
+                "use_multimer": True,
+                "use_templates": True,
+            }))
+        elif data_label.startswith("model_"):
+            data.append(model)
+        elif data_label in acc_data:
+            data.append(acc_data[data_label])
+        else:
+            raise RuntimeError(f"Unknown protocol data: '{data_label}'")
+    data_group_cache[cache_key] = data
+    return data
+
+
+def _get_modelcif_protocol(protocol_steps, target_entities, model, acc_data):
+    """Create the protocol for the ModelCIF file."""
+    protocol = modelcif.protocol.Protocol()
+    for js_step in protocol_steps:
+        sftwre = _get_modelcif_protocol_software(js_step)
+        input_data = _get_modelcif_protocol_data(
+            js_step["input"], target_entities, model, acc_data
+        )
+        output_data = _get_modelcif_protocol_data(
+            js_step["output"], target_entities, model, acc_data
+        )
+
+        protocol.steps.append(
+            modelcif.protocol.Step(
+                input_data=input_data,
+                output_data=output_data,
+                name=js_step["name"],
+                details=js_step["details"],
+                software=sftwre,
+            )
+        )
+        protocol.steps[-1].method_type = js_step["method_type"]
+    return protocol
+
+
+def _compress_cif_file(cif_file):
+    """Compress cif file and delete original."""
+    with open(cif_file, "rb") as f_in:
+        with gzip.open(cif_file + ".gz", "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(cif_file)
+
+
+def _package_associated_files(repo, dict_data):
+    """Compress associated files into single zip file and delete original.
+    Data can either be on disk or available in dict_data (key = path).
+    """
+    # zip settings tested for good speed vs compression
+    for archive in repo.files:
+        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
+            for zfile in archive.files:
+                zpath = zfile.path
+                if zpath in dict_data:
+                    cif_zip.writestr(zpath, dict_data[zpath])
+                else:
+                    cif_zip.write(zpath, arcname=zpath)
+                    os.remove(zpath)
+
+
+def _pkl_to_json(pkl_data, pae_handling="keep"):
+    """Make pkl_data JSON writable.
+    Options for pae_handling:
+    - keep = values kept as they are
+    - round = values kept as closest integers
+    - drop = drop whole table
+    """
+    pkl_json_data = {}
+    for k, v in pkl_data.items():
+        if k == "PAE":
+            if pae_handling == "keep":
+                pkl_json_data["PAE"] = pkl_data["PAE"].tolist()
+            elif pae_handling == "round":
+                pae = pkl_data["PAE"].round().astype(int).tolist()
+                pkl_json_data["PAE"] = pae
+            elif pae_handling == "drop":
+                pass
+            else:
+                raise RuntimeError(
+                    f"Unknown pae_handling value {pae_handling}"
+                )
+        elif k != "pLDDT_chains":
+            if type(v) == np.ndarray:
+                # note: scalars become scalars!
+                v = v.tolist()
+            pkl_json_data[k] = v
+    return pkl_json_data
+
+
+def _get_sw_for_qe(steps, step_name):
+    """Fetch suitable SW objects from protocol steps to use in QE."""
+    # to maximally reduce duplicates we reuse single groups
+    # otherwise new group created using same SoftwareWithParameters objects
+    sw_groups = [step.software for step in steps if step.name == step_name]
+    if len(sw_groups) == 0:
+        return None
+    elif len(sw_groups) == 1:
+        return sw_groups[0]
+    else:
+        # each sw_group is a list (SoftwareGroup) of SoftwareWithParameters
+        # ...and we remove duplicates...just in case
+        sw_dict = {}
+        for sw_group in sw_groups:
+            sw_dict.update({hash(swp): swp for swp in sw_group})
+        return modelcif.SoftwareGroup(sw_dict.values())
+
+
+def _store_as_modelcif(data_json, ost_ent, zip_dict, kept_cif_cats, out_dir,
+                       mdl_name, compress, add_extra_files):
+    """Mix all the data into a ModelCIF file."""
+    print("    generating ModelCIF objects...", end="")
+    pstart = timer()
+    # create system to gather all the data
+    system = modelcif.System(
+        title=data_json["title"],
+        id=data_json["mdl_id"].upper(),
+        model_details=data_json["model_details"],
+    )
+
+    # create an asymmetric unit and an entity per target sequence
+    asym_units = {}
+    _get_modelcif_entities(data_json["target_entities"], asym_units, system)
+
+    # audit_authors
+    system.authors.extend(data_json["audit_authors"])
+
+    # set up the model to produce coordinates
+    model = _OST2ModelCIF(
+        assembly=modelcif.Assembly(asym_units.values()),
+        asym=asym_units,
+        ost_entity=ost_ent,
+        scores_json=data_json["scores"],
+        name=None,
+    )
+    # TEST
+    # model.not_modeled_residue_ranges.extend(_get_not_modeled_residues(model))
+    #
+    print(f" ({timer()-pstart:.2f}s)")
+    print("    processing QA scores...", end="", flush=True)
+    pstart = timer()
+    model.add_scores()
+    print(f" ({timer()-pstart:.2f}s)")
+
+    model_group = modelcif.model.ModelGroup([model])
+    system.model_groups.append(model_group)
+
+    # handle additional files
+    acc_data = {}
+    acc_files = []
+    dict_data = {}
+    for pred in data_json["af2_preds"]:
+        # get data objects
+        pdb_file, pkl_file, pae_file = _get_af2_files(pred["prediction_p"])
+        acc_data[pdb_file] = _get_assoc_af2_pdb_data(pred)
+        # NOTE: we keep data object for mismatching PKL file even though we
+        # do not dump file itself
+        acc_data[pkl_file] = _get_assoc_af2_json_data(pdb_file)
+        acc_data[pae_file] = _get_assoc_pae_png_data(pdb_file)
+        if add_extra_files:
+            # get file objects
+            acc_files.append(_get_associated_file(pdb_file, acc_data[pdb_file]))
+            if pkl_file not in data_json["pkl_files_to_skip"]:
+                json_file = f"{os.path.splitext(pkl_file)[0]}.json"
+                acc_files.append(_get_associated_file(
+                    json_file, acc_data[pkl_file], file_format="json"
+                ))
+            acc_files.append(_get_associated_file(pae_file, acc_data[pae_file]))
+            # get file content
+            dict_data[pdb_file] = zip_dict["accompanying_data"][pdb_file]
+            if pkl_file not in data_json["pkl_files_to_skip"]:
+                pkl_data = pickle.loads(zip_dict["accompanying_data"][pkl_file])
+                pkl_json_data = _pkl_to_json(pkl_data, pae_handling="round")
+                dict_data[json_file] = json.dumps(pkl_json_data)
+            dict_data[pae_file] = zip_dict["accompanying_data"][pae_file]
+    for part in data_json["trunc_parts"]:
+        part_path = part["truncated_prediction_path"]
+        if part_path != "model":
+            acc_data[part_path] = _get_assoc_trunc_pdb_data(
+                part, len(data_json["trunc_parts"]) == 1
+            )
+            if add_extra_files:
+                acc_files.append(_get_associated_file(
+                    part_path, acc_data[part_path]
+                ))
+                dict_data[part_path] = zip_dict["accompanying_data"][part_path]
+    if acc_files:
+        system.repositories.append(_get_associated_files(mdl_name, acc_files))
+
+    # get data and steps
+    protocol = _get_modelcif_protocol(
+        data_json["protocol"], system.entities, model, acc_data,
+    )
+    system.protocols.append(protocol)
+
+    # set SW for QE
+    _LocalPLDDT.software = _get_sw_for_qe(protocol.steps, "AF2")
+    _GlobalPRC_pLDDT.software = _LocalPLDDT.software
+    _LocalMOD.software = _get_sw_for_qe(protocol.steps, "MODELLER")
+
+    # write modelcif System to file
+    print("    write to disk...", end="", flush=True)
+    pstart = timer()
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    mdl_fle = f"{mdl_name}.cif"
+    try:
+        # dump to string and fix with gemmi
+        with StringIO() as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+            modelcif_str = mmcif_fh.getvalue()
+        doc = gemmi.cif.read_string(modelcif_str)
+        block = doc.sole_block()
+        # HACK: set all label_alt_id to '-'
+        # -> see https://github.com/wwpdb-dictionaries/mmcif_pdbx/issues/60
+        if kept_cif_cats:
+            cat_dict = block.get_mmcif_category("_atom_site.")
+            cat_dict["label_alt_id"] = ['-'] * len(cat_dict["label_alt_id"])
+            block.set_mmcif_category("_atom_site.", cat_dict)
+            cat_dict = kept_cif_cats["_pdbx_domain_range."]
+            new_alt_ids = ['-'] * len(cat_dict["beg_label_alt_id"])
+            cat_dict["beg_label_alt_id"] = new_alt_ids
+            new_alt_ids = ['-'] * len(cat_dict["end_label_alt_id"])
+            cat_dict["end_label_alt_id"] = new_alt_ids
+        #
+        for cat, cat_dict in kept_cif_cats.items():
+            block.set_mmcif_category(cat, cat_dict)
+        doc.write_file(mdl_fle)
+        # extra files
+        with open(f"{mdl_name}-image.png", "wb") as fh:
+            fh.write(zip_dict["image_file"])
+        if acc_files:
+            _package_associated_files(system.repositories[0], dict_data)
+        if compress:
+            _compress_cif_file(mdl_fle)
+            mdl_fle += ".gz"
+    finally:
+        os.chdir(oldpwd)
+    print(f" ({timer()-pstart:.2f}s)")
+################################################################################
+
+################################################################################
+# HANDLE FULL DATA SET
+################################################################################
+def _translate2modelcif(ncbi_data, opts):
+    """Convert a model with its accompanying data to ModelCIF."""
+    # get main names
+    mdl_id = os.path.splitext(os.path.basename(opts.input_zip_path))[0]
+    if opts.compress:
+        cifext = "cif.gz"
+    else:
+        cifext = "cif"
+    mdl_path = os.path.join(opts.out_dir, f"{mdl_id}.{cifext}")
+    # skip if done already (disabled here)
+    # if os.path.exists(mdl_path):
+    #     print(f"  {mdl_id} already done...")
+
+    # prepare data for model to convert (also gets all issues)
+    issues = []
+    abort_after_checks = opts.checks_only
+    zip_dict = _get_zip_content(opts.input_zip_path)
+    mdl_type = zip_dict["mdl_type"]
+    if zip_dict["unparsed_files"]:
+        file_list = zip_dict["unparsed_files"]
+        issues.append(("unparsed_files", len(file_list), file_list))
+    json_dict = json.loads(zip_dict["json_file"])
+    # apply hard-coded fixes for some models
+    if mdl_id == "ma-taas-2330":
+        # find entry to fix
+        for pred in json_dict["AF_predictions"]:
+            if pred["prediction_p"] == "AFv2.3.1_389-515_mergeN_21.pdb":
+                pred["prediction_p"] = "AFv2.3.1_389-515.pdb"
+                del pred["merged_GCN4linker_len_N"]
+                del pred["merged_GCN4linker_len_C"]
+    elif mdl_id == "ma-taas-9424":
+        # len(json_dict["pLDDT"]) was 53 instead of 52
+        # -> also in wrong abstract...
+        json_dict["pLDDT"] = json_dict["pLDDT"][:52]
+        json_dict["abstract"] = json_dict["abstract"].replace(
+            "53 residues", "52 residues"
+        )
+    cif_ent, kept_cif_cats = _load_cif_file(zip_dict)
+    try:
+        issues.extend(_do_sanity_checks(
+            mdl_id, json_dict, zip_dict, cif_ent, kept_cif_cats
+        ))
+        entities, ent_issues = _get_entities(
+            json_dict, ncbi_data, cif_ent, mdl_type
+        )
+        issues.extend(ent_issues)
+    except Exception as ex:
+        short_txt = f"{type(ex).__name__}: {ex}"
+        long_txt = traceback.format_exc()
+        issues.append(("exception", short_txt, long_txt))
+        abort_after_checks = True
+
+    # dump issues
+    issues_file_path = os.path.join(opts.out_dir, f"{mdl_id}-issues.json")
+    json.dump(issues, open(issues_file_path, "w"))
+
+    # abort if needed
+    if abort_after_checks:
+        return
+
+    # do the actual conversion
+    print(f"  translating {mdl_id}...")
+    pdb_start = timer()
+
+    # gather data into JSON-like structure
+    print("    preparing data...", end="")
+    pstart = timer()
+    af2_preds, trunc_parts = _get_preds_and_parts(json_dict, mdl_type)
+    mdlcf_json = {
+        "title": json_dict["title"].strip(),
+        "model_details": json_dict["abstract"].strip(),
+        "audit_authors": _get_audit_authors(),
+        "af2_preds": af2_preds,
+        "trunc_parts": trunc_parts,
+        "protocol": _get_protocol_steps_and_software(
+            json_dict, mdl_type, mdl_id
+        ),
+        "mdl_id": mdl_id,  # used for entry ID
+        "target_entities": entities,
+        "scores": {
+            score: json_dict[score] \
+            for score in [
+                "pLDDT", "MODELLER_energy_score", "per_res_conf",
+                "percent_confident_residues", "mean_per_res_conf"
+            ] if score in json_dict
+        },
+        "pkl_files_to_skip": [],
+    }
+    # fix percentage to [0,1]
+    mdlcf_json["scores"]["percent_confident_residues"] /= 100.0
+    # enlarge local scores to be for each residue
+    for score in ["pLDDT", "MODELLER_energy_score", "per_res_conf"]:
+        if score in mdlcf_json["scores"]:
+            mdlcf_json["scores"][score] *= cif_ent.chain_count
+    # note: we overwrite b-factors from file!
+    # -> also choose which global score to use for mean_per_res_conf
+    if "per_res_conf" in mdlcf_json["scores"]:
+        mdlcf_json["scores"]["b_factors"] = mdlcf_json["scores"]["per_res_conf"]
+    else:
+        mdlcf_json["scores"]["b_factors"] = mdlcf_json["scores"]["pLDDT"]
+        mdlcf_json["scores"]["mean_per_res_conf_plddt"] = \
+            mdlcf_json["scores"].pop("mean_per_res_conf")
+    # check for inconsistent pkl files
+    for issue_type, short_data, long_data in issues:
+        if issue_type.startswith("pkl_") and issue_type.endswith("_diff"):
+            mdlcf_json["pkl_files_to_skip"].append(short_data[1])
+    print(f" ({timer()-pstart:.2f}s)")
+
+    # save ModelCIF
+    _store_as_modelcif(
+        data_json=mdlcf_json,
+        ost_ent=cif_ent,
+        zip_dict=zip_dict,
+        kept_cif_cats=kept_cif_cats,
+        out_dir=opts.out_dir,
+        mdl_name=mdl_id,
+        compress=opts.compress,
+        add_extra_files=(not opts.no_extra_files),
+    )
+
+    # check if result can be read and has expected seq.
+    ent, ss = io.LoadMMCIF(mdl_path, seqres=True)
+    ent_seqres = [ss.FindSequence(chn.name).string for chn in ent.chains]
+    exp_seqres = []
+    for trg_ent in mdlcf_json["target_entities"]:
+        exp_seqres += [trg_ent["seqres"]] * len(trg_ent["pdb_chain_ids"])
+    assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}"
+    # check model itself
+    ch_names, mdlsqe, resnum_range, ent_bfs = _process_ent(ent)
+    assert ch_names == ['A', 'B', 'C']
+    target_entity = mdlcf_json["target_entities"][0]
+    assert mdlsqe == target_entity["sqe_no_gaps"]
+    assert max(resnum_range) <= len(target_entity["seqres"])
+    if mdl_type == "assembly":
+        seq_range = (
+            json_dict["predicted_sequence_start"],
+            json_dict["predicted_sequence_end"],
+        )
+        assert resnum_range == seq_range
+    else:
+        assert resnum_range == (1, len(mdlsqe))
+    exp_bfs = mdlcf_json["scores"]["b_factors"]
+    bfs_max_diff = np.max(abs(np.asarray(ent_bfs) - np.asarray(exp_bfs)))
+    assert bfs_max_diff < 0.01
+
+    print(f"  ... done with {mdl_id} ({timer()-pdb_start:.2f}s).")
+
+
+def _main():
+    """Run as script."""
+
+    # parse/fetch global data
+    opts = _parse_args()
+    ncbi_data = json.load(open(opts.ncbi_json))
+
+    # handle single entry (print statements kept consistent with other sets)
+    print(f"Working on models...")
+    _translate2modelcif(ncbi_data, opts)
+    print(f"... done with models.")
+
+    # TEST: to judge res. needed on cluster
+    import resource
+    print('mem', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000)
+
+
+if __name__ == "__main__":
+    _main()