diff --git a/projects/PP2A-B55-design/expected_output_modelcif.zip b/projects/PP2A-B55-design/expected_output_modelcif.zip
new file mode 100644
index 0000000000000000000000000000000000000000..f73737bf5a1d9e4e3682525ba1162edee68b5524
Binary files /dev/null and b/projects/PP2A-B55-design/expected_output_modelcif.zip differ
diff --git a/projects/PP2A-B55-design/modelarchive_submission.zip b/projects/PP2A-B55-design/modelarchive_submission.zip
new file mode 100644
index 0000000000000000000000000000000000000000..48ec525bc0390aba8156f3754aec35224aade901
Binary files /dev/null and b/projects/PP2A-B55-design/modelarchive_submission.zip differ
diff --git a/projects/PP2A-B55-design/translate2modelcif.py b/projects/PP2A-B55-design/translate2modelcif.py
new file mode 100644
index 0000000000000000000000000000000000000000..5db56f0f43d6b5ebef2458f5658574d074fd7c94
--- /dev/null
+++ b/projects/PP2A-B55-design/translate2modelcif.py
@@ -0,0 +1,1899 @@
+#! /usr/local/bin/ost
+# -*- coding: utf-8 -*-
+
+"""Translate PRC models for Juntao from PDB + extra data into ModelCIF."""
+
+# EXAMPLES for running:
+# ost translate2modelcif.py ./modelarchive_submission ./modelcif
+
+import argparse
+import datetime
+import gzip
+import os
+import shutil
+import sys
+import zipfile
+
+from timeit import default_timer as timer
+import numpy as np
+import requests
+import ujson as json
+
+import ihm
+import ihm.citations
+import modelcif
+import modelcif.associated
+import modelcif.dumper
+import modelcif.model
+import modelcif.protocol
+import modelcif.reference
+
+import pandas as pd
+from ost import io, seq
+
+
+# In[2]:
+
+
+################################################################################
+# GENERAL HELPER FUNCTIONS
+################################################################################
+def _abort_msg(msg, exit_code=1):
+    """Write error message and exit with exit_code."""
+    print(f"{msg}\nAborting.", file=sys.stderr)
+    sys.exit(exit_code)
+
+
+def _warn_msg(msg):
+    """Write a warning message to stdout."""
+    print(f"WARNING: {msg}")
+
+
+def _check_file(file_path):
+    """Make sure a file exists and is actually a file."""
+    if not os.path.exists(file_path):
+        _abort_msg(f"File not found: '{file_path}'.")
+    if not os.path.isfile(file_path):
+        _abort_msg(f"File path does not point to file: '{file_path}'.")
+
+
+def _check_folder(dir_path):
+    """Make sure a file exists and is actually a file."""
+    if not os.path.exists(dir_path):
+        _abort_msg(f"Path not found: '{dir_path}'.")
+    if not os.path.isdir(dir_path):
+        _abort_msg(f"Path does not point to a directory: '{dir_path}'.")
+
+
+def _check_opts_folder(dir_path):
+    """Remove trailing '/' (return fixed one) and check if path valid."""
+    if dir_path.endswith("/"):
+        dir_path = dir_path[:-1]
+    _check_folder(dir_path)
+    return dir_path
+
+
+def _get_res_num(r, use_auth=False):
+    """Get res. num. from auth. IDs if reading from mmCIF files."""
+    if use_auth:
+        return int(r.GetStringProp("pdb_auth_resnum"))
+    return r.number.num
+
+
+def _get_ch_name(ch, use_auth=False):
+    """Get chain name from auth. IDs if reading from mmCIF files."""
+    if use_auth:
+        return ch.GetStringProp("pdb_auth_chain_name")
+    return ch.name
+
+
+def _get_sequence(chn, use_auth=False):
+    """Get the sequence out of an OST chain incl. '-' for gaps in resnums."""
+    # initialise (add gaps if first is not at num. 1)
+    lst_rn = _get_res_num(chn.residues[0], use_auth)
+    idx = 1
+    sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code
+
+    for res in chn.residues[idx:]:
+        lst_rn += 1
+        while lst_rn != _get_res_num(res, use_auth):
+            sqe += "-"
+            lst_rn += 1
+        sqe += res.one_letter_code
+    return sqe
+################################################################################
+
+
+# In[3]:
+
+
+################################################################################
+# DATA HANDLING
+################################################################################
+def _parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        "input_data_path",
+        type=str,
+        metavar="<INPUT DATA PATH>",
+        help="Data as provided by depositors. Expected to contain files "
+        + "Annotations.csv and Annotations.json with metadata, Config_Files "
+        + "directory with config_[X].json files for all X listed in Config "
+        + "column of Annotations.csv, and Zip_Files directory with files named "
+        + "[X]-[NAME].zip for each X listed in the metadata files.",
+    )
+    parser.add_argument(
+        "out_dir",
+        type=str,
+        metavar="<OUTPUT DIR>",
+        help="Path to directory to store results ([X]-[NAME].* files and "
+        + "issues.json with any observed issues).",
+    )
+    parser.add_argument(
+        "--compress",
+        default=False,
+        action="store_true",
+        help="Compress ModelCIF file with gzip.",
+    )
+    parser.add_argument(
+        "--checks-only",
+        default=False,
+        action="store_true",
+        help="Perform only checks without producing ModelCIF files.",
+    )
+    parser.add_argument(
+        "--no-extra-files",
+        default=False,
+        action="store_true",
+        help="Skip writing extra models, PNGs, and PAE (for testing).",
+    )
+    parser.add_argument(
+        "--single-model",
+        type=str,
+        #metavar="<PDB WEB PATH>",
+        default=None,
+        help="If provided, only the model matching the provided string in the "
+        + "Annotations.json will be converted.",
+    )
+
+    opts = parser.parse_args()
+
+    # check input
+    opts.input_data_path = _check_opts_folder(opts.input_data_path)
+    _check_file(os.path.join(opts.input_data_path, "info_of_submitted_structures.csv"))
+    _check_folder(os.path.join(opts.input_data_path, "screen_256"))
+    if opts.out_dir.endswith("/"):
+        opts.out_dir = opts.out_dir[:-1]
+    if not os.path.exists(opts.out_dir):
+        os.makedirs(opts.out_dir, exist_ok=True)
+    return opts
+
+
+def _get_audit_authors():
+    """Return the list of authors that produced this model."""
+    return (
+        "Schueler-Furman, Ora",
+        "Varga, Julia Kornelia",
+    )
+
+
+def _check_scores(mdl_data, metadata):
+    """Check scores JSON.
+    Bad issues raise exceptions, minor ones are in returned list
+    (compatible with list returned by _get_entities)
+    """
+    issues = []
+    scores_json = mdl_data["scores"]
+    # NOTE: cannot deal with gapped sequences here as we cannot map
+    # multiple chains to scores
+    ost_ent = mdl_data["ent"]
+    exp_len = ost_ent.residue_count
+    assert "ptm" in scores_json
+    assert "iptm" in scores_json
+    assert len(scores_json["pae"]) == exp_len
+    assert len(scores_json["pae"][0]) == exp_len
+    # b-factor vs pLDDT in expected range?
+    ent_plddts = []
+    for i, res in enumerate(ost_ent.residues):
+        b_factors = [a.b_factor for a in res.atoms]
+        assert len(set(b_factors)) == 1 # must all be equal!
+        ent_plddts.append(b_factors[0])
+    scores_plddts = scores_json["plddt"]
+    assert len(ent_plddts) == len(scores_plddts)
+    plddt_max_diff = max([
+        abs(s1 - s2) for s1, s2 in zip(ent_plddts, scores_plddts)
+    ])
+    # threshold due to 0.01 accuracy in PDB file + numerical rounding
+    if plddt_max_diff > 0.0051:
+        issues.append((
+            metadata['mdl_id'],
+            "plddt_vs_bf_mismatch",
+            (plddt_max_diff),
+            ()
+        ))
+    return issues
+
+
+def _get_n_parse_up_entry(up_ac, up_txt_path):
+    """Get data for an UniProtKB entry and parse it."""
+    # This is a simple parser for UniProtKB txt format, instead of breaking it
+    # up into multiple functions, we just allow many many branches & statements,
+    # here.
+    # pylint: disable=too-many-branches,too-many-statements
+    data = {}
+    data["up_organism"] = ""
+    data["up_sequence"] = ""
+    data["up_ac"] = up_ac
+    # check if we read from file or URL
+    if up_txt_path.startswith("http"):
+        rspns = requests.get(up_txt_path, timeout=180)
+        lines = rspns.iter_lines(decode_unicode=True)
+    else:
+        lines = open(up_txt_path).readlines()
+    for line_ in lines:
+        # need to strip trailing characters if reading from file (doesn't hurt)
+        line = line_.rstrip()
+        if line.startswith("ID   "):
+            sline = line.split()
+            if len(sline) != 5:
+                raise RuntimeError(f"Unusual UniProtKB ID line found:\n" \
+                                   f"'{line}'")
+            data["up_id"] = sline[1]
+        elif line.startswith("OX   NCBI_TaxID="):
+            # Following strictly the UniProtKB format: 'OX   NCBI_TaxID=<ID>;'
+            data["up_ncbi_taxid"] = line[len("OX   NCBI_TaxID=") : -1]
+            data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
+        elif line.startswith("OS   "):
+            # multiple lines possible; last one ends in "."
+            if line[-1] == ".":
+                data["up_organism"] += line[len("OS   ") : -1]
+            else:
+                data["up_organism"] += line[len("OS   ") :] + " "
+        elif line.startswith("SQ   "):
+            sline = line.split()
+            if len(sline) != 8:
+                raise RuntimeError(f"Unusual UniProtKB SQ line found:\n" \
+                                   f"'{line}'")
+            data["up_seqlen"] = int(sline[2])
+            data["up_crc64"] = sline[6]
+        elif line.startswith("     "):
+            sline = line.split()
+            if len(sline) > 6:
+                raise RuntimeError(
+                    "Unusual UniProtKB sequence data line "
+                    + f"found:\n'{line}'"
+                )
+            data["up_sequence"] += "".join(sline)
+        elif line.startswith("DT   "):
+            dt_flds = line[len("DT   ") :].split(", ")
+            if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
+                data["up_last_mod"] = datetime.datetime.strptime(
+                    dt_flds[0], "%d-%b-%Y"
+                )
+            elif dt_flds[1].upper().startswith("ENTRY VERSION "):
+                data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :]
+                if data["up_entry_version"][-1] == ".":
+                    data["up_entry_version"] = data["up_entry_version"][:-1]
+                data["up_entry_version"] = int(data["up_entry_version"])
+        elif line.startswith("GN   Name="):
+            data["up_gn"] = line[len("GN   Name=") :].split(";")[0]
+            data["up_gn"] = data["up_gn"].split("{")[0].strip()
+
+    # in UP isoforms are identified in the AC so no need for this...
+    # -> in PDB (e.g. 8TRE), we see unset _struct_ref.pdbx_db_isoform in such cases
+    data["up_isoform"] = None
+
+    # NOTE: no gene names in this set (use provided names instead)
+    if "up_gn" not in data:
+        _warn_msg(
+            f"No gene name found for UniProtKB entry '{up_ac}', using "
+            + "UniProtKB AC instead."
+        )
+        data["up_gn"] = up_ac
+    if "up_last_mod" not in data:
+        raise RuntimeError(f"No sequence version found for UniProtKB entry " \
+                           f"'{up_ac}'.")
+    if "up_crc64" not in data:
+        raise RuntimeError(f"No CRC64 value found for UniProtKB entry " \
+                           f"'{up_ac}'.")
+    if len(data["up_sequence"]) == 0:
+        raise RuntimeError(f"No sequence found for UniProtKB entry '{up_ac}'.")
+    # check that sequence length and CRC64 is correct
+    if data["up_seqlen"] != len(data["up_sequence"]):
+        raise RuntimeError(
+            "Sequence length of SQ line and sequence data differ for "
+            + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
+            + f"{len(data['up_sequence'])}"
+        )
+    data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"])
+
+    if "up_id" not in data:
+        raise RuntimeError(f"No ID found for UniProtKB entry '{up_ac}'.")
+    if "up_ncbi_taxid" not in data:
+        raise RuntimeError(f"No NCBI taxonomy ID found for UniProtKB entry "
+                           f"'{up_ac}'.")
+    if len(data["up_organism"]) == 0:
+        raise RuntimeError(f"No organism species found for UniProtKB entry "
+                           f"'{up_ac}'.")
+    return data
+
+
+def _fetch_upkb_entry(up_ac):
+    """Get an UniProtKB entry."""
+    return _get_n_parse_up_entry(
+        up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt"
+    )
+
+
+def _fetch_unisave_entry(up_ac, version):
+    """Get an UniSave entry, in contrast to an UniProtKB entry, that allows us
+    to specify a version."""
+    return _get_n_parse_up_entry(
+        up_ac,
+        f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&"
+        + f"versions={version}",
+    )
+
+
+def _check_sequence(up_ac, sequence):
+    """Verify sequence to only contain standard olc."""
+    ns_aa_pos = []  # positions of non-standard amino acids
+    for i, res in enumerate(sequence):
+        if res not in "ACDEFGHIKLMNPQRSTVWY":
+            if res == "U":
+                _warn_msg(
+                    f"Selenocysteine found at position {i+1} of entry "
+                    + f"'{up_ac}', this residue may be missing in the "
+                    + "model."
+                )
+                ns_aa_pos.append(i)
+                continue
+            raise RuntimeError(
+                "Non-standard aa found in UniProtKB sequence "
+                + f"for entry '{up_ac}': {res}, position {i+1}"
+            )
+    return ns_aa_pos
+
+
+# for cache below
+upkb_entry_cache = {} # key = (up_ac, up_version, mdl_sequence)
+def _fetch_upkb_cached(sqe, up_ac, up_version=None):
+    """Get versioned UniProtKB entry (version=None means latest).
+    Get it from cache if already fetched.
+    Return None if failed to parse entry.
+    """
+    # check if in cache already
+    cache_key = (up_ac, up_version, sqe)
+    if cache_key in upkb_entry_cache:
+        return upkb_entry_cache[cache_key]
+    # fetch and parse
+    if up_version is None:
+        up_data = _fetch_upkb_entry(up_ac)
+    min_up_data = None
+
+    while True:
+        mismatches, up_range, mdl_range, covered_aln,  mdl_seqres = _align_sequences(
+            sqe, up_data["up_sequence"], atomseq_aln=False)
+
+        if min_up_data is None or \
+           len(mismatches) < len(min_up_data["mismatches"]):
+            min_up_data = up_data
+            min_up_data["mismatches"] = mismatches
+            min_up_data["up_range"] = up_range
+            min_up_data["mdl_range"] = mdl_range
+            min_up_data["covered_aln"] = covered_aln
+            min_up_data["mdl_seqres"] = mdl_seqres
+
+        if len(mismatches) == 0:
+            # found hit; done
+            break
+        # fetch next one (skip if exceptions happen)
+        next_v = up_data["up_entry_version"] - 1
+        while next_v > 0:
+            try:
+                # note: can fail to parse very old UP versions...
+                up_data = _fetch_unisave_entry(up_ac, next_v)
+                # can move on if no exception happened
+                break
+            except RuntimeError as ex:
+                # _warn_msg(f"Error in parsing v{next_v} of {up_ac}:\n{ex}")
+                # try next one
+                next_v -= 1
+        if next_v == 0:
+            # warn user about failure to find match and abort
+            min_mismatches = min_up_data["mismatches"]
+            msg = f"Sequences not equal from file: {sqe}, from UniProtKB: " \
+                  f"{min_up_data['up_sequence']} ({up_ac}), checked entire " \
+                  f"entry history and best match had following mismatches " \
+                  f"in v{min_up_data['up_entry_version']} (range " \
+                  f"{min_up_data['up_range']}): {min_up_data['mismatches']}."
+            _warn_msg(msg)
+            # raise RuntimeError(msg)
+            break
+    '''else:
+        try:
+            # note: can fail to parse very old UP versions...
+            up_data = _fetch_unisave_entry(up_ac, up_version)
+        except RuntimeError as ex:
+            #_warn_msg(f"Error in parsing v{next_v} of {up_ac}:\n{ex}")
+            up_data = None
+    '''
+
+    # keep in cache
+    upkb_entry_cache[cache_key] = up_data
+    return min_up_data
+
+
+def _align_sequences(mdl_sqe, ref_sqe, atomseq_aln=True, ref_fixes=[],
+                     gapped_aa="XOUBJZ"):
+    """Compare sequence while paying attention on non-standard amino acids.
+    
+    Can pass list of tuples for OLCs expected to be changed between ref and mdl.
+    E.g. Jason set would have ref_fixes=[('B', 'D'), ('J', 'L'), ('Z', 'E')].
+    Non-def. AA (listed in gapped_aa) in ref_sqe are assumed to be gaps (-) in
+    mdl_sqe (this is def. in CF/AF for "XOUBJZ").
+    
+    Returns (mismatches, ref_range, mdl_range, covered_aln, mdl_seqres):
+    - mismatches = list of (ref_pos, mdl_pos, olc_ref, olc_mdl)
+      (positions are 1-indexed, None if gap and only if in range)
+    - ref_range / mdl_range = (start, end) tuples with 1-indexed positions of
+      start and end of covered range (mdl_range with respect to mdl_sqe!).
+      Extra non-covered residues in mdl or ref can be counted by comparing
+      ref_range / mdl_range with lengths of ref_sqe / mdl_sqe.
+    - covered_aln = alignment (seq. 0 = REF, seq. 1 = MDL) within covered range
+      (i.e. first and last column have no gaps). If atomseq_aln is True, the
+      alignment only includes non-gap residues of mdl_sqe. OST seq. offsets are
+      set with respect to mdl_sqe, ref_sqe (0-indexed). Note that offsets are
+      only guaranteed to fit ref_range / mdl_range if atomseq_aln is False.
+    - mdl_seqres = mdl_sqe with gaps (-) replaced with seq. from ref. if
+      non-def-AA there or with X otherwise (i.e. both have same length!).
+      Here guaranteed for mdl_seqres to match mdl_sqe if AA in gapped_aa and X
+      are replaced by gaps (-).
+    """
+    # add fixes if needed
+    ref_sqe_fixed = ref_sqe
+    for olc1, olc2 in ref_fixes:
+        ref_sqe_fixed = ref_sqe_fixed.replace(olc1, olc2)
+    # put gaps for parts not modelled by AF2 (i.e. any non-def-AA)
+    ref_sqe_fixed_gapped = ref_sqe_fixed
+    for olc in gapped_aa:
+        assert olc not in mdl_sqe
+        ref_sqe_fixed_gapped = ref_sqe_fixed_gapped.replace(olc, '-')
+    # easy and preferred case: mdl_sqe is subset of ref_sqe
+    ref_idx = ref_sqe_fixed_gapped.find(mdl_sqe)
+    if ref_idx >= 0:
+        mismatches = []
+        ref_range = (ref_idx + 1, ref_idx + len(mdl_sqe))
+        mdl_range = (1, len(mdl_sqe))
+        mdl_seqres = ref_sqe_fixed[ref_idx : ref_idx + len(mdl_sqe)]
+        # we handle covered_aln afterwards...
+        aln_s_ref = ref_sqe[ref_idx : ref_idx + len(mdl_sqe)]
+        aln_s_mdl = mdl_seqres
+    else:
+        # align and report mismatches
+        ref_seq = seq.CreateSequence("REF", ref_sqe_fixed)
+        # use X as first guess for gaps in model
+        mdl_seq = seq.CreateSequence("MDL", mdl_sqe.replace('-', 'x'))
+        aln = seq.alg.SemiGlobalAlign(ref_seq, mdl_seq, seq.alg.BLOSUM62)[0]
+        # get range
+        aligned_indices = [i for i, c in enumerate(aln) \
+                           if c[0] != '-' and c[1] != '-']
+        ref_range = (
+            aln.GetResidueIndex(0, aligned_indices[0]) + 1,
+            aln.GetResidueIndex(0, aligned_indices[-1]) + 1,
+        )
+        mdl_range = (
+            aln.GetResidueIndex(1, aligned_indices[0]) + 1,
+            aln.GetResidueIndex(1, aligned_indices[-1]) + 1,
+        )
+        # build up strings as we go
+        aln_s_ref = ""
+        aln_s_mdl = ""
+        mdl_seqres = ""
+        # collect mismatches and fix seqs as we go
+        mismatches = []
+        for idx, (olc_ref, olc_mdl) in enumerate(aln):
+            # fix seqres as needed
+            if olc_mdl == 'x' and olc_ref in gapped_aa:
+                olc_mdl = olc_ref
+            if olc_mdl != '-':
+                mdl_seqres += olc_mdl
+            if idx >= aligned_indices[0] and idx <= aligned_indices[-1]:
+                # fill aln_s_x as needed
+                if olc_ref != '-':
+                    # must fetch from ref_sqe
+                    ref_idx = aln.GetResidueIndex(0, idx)
+                    aln_s_ref += ref_sqe[ref_idx]
+                    ref_pos = ref_idx + 1
+                else:
+                    aln_s_ref += '-'
+                    ref_pos = None
+                if olc_mdl != '-':
+                    # fetch from mdl_seqres here
+                    # (revert back to mdl_sqe afterwards)
+                    mdl_idx = aln.GetResidueIndex(1, idx)
+                    aln_s_mdl += mdl_seqres[mdl_idx]
+                    mdl_pos = mdl_idx + 1
+                else:
+                    aln_s_mdl += '-'
+                    mdl_pos = None
+                if olc_ref != olc_mdl:
+                    mismatches.append((ref_pos, mdl_pos, olc_ref, olc_mdl))
+    # fix remaining x in mdl_seqres
+    mdl_seqres = mdl_seqres.replace('x', 'X')
+    # create covered_aln
+    s_ref_offset = ref_range[0] - 1
+    s_mdl_offset = mdl_range[0] - 1
+    covered_aln = seq.CreateAlignment(
+        seq.CreateSequence("REF", aln_s_ref),
+        seq.CreateSequence("MDL", aln_s_mdl.replace('x', 'X'))
+    )
+    # cut it once again if needed (only for atomseq_aln)
+    if atomseq_aln:
+        # revert
+        new_cols = [
+            (
+                olc_ref,
+                '-' if olc_mdl == 'x' or olc_mdl in gapped_aa else olc_mdl
+            ) for olc_ref, olc_mdl in zip(aln_s_ref, aln_s_mdl)
+        ]
+        aligned_indices = [i for i, c in enumerate(new_cols) \
+                           if c[0] != '-' and c[1] != '-']
+        s_ref_offset += covered_aln.GetResidueIndex(0, aligned_indices[0])
+        s_mdl_offset += covered_aln.GetResidueIndex(1, aligned_indices[0])
+        cut_cols = new_cols[aligned_indices[0]:aligned_indices[-1]+1]
+        aln_s_ref = "".join([olc_ref for olc_ref, _ in cut_cols])
+        aln_s_mdl = "".join([olc_mdl for _, olc_mdl in cut_cols])
+        covered_aln = seq.CreateAlignment(
+            seq.CreateSequence("REF", aln_s_ref),
+            seq.CreateSequence("MDL", aln_s_mdl)
+        )
+    covered_aln.SetSequenceOffset(0, s_ref_offset)
+    covered_aln.SetSequenceOffset(1, s_mdl_offset)
+    # check post assertions (as in docstring)
+    assert ref_sqe[covered_aln.GetSequenceOffset(0):]\
+        .startswith(covered_aln.sequences[0].gapless_string)
+    if atomseq_aln:
+        assert mdl_sqe[covered_aln.GetSequenceOffset(1)] \
+            == covered_aln.sequences[1].gapless_string[0]
+        assert mdl_sqe[covered_aln.GetSequenceOffset(1):].replace('-', '')\
+            .startswith(covered_aln.sequences[1].gapless_string)
+    else:
+        assert covered_aln.sequences[0].gapless_string \
+            == ref_sqe[ref_range[0]-1:ref_range[1]]
+        assert covered_aln.sequences[1].gapless_string \
+            == mdl_seqres[mdl_range[0]-1:mdl_range[1]]
+        assert mdl_seqres[covered_aln.GetSequenceOffset(1):]\
+            .startswith(covered_aln.sequences[1].gapless_string)
+    assert len(mdl_seqres) == len(mdl_sqe)
+    mdl_sqe_check = mdl_seqres.replace('X', '-')
+    for olc in gapped_aa:
+        mdl_sqe_check = mdl_sqe_check.replace(olc, '-')
+    assert mdl_sqe_check == mdl_sqe.replace('X', '-')
+    #
+    return mismatches, ref_range, mdl_range, covered_aln, mdl_seqres
+
+
+def _get_entities(mdl_data, metadata):
+    """Gather data for the mmCIF (target) entities.
+    Returns (list of cif_ents, list of issues)
+    """
+    # merge info for matching chains
+    unique_chains = {}  # key = sqe_gaps, value = partial cif_ent
+    chain_info = {ch["chain"]: {
+        "up_ac": ch["up_ac"], "up_range": ch["up_range"], "is_synthetic_construct": ch["is_synthetic_construct"]
+    } for ch in metadata["chains"]}
+    ost_ent = mdl_data["ent"]
+    for chn in ost_ent.chains:
+        pdb_chain_id = _get_ch_name(chn, False)
+        if pdb_chain_id not in chain_info:
+            raise RuntimeError(
+                f"Non-described chain {pdb_chain_id} in " \
+                f"{metadata['mdl_id']}"
+            )
+        sqe_gaps = _get_sequence(chn)
+        cif_ent = {
+            "pdb_sequence": sqe_gaps,
+            "pdb_chain_ids": [_get_ch_name(chn, False)],
+            "up_ac": chain_info[pdb_chain_id]["up_ac"],
+            # expected up range as parsed in metadata
+            "exp_up_range": chain_info[pdb_chain_id]["up_range"],
+            "is_synthetic_construct": chain_info[pdb_chain_id]["is_synthetic_construct"],
+        }
+        if sqe_gaps in unique_chains:
+            other_cif_ent = unique_chains[sqe_gaps]
+            # sanity checks
+            for key, value in other_cif_ent.items():
+                if key != "pdb_chain_ids" and value != cif_ent[key]:
+                    raise RuntimeError(
+                        f"Inconsistent info {key} for identical chains for " \
+                        f"chain {pdb_chain_id} vs chains " \
+                        f"{other_cif_ent['pdb_chain_ids']}."
+                    )
+            # add to list of chains
+            other_cif_ent['pdb_chain_ids'].append(pdb_chain_id)
+        else:
+            unique_chains[sqe_gaps] = cif_ent
+    # sort by model chain name (should ensure same order of chains in mmCIF)
+    entities = sorted(
+        unique_chains.values(),
+        key=lambda x: min(x["pdb_chain_ids"])
+    )
+    # compare with info from UP and complete data to return
+    issues = []
+    for cif_ent in entities:
+        sqe_gaps = cif_ent["pdb_sequence"]
+        if cif_ent["is_synthetic_construct"]:
+            cif_ent["seqres"] = sqe_gaps
+            cif_ent["description"] = f"Synthetic construct"
+            continue
+        up_ac = cif_ent["up_ac"]
+        up_data = _fetch_upkb_cached(sqe_gaps, up_ac)
+
+        num_extra_ref = len(up_data["up_sequence"]) - (up_data["up_range"][1] - up_data["up_range"][0] + 1)
+        len_mdl_covered = (up_data["mdl_range"][1] - up_data["mdl_range"][0] + 1)
+        num_extra_mdl = len(sqe_gaps) - len_mdl_covered
+        if len(up_data["mismatches"]) > 0 or num_extra_ref > 0 or num_extra_mdl > 0:
+            # ok to cover subset of UP usually (e.g. Ubiquitin), rest big issue
+            if len(up_data["mismatches"]) > 0 or num_extra_mdl > 0:
+                issue_type = "up_mismatch"
+            else:
+                issue_type = "up_extra"
+
+            if cif_ent['exp_up_range'] == None:
+                cif_up_range = (1, len(up_data["up_sequence"]))
+            else:
+                cif_up_range = tuple(
+                    map(int, cif_ent['exp_up_range'].split('-')))
+            if (issue_type == "up_extra" and up_data["up_range"] != cif_up_range) or issue_type == "up_mismatch":
+                chain_names = ",".join(cif_ent["pdb_chain_ids"])
+                short_data = (
+                    mdl_data['mdl_name'], chain_names, up_ac,
+                    len_mdl_covered, len(up_data["mismatches"]), num_extra_ref, num_extra_mdl
+                )
+                long_data = (up_data["mismatches"], up_data["up_range"], up_data["mdl_range"])
+                issues.append(
+                    (metadata['mdl_id'], issue_type, short_data, long_data)
+                )
+        # cannot deal with gapped sequences here as we cannot map to scores
+        if sqe_gaps != up_data["mdl_seqres"]:
+            issues.append((
+                metadata['mdl_id'],
+                "gapped_seq",
+                (cif_ent['pdb_chain_ids']),
+                (sqe_gaps, up_data["mdl_seqres"])
+            ))
+        cif_ent["seqres"] = up_data["mdl_seqres"]
+        cif_ent["description"] = f"{up_data['up_organism']} {up_data['up_gn']} ({up_data['up_ac']})"
+        cif_ent.update(up_data)
+    return entities, issues
+
+
+def _get_cf_config(cf_config, ur30_db_version=None, tpl_db=None,
+                   tpl_db_version=None):
+    """Define ColabFold setup.
+    Extra info needed from depositor for DBs used (depend on MMseqs2 server)
+    - ur30_db_version options: see dict in _get_sequence_dbs
+    - tpl_db options: None, "PDB70", "PDB100"
+    - tpl_db_version options: see dict in _get_sequence_dbs
+    -> can be set to None if DB not used at all (incl. custom tpls)
+    Note on versions used over time
+    - first: 2021_03 version of UniRef30, unclear what PDB70
+    - after 13.7.22: updated the UniRef30 to 2022_02 and PDB70 to 220313
+    - after 12.6.23: UniRef30 2023_02, PDB100 (instead of PDB70) 230517
+    - also to define if DB used at all for tpls or custom tpls
+    - db versions only relevant if actually used
+    """
+    # NOTES:
+    # - UP-TO-DATE (as of March 2024) generic parser given a config.json dict
+    # - custom MSA is assumed to be complemented with extra step (as for Jason)
+
+    # keep version indep. of params (and add commit since versions are meh)
+    cf_version = cf_config["version"]
+    if "commit" in cf_config and cf_config["commit"] is not None:
+        cf_version += f" ({cf_config['commit'][:7]})"
+    # drop fields which are not relevant for model building
+    cf_config = cf_config.copy()
+    for key in ["num_queries", "commit", "version", "user_agent"]:
+        if key in cf_config:
+            del cf_config[key]
+
+    # NOTE: following code from
+    # https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to
+    # understand config
+    # -> should be backward compatible with Tara and Niko sets
+    # -> see also https://github.com/sokrypton/ColabFold/wiki/v1.5.0
+
+    # deal with old names (some settings changed name in v1.5)
+    # -> code taken almost verbatim from https://github.com/sokrypton/ColabFold
+    old_names = {"MMseqs2 (UniRef+Environmental)": "mmseqs2_uniref_env",
+                 "MMseqs2 (UniRef only)": "mmseqs2_uniref",
+                 "unpaired+paired": "unpaired_paired",
+                 "AlphaFold2-multimer-v1": "alphafold2_multimer_v1",
+                 "AlphaFold2-multimer-v2": "alphafold2_multimer_v2",
+                 "AlphaFold2-multimer-v3": "alphafold2_multimer_v3",
+                 "AlphaFold2-ptm": "alphafold2_ptm",
+                 "AlphaFold2": "alphafold2"}
+    msa_mode = old_names.get(cf_config["msa_mode"], cf_config["msa_mode"])
+    if "pair_mode" in cf_config:
+        pair_mode = old_names.get(cf_config["pair_mode"], cf_config["pair_mode"])
+    model_type = old_names.get(cf_config["model_type"], cf_config["model_type"])
+
+    # fix v1.5 defaults for num_recycles and recycle_early_stop_tolerance
+    # -> def. (set as "null" in config):
+    #    - num_recycles == 20 if alphafold2_multimer_v3 else 3
+    #    - recycle_early_stop_tolerance == 0.5 if multimer else 0.0
+    # -> valid from 1.5.0 until 1.5.5 (and probably later)
+    # -> defined in alphafold/model/config.py of steineggerlab/alphafold repo
+    if "num_recycles" in cf_config and cf_config["num_recycles"] is None:
+        if "multimer" in model_type and model_type not in [
+            "alphafold2_multimer_v1", "alphafold2_multimer_v2"
+        ]:
+            cf_config["num_recycles"] = 20
+        else:
+            cf_config["num_recycles"] = 3
+    if "recycle_early_stop_tolerance" in cf_config \
+       and cf_config["recycle_early_stop_tolerance"] is None:
+        cf_config["recycle_early_stop_tolerance"] = \
+            0.5 if "multimer" in model_type else 0.0
+
+    # remove null config entries (ASSUME: None = use default)
+    cf_config = {k: v for k, v in cf_config.items() if v is not None}
+
+    # fetch relevant data
+    # -> MSA mode
+    if msa_mode == "mmseqs2_uniref_env":
+        seq_dbs = ["UniRef", "Environmental"]
+        use_mmseqs = True
+        use_msa = True
+    elif msa_mode == "mmseqs2_uniref":
+        seq_dbs = ["UniRef"]
+        use_mmseqs = True
+        use_msa = True
+    elif msa_mode == "single_sequence":
+        seq_dbs = []
+        use_mmseqs = False
+        use_msa = False
+    elif msa_mode == "custom":
+        seq_dbs = []
+        use_mmseqs = False
+        use_msa = True
+    else:
+        raise ValueError(f"Unknown msa_mode {cf_config['msa_mode']}")
+
+    # -> model type
+    if model_type == "alphafold2_multimer_v1":
+        # AF-Multimer as introduced in AlphaFold v2.1.0
+        use_multimer = True
+        multimer_version = 1
+    elif model_type == "alphafold2_multimer_v2":
+        # AF-Multimer as introduced in AlphaFold v2.2.0
+        use_multimer = True
+        multimer_version = 2
+    elif model_type == "alphafold2_multimer_v3":
+        # AF-Multimer as introduced in AlphaFold v2.3.0
+        use_multimer = True
+        multimer_version = 3
+    elif model_type == "alphafold2_ptm":
+        use_multimer = False
+        multimer_version = None
+    else:
+        raise ValueError(f"Unknown model_type {cf_config['model_type']}")
+
+    # write modeling description
+    mdl_description = f"Model generated using ColabFold v{cf_version}"
+    if use_multimer:
+        mdl_description += f" with AlphaFold-Multimer (v{multimer_version})"
+    else:
+        mdl_description += " with AlphaFold"
+    # early stopping feature of ColabFold
+    upto_mdl = ""
+    upto_rec = ""
+    if cf_config.get("stop_at_score", 100) < 100:
+        upto_mdl = "up to "
+        upto_rec = "up to "
+    if cf_config.get("recycle_early_stop_tolerance", 0) > 0:
+        upto_rec = "up to "
+    if cf_config.get("num_seeds", 1) > 1:
+        mdl_str = f"{cf_config['num_models'] * cf_config['num_seeds']} " \
+                  f"models ({cf_config['num_seeds']} random seeds per " \
+                  f"parameter set)"
+    else:
+        mdl_str = f"{cf_config['num_models']} models"
+    mdl_description += f" producing {upto_mdl}{mdl_str} with {upto_rec}" \
+                       f"{cf_config['num_recycles']} recycles each"
+    if cf_config.get("use_amber", False) or \
+       cf_config.get("num_relax", 0) > 0:
+        mdl_description += ", with AMBER relaxation"
+    else:
+        mdl_description += ", without model relaxation"
+    if cf_config["use_templates"]:
+        # tpl_db == None meant to mean that custom templates were used
+        # -> no need to stress it but just visible in search DBs
+        mdl_description += ", using templates"
+    else:
+        mdl_description += ", without templates"
+        tpl_db = None
+        tpl_db_version = None
+    if cf_config["rank_by"] == "plddt":
+        mdl_description += ", ranked by pLDDT"
+    elif cf_config["rank_by"] == "ptmscore":
+        mdl_description += ", ranked by pTM"
+    elif cf_config["rank_by"] == "multimer":
+        mdl_description += ", ranked by 80*ipTM+20*pTM"
+    else:
+        raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
+    if use_msa:
+        mdl_description += ", starting from"
+        if use_mmseqs:
+            msa_type = "MSA"
+        else:
+            msa_type = "custom MSA"
+        if use_multimer:
+            if pair_mode == "unpaired_paired":
+                mdl_description += f" paired and unpaired {msa_type}s"
+            elif pair_mode == "paired":
+                mdl_description += f" paired {msa_type}s"
+            elif pair_mode == "unpaired":
+                mdl_description += f" unpaired {msa_type}s"
+            else:
+                raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
+        elif msa_type.startswith('M'):
+            mdl_description += f" an {msa_type}"
+        else:
+            mdl_description += f" a {msa_type}"
+        if use_mmseqs:
+            mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
+    else:
+        mdl_description += " without an MSA"
+    mdl_description += "."
+
+    return {
+        "params": cf_config,
+        "version": cf_version,
+        "seq_dbs": seq_dbs,
+        "use_mmseqs": use_mmseqs,
+        "use_msa": use_msa,
+        "ur30_db_version": ur30_db_version,
+        "tpl_db": tpl_db,
+        "tpl_db_version": tpl_db_version,
+        "use_multimer": use_multimer,
+        "multimer_version": multimer_version,
+        "description": mdl_description,
+    }
+
+
+def _get_mmseqs2_software(version=None):
+    """Get MMseqs2 as a dictionary, suitable to create a modelcif software
+    object."""
+    return {
+        "name": "MMseqs2",
+        "classification": "data collection",
+        "description": "Many-against-Many sequence searching",
+        "citation": ihm.citations.mmseqs2,
+        "location": "https://github.com/soedinglab/mmseqs2",
+        "type": "package",
+        "version": version,
+    }
+
+
+def _get_colabfold_software(version=None):
+    """Get ColabFold as a dictionary, suitable to create a modelcif software
+    object."""
+    return {
+        "name": "ColabFold",
+        "classification": "model building",
+        "description": "Structure prediction",
+        "citation": ihm.citations.colabfold,
+        "location": "https://github.com/sokrypton/ColabFold",
+        "type": "package",
+        "version": version,
+    }
+
+
+def _get_af2_software(version=None, is_multimer=False):
+    """Get AF2 as dictionary, suitable to create a modelcif software object."""
+    if is_multimer:
+        return {
+            "name": "AlphaFold-Multimer",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.Citation(
+                pmid=None,
+                title="Protein complex prediction with "
+                + "AlphaFold-Multimer.",
+                journal="bioRxiv",
+                volume=None,
+                page_range=None,
+                year=2021,
+                authors=[
+                    "Evans, R.",
+                    "O'Neill, M.",
+                    "Pritzel, A.",
+                    "Antropova, N.",
+                    "Senior, A.",
+                    "Green, T.",
+                    "Zidek, A.",
+                    "Bates, R.",
+                    "Blackwell, S.",
+                    "Yim, J.",
+                    "Ronneberger, O.",
+                    "Bodenstein, S.",
+                    "Zielinski, M.",
+                    "Bridgland, A.",
+                    "Potapenko, A.",
+                    "Cowie, A.",
+                    "Tunyasuvunakool, K.",
+                    "Jain, R.",
+                    "Clancy, E.",
+                    "Kohli, P.",
+                    "Jumper, J.",
+                    "Hassabis, D.",
+                ],
+                doi="10.1101/2021.10.04.463034",
+            ),
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": version,
+        }
+    else:
+        return {
+            "name": "AlphaFold",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.citations.alphafold2,
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": version,
+        }
+
+
+def _get_protocol_steps_and_software(cf_config, model_selection_step=False, binding_test_result=None, refinement=None):
+    """Create the list of protocol steps with software and parameters used."""
+    protocol = []
+
+    # build up SW
+    sw_plus_params = [
+        (
+            _get_colabfold_software(cf_config["version"]), cf_config["params"]
+        )
+    ]
+    if cf_config["use_mmseqs"]:
+        sw_plus_params.append((_get_mmseqs2_software(), {}))
+    sw_plus_params.append((
+        _get_af2_software(is_multimer=cf_config["use_multimer"]), {}
+    ))
+
+    # modelling step
+    protocol.append({
+        "method_type": "modeling",
+        "name": None,
+        "details": cf_config["description"],
+        "input": "target_sequences_and_ref_DBs",
+        "output": "model",
+        "software_plus_params": sw_plus_params,
+    })
+
+    # model selection step
+    if model_selection_step:
+        step = {
+            "method_type": "model selection",
+            "name": None,
+            "details": "Best model selected according to average interface pLDDT.",
+        }
+        step["input"] = "model"
+        step["output"] = "model"
+        step["software_plus_params"] = {}
+        protocol.append(step)
+
+    if binding_test_result == 'nan' or (isinstance(binding_test_result, float) and np.isnan(binding_test_result)):
+        pass
+    elif binding_test_result == 'yes':
+        protocol.append({
+            "method_type": "other",
+            "name": None,
+            "details": "Experimental validation was performed and showed that two proteins are binding",
+            "input": "model",
+            "output": "model",
+            "software_plus_params": {}
+        })
+
+    elif binding_test_result == 'no':
+        protocol.append({
+            "method_type": "other",
+            "name": None,
+            "details": "Experimental validation was performed and showed that two proteins are not binding",
+            "input": "model",
+            "output": "model",
+            "software_plus_params": {}
+        })
+    else:
+        raise RuntimeError(f"Invalid result for binding experimental validation: "
+                           f"{binding_test_result}")
+
+    if refinement is not None:
+        if refinement == "cropped_and_relax":
+            refinement_details = "Selected full-length model cropped to the interface and relaxed with AMBER, using ColabFold's default protocol."
+        elif refinement == "relax":
+            refinement_details = "Selected model relaxed with AMBER, using ColabFold's default protocol."
+        else:
+            raise RuntimeError(
+                "Unexpect protocol step for model refinement: "
+                + f"'{refinement}'"
+            )
+        protocol.append({
+            "method_type": "other",
+            "name": None,
+            "details": refinement_details,
+            "input": "model",
+            "output": "model",
+            "software_plus_params": {}
+        })
+    return protocol
+
+
+def _get_title(metadata):
+    """Get a title for this modelling experiment."""
+    return metadata["title"].strip()
+
+
+def _get_model_details(metadata):
+    """Get the model description."""
+    return metadata["abstract"].strip()
+################################################################################
+
+
+# In[4]:
+
+
+################################################################################
+# ModelCIF HANDLING
+################################################################################
+# pylint: disable=too-few-public-methods
+class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
+    """Predicted accuracy according to the TM-score score in [0,1]"""
+
+    name = "pTM"
+    software = None
+
+
+class _GlobalIpTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
+    """Predicted protein-protein interface score based on TM-score in [0,1]"""
+
+    name = "ipTM"
+    software = None
+
+
+class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+    name = "pLDDT"
+    software = None
+
+
+class _GlobalIpLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Average pLDDT for interface residues of binding partner of B55"""
+    name = "average interface pLDDT"
+    software = None
+
+
+class _GlobalIPAE(modelcif.qa_metric.Global, modelcif.qa_metric.PAE):
+    """Median PAE for interface residues between peptide (rows) and receptor (columns)"""
+    name = "median interface PAE"
+    software = None
+
+
+class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+    name = "pLDDT"
+    software = None
+
+
+class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, modelcif.qa_metric.PAE):
+    """Predicted aligned error (in Angstroms)"""
+    name = "PAE"
+    software = None
+
+
+class _LPeptideAlphabetWithXO(ihm.LPeptideAlphabet):
+    """Have the default amino acid alphabet plus 'X' for unknown residues
+    and 'O' as allowed non-def. AA (U already in alphabet)."""
+
+    # extra entry added according to LPeptideAlphabet def. in
+    # https://python-ihm.readthedocs.io/en/latest/_modules/ihm.html
+    # and https://files.rcsb.org/view/1NTH.cif for values for 'O'.
+
+    def __init__(self):
+        """Create the alphabet."""
+        super().__init__()
+        self._comps["X"] = self._comps["UNK"]
+        self._comps['O'] = ihm.LPeptideChemComp(
+            "PYL", "O", "O", "PYRROLYSINE", "C12 H21 N3 O3"
+        )
+# pylint: enable=too-few-public-methods
+
+
+class _OST2ModelCIF(modelcif.model.AbInitioModel):
+    """Map OST entity elements to ihm.model"""
+
+    def __init__(self, *args, **kwargs):
+        """Initialise a model"""
+        for i in ["ost_entity", "asym", "scores_json", "incl_pae"]:
+            if i not in kwargs:
+                raise TypeError(f"Required keyword argument '{i}' not found.")
+        self.ost_entity = kwargs.pop("ost_entity")
+        self.asym = kwargs.pop("asym")
+        self.scores_json = kwargs.pop("scores_json")
+        self.incl_pae = kwargs.pop("incl_pae")
+
+        # use auth IDs for res. nums and chain names
+        self.use_auth = False
+        # what accuracy to use for PAE? (writer uses 3 anyway)
+        self.pae_digits = 3
+
+        super().__init__(*args, **kwargs)
+
+    def get_atoms(self):
+        # ToDo [internal]: Take B-factor out since its not a B-factor?
+        # NOTE: this assumes that _get_res_num maps residue to pos. in seqres
+        #       within asym
+        for atm in self.ost_entity.atoms:
+            yield modelcif.model.Atom(
+                asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)],
+                seq_id=_get_res_num(atm.residue, self.use_auth),
+                atom_id=atm.name,
+                type_symbol=atm.element,
+                x=atm.pos[0],
+                y=atm.pos[1],
+                z=atm.pos[2],
+                het=atm.is_hetatom,
+                biso=atm.b_factor,
+                occupancy=atm.occupancy,
+            )
+
+    def add_scores(self):
+        """Add QA metrics from AF2 scores."""
+        # global scores
+        self.qa_metrics.extend(
+            (
+                _GlobalPLDDT(np.mean(self.scores_json["plddt"])),
+            )
+        )
+        if self.scores_json["ptm"] != None and self.scores_json["ptm"] != "None":
+            self.qa_metrics.extend(
+                (
+                    _GlobalPTM(self.scores_json["ptm"]),
+                )
+            )
+        if self.scores_json["iptm"] != None and self.scores_json["iptm"] != "None":
+            self.qa_metrics.extend(
+                (
+                    _GlobalIpTM(self.scores_json["iptm"]),
+                )
+            )
+        if self.scores_json["ipLDDT"] != None and self.scores_json["ipLDDT"] != "None":
+            self.qa_metrics.extend(
+                (
+                    _GlobalIpLDDT(self.scores_json["ipLDDT"]),
+                )
+            )
+        if self.scores_json["iPAE"] != None and self.scores_json["iPAE"] != "None":
+            self.qa_metrics.extend(
+                (
+                    _GlobalIPAE(self.scores_json["iPAE"]),
+                )
+            )
+
+        # NOTE: none of the below expected top work if we have unmodelled gaps!
+
+        # local scores
+        lpae = []
+        i = 0
+        for chn_i in self.ost_entity.chains:
+            ch_name_i = _get_ch_name(chn_i, self.use_auth)
+            for res_i in chn_i.residues:
+                # local pLDDT
+                res_num_i = _get_res_num(res_i, self.use_auth)
+                self.qa_metrics.append(
+                    _LocalPLDDT(
+                        self.asym[ch_name_i].residue(res_num_i),
+                        self.scores_json["plddt"][i],
+                    )
+                )
+
+                # pairwise alignment error
+                if self.incl_pae:
+                    j = 0
+                    for chn_j in self.ost_entity.chains:
+                        ch_name_j = _get_ch_name(chn_j, self.use_auth)
+                        for res_j in chn_j.residues:
+                            res_num_j = _get_res_num(res_j, self.use_auth)
+                            pae_ij = self.scores_json["pae"][i][j]
+                            lpae.append(
+                                _LocalPairwisePAE(
+                                    self.asym[ch_name_i].residue(res_num_i),
+                                    self.asym[ch_name_j].residue(res_num_j),
+                                    round(pae_ij, self.pae_digits),
+                                )
+                            )
+                            j += 1
+
+                i += 1
+
+        if self.incl_pae:
+            self.qa_metrics.extend(lpae)
+
+
+def _get_modelcif_entities(target_ents, asym_units, system):
+    """Create ModelCIF entities and asymmetric units."""
+    alphabet = _LPeptideAlphabetWithXO()
+    for cif_ent in target_ents:
+        if cif_ent["is_synthetic_construct"]:
+            references = []
+            mdlcif_ent = modelcif.Entity(
+                cif_ent["seqres"],
+                description=cif_ent["description"],
+                alphabet=alphabet,
+                source=ihm.source.Synthetic(),
+                references=references,
+            )
+        else:
+            # collect references
+            up_ref = modelcif.reference.UniProt(
+                code=cif_ent["up_id"],
+                accession=cif_ent["up_ac"],
+                isoform=cif_ent["up_isoform"],
+                ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"],
+                organism_scientific=cif_ent["up_organism"],
+                sequence_version_date=cif_ent["up_last_mod"],
+                sequence_crc64=cif_ent["up_crc64"],
+                sequence=cif_ent["up_sequence"],
+            )
+            # ASSUME: full model covered w/o mismatches
+            # -> NOTE: sequence passed above is cut based on alignments!
+            up_ref.alignments.append(modelcif.reference.Alignment(
+                db_begin=cif_ent["up_range"][0],
+                db_end=cif_ent["up_range"][1],
+                entity_begin=1,
+                entity_end=len(cif_ent["seqres"]),
+                seq_dif=[
+                    ihm.reference.SeqDif(
+                        mismatch[1],
+                        alphabet[mismatch[2]],
+                        alphabet[mismatch[3]]
+                    ) for mismatch in cif_ent["mismatches"]
+                ]
+            ))
+            #
+            references = [up_ref]
+            # combine into ModelCIF entity
+            mdlcif_ent = modelcif.Entity(
+                cif_ent["seqres"],
+                description=cif_ent["description"],
+                alphabet=alphabet,
+                source=ihm.source.Natural(
+                    ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"],
+                    scientific_name=cif_ent["up_organism"],
+                ),
+                references=references,
+            )
+        # NOTE: this assigns (potentially new) alphabetic chain names
+        for pdb_chain_id in cif_ent["pdb_chain_ids"]:
+            asym_units[pdb_chain_id] = modelcif.AsymUnit(
+                mdlcif_ent, strand_id=pdb_chain_id,
+            )
+        system.entities.append(mdlcif_ent)
+
+
+def _get_assoc_pae_file(entry_id, mdl_name):
+    """Generate a associated file object to extract PAE to extra file."""
+    return modelcif.associated.LocalPairwiseQAScoresFile(
+        f"{mdl_name}_local_pairwise_qa.cif",
+        categories=["_ma_qa_metric_local_pairwise"],
+        copy_categories=["_ma_qa_metric"],
+        entry_id=entry_id,
+        entry_details="This file is an associated file consisting "
+        + "of local pairwise QA metrics. This is a partial mmCIF "
+        + "file and can be validated by merging with the main "
+        + "mmCIF file containing the model coordinates and other "
+        + "associated data.",
+        details="Predicted aligned error",
+    )
+
+
+def _get_assoc_png_file(fle_path, png_type):
+    """Generate a modelcif.associated.File object pointing to PNG file
+    with content defined by png_type (coverage, plddt, or pae).
+    """
+    details = {
+        "coverage": "PNG file showing number of sequences in the MSA covering "
+        + "each position in the target sequences",
+        "plddt": "PNG file showing pLDDT at each residue position for each "
+        + "of the 5 models produced",
+        "pae": "PNG file showing the PAE matrices for each of the 5 models "
+        + "produced",
+    }
+    afile = modelcif.associated.File(
+        fle_path,
+        details=details[png_type],
+    )
+    # NOTE: file_format can be set to "png" in future ModelCIF versions
+    # (i.e. when https://github.com/ihmwg/ModelCIF/issues/17 is resolved)
+    afile.file_format = "other"
+    afile.file_content = "other"
+    return afile
+
+
+def _get_assoc_mdl_file(fle_path, data_json):
+    """Generate a modelcif.associated.File object that looks like a CIF file.
+    The dedicated CIFFile functionality in modelcif would also try to write it.
+    """
+    cfile = modelcif.associated.File(
+        fle_path,
+        details=f"models-details",
+    )
+    cfile.file_format = "cif"
+    return cfile
+
+
+def _get_assoc_zip_file(fle_path, data_json):
+    """Create a modelcif.associated.File object that looks like a ZIP file.
+    This is NOT the archive ZIP file for the PAEs but to store that in the
+    ZIP archive of the selected model."""
+    zfile = modelcif.associated.File(
+        fle_path,
+        details="archive with multiple files for "
+        + f"#{data_json['mdl_name']}",
+    )
+    zfile.file_format = "other"
+    return zfile
+
+
+def _get_associated_files(mdl_name, arc_files):
+    """Create entry for associated files."""
+    # package all into zip file
+    return modelcif.associated.Repository(
+        "",
+        [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
+    )
+    # NOTE: by convention MA expects zip file with same name as model-cif
+
+
+def _get_sequence_dbs(config_data):
+    """Get ColabFold seq. DBs."""
+    # Uses HC list of known DBs used in ColabFold
+    # -> see also notes in _get_config
+    db_dict = {
+        "UniRef_2021_03": modelcif.ReferenceDatabase(
+            "UniRef30",
+            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz",
+            version="2021_03",
+        ),
+        "UniRef_2022_02": modelcif.ReferenceDatabase(
+            "UniRef30",
+            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz",
+            version="2022_02",
+        ),
+        "UniRef_2023_02": modelcif.ReferenceDatabase(
+            "UniRef30",
+            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2302.tar.gz",
+            version="2023_02",
+        ),
+        "Environmental": modelcif.ReferenceDatabase(
+            "ColabFold DB",
+            "https://wwwuser.gwdg.de/~compbiol/colabfold/"
+            + "colabfold_envdb_202108.tar.gz",
+            version="2021_08",
+        ),
+        "PDB100_230517": modelcif.ReferenceDatabase(
+            "PDB100",
+            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+            + "hhsuite_dbs/pdb100_foldseek_230517.tar.gz",
+            release_date=datetime.datetime(2023, 5, 17)
+        ),
+        "PDB70_211027": modelcif.ReferenceDatabase(
+            "PDB70",
+            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+            + "hhsuite_dbs/pdb70_from_mmcif_211027.tar.gz",
+            release_date=datetime.datetime(2021, 10, 27)
+        ),
+        "PDB70_211117": modelcif.ReferenceDatabase(
+            "PDB70",
+            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+            + "hhsuite_dbs/pdb70_from_mmcif_211117.tar.gz",
+            release_date=datetime.datetime(2021, 11, 17)
+        ),
+        "PDB70_220313": modelcif.ReferenceDatabase(
+            "PDB70",
+            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+            + "hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz",
+            release_date=datetime.datetime(2022, 3, 13)
+        ),
+    }
+    # fill list of DBs
+    seq_dbs = []
+    search_keys = []
+    for seq_db in config_data["seq_dbs"]:
+        if seq_db == "UniRef":
+            if config_data['ur30_db_version'] is None:
+                raise ValueError("Cannot use UniRef without version")
+            search_key = f"UniRef_{config_data['ur30_db_version']}"
+        else:
+            search_key = seq_db
+        search_keys.append(search_key)
+    if config_data["tpl_db"] is not None:
+        if config_data["tpl_db_version"] is None:
+            raise ValueError("Cannot have tpl DB without version")
+        search_keys.append(
+            f"{config_data['tpl_db']}_{config_data['tpl_db_version']}"
+        )
+    for search_key in search_keys:
+        if search_key not in db_dict:
+            raise ValueError(f"Unknown seq. DB {search_key}")
+        seq_dbs.append(db_dict[search_key])
+    return seq_dbs
+
+
+def _assemble_modelcif_software(soft_dict, params_dict):
+    """Create a modelcif.SoftwareWithParameters instance from dictionaries."""
+    # create SW object
+    sw = modelcif.Software(
+        soft_dict["name"],
+        soft_dict["classification"],
+        soft_dict["description"],
+        soft_dict["location"],
+        soft_dict["type"],
+        soft_dict["version"],
+        citation=soft_dict["citation"],
+    )
+    # assemble parameters
+    params = []
+    for key, val in params_dict.items():
+        params.append(modelcif.SoftwareParameter(key, val))
+    # put them together
+    return modelcif.SoftwareWithParameters(sw, params)
+
+
+def _get_modelcif_protocol_software(js_step):
+    """Assemble software entries for a ModelCIF protocol step."""
+    # new setup in python-modelcif (as of late 2023): params with each SW
+    sw_list = []
+    for sw, sw_params in js_step["software_plus_params"]:
+        sw_list.append(_assemble_modelcif_software(sw, sw_params))
+    # group and done...
+    if sw_list:
+        return modelcif.SoftwareGroup(sw_list)
+    else:
+        return None
+
+
+def _get_modelcif_protocol_data(data_label, target_entities, model, ref_dbs):
+    """Assemble data for a ModelCIF protocol step."""
+    if data_label == "target_sequences_and_ref_DBs":
+        data = modelcif.data.DataGroup(target_entities)
+        data.extend(ref_dbs)
+    elif data_label == "model":
+        data = model
+    else:
+        raise RuntimeError(f"Unknown protocol data: '{data_label}'")
+    return data
+
+
+def _get_modelcif_protocol(protocol_steps, target_entities, model, ref_dbs):
+    """Create the protocol for the ModelCIF file."""
+    protocol = modelcif.protocol.Protocol()
+    for js_step in protocol_steps:
+        sftwre = _get_modelcif_protocol_software(js_step)
+        input_data = _get_modelcif_protocol_data(
+            js_step["input"], target_entities, model, ref_dbs
+        )
+        output_data = _get_modelcif_protocol_data(
+            js_step["output"], target_entities, model, ref_dbs
+        )
+
+        protocol.steps.append(
+            modelcif.protocol.Step(
+                input_data=input_data,
+                output_data=output_data,
+                name=js_step["name"],
+                details=js_step["details"],
+                software=sftwre,
+            )
+        )
+        protocol.steps[-1].method_type = js_step["method_type"]
+    return protocol
+
+
+def _compress_cif_file(cif_file):
+    """Compress cif file and delete original."""
+    with open(cif_file, "rb") as f_in:
+        with gzip.open(cif_file + ".gz", "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(cif_file)
+
+
+def _package_associated_files(repo):
+    """Compress associated files into single zip file and delete original."""
+    # zip settings tested for good speed vs compression
+    for archive in repo.files:
+        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
+            for zfile in archive.files:
+                cif_zip.write(zfile.path, arcname=zfile.path)
+                os.remove(zfile.path)
+
+
+def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress, add_pae, add_files):
+    """Mix all the data into a ModelCIF file."""
+    print("    generating ModelCIF objects...", end="")
+    pstart = timer()
+    # create system to gather all the data
+    system = modelcif.System(
+        title=data_json["title"],
+        id=data_json["mdl_id"].upper(),
+        model_details=data_json["model_details"],
+    )
+
+    # create an asymmetric unit and an entity per target sequence
+    asym_units = {}
+    _get_modelcif_entities(data_json["target_entities"], asym_units, system)
+
+    # audit_authors
+    system.authors.extend(data_json["audit_authors"])
+
+    # set up the model to produce coordinates
+    model = _OST2ModelCIF(
+        assembly=modelcif.Assembly(asym_units.values()),
+        asym=asym_units,
+        ost_entity=ost_ent,
+        scores_json=data_json,
+        name=data_json["mdl_name"],
+        incl_pae=add_pae,
+    )
+    print(f" ({timer()-pstart:.2f}s)")
+    print("    processing QA scores...", end="", flush=True)
+    pstart = timer()
+    model.add_scores()
+    print(f" ({timer()-pstart:.2f}s)")
+
+    model_group = modelcif.model.ModelGroup([model])
+    system.model_groups.append(model_group)
+
+    # handle additional files
+    arc_files = []
+    if add_pae:
+        arc_files.append(_get_assoc_pae_file(system.id, mdl_name))
+    arc_files.extend(add_files)
+    if arc_files:
+        system.repositories.append(_get_associated_files(mdl_name, arc_files))
+
+    # get data and steps
+    ref_dbs = _get_sequence_dbs(data_json["cf_config"])
+    protocol = _get_modelcif_protocol(
+        data_json["protocol"], system.entities, model, ref_dbs,
+    )
+    system.protocols.append(protocol)
+
+    # write modelcif System to file
+    print("    write to disk...", end="", flush=True)
+    pstart = timer()
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    mdl_fle = f"{mdl_name}.cif"
+    try:
+        with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+        if arc_files:
+            _package_associated_files(system.repositories[0])
+        if compress:
+            _compress_cif_file(mdl_fle)
+            mdl_fle += ".gz"
+    finally:
+        os.chdir(oldpwd)
+    print(f" ({timer()-pstart:.2f}s)")
+    return mdl_fle
+################################################################################
+
+
+# In[5]:
+
+
+################################################################################
+# HANDLE FULL DATA SET
+################################################################################
+
+def generate_json_filepath(pdb_filename, directory, input_data_path):
+
+    # Split the pdb filename into parts
+    parts = pdb_filename.split('_')
+
+    if directory == "relaxed_figures/":
+        if parts[-2] == "seed":
+            parts[-10] = parts[-10].replace('relaxed', 'scores')
+            parts[-1] = parts[-1].replace('.pdb', '.json')
+            json_filename = '_'.join(parts)
+            json_file_path = input_data_path + '/' + directory + json_filename
+
+        if parts[-2] == "only":
+            parts[3] = parts[3].replace('relaxed', 'scores')
+            if parts[6] == 'v3':
+                parts[6] = 'alphafold2_multimer_v3'
+            else:
+                parts[6] = 'alphafold2_multimer_v3_' + parts[6]
+            parts[-2] = parts[-2].replace('only', 'seed')
+            parts[-1] = parts[-1].replace('pep.pdb', '000.json')
+            json_filename = '_'.join(parts)
+            json_file_path = input_data_path + '/' + "screen_256/" + json_filename
+    elif directory in ("screen_256/", 'design/', 'holoenzyme/'):
+        json_file_path = os.path.join(
+            input_data_path,
+            directory,
+            pdb_filename.replace("_relaxed_", "_scores_")
+            .replace("_unrelaxed_", "_scores_")
+            .replace(".pdb", ".json")
+        )
+
+    return json_file_path
+
+
+def _translate2modelcif(metadata, opts, add_files=[]):
+    """Convert a model with its accompanying data to ModelCIF."""
+    mdl_id = metadata["mdl_id"]  # here for exemple : ma-osf-ppp2r2a-00x
+    # skip if done already (done later here due to info to be returned)
+
+    if opts.compress:
+        cifext = "cif.gz"
+    else:
+        cifext = "cif"
+    mdl_path = os.path.join(opts.out_dir, f"{mdl_id}.{cifext}")
+
+    # prepare data for models to convert (also gets all issues)
+    issues = []
+
+    mdl_dict = []
+    pdb_file_path = opts.input_data_path + '/' + \
+        metadata['directory'] + metadata['pdb_file_name']
+    with open(pdb_file_path, 'r') as file:
+        pdb_data = file.read()
+
+    mdl_dict = dict()
+    mdl_dict["ent"] = io.PDBStrToEntity(
+        pdb_data,
+        profile=io.profiles["DEFAULT"],
+        process=True
+    )
+    file_base, file_ext = os.path.splitext(metadata['pdb_file_name'])
+    mdl_dict["mdl_name"] = file_base
+    json_file_path = metadata['json_file_path']
+    with open(json_file_path, 'r') as file:
+        json_data = json.load(file)
+    json_data["iPAE"] = metadata['iPAE']
+    json_data["ipLDDT"] = metadata['ipLDDT']
+
+    # subset the scores
+    chain_lengths = []
+    for ch in metadata["chains"]:
+        ost_chain = mdl_dict["ent"].FindChain(ch["chain"])
+        if ch["up_range"] is not None and "only_pep" in metadata['pdb_file_name']:
+            start, end = map(int, ch["up_range"].split('-'))
+            # I make the assumption that the chain with a subset is the last one
+            previous_chains_length = sum(chain_lengths)
+            json_data["plddt"] = json_data["plddt"][:previous_chains_length] + \
+                json_data["plddt"][(previous_chains_length+start-1)
+                                    :(previous_chains_length+end)]
+            json_data["pae"] = json_data["pae"][:previous_chains_length] + \
+                json_data["pae"][(previous_chains_length+start-1)
+                                  :(previous_chains_length+end)]
+            for i in range(len(json_data["pae"])):
+                row = json_data["pae"][i]
+                json_data["pae"][i] = row[:previous_chains_length] + \
+                    row[(previous_chains_length+start-1)
+                         :(previous_chains_length+end)]
+            json_data['ptm'] = None
+            json_data['iptm'] = None
+            continue
+        chain_lengths.append(ost_chain.residue_count)
+
+    mdl_dict["scores"] = json_data
+    metadata["plddt"] = json_data["plddt"]
+    entities, issues = _get_entities(mdl_dict, metadata)
+    scores_issues = _check_scores(mdl_dict, metadata)
+    issues.extend(scores_issues)
+
+    # abort here if already done
+    if opts.checks_only:
+        return issues
+    if os.path.exists(mdl_path):
+        print(f"  {mdl_id} already done...")
+        return issues
+
+    # now the translation from the single function
+    mdlcf_json = {}
+    # the timepoint and the dictionary and the chain_B are the elements determining the config
+    config_dict = metadata["config"].copy()
+    cf_config = _get_cf_config(
+        config_dict, metadata['ur30_db_version'], metadata['tpl_db'], metadata['tpl_db_version'])
+
+    mdlcf_json["audit_authors"] = _get_audit_authors()
+    if (mdl_dict["scores"]["ipLDDT"] is not None) and (mdl_dict["scores"]["ipLDDT"] != 'None') and (float(mdl_dict["scores"]["ipLDDT"]) > 0):
+        mdlcf_json["protocol"] = _get_protocol_steps_and_software(
+            cf_config, model_selection_step=True, binding_test_result=metadata['binding_test_result'], refinement=metadata['refinement'])
+    else:
+        mdlcf_json["protocol"] = _get_protocol_steps_and_software(
+            cf_config, model_selection_step=False, binding_test_result=metadata['binding_test_result'], refinement=metadata['refinement'])
+    mdlcf_json["cf_config"] = cf_config
+    mdlcf_json["mdl_id"] = mdl_id  # used for entry ID
+    mdlcf_json["mdl_name"] = file_base
+    mdlcf_json["target_entities"] = entities
+    for scores_key in ["plddt", "pae", "ipLDDT", "iPAE"]:
+        mdlcf_json[scores_key] = mdl_dict["scores"][scores_key]
+    mdlcf_json["title"] = _get_title(metadata)
+    mdlcf_json["model_details"] = _get_model_details(metadata)
+    mdlcf_json["iptm"] = json_data['iptm']
+    mdlcf_json["ptm"] = json_data['ptm']
+
+    print(f"  translating {mdl_id}...")
+    pdb_start = timer()
+
+    file_name = _store_as_modelcif(
+        data_json=mdlcf_json,
+        ost_ent=mdl_dict["ent"],
+        out_dir=opts.out_dir,
+        mdl_name=mdl_id,
+        compress=opts.compress,
+        add_pae=not opts.no_extra_files,
+        add_files=add_files
+    )
+
+    # check if result can be read and has expected seq.
+    mdl_path = os.path.join(opts.out_dir, file_name)
+    ent, ss = io.LoadMMCIF(mdl_path, seqres=True)
+    exp_seqs = []
+    for trg_ent in mdlcf_json["target_entities"]:
+        exp_seqs += [trg_ent["pdb_sequence"]] * len(trg_ent["pdb_chain_ids"])
+    assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}"
+    # NOTE: here we expect auth = label IDs
+    ent_seq = "".join([_get_sequence(chn, False) for chn in ent.chains])
+    ent_seq_a = "".join([_get_sequence(chn, True) for chn in ent.chains])
+    assert ent_seq == ent_seq_a
+    assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}"
+    ent_seqres = [ss.FindSequence(chn.name).string for chn in ent.chains]
+    exp_seqres = []
+    for trg_ent in mdlcf_json["target_entities"]:
+        exp_seqres += [trg_ent["seqres"]] * len(trg_ent["pdb_chain_ids"])
+    assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}"
+    print(f"  ... done with {mdl_id} ({timer()-pdb_start:.2f}s).")
+
+    return issues
+
+
+# In[6]:
+
+
+def _get_metadata(input_data_path, single_model=None):
+
+    metadata_csv = pd.read_csv(os.path.join(
+        input_data_path, "info_of_submitted_structures.csv"), sep='\t')
+    metadata_full = []
+
+    # fetch configs
+    configs = {}
+    for directory in metadata_csv['directory'].unique():
+        config_path = os.path.join(input_data_path, directory)
+        configs[directory] = json.load(
+            open(os.path.join(config_path, "config.json"))
+        )
+    for mdl_idx, metadata in metadata_csv.iterrows():
+        mdl_num = mdl_idx + 1
+        if single_model is not None and mdl_num != int(single_model):
+            continue
+        data = dict()
+        data['mdl_num'] = mdl_num
+        data['mdl_id'] = "ma-osf-ppp2r2a-" + f"{mdl_num:03}"
+        data['title'] = metadata['title']
+
+        data['abstract'] = metadata['description']
+        chains = []
+        for chain_id in "ABCD":
+            if isinstance(metadata["chain_" + chain_id], str):
+                chain_data = metadata["chain_" + chain_id]
+                if chain_data == "synthetic construct":
+                    chain = {
+                        "chain": chain_id,
+                        "up_ac": None,
+                                            "up_range": None,
+                                            "is_synthetic_construct": True
+                    }
+                else:
+                    if ':' in chain_data:
+                        up_ac, up_range = chain_data.split(':', 1)
+                    else:
+                        up_ac, up_range = (chain_data, None)
+
+                    chain = {
+                        "chain": chain_id,
+                        "up_ac": up_ac,
+                        "up_range": up_range,
+                        "is_synthetic_construct": False
+                    }
+                chains.append(chain)
+
+        data['chains'] = chains
+
+        data['ipLDDT'] = metadata['ipLDDT_in_conserved_binding_site']
+        data['iPAE'] = metadata['iPAE_in_conserved binding site']
+        data['json_file_path'] = generate_json_filepath(
+            metadata['pdb'], metadata['directory'], input_data_path)
+        data['pdb_file_name'] = metadata['pdb']
+        data['directory'] = metadata['directory']
+        if metadata['timepoint'] == "between 13.7.22 and 12.6.23":
+            data['ur30_db_version'] = "2022_02"
+            data['tpl_db'] = "PDB70"
+            data['tpl_db_version'] = "220313"
+        elif metadata['timepoint'] == "after 12.6.23":
+            data['ur30_db_version'] = "2023_02"
+            data['tpl_db'] = "PDB100"
+            data['tpl_db_version'] = "230517"
+        else:
+            raise RuntimeError(f"Unknown databases version for this timepoint :"
+                               f"{metadata['timepoint']}")
+        data['config'] = configs[metadata['directory']]
+
+        # protocol
+        if metadata['directory'] == "relaxed_figures/":
+            if "only_pep" in metadata['pdb']:
+                data['refinement'] = "cropped_and_relax"
+            else:
+                data['refinement'] = "relax"
+        else:
+            data['refinement'] = None
+        data['binding_test_result'] = metadata['binding']
+        metadata_full.append(data)
+    return metadata_full
+
+
+# In[7]:
+
+
+################################################################################
+# HANDLE ONE MODEL IN A NOTEBOOK
+################################################################################
+
+def _main():
+    """Run as script."""
+
+    # parse/fetch global data
+    opts = _parse_args()
+
+    # parse/fetch global data
+    metadata_all = _get_metadata(opts.input_data_path, opts.single_model)
+
+    # iterate over models
+    print(f"Working on models in {opts.input_data_path}...")
+    issues = []
+    for metadata in metadata_all:
+        new_issues = _translate2modelcif(metadata, opts)
+        issues.extend(new_issues)
+    print(f"... done with models in {opts.input_data_path}.")
+
+    if opts.single_model is None:
+        # dump issues
+        issues_file_path = os.path.join(opts.out_dir, "issues.json")
+        json.dump(issues, open(issues_file_path, "w"))
+        # dump info on which ones to export to 3D-Beacons
+        to_export_file_path = os.path.join(opts.out_dir, "MA_to_export.json")
+        to_export = {
+            metadata["mdl_num"]: ((metadata["binding_test_result"] == "yes") or (
+                metadata["directory"] == "relaxed_figures/") or (metadata["directory"] == "holoenzyme/"))
+            for metadata in metadata_all
+        }
+        json.dump(to_export, open(to_export_file_path, "w"))
+
+
+if __name__ == "__main__":
+    _main()