diff --git a/projects/cancer-PPI-domains/README.md b/projects/cancer-PPI-domains/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad90c6d92b647ee39e8acdc0e6e9525e5e681499
--- /dev/null
+++ b/projects/cancer-PPI-domains/README.md
@@ -0,0 +1,23 @@
+# Modelling of Spongilla lacustris proteome with functional annotations
+
+[Link to project in ModelArchive](https://modelarchive.org/doi/10.5452/ma-t3vr3) (incl. background on project itself)
+
+Setup:
+- Domains of interacting proteins extracted from full length proteins (sequences from UniProtKB)
+- Models generated using sequences of domains which can have discontinuous mapping to full length sequence
+- Same protocol used as in [model set for core eukaryotic protein complexes](https://www.modelarchive.org/doi/10.5452/ma-bak-cepc)
+  - Paired multiple sequence alignment (MSA) generated for each dimer
+  - Model using AlphaFold ("model 3" parameters; pTM monomer version) with a 200 residue gap between the two chains, without templates and without model relaxation
+- Input from them:
+  - one zip file with all the PDB files (no b-factor values, residue numbers matching position in UniProtKB sequence)
+  - one zip file with all the extra files (1 fasta file for alignment, 1 npz file with pLDDT, PAE and contact probabilities)
+  - a CSV file with description and UniProtKB links for each protein
+
+Special features here:
+- Custom MSA generation with intermediate result in accompanying data
+- PAE and contact probabilities only kept for inter-chain residue-pairs
+- Author provided residue numbers kept as auth_seq_num
+- Mapping to most recent UniProtKB sequence generated, checked and stored as fasta files (ModelCIF file only has covered range with respect to the originally used sequence)
+
+Content:
+- translate2modelcif.py : script to do conversion; compatible with Docker setup from [ma-wilkins-import](https://git.scicore.unibas.ch/schwede/ma-wilkins-import/-/tree/6bbd6fa7ec53e1a0971fba40c96fa971d1022f74) (and script based on code there)
diff --git a/projects/cancer-PPI-domains/translate2modelcif.py b/projects/cancer-PPI-domains/translate2modelcif.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5076cd6c684f277c2bb713f0d40a12e19a4253b
--- /dev/null
+++ b/projects/cancer-PPI-domains/translate2modelcif.py
@@ -0,0 +1,1106 @@
+#! /usr/local/bin/ost
+"""Translate models from Jing Zhang from PDB + extra data into ModelCIF."""
+
+# EXAMPLES for running:
+"""
+GT test setup:
+ost scripts/translate2modelcif.py ./pdbs ./accompanying_data \
+    ./modelarchive_submission.csv ./modelcif
+# DO ALL
+ost scripts/translate2modelcif.py ./pdbs ./accompanying_data \
+    ./modelarchive_submission.csv ./modelcif --compress > script_out.txt
+"""
+
+import argparse
+import datetime
+import gzip
+import os
+import shutil
+import sys
+import zipfile
+
+from timeit import default_timer as timer
+import numpy as np
+import requests
+import ujson as json
+
+import ihm
+import ihm.citations
+import modelcif
+import modelcif.associated
+import modelcif.dumper
+import modelcif.model
+import modelcif.protocol
+import modelcif.reference
+
+import pandas as pd
+from ost import io, seq
+
+# global cache
+up_cache = {}
+
+def _parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        "model_dir",
+        type=str,
+        metavar="<MODEL DIR>",
+        help="Directory with models to be translated.",
+    )
+    parser.add_argument(
+        "accompanying_data_dir",
+        type=str,
+        metavar="<ACC. DIR>",
+        help="Directory with zip files containing accompanying data.",
+    )
+    parser.add_argument(
+        "metadata_file",
+        type=str,
+        metavar="<METADATA FILE>",
+        help="Path to CSV file with metadata.",
+    )
+    parser.add_argument(
+        "out_dir",
+        type=str,
+        metavar="<OUTPUT DIR>",
+        help="Path to directory to store results.",
+    )
+    parser.add_argument(
+        "--compress",
+        default=False,
+        action="store_true",
+        help="Compress ModelCIF file with gzip "
+        "(note that QA file is zipped either way).",
+    )
+    opts = parser.parse_args()
+
+    # check that model dir exists
+    if opts.model_dir.endswith("/"):
+        opts.model_dir = opts.model_dir[:-1]
+    if not os.path.exists(opts.model_dir):
+        _abort_msg(f"Model directory '{opts.model_dir}' does not exist.")
+    if not os.path.isdir(opts.model_dir):
+        _abort_msg(f"Path '{opts.model_dir}' does not point to a directory.")
+    # check that acc. dir exists
+    if opts.accompanying_data_dir.endswith("/"):
+        opts.accompanying_data_dir = opts.accompanying_data_dir[:-1]
+    if not os.path.exists(opts.accompanying_data_dir):
+        _abort_msg(f"Model directory '{opts.accompanying_data_dir}' does not exist.")
+    if not os.path.isdir(opts.accompanying_data_dir):
+        _abort_msg(f"Path '{opts.accompanying_data_dir}' does not point to a directory.")
+    # check metadata_file
+    if not os.path.exists(opts.metadata_file):
+        _abort_msg(f"Metadata file '{opts.metadata_file}' does not exist.")
+    if not os.path.isfile(opts.metadata_file):
+        _abort_msg(f"Path '{opts.metadata_file}' does not point to a file.")
+    # check out_dir
+    if opts.out_dir.endswith("/"):
+        opts.out_dir = opts.out_dir[:-1]
+    if not os.path.exists(opts.out_dir):
+        os.makedirs(opts.out_dir)
+    if not os.path.isdir(opts.out_dir):
+        _abort_msg(f"Path '{opts.out_dir}' does not point to a directory.")
+    return opts
+
+
+# pylint: disable=too-few-public-methods
+class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
+    name = "pLDDT"
+    software = None
+
+
+class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
+    name = "pLDDT"
+    software = None
+
+
+class _PAE(modelcif.qa_metric.MetricType):
+    """Predicted aligned error (in Angstroms)"""
+
+    type = "PAE"
+    other_details = None
+
+
+class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE):
+    """Predicted aligned error (in Angstroms)"""
+
+    name = "PAE"
+    software = None
+
+
+class _ContactProb(modelcif.qa_metric.MetricType):
+    """Contact probability of a pairwise interaction in [0,1]"""
+
+    type = "contact probability"
+    other_details = None
+
+
+class _LocalPairwiseCP(modelcif.qa_metric.LocalPairwise, _ContactProb):
+    """Contact probability of a pairwise interaction in [0,1]"""
+
+    name = "contact probability"
+    software = None
+
+
+# pylint: enable=too-few-public-methods
+
+
+class _OST2ModelCIF(modelcif.model.AbInitioModel):
+    """Map OST entity elements to ihm.model"""
+
+    def __init__(self, *args, **kwargs):
+        """Initialise a model"""
+        for i in ["ost_entity", "asym", "scores_json"]:
+            if i not in kwargs:
+                raise TypeError(f"Required keyword argument '{i}' not found.")
+        self.ost_entity = kwargs.pop("ost_entity")
+        self.asym = kwargs.pop("asym")
+        self.scores_json = kwargs.pop("scores_json")
+        # need reverse mapping from chain name + res num to seq. id
+        self.seq_id_map = {
+            ch.name: {
+                res.number.num: idx+1 for idx, res in enumerate(ch.residues)
+            } for ch in self.ost_entity.chains
+        }
+        # self.seq_id_map = {
+        #     ch_id: {v: k for k, v in asym_unit.auth_seq_id_map.items()} \
+        #     for ch_id, asym_unit in self.asym.items()
+        # }
+        # fetch plddts for all residues (for mapping)
+        self.res_bfactors = {
+            r.qualified_name: self.scores_json["plddt"][i] \
+            for i, r in enumerate(self.ost_entity.residues)
+        }
+
+        super().__init__(*args, **kwargs)
+
+    def get_atoms(self):
+        # note we blindly replace b-factors here!
+        for atm in self.ost_entity.atoms:
+            yield modelcif.model.Atom(
+                asym_unit=self.asym[atm.chain.name],
+                seq_id=self.seq_id_map[atm.chain.name][atm.residue.number.num],
+                atom_id=atm.name,
+                type_symbol=atm.element,
+                x=atm.pos[0],
+                y=atm.pos[1],
+                z=atm.pos[2],
+                het=atm.is_hetatom,
+                biso=self.res_bfactors[atm.residue.qualified_name],
+                occupancy=atm.occupancy,
+            )
+
+    def add_scores(self):
+        """Add QA metrics from AF2 scores."""
+        # global scores
+        self.qa_metrics.append(
+            _GlobalPLDDT(np.mean(self.scores_json["plddt"]))
+        )
+
+        # local scores
+        lpae = []
+        lcp = []
+        i = 0
+        for chn_idx, chn_i in enumerate(self.ost_entity.chains):
+            for res_i in chn_i.residues:
+                # local pLDDT
+                seq_id_i = self.seq_id_map[chn_i.name][res_i.number.num]
+                asym_i = self.asym[chn_i.name].residue(seq_id_i)
+                self.qa_metrics.append(
+                    _LocalPLDDT(asym_i, self.scores_json["plddt"][i])
+                )
+
+                # per-residue scores
+                # NOTE: only off diagonal stored (i.e. chain A with chain B)
+                j = 0
+                for chn_j in self.ost_entity.chains[chn_idx+1:]:
+                    for res_j in chn_j.residues:
+                        seq_id_j = self.seq_id_map[chn_j.name][res_j.number.num]
+                        asym_j = self.asym[chn_j.name].residue(seq_id_j)
+                        pae_ij = self.scores_json["pae"][i][j]
+                        lpae.append(
+                            _LocalPairwisePAE(asym_i, asym_j, pae_ij)
+                        )
+                        cp_ij = self.scores_json["contact_prob"][i][j]
+                        lcp.append(
+                            _LocalPairwiseCP(asym_i, asym_j, round(cp_ij, 3))
+                        )
+                        j += 1
+
+                i += 1
+
+        self.qa_metrics.extend(lpae)
+        self.qa_metrics.extend(lcp)
+
+
+def _abort_msg(msg, exit_code=1):
+    """Write error message and exit with exit_code."""
+    print(f"{msg}\nAborting.", file=sys.stderr)
+    sys.exit(exit_code)
+
+
+def _warn_msg(msg):
+    """Write a warning message to stdout."""
+    print(f"WARNING: {msg}")
+
+
+def _check_file(file_path):
+    """Make sure a file exists and is actually a file."""
+    if not os.path.exists(file_path):
+        _abort_msg(f"File not found: '{file_path}'.")
+    if not os.path.isfile(file_path):
+        _abort_msg(f"File path does not point to file: '{file_path}'.")
+
+
+def _get_audit_authors():
+    """Return the list of authors that produced this model."""
+    return (
+        "Zhang, J.",
+        "Pei, J.",
+        "Durham, J.",
+        "Bos, T.",
+        "Cong, Q.",
+    )
+
+
+def _get_metadata(metadata_file):
+    """Read csv file with metedata and prepare for next steps."""
+    # NEEDS LOTS OF CLEANUP!!
+    # -> ASSUMED "A" always chain A
+    df_in = pd.read_csv(metadata_file)
+    cols = None
+    mdl_part_txt = " This model is part of"
+    mdl_af_txt = "Model generated using AlphaFold2"
+    for idx, row in df_in.iterrows():
+        if row[0].startswith("#"):
+            continue
+        elif cols is None:
+            cols = list(row[~row.isna()])
+            df = pd.DataFrame(columns=cols)
+        else:
+            row_data = [row[0]]
+            # fix overview entry (ASSUME "A" is always chain 1)
+            # -> any ',' in oevreview was split into columns
+            # -> also new-lines missing between lines
+            # -> and need to remove "This model is part of ..." text
+            ch1_idx = row.to_list().index("A")
+            overview = ",".join(row[1:ch1_idx])
+            mdl_af_idx = overview.index(mdl_af_txt)
+            mdl_rel_idx = overview.index(mdl_part_txt)
+            assert mdl_af_idx < mdl_rel_idx
+            ov_fixed = overview[:mdl_af_idx] + "\n\n" \
+                     + overview[mdl_af_idx:mdl_rel_idx]
+            row_data.append(ov_fixed)
+            data_end_idx = ch1_idx + len(cols) - 2
+            assert data_end_idx <= len(row)
+            assert not any(row[ch1_idx:data_end_idx].isna())
+            row_data.extend(row[ch1_idx:data_end_idx])
+            df.loc[idx] = row_data
+    # sanity checks
+    assert len(set(df["model id"])) == len(df["model id"])
+    assert all(df.alignment_file.str.endswith(".fas"))
+    return df.set_index("model id")
+
+
+def _get_modeling_description():
+    """Fixed text for model creation."""
+    return "Model using AlphaFold ('model 3' parameters; pTM monomer version)" \
+           " with a 200 residue gap between the two chains, without templates" \
+           " and without model relaxation"
+
+def _get_protocol_steps_and_software():
+    """Create the list of protocol steps with software and parameters used."""
+    protocol = []
+    # MSA step
+    step = {
+        "method_type": "coevolution MSA",
+        "name": None,
+        "details": "Create paired MSAs for the dimers",
+    }
+    step["input"] = "target_sequences"
+    step["output"] = "MSA"
+    step["software"] = []
+    step["software_parameters"] = {}
+    protocol.append(step)
+
+    # modelling step
+    step = {
+        "method_type": "modeling",
+        "name": None,
+        "details": _get_modeling_description(),
+    }
+    # get input data
+    # Must refer to data already in the JSON, so we try keywords
+    step["input"] = "target_sequences_and_MSA"
+    # get output data
+    # Must refer to existing data, so we try keywords
+    step["output"] = "model"
+    # get software
+    step["software"] = [
+        {
+            "name": "AlphaFold",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.citations.alphafold2,
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": "2.0.0",
+        }]
+    step["software_parameters"] = {}
+    protocol.append(step)
+
+    return protocol
+
+
+def _get_title(gene_name_infos):
+    """Get a title for this modelling experiment."""
+    ranged_gene_names = [f"{gn} ({start_pos}-{end_pos})" 
+                         for gn, start_pos, end_pos in gene_name_infos]
+    return f"Predicted interaction between {' and '.join(ranged_gene_names)}"
+
+
+def _get_model_group_name():
+    """Get a name for a model group."""
+    return None
+
+
+def _get_sequence(chn):
+    """Get the sequence out of an OST chain incl. '-' for gaps in resnums."""
+    # initialise
+    lst_rn = chn.residues[0].number.num
+    idx = 1
+    sqe = chn.residues[0].one_letter_code
+
+    for res in chn.residues[idx:]:
+        lst_rn += 1
+        while lst_rn != res.number.num:
+            sqe += "-"
+            lst_rn += 1
+        sqe += res.one_letter_code
+    return sqe
+
+
+def _check_sequence(up_ac, sequence):
+    """Verify sequence to only contain standard olc."""
+    ns_aa_pos = []  # positions of non-standard amino acids
+    for i, res in enumerate(sequence):
+        if res not in "ACDEFGHIKLMNPQRSTVWY":
+            if res == "U":
+                _warn_msg(
+                    f"Selenocysteine found at position {i+1} of entry "
+                    + f"'{up_ac}', this residue may be missing in the "
+                    + "model."
+                )
+                ns_aa_pos.append(i)
+                continue
+            raise RuntimeError(
+                "Non-standard aa found in UniProtKB sequence "
+                + f"for entry '{up_ac}': {res}, position {i+1}"
+            )
+    return ns_aa_pos
+
+
+def _get_n_parse_up_entry(up_ac, up_url):
+    """Get data for an UniProtKB entry and parse it."""
+    # This is a simple parser for UniProtKB txt format, instead of breaking it
+    # up into multiple functions, we just allow many many branches & statements,
+    # here.
+    # pylint: disable=too-many-branches,too-many-statements
+    data = {}
+    data["up_organism"] = ""
+    data["up_sequence"] = ""
+    data["up_ac"] = up_ac
+    rspns = requests.get(up_url, timeout=180)
+    for line in rspns.iter_lines(decode_unicode=True):
+        if line.startswith("ID   "):
+            sline = line.split()
+            if len(sline) != 5:
+                _abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'")
+            data["up_id"] = sline[1]
+        elif line.startswith("OX   NCBI_TaxID="):
+            # Following strictly the UniProtKB format: 'OX   NCBI_TaxID=<ID>;'
+            data["up_ncbi_taxid"] = line[len("OX   NCBI_TaxID=") : -1]
+            data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
+        elif line.startswith("OS   "):
+            if line[-1] == ".":
+                data["up_organism"] += line[len("OS   ") : -1]
+            else:
+                data["up_organism"] += line[len("OS   ") : -1] + " "
+        elif line.startswith("SQ   "):
+            sline = line.split()
+            if len(sline) != 8:
+                _abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'")
+            data["up_seqlen"] = int(sline[2])
+            data["up_crc64"] = sline[6]
+        elif line.startswith("     "):
+            sline = line.split()
+            if len(sline) > 6:
+                _abort_msg(
+                    "Unusual UniProtKB sequence data line "
+                    + f"found:\n'{line}'"
+                )
+            data["up_sequence"] += "".join(sline)
+        elif line.startswith("RP   "):
+            if "ISOFORM" in line.upper():
+                RuntimeError(
+                    f"First ISOFORM found for '{up_ac}', needs " + "handling."
+                )
+        elif line.startswith("DT   "):
+            # 2012-10-03
+            dt_flds = line[len("DT   ") :].split(", ")
+            if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
+                data["up_last_mod"] = datetime.datetime.strptime(
+                    dt_flds[0], "%d-%b-%Y"
+                )
+            elif dt_flds[1].upper().startswith("ENTRY VERSION "):
+                data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :]
+                if data["up_entry_version"][-1] == ".":
+                    data["up_entry_version"] = data["up_entry_version"][:-1]
+                data["up_entry_version"] = int(data["up_entry_version"])
+        elif line.startswith("GN   Name="):
+            data["up_gn"] = line[len("GN   Name=") :].split(";")[0]
+            data["up_gn"] = data["up_gn"].split("{")[0].strip()
+
+    # we have not seen isoforms in the data set, yet, so we just set them to '.'
+    data["up_isoform"] = None
+
+    if "up_gn" not in data:
+        _warn_msg(
+            f"No gene name found for UniProtKB entry '{up_ac}', using "
+            + "UniProtKB AC instead."
+        )
+        data["up_gn"] = up_ac
+    if "up_last_mod" not in data:
+        _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
+    if "up_crc64" not in data:
+        _abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.")
+    if len(data["up_sequence"]) == 0:
+        _abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.")
+    # check that sequence length and CRC64 is correct
+    if data["up_seqlen"] != len(data["up_sequence"]):
+        _abort_msg(
+            "Sequence length of SQ line and sequence data differ for "
+            + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
+            + f"{len(data['up_sequence'])}"
+        )
+    data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"])
+
+    if "up_id" not in data:
+        _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
+    if "up_ncbi_taxid" not in data:
+        _abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.")
+    if len(data["up_organism"]) == 0:
+        _abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.")
+    return data
+
+
+def _fetch_upkb_entry(up_ac):
+    """Get an UniProtKB entry."""
+    return _get_n_parse_up_entry(
+        up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt"
+    )
+
+
+def _fetch_unisave_entry(up_ac, version):
+    """Get an UniSave entry, in contrast to an UniProtKB entry, that allows us
+    to specify a version."""
+    return _get_n_parse_up_entry(
+        up_ac,
+        f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&"
+        + f"versions={version}",
+    )
+
+
+def _build_and_check_up_aln(up_data, up_start, up_end, ch_seq):
+    # return: aln (UP (name: "up_ac|0-based-offset"), CH) if 100% match,
+    #         None else
+    # up_start and up_end expected to be ready for 0-indexed slicing
+    # ch_seq expected to contain gaps to cover same length as up_end - up_start
+    if up_end > len(up_data["up_sequence"]):
+        # skip UP seq. versions with meaningless lengths
+        return None
+    s_up = seq.CreateSequence(f"{up_data['up_ac']}|{up_start}",
+                              up_data["up_sequence"][up_start:up_end])
+    s_ch = seq.CreateSequence("ch_seq", ch_seq)
+    aln = seq.CreateAlignment(s_up, s_ch)
+    seq_id = seq.alg.SequenceIdentity(aln) # ignores gaps
+    if seq_id < 100:
+        return None
+    else:
+        return aln
+
+def _build_and_cut_up_aln(up_data, ch_seq):
+    # return: aln (UP (name: "up_ac|0-based-offset"), CH)
+    #         cut to remove terminal gaps in CH
+    s_ch = seq.CreateSequence("ch_seq", ch_seq)
+    s_up = seq.CreateSequence("up_seq", up_data["up_sequence"])
+    aln = seq.alg.SemiGlobalAlign(s_up, s_ch, seq.alg.BLOSUM62)[0]
+    # cut to part covered by model and rename
+    offset = aln.sequences[1].first_non_gap
+    aln.Cut(0, aln.sequences[1].first_non_gap)
+    aln.Cut(aln.sequences[1].last_non_gap + 1, len(aln))
+    aln.SetSequenceName(0, f"{up_data['up_ac']}|{offset}")
+    return aln
+
+def _get_upkb_for_sequence(label, up_ac, up_start, up_end, ch_seq, up_id,
+                           up_cache={}):
+    # check length
+    if len(ch_seq) != up_end - up_start:
+        raise RuntimeError(f"BAD RES. COUNT FOR {label}")
+    # return matching up_data and aln to latest UP
+    up_data = up_cache.get(up_ac, None)
+    if up_data is None:
+        up_data = _fetch_upkb_entry(up_ac)
+    aln = _build_and_check_up_aln(up_data, up_start, up_end, ch_seq)
+    if aln is None:
+        # keep original version in cache and check other versions until hit
+        up_cache[up_ac + "-latest"] = up_data
+        while aln is None:
+            if up_data["up_entry_version"] > 1:
+                up_data = _fetch_unisave_entry(
+                    up_ac, up_data["up_entry_version"] - 1
+                )
+                aln = _build_and_check_up_aln(up_data, up_start, up_end, ch_seq)
+            else:
+                raise RuntimeError(
+                    f"Sequences not equal from file: {ch_seq}, from UniProtKB: "
+                    f"{up_data['up_sequence']} ({label}), checked entire entry "
+                    "history."
+                )
+    # check UP ID
+    if up_id != up_data["up_id"]:
+        raise RuntimeError(f"UP ID MISMATCH for {label}; {up_id} vs {up_data['up_id']}")
+    # keep cached for matched data
+    up_cache[up_ac] = up_data
+    # align to latest sequence if needed
+    if up_ac + "-latest" in up_cache:
+        aln = _build_and_cut_up_aln(up_cache[up_ac + "-latest"], ch_seq)
+        seq_id = seq.alg.SequenceIdentity(aln) # ignores gaps
+        if len(aln) != up_end - up_start or seq_id < 95:
+            raise RuntimeError(
+                f"Failed to align {label} to latest {up_ac}!"
+                f" Lengths {len(aln)} vs {up_end - up_start};"
+                f" seq. id {seq_id}."
+            )
+    return up_data, aln
+
+
+def _get_entities(pdb_file, up_infos):
+    """Gather data for the mmCIF (target) entities."""
+    # up_infos as dict {ch_name: (up_ac, up_start_idx, up_end_idx, up_id)}
+    # -> [up_start_idx:up_end_idx] meant to be for slicing up_sequence
+    entities = []
+    _check_file(pdb_file)
+    ost_ent = io.LoadPDB(pdb_file)
+    # check assumption of dimer
+    if ost_ent.chain_count != len(up_infos):
+        raise RuntimeError(f"UNEXPECTED CHAIN COUNT IN {pdb_file}")
+    # check assumption that none of the PDB files have bfactors
+    unique_b_factors = sorted(set(a.b_factor for a in ost_ent.atoms))
+    if unique_b_factors != [0.0]:
+        raise RuntimeError(f"B-FACTORS SPOTTED IN {pdb_file}: "
+                           f"{unique_b_factors}")
+    already_seen = []
+    for chn in ost_ent.chains:
+        cif_ent = {}
+        ch_label = f"{pdb_file}|CH-{chn.name}"
+        up_ac, up_start_idx, up_end_idx, up_id = up_infos[chn.name]
+        up_key = (up_ac, up_start_idx, up_end_idx)
+        sqe_no_gaps = "".join([res.one_letter_code for res in chn.residues])
+        sqe_gaps = _get_sequence(chn)
+        try:
+            e_idx = already_seen.index(up_key)
+        except ValueError:
+            pass
+        else:
+            if sqe_no_gaps != entities[e_idx]["pdb_sequence"]:
+                _abort_msg(
+                    "Sequences are different for two chains of the same "
+                    "UniProtKB AC. This case is not implemented, yet."
+                )
+            entities[e_idx]["pdb_chain_id"].append(chn.name)
+            continue
+        already_seen.append(up_key)
+        # note: upkb is for matching seq. version, aln is for latest seq.
+        upkb, up_aln = _get_upkb_for_sequence(
+            ch_label, up_ac, up_start_idx, up_end_idx, sqe_gaps,
+            up_id, up_cache
+        )
+        cif_ent["pdb_sequence"] = sqe_no_gaps
+        cif_ent["pdb_seqlen"] = len(sqe_no_gaps)
+        cif_ent["pdb_chain_id"] = [chn.name]
+        cif_ent["description"] = (
+            f"{upkb['up_organism']} {upkb['up_gn']} ({upkb['up_ac']}; "
+            f"{up_start_idx + 1} - {up_end_idx})"
+        )
+        cif_ent["up_aln"] = up_aln
+        # 1-indexed positions for closed interval
+        cif_ent["up_start_pos"] = up_start_idx + 1
+        cif_ent["up_end_pos"] = up_end_idx
+        # alignment didn't check starting position in UP
+        if chn.residues[0].number.num != cif_ent["up_start_pos"]:
+            print(f"UNEXPECTED STARTING RESNUM IN {ch_label}")
+        # mapping between seq_id and auth_seq_id
+        # -> assume same for all chains of this entity
+        #    (trivial here as hetero-dimer)
+        cif_ent["auth_seq_id_map"] = {
+            idx+1: res.number.num for idx, res in enumerate(chn.residues)
+        }
+        cif_ent.update(upkb)
+        entities.append(cif_ent)
+    return entities, ost_ent
+
+
+def _get_scores(data, acc_path, cp_name, aln_name):
+    """Check that all files needed to process this model are present."""
+    _check_file(acc_path)
+    with zipfile.ZipFile(acc_path) as zf:
+        if len(zf.namelist()) > 2:
+            print(f"TOO MANY FILES IN ACC {acc_path}")
+        if cp_name not in zf.namelist():
+            raise RuntimeError(f"MISSING NPZ FILE {acc_path}")
+        if aln_name not in zf.namelist():
+            raise RuntimeError(f"MISSING ALN FILE {acc_path}")
+        aln_data = zf.open(aln_name).read()
+        # NOTE: this also checks that all sequences in FASTA have same length
+        aln = io.AlignmentFromString(aln_data, "fasta")
+        cp_data = np.load(zf.open(cp_name))
+    # check aln data
+    # -> should match combined length of gapless chain sequences
+    combined_seq = "".join(trg_ent["pdb_sequence"] \
+                           for trg_ent in data["target_entities"])
+    if str(aln.sequences[0]) != combined_seq:
+        raise RuntimeError(f"BAD ALN SEQ {acc_path}")
+    # if aln.sequence_count < 10:
+    #     print("SHALLOW ALN", acc_path, aln.sequence_count)
+    #     # not an issue really, just curious
+    # check cp data
+    # -> expected pLDDT vector and PAE/CP matrices
+    # -> expected to either have one off diagonal or full matrix
+    seq_lens = [trg_ent["pdb_seqlen"] for trg_ent in data["target_entities"]]
+    assert len(seq_lens) == 2 # only dimers supported here
+    sum_lens = sum(seq_lens)
+    exp_shape_offdiag = (seq_lens[0], seq_lens[1])
+    exp_shape_full = (sum_lens, sum_lens)
+    if len(cp_data) != 3:
+        print(f"UNEXPECTED NUMBER OF ITEMS IN NPZ FILE {acc_path}")
+    if "contact_prob" not in cp_data:
+        raise RuntimeError(f"MISSING CP IN NPZ FILE {acc_path}")
+    cp_matrix = cp_data["contact_prob"]
+    if "predicted_aligned_error" not in cp_data:
+        raise RuntimeError(f"MISSING PAE IN NPZ FILE {acc_path}")
+    pae_matrix = cp_data["predicted_aligned_error"]
+    if cp_matrix.shape == exp_shape_full:
+        # cut matrices
+        cp_matrix = cp_matrix[:seq_lens[0], seq_lens[0]:]
+        pae_matrix = pae_matrix[:seq_lens[0], seq_lens[0]:]
+    elif cp_matrix.shape != exp_shape_offdiag:
+        raise RuntimeError(f"BAD SHAPE OF CP DATA {acc_path}")
+    if pae_matrix.shape != cp_matrix.shape:
+        raise RuntimeError(f"BAD SHAPE OF PAE DATA {acc_path}")
+    if "plddt" not in cp_data:
+        raise RuntimeError(f"MISSING PLDDT IN NPZ FILE {acc_path}")
+    elif cp_data["plddt"].shape != (sum_lens, ):
+        raise RuntimeError(f"BAD SHAPE OF PLDDT DATA {acc_path}")
+    # update data
+    data["plddt"] = cp_data["plddt"]
+    data["pae"] = pae_matrix
+    data["contact_prob"] = cp_matrix
+    data["aln_data"] = aln_data
+
+
+def _get_modelcif_entities(target_ents, asym_units, system):
+    """Create ModelCIF entities and asymmetric units."""
+    for cif_ent in target_ents:
+        mdlcif_ent = modelcif.Entity(
+            # NOTE: here pdb_sequence is what has actually been modelled
+            # -> compared to UP this contains gaps!
+            cif_ent["pdb_sequence"],
+            description=cif_ent["description"],
+            source=ihm.source.Natural(
+                ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"],
+                scientific_name=cif_ent["up_organism"],
+            ),
+            references=[
+                modelcif.reference.UniProt(
+                    cif_ent["up_id"],
+                    cif_ent["up_ac"],
+                    align_begin=cif_ent["up_start_pos"],
+                    align_end=cif_ent["up_end_pos"],
+                    isoform=cif_ent["up_isoform"],
+                    ncbi_taxonomy_id=cif_ent["up_ncbi_taxid"],
+                    organism_scientific=cif_ent["up_organism"],
+                    sequence_version_date=cif_ent["up_last_mod"],
+                    sequence_crc64=cif_ent["up_crc64"],
+                )
+            ],
+        )
+        for pdb_chain_id in cif_ent["pdb_chain_id"]:
+            asym_units[pdb_chain_id] = modelcif.AsymUnit(
+                mdlcif_ent, auth_seq_id_map=cif_ent["auth_seq_id_map"]
+            )
+        system.target_entities.append(mdlcif_ent)
+
+
+def _get_assoc_aln_file(fle_path):
+    """Generate a modelcif.associated.File object pointing to FASTA formatted
+    file containing MSA.
+    """
+    cfile = modelcif.associated.File(
+        fle_path,
+        details="Paired MSAs"
+    )
+    cfile.file_format = "fasta"
+    cfile.file_content = "multiple sequence alignments"
+    return cfile
+
+
+def _get_associated_files(entry_id, mdl_name, add_files):
+    """Create entry for associated files."""
+    # extract local pairwise into extra file
+    ac_file = f"{mdl_name}_local_pairwise_qa.cif"
+    arc_files = [
+        modelcif.associated.LocalPairwiseQAScoresFile(
+            ac_file,
+            categories=["_ma_qa_metric_local_pairwise"],
+            copy_categories=["_ma_qa_metric"],
+            entry_id=entry_id,
+            entry_details="This file is an associated file consisting "
+            + "of local pairwise QA metrics. This is a partial mmCIF "
+            + "file and can be validated by merging with the main "
+            + "mmCIF file containing the model coordinates and other "
+            + "associated data.",
+            details="Predicted aligned error and contact probability",
+        )
+    ]
+    # add any extra files if needed
+    if add_files:
+        arc_files.extend(add_files)
+    # package all into zip file
+    return modelcif.associated.Repository(
+        "",
+        [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
+    )
+    # NOTE: by convention MA expects zip file with same name as model-cif
+
+
+def _assemble_modelcif_software(soft_dict):
+    """Create a modelcif.Software instance from dictionary."""
+    return modelcif.Software(
+        soft_dict["name"],
+        soft_dict["classification"],
+        soft_dict["description"],
+        soft_dict["location"],
+        soft_dict["type"],
+        soft_dict["version"],
+        citation=soft_dict["citation"],
+    )
+
+
+def _get_modelcif_protocol_software(js_step):
+    """Assemble software entries for a ModelCIF protocol step."""
+    if js_step["software"]:
+        if len(js_step["software"]) == 1:
+            sftwre = _assemble_modelcif_software(js_step["software"][0])
+        else:
+            sftwre = []
+            for sft in js_step["software"]:
+                sftwre.append(_assemble_modelcif_software(sft))
+            sftwre = modelcif.SoftwareGroup(elements=sftwre)
+        if js_step["software_parameters"]:
+            params = []
+            for key, val in js_step["software_parameters"].items():
+                params.append(modelcif.SoftwareParameter(key, val))
+            if isinstance(sftwre, modelcif.SoftwareGroup):
+                sftwre.parameters = params
+            else:
+                sftwre = modelcif.SoftwareGroup(
+                    elements=(sftwre,), parameters=params
+                )
+        return sftwre
+    return None
+
+
+def _get_modelcif_protocol_data(data_label, target_entities, aln_data, model):
+    """Assemble data for a ModelCIF protocol step."""
+    if data_label == "target_sequences":
+        data = modelcif.data.DataGroup(target_entities)
+    elif data_label == "MSA":
+        data = aln_data
+    elif data_label == "target_sequences_and_MSA":
+        data = modelcif.data.DataGroup(target_entities)
+        data.append(aln_data)
+    elif data_label == "model":
+        data = model
+    else:
+        raise RuntimeError(f"Unknown protocol data: '{data_label}'")
+    return data
+
+
+def _get_modelcif_protocol(protocol_steps, target_entities, aln_data, model):
+    """Create the protocol for the ModelCIF file."""
+    protocol = modelcif.protocol.Protocol()
+    for js_step in protocol_steps:
+        sftwre = _get_modelcif_protocol_software(js_step)
+        input_data = _get_modelcif_protocol_data(
+            js_step["input"], target_entities, aln_data, model
+        )
+        output_data = _get_modelcif_protocol_data(
+            js_step["output"], target_entities, aln_data, model
+        )
+
+        protocol.steps.append(
+            modelcif.protocol.Step(
+                input_data=input_data,
+                output_data=output_data,
+                name=js_step["name"],
+                details=js_step["details"],
+                software=sftwre,
+            )
+        )
+        protocol.steps[-1].method_type = js_step["method_type"]
+    return protocol
+
+
+def _compress_cif_file(cif_file):
+    """Compress cif file and delete original."""
+    with open(cif_file, "rb") as f_in:
+        with gzip.open(cif_file + ".gz", "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(cif_file)
+
+
+def _package_associated_files(repo):
+    """Compress associated files into single zip file and delete original."""
+    # zip settings tested for good speed vs compression
+    for archive in repo.files:
+        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
+            for zfile in archive.files:
+                cif_zip.write(zfile.path, arcname=zfile.path)
+                os.remove(zfile.path)
+
+
+def _store_as_modelcif(
+    data_json, ost_ent, out_dir, mdl_name, compress, add_files
+):
+    """Mix all the data into a ModelCIF file."""
+    print("    generating ModelCIF objects...", end="")
+    pstart = timer()
+    # create system to gather all the data
+    system = modelcif.System(
+        title=data_json["title"],
+        id=data_json["mdl_id"].upper(),
+        model_details=data_json["model_details"],
+    )
+
+    # add primary citation (HC here but could also fetch via from_pubmed_id)
+    system.citations.append(ihm.Citation(
+        pmid="36261849",
+        title="Computed cancer interactome explains the effects of somatic "
+              "mutations in cancers.",
+        journal="Protein Sci",
+        volume=31,
+        page_range="e4479",
+        year=2022,
+        authors=['Zhang, J.', 'Pei, J.', 'Durham, J.', 'Bos, T.', 'Cong, Q.'],
+        doi="10.1002/pro.4479",
+        is_primary=True,
+    ))
+
+    # create target entities, references, source, asymmetric units & assembly
+    # create an asymmetric unit and an entity per target sequence
+    asym_units = {}
+    _get_modelcif_entities(
+        data_json["target_entities"],
+        asym_units,
+        system,
+    )
+
+    # audit_authors
+    system.authors.extend(data_json["audit_authors"])
+
+    # set up the model to produce coordinates
+    mdl_list_name = "Model 3" # fixed set of params used here
+    model = _OST2ModelCIF(
+        assembly=modelcif.Assembly(asym_units.values()),
+        asym=asym_units,
+        ost_entity=ost_ent,
+        scores_json=data_json,
+        name=mdl_list_name,
+    )
+    print(f" ({timer()-pstart:.2f}s)")
+    print("    processing QA scores...", end="", flush=True)
+    pstart = timer()
+    model.add_scores()
+    print(f" ({timer()-pstart:.2f}s)")
+
+    system.model_groups.append(
+        modelcif.model.ModelGroup([model], name=data_json["model_group_name"])
+    )
+
+    # handle additional files
+    system.repositories.append(
+        _get_associated_files(system.id, mdl_name, add_files)
+    )
+
+    # prepare aln_data info
+    aln_files = [f for f in add_files \
+                 if f.file_content == "multiple sequence alignments"]
+    assert len(aln_files) == 1 # no support for multiple one atm
+    aln_file_name = aln_files[0].path
+    aln_data = modelcif.data.Data(
+        "Paired MSA for the dimer",
+        f"MSA stored as {aln_file_name} in accompanying data"
+    )
+
+    system.protocols.append(
+        _get_modelcif_protocol(
+            data_json["protocol"],
+            system.target_entities,
+            aln_data,
+            model,
+        )
+    )
+
+    # write modelcif System to file
+    print("    write to disk...", end="", flush=True)
+    pstart = timer()
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    # -> also we need to write aln file to have it in zip file with rest
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    mdl_fle = f"{mdl_name}.cif"
+    try:
+        with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+        with open(aln_file_name, "wb") as fh:
+            fh.write(data_json["aln_data"])
+        _package_associated_files(system.repositories[0])
+        if compress:
+            _compress_cif_file(mdl_fle)
+            mdl_fle += ".gz"
+        # also output alignments to latest UP
+        for cif_ent in data_json["target_entities"]:
+            ch_names = "-".join(chn for chn in cif_ent["pdb_chain_id"])
+            up_aln_file_name = f"{mdl_name}-UP-ALN-{ch_names}.fasta"
+            io.SaveAlignment(cif_ent["up_aln"], up_aln_file_name)
+    finally:
+        os.chdir(oldpwd)
+    print(f" ({timer()-pstart:.2f}s)")
+    return os.path.join(out_dir, mdl_fle)
+
+
+def _create_interaction_json():
+    """Create a dictionary (mimicking JSON) that contains data which is the same
+    for all models."""
+    data = {}
+    data["audit_authors"] = _get_audit_authors()
+    data["protocol"] = _get_protocol_steps_and_software()
+    return data
+
+
+def _create_model_json(data, pdb_file, metadata):
+    """Create a dictionary (mimicking JSON) that contains all the data."""
+    up_infos = {
+        metadata[f"chain{ch_idx}"]: (
+            metadata[f"chain{ch_idx}_UniProtAC"],
+            int(metadata[f"chain{ch_idx}_startPosi"]) - 1,
+            int(metadata[f"chain{ch_idx}_endPosi"]),
+            metadata[f"chain{ch_idx}_UniProtKB"],
+        ) for ch_idx in [1, 2]
+    }
+    data["target_entities"], ost_ent = _get_entities(pdb_file, up_infos)
+    gns = [(i["up_gn"], i["up_start_pos"], i["up_end_pos"]) \
+           for i in data["target_entities"]]
+    data["title"] = _get_title(gns)
+    data["model_details"] = metadata.overview
+    data["model_group_name"] = _get_model_group_name()
+    return ost_ent
+
+
+def _translate2modelcif(mdl_id, metadata, opts):
+    """Convert a PDB file with its accompanying data to ModelCIF."""
+    pdb_fle = os.path.join(opts.model_dir, mdl_id + ".pdb")
+    acc_path = os.path.join(opts.accompanying_data_dir,
+                            mdl_id + "_accompanying_data.zip")
+
+    # gather data into JSON-like structure
+    print("    preparing data...", end="")
+    pstart = timer()
+
+    mdlcf_json = _create_interaction_json()
+    mdlcf_json["mdl_id"] = mdl_id
+
+    ost_ent = _create_model_json(mdlcf_json, pdb_fle, metadata)
+
+    # read quality scores from JSON file
+    _get_scores(
+        mdlcf_json, acc_path,
+        metadata.contact_probility_file,
+        metadata.alignment_file
+    )
+    print(f" ({timer()-pstart:.2f}s)")
+    add_files = [_get_assoc_aln_file(metadata.alignment_file)]
+    mdl_path = _store_as_modelcif(
+        mdlcf_json,
+        ost_ent,
+        opts.out_dir,
+        mdl_id,
+        opts.compress,
+        add_files,
+    )
+    # check if result can be read and has expected seq.
+    ent = io.LoadMMCIF(mdl_path)
+    exp_seqs = [trg_ent["pdb_sequence"] \
+                for trg_ent in mdlcf_json["target_entities"]]
+    assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}"
+    ent_seq = "".join(res.one_letter_code for res in ent.residues)
+    assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}"
+
+
+def _main():
+    """Run as script."""
+    opts = _parse_args()
+
+    # parse/fetch global data
+    metadata_full = _get_metadata(opts.metadata_file)
+    if opts.compress:
+        cifext = "cif.gz"
+    else:
+        cifext = "cif"
+    print(f"Working on {opts.metadata_file}...")
+
+    # iterate models
+    # TEST: selected models
+    # for mdl_id in ["O00329-D1_P27986-D2", "O14602_P47813"]:
+    for mdl_id in ["O00329-D1_P27986-D2", "O14602_P47813",
+                   "O75376-D12_Q14995-D5", "Q8NFD5-D13_Q92925-D2",
+                   "P20248-D2_Q09472-D14", "O14744-D1_Q9Y618-D10",
+                   "O00257-D2_Q6W2J9-D8"]:
+        metadata = metadata_full.loc[mdl_id]
+    # for mdl_id, metadata in metadata.iterrows():
+        if os.path.exists(os.path.join(opts.out_dir, f"{mdl_id}.{cifext}")):
+            print(f"  {mdl_id} already done...")
+            continue
+        print(f"  translating {mdl_id}...")
+        pdb_start = timer()
+        _translate2modelcif(mdl_id, metadata, opts)
+        print(f"  ... done with {mdl_id} ({timer()-pdb_start:.2f}s).")
+
+    print(f"... done with {opts.metadata_file}.")
+
+
+if __name__ == "__main__":
+    _main()
+
+#  LocalWords:  aa pdb