Updated translate2modelcif.py based on Niko-model-set

14a60b46 · Gerardo Tauriello · 6bbd6fa7 · 14a60b46
Commit 14a60b46 authored 2 years ago by Gerardo Tauriello
--- a/translate2modelcif.py
+++ b/translate2modelcif.py
 #! /usr/local/bin/ost
 """Translate models from Tara/ Xabi from PDB + extra data into ModelCIF."""
-# ToDo [internal]: get DB versions in - https://colabfold.mmseqs.com, scroll
-# down to "Database Information"
+
+# EXAMPLES for running:
+"""
+ost scripts/translate2modelcif.py "A0A1B0GTU1-O75152" \
+    --top_ranked_only --out_dir="./modelcif"
+"""

 import argparse
 import datetime
 import os
 import sys
+import gzip, shutil, zipfile

 from timeit import default_timer as timer
 import numpy as np
@@ -14,6 +19,7 @@ import requests
 import ujson as json

 import ihm
+import ihm.citations
 import modelcif
 import modelcif.associated
 import modelcif.dumper
@@ -38,6 +44,27 @@ def _parse_args():
        help="Directory with model(s) to be translated. Must be of form "
        + "'<UniProtKB AC>-<UniProtKB AC>'",
    )
+    parser.add_argument(
+        "--top_ranked_only",
+        default=False,
+        action="store_true",
+        help="Only process top ranked model."
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        metavar="<OUTPUT DIR>",
+        default="",
+        help="Path to separate path to store results " \
+             "(model_dir used, if none given).",
+    )
+    parser.add_argument(
+        "--compress",
+        default=False,
+        action="store_true",
+        help="Compress ModelCIF file with gzip " \
+             "(note that QA file is zipped either way).",
+    )

    opts = parser.parse_args()

@@ -48,42 +75,49 @@ def _parse_args():
        _abort_msg(f"Model directory '{opts.model_dir}' does not exist.")
    if not os.path.isdir(opts.model_dir):
        _abort_msg(f"Path '{opts.model_dir}' does not point to a directory.")
+    # check out_dir
+    if not opts.out_dir:
+        opts.out_dir = opts.model_dir
+    else:
+        if not os.path.exists(opts.out_dir):
+            _abort_msg(f"Output directory '{opts.out_dir}' does not exist.")
+        if not os.path.isdir(opts.out_dir):
+            _abort_msg(f"Path '{opts.out_dir}' does not point to a directory.")

    return opts


 # pylint: disable=too-few-public-methods
 class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
-    """Predicted accuracy according to the TM-score score in [0,1]."""
+    """Predicted accuracy according to the TM-score score in [0,1]"""

    name = "pTM"
    software = None


 class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
-    """Predicted accuracy according to the CA-only lDDT in [0,100]."""
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


 class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
-    """Predicted accuracy according to the CA-only lDDT in [0,100]."""
+    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


 class _PAE(modelcif.qa_metric.MetricType):
-    """Predicted aligned error (in Angstroms).
-    See :class:`MetricType` for more information."""
+    """Predicted aligned error (in Angstroms)"""

    type = "PAE"
    other_details = None


 class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE):
-    """predicted aligned error (in Angstroms)."""
+    """Predicted aligned error (in Angstroms)"""

    name = "PAE"
    software = None
@@ -118,7 +152,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
                occupancy=atm.occupancy,
            )

-    def add_scores(self, scores_json, entry_id, ac_file_prfx):
+    def add_scores(self, scores_json, entry_id, mdl_name):
        """Add QA metrics from AF2 scores."""
        # global scores
        self.qa_metrics.extend(
@@ -162,25 +196,28 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):

        self.qa_metrics.extend(lpae)

-        ac_file = f"{os.path.basename(ac_file_prfx)}_local_pairwise_qa.cif"
+        ac_file = f"{mdl_name}_local_pairwise_qa.cif"
+        qa_file = modelcif.associated.LocalPairwiseQAScoresFile(
+            ac_file,
+            categories=["_ma_qa_metric_local_pairwise"],
+            copy_categories=["_ma_qa_metric"],
+            entry_id=entry_id,
+            entry_details="This file is an associated file consisting "
+            + "of local pairwise QA metrics. This is a partial mmCIF "
+            + "file and can be validated by merging with the main "
+            + "mmCIF file containing the model coordinates and other "
+            + "associated data.",
+            details="Predicted aligned error",
+        )

        return modelcif.associated.Repository(
            "",
            [
-                modelcif.associated.LocalPairwiseQAScoresFile(
-                    ac_file,
-                    categories=["_ma_qa_metric_local_pairwise"],
-                    copy_categories=["_ma_qa_metric"],
-                    entry_id=entry_id,
-                    entry_details="This file is an associated file consisting "
-                    + "of local pairwise QA metrics. This is a partial mmCIF "
-                    + "file and can be validated by merging with the main "
-                    + "mmCIF file containing the model coordinates and other "
-                    + "associated data.",
-                    details="Predicted aligned error.",
-                )
+                modelcif.associated.ZipFile(f"{mdl_name}.zip",
+                                            files=[qa_file])
            ],
        )
+        # NOTE: by convention MA expects zip file with same name as model-cif


 def _abort_msg(msg, exit_code=1):
@@ -220,29 +257,131 @@ def _get_audit_authors():
    """Return the list of authors that produced this model."""
    # ToDo: tell Xabi that his name can't have a á in mmCIF
    return (
-        "Bartolec T",
-        "Vazquez-Campos X",
-        "Johnson M",
-        "Norman A",
-        "Payne R",
-        "Wilkins M",
-        "Mackay J",
-        "Low J",
+        "Bartolec, T.",
+        "Vazquez-Campos, X.",
+        "Johnson, M.",
+        "Norman, A.",
+        "Payne, R.",
+        "Wilkins, M.",
+        "Mackay, J.",
+        "Low, J.",
    )


-def _get_protocol_steps_and_software(cnfg_file):
+def _parse_colabfold_config(cnfg_file):
+    """Read config.json and fetch relevant data from it."""
+    # NOTE: following code from https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to understand config
+
+    # fetch and drop fields which are not relevant for model building
+    with open(cnfg_file, encoding="utf8") as jfh:
+        cf_config = json.load(jfh)
+    if "num_queries" in cf_config:
+        del cf_config["num_queries"]
+    # fetch relevant data
+    # -> MSA mode
+    if cf_config["msa_mode"] == "MMseqs2 (UniRef+Environmental)":
+        seq_dbs = ["UniRef", "Environmental"]
+        use_mmseqs = True
+        use_msa = True
+    elif cf_config["msa_mode"] == "MMseqs2 (UniRef only)":
+        seq_dbs = ["UniRef"]
+        use_mmseqs = True
+        use_msa = True
+    elif cf_config["msa_mode"] == "single_sequence":
+        seq_dbs = []
+        use_mmseqs = False
+        use_msa = False
+    elif cf_config["msa_mode"] == "custom":
+        print("WARNING: Custom MSA mode used. Not clear from config what to do here!")
+        seq_dbs = []
+        use_mmseqs = False
+        use_msa = True
+    else:
+        raise ValueError(f"Unknown msa_mode {cf_config['msa_mode']}")
+    # -> model type
+    if cf_config["model_type"] == "AlphaFold2-multimer-v1":
+        # AF-Multimer as introduced in AlphaFold v2.1.0
+        use_multimer = True
+        multimer_version = 1
+    elif cf_config["model_type"] == "AlphaFold2-multimer-v2":
+        # AF-Multimer as introduced in AlphaFold v2.2.0
+        use_multimer = True
+        multimer_version = 2
+    elif cf_config["model_type"] == "AlphaFold2-ptm":
+        use_multimer = False
+        multimer_version = None
+    else:
+        raise ValueError(f"Unknown model_type {cf_config['model_type']}")
+
+    # write description
+    description = f"Model generated using ColabFold v{cf_config['version']}"
+    if use_multimer:
+        description += f" with AlphaFold-Multimer (v{multimer_version})"
+    else:
+        description += f" with AlphaFold"
+    description += f" producing {cf_config['num_models']} models" \
+                   f" with {cf_config['num_recycles']} recycles each"
+    if cf_config["use_amber"]:
+        description += ", with AMBER relaxation"
+    else:
+        description += ", without model relaxation"
+    if cf_config["use_templates"]:
+        print("WARNING: ColabFold may use PDB70 or custom templates. " \
+              "Not clear from config!")
+        description += ", using templates"
+    else:
+        description += ", without templates"
+    if cf_config["rank_by"] == "plddt":
+        description += ", ranked by pLDDT"
+    elif cf_config["rank_by"] == "ptmscore":
+        description += ", ranked by pTM"
+    elif cf_config["rank_by"] == "multimer":
+        description += ", ranked by ipTM*0.8+pTM*0.2"
+    else:
+        raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
+    if use_msa:
+        description += ", starting from"
+        if use_mmseqs:
+            msa_type = "MSA"
+        else:
+            msa_type = "custom MSA"
+        if use_multimer:
+            if cf_config["pair_mode"] == "unpaired+paired":
+                description += f" paired and unpaired {msa_type}s"
+            elif cf_config["pair_mode"] == "paired":
+                description += f" paired {msa_type}s"
+            elif cf_config["pair_mode"] == "unpaired":
+                description += f" unpaired {msa_type}s"
+            else:
+                raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
+        else:
+            description += f" an {msa_type}"
+        if use_mmseqs:
+            description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
+    else:
+        description += " without an MSA"
+    description += "."
+
+    return {
+        "config": cf_config,
+        "seq_dbs": seq_dbs,
+        "use_mmseqs": use_mmseqs,
+        "use_msa": use_msa,
+        "use_multimer": use_multimer,
+        "multimer_version": multimer_version,
+        "description": description
+    }
+
+
+def _get_protocol_steps_and_software(config_data):
    """Create the list of protocol steps with software and parameters used."""
    protocol = []

    # modelling step
    step = {
        "method_type": "modeling",
-        "name": "ma_protocol_step.step_name",
-        "details": "Model using AlphaFold-Multimer (AlphaFold v2.2.0), "
-        + "without amber relaxation and producing 5 models with up to 3 "
-        + "recycles each, starting from paired and unparied MSAs for the "
-        + "dimers using MMseqs2.",
+        "name": None,
+        "details": config_data["description"],
    }
    # get input data
    # Must refer to data already in the JSON, so we try keywords
@@ -255,116 +394,116 @@ def _get_protocol_steps_and_software(cnfg_file):
        {
            "name": "ColabFold",
            "classification": "model building",
-            # ToDo: Get description for ColabFold
-            "description": "software.description",
-            "citation": {
-                "pmid": None,
-                "title": "ColabFold - Making protein folding accessible to all",
-                "journal": "bioRxiv",
-                "volume": None,
-                "page_range": None,
-                "year": 2022,
-                "authors": [
-                    "Mirdita M",
-                    "Schuetze K",
-                    "Moriwaki Y",
-                    "Heo L",
-                    "Ovchinnikov S",
-                    "Steinegger M",
-                ],
-                "doi": "10.1101/2021.08.15.456425",
-            },
+            "description": "Structure prediction",
+            "citation": ihm.citations.colabfold,
            "location": "https://github.com/sokrypton/ColabFold",
            "type": "package",
            "version": "1.2.0",
-        },
-        {
+        }]
+    if config_data["use_mmseqs"]:
+        step["software"].append({
            "name": "MMseqs2",
            "classification": "data collection",
            "description": "Many-against-Many sequence searching",
-            "citation": {
-                "pmid": "30615063",
-                "title": "MMseqs2 desktop and local web server app for fast, "
-                + "interactive sequence searches",
-                "journal": "Bioinformatics",
-                "volume": 35,
-                "page_range": (2856, 2858),
-                "year": 2019,
-                "authors": [
-                    "Mirdita M",
-                    "Steinegger M",
-                    "Soeding J",
+            "citation": ihm.Citation(
+                pmid="30615063",
+                title="MMseqs2 desktop and local web server app for fast, "
+                + "interactive sequence searches.",
+                journal="Bioinformatics",
+                volume=35,
+                page_range=(2856, 2858),
+                year=2019,
+                authors=[
+                    "Mirdita, M.",
+                    "Steinegger, M.",
+                    "Soeding, J.",
                ],
-                "doi": "10.1093/bioinformatics/bty1057",
-            },
+                doi="10.1093/bioinformatics/bty1057",
+            ),
            "location": "https://github.com/soedinglab/mmseqs2",
            "type": "package",
            "version": None,
-        },
-        {
+        })
+    if config_data["use_multimer"]:
+        step["software"].append({
            "name": "AlphaFold-Multimer",
            "classification": "model building",
            "description": "Structure prediction",
-            "citation": {
-                "pmid": None,
-                "title": "Protein complex prediction with "
+            "citation": ihm.Citation(
+                pmid=None,
+                title="Protein complex prediction with "
                + "AlphaFold-Multimer.",
-                "journal": "bioRxiv",
-                "volume": None,
-                "page_range": None,
-                "year": 2021,
-                "authors": [
-                    "Evans R",
-                    "O'Neill M",
-                    "Pritzel A",
-                    "Antropova N",
-                    "Senior A",
-                    "Green T",
-                    "Zidek A",
-                    "Bates R",
-                    "Blackwell S",
-                    "Yim J",
-                    "Ronneberger O",
-                    "Bodenstein S",
-                    "Zielinski M",
-                    "Bridgland A",
-                    "Potapenko A",
-                    "Cowie A",
-                    "Tunyasuvunakool K",
-                    "Jain R",
-                    "Clancy E",
-                    "Kohli P",
-                    "Jumper J",
-                    "Hassabis D",
+                journal="bioRxiv",
+                volume=None,
+                page_range=None,
+                year=2021,
+                authors=[
+                    "Evans, R.",
+                    "O'Neill, M.",
+                    "Pritzel, A.",
+                    "Antropova, N.",
+                    "Senior, A.",
+                    "Green, T.",
+                    "Zidek, A.",
+                    "Bates, R.",
+                    "Blackwell, S.",
+                    "Yim, J.",
+                    "Ronneberger, O.",
+                    "Bodenstein, S.",
+                    "Zielinski, M.",
+                    "Bridgland, A.",
+                    "Potapenko, A.",
+                    "Cowie, A.",
+                    "Tunyasuvunakool, K.",
+                    "Jain, R.",
+                    "Clancy, E.",
+                    "Kohli, P.",
+                    "Jumper, J.",
+                    "Hassabis, D.",
                ],
-                "doi": "10.1101/2021.10.04.463034",
-            },
+                doi="10.1101/2021.10.04.463034",
+            ),
            "location": "https://github.com/deepmind/alphafold",
            "type": "package",
-            "version": "2.1.1",
-        },
-    ]
-    # get parameters
-    with open(cnfg_file, encoding="utf8") as jfh:
-        step["software_parameters"] = json.load(jfh)
+            "version": None,
+        })
+    else:
+        step["software"].append({
+            "name": "AlphaFold",
+            "classification": "model building",
+            "description": "Structure prediction",
+            "citation": ihm.citations.alphafold2,
+            "location": "https://github.com/deepmind/alphafold",
+            "type": "package",
+            "version": None,
+        })
+    step["software_parameters"] = config_data["config"]
    protocol.append(step)

    # model selection step
    # ToDo [input/ internal]: model selection step on a single model is a bit
    # silly, how do we get a list of models?
-    step = {
-        "method_type": "model selection",
-        "name": "ma_protocol_step.step_name",
-        "details": "Select best model, which is either the top-ranked model "
-        + "as determined by the ColabFold pipeline "
-        + "(iptmscore*0.8+ptmscore*0.2), or else the model with best "
-        + "congruence with crosslinks reported in the related study.",
-    }
-    step["input"] = "model"
-    step["output"] = "model"
-    step["software"] = []
-    step["software_parameters"] = {}
-    protocol.append(step)
+    # GT-NOTES:
+    # - input/output should be ok without list of models
+    # - rank of model is already stored in _ma_model_list.model_name and 
+    #   _ma_data.name (in _store_as_modelcif)
+    # - ColabFold ranking details is already in details of step above.
+    # - Suggestion: add extra step only if AF-ranking was overruled and
+    #   include it in step above.
+
+    # step = {
+    #     "method_type": "model selection",
+    #     "name": "ma_protocol_step.step_name",
+    #     "details": "Select best model, which is either the top-ranked model "
+    #     + "as determined by the ColabFold pipeline "
+    #     + "(iptmscore*0.8+ptmscore*0.2), or else the model with best "
+    #     + "congruence with crosslinks reported in the related study.",
+    # }
+    # step["input"] = "model"
+    # step["output"] = "model"
+    # step["software"] = []
+    # step["software_parameters"] = {}
+    # protocol.append(step)

    return protocol

@@ -387,7 +526,7 @@ def _get_model_details(gene_names):
 def _get_model_group_name():
    """Get a name for a model group."""

-    return "Crosslinked Heterodimer ALphaFold-Multimer v2 Models"
+    return "Crosslinked Heterodimer AlphaFold-Multimer v2 Models"


 def _get_sequence(chn):
@@ -529,7 +668,8 @@ def _get_entities(pdb_file, up_acs):
        upkb = _get_upkb_for_sequence(sqe, up_acs[i])
        cif_ent["pdb_sequence"] = sqe
        cif_ent["pdb_chain_id"] = chn.name
-        cif_ent["description"] = f"Model of {upkb['up_gn']} ({upkb['up_ac']})"
+        cif_ent["description"] = f"{upkb['up_organism']} {upkb['up_gn']} " \
+                                 f"({upkb['up_ac']})"
        cif_ent.update(upkb)
        entities.append(cif_ent)

@@ -542,8 +682,8 @@ def _get_scores(data, prfx):
    with open(scrs_fle, encoding="utf8") as jfh:
        scrs_json = json.load(jfh)

-    # ToDo: is dict.update still the way to go when iterating multiple model
-    # directories? Aka, does dict.update overwrite old scores?
+    # NOTE for reuse of data when iterating multiple models: this will overwrite
+    # scores in data but will not delete any scores if prev. models had more...
    data.update(scrs_json)


@@ -570,10 +710,8 @@ def _get_modelcif_entities(target_ents, source, asym_units, system):
                )
            ],
        )
-        # ToDo [input]: Add details
        asym_units[cif_ent["pdb_chain_id"]] = modelcif.AsymUnit(
-            mdlcif_ent,
-            details="struct_asym.details",
+            mdlcif_ent
        )
        system.target_entities.append(mdlcif_ent)

@@ -587,25 +725,34 @@ def _assemble_modelcif_software(soft_dict):
        soft_dict["location"],
        soft_dict["type"],
        soft_dict["version"],
-        citation=ihm.Citation(
-            pmid=soft_dict["citation"]["pmid"],
-            title=soft_dict["citation"]["title"],
-            journal=soft_dict["citation"]["journal"],
-            volume=soft_dict["citation"]["volume"],
-            page_range=soft_dict["citation"]["page_range"],
-            year=soft_dict["citation"]["year"],
-            authors=soft_dict["citation"]["authors"],
-            doi=soft_dict["citation"]["doi"],
-        ),
+        citation=soft_dict["citation"]
    )


-def _get_modelcif_protocol(protocol_steps, target_entities, model):
+def _get_sequence_dbs(seq_dbs):
+    """Get ColabFold seq. DBs."""
+    # NOTE: hard coded for ColabFold versions before 2022/07/13
+    # -> afterwards UniRef30 updated to 2022_02 (and maybe more changes)
+    db_dict = {
+        "UniRef": modelcif.ReferenceDatabase(
+            "UniRef30",
+            "http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz",
+            version="2021_03"
+        ),
+        "Environmental": modelcif.ReferenceDatabase(
+            "ColabFold DB",
+            "http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz",
+            version="2021_08"
+        )
+    }
+    return [db_dict[seq_db] for seq_db in seq_dbs]
+
+
+def _get_modelcif_protocol(protocol_steps, target_entities, model, ref_dbs):
    """Create the protocol for the ModelCIF file."""
    protocol = modelcif.protocol.Protocol()
    for js_step in protocol_steps:
        sftwre = None
-        # ToDo [input]: Turn into software group if parameters are available
        if js_step["software"]:
            if len(js_step["software"]) == 1:
                sftwre = _assemble_modelcif_software(js_step["software"][0])
@@ -616,7 +763,6 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):
                sftwre = modelcif.SoftwareGroup(elements=sftwre)
            if js_step["software_parameters"]:
                params = []
-                # ToDo [internal]: handle lists!
                for k, v in js_step["software_parameters"].items():
                    params.append(
                        modelcif.SoftwareParameter(k, v)
@@ -630,7 +776,7 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):

        if js_step["input"] == "target_sequences":
            input_data = modelcif.data.DataGroup(target_entities)
-        # ToDo: Add databases + versions
+            input_data.extend(ref_dbs)
        elif js_step["input"] == "model":
            input_data = model
        else:
@@ -655,7 +801,28 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):
    return protocol


-def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx):
+def _compress_cif_file(cif_file):
+    """Compress cif file and delete original."""
+    with open(cif_file, 'rb') as f_in:
+        with gzip.open(cif_file + '.gz', 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.remove(cif_file)
+
+
+def _package_associated_files(mdl_name):
+    """Compress associated files into single zip file and delete original."""
+    # file names must match ones from add_scores
+    zip_path = f"{mdl_name}.zip"
+    files = [f"{mdl_name}_local_pairwise_qa.cif"]
+    # zip settings tested for good speed vs compression
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_BZIP2) as myzip:
+        for file in files:
+            myzip.write(file)
+            os.remove(file)
+
+
+def _store_as_modelcif(interaction_name, data_json, ost_ent, out_dir, file_prfx,
+                       compress):
    """Mix all the data into a ModelCIF file."""
    print("    generating ModelCIF objects...", end="")
    pstart = timer()
@@ -678,27 +845,31 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx):
        data_json["target_entities"], source, asym_units, system
    )

-    # ToDo [input]: Get Assembly name
    assembly = modelcif.Assembly(
-        asym_units.values(), name="ma_struct_assembly_details.assembly_name"
+        asym_units.values()
    )

    # audit_authors
    system.authors.extend(data_json["audit_authors"])

    # set up the model to produce coordinates
-    # ToDo [input]: Get ma_model_list.model_name
+    if data_json["rank_num"] == 1:
+        mdl_list_name = f"Model {data_json['mdl_num']} (top ranked model)"
+    else:
+        mdl_list_name = f"Model {data_json['mdl_num']} " \
+                        f"(#{data_json['rank_num']} ranked model)"
    model = _OST2ModelCIF(
        assembly=assembly,
        asym=asym_units,
        ost_entity=ost_ent,
-        name="ma_model_list.model_name",
+        name=mdl_list_name,
    )
    print(f" ({timer()-pstart:.2f}s)")
    print("    processing QA scores...", end="", flush=True)
    pstart = timer()
+    mdl_name = os.path.basename(file_prfx)
    system.repositories.append(
-        model.add_scores(data_json, system.id, file_prfx)
+        model.add_scores(data_json, system.id, mdl_name)
    )
    print(f" ({timer()-pstart:.2f}s)")

@@ -707,26 +878,39 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx):
    )
    system.model_groups.append(model_group)

+    ref_dbs = _get_sequence_dbs(data_json["config_data"]["seq_dbs"])
    protocol = _get_modelcif_protocol(
-        data_json["protocol"], system.target_entities, model
+        data_json["protocol"], system.target_entities, model, ref_dbs
    )
    system.protocols.append(protocol)

    # write modelcif System to file
    print("    write to disk...", end="", flush=True)
    pstart = timer()
-    with open(f"{file_prfx}.cif", "w", encoding="ascii") as mmcif_fh:
-        modelcif.dumper.write(mmcif_fh, [system])
+    # NOTE: this will dump PAE on path provided in add_scores
+    # -> hence we cheat by changing path and back while being exception-safe...
+    oldpwd = os.getcwd()
+    os.chdir(out_dir)
+    try:
+        with open(f"{mdl_name}.cif", "w", encoding="ascii") as mmcif_fh:
+            modelcif.dumper.write(mmcif_fh, [system])
+        _package_associated_files(mdl_name)
+        if compress:
+            _compress_cif_file(f"{mdl_name}.cif")
+    finally:
+        os.chdir(oldpwd)
+
    print(f" ({timer()-pstart:.2f}s)")


-def _create_interaction_json(cnfg_file):
+def _create_interaction_json(config_data):
    """Create a dictionary (mimicking JSON) that contains data which is the same
    for all models."""
    data = {}

    data["audit_authors"] = _get_audit_authors()
-    data["protocol"] = _get_protocol_steps_and_software(cnfg_file)
+    data["protocol"] = _get_protocol_steps_and_software(config_data)
+    data["config_data"] = config_data

    return data

@@ -756,14 +940,15 @@ def _main():
    up_acs = interaction.split("-")

    cnfg = _check_interaction_extra_files_present(opts.model_dir)
-
-    mdlcf_json = _create_interaction_json(cnfg)
+    config_data = _parse_colabfold_config(cnfg)

    # iterate model directory
-    for fle in os.listdir(opts.model_dir):
+    for fle in sorted(os.listdir(opts.model_dir)):
        # iterate PDB files
        if not fle.endswith(".pdb"):
            continue
+        if opts.top_ranked_only and "rank_1" not in fle:
+            continue
        print(f"  translating {fle}...")
        pdb_start = timer()
        file_prfx, uid = _check_model_extra_files_present(opts.model_dir, fle)
@@ -772,14 +957,26 @@ def _main():
        # gather data into JSON-like structure
        print("    preparing data...", end="")
        pstart = timer()
+        
+        # NOTE: could also be prepared globally if all carefully overwritten
+        # but not worth the trouble...
+        mdlcf_json = _create_interaction_json(config_data)
+
+        # uid = ..._rank_X_model_Y.pdb
+        mdl_name_parts = uid.split('_')
+        assert mdl_name_parts[-4] == "rank"
+        assert mdl_name_parts[-2] == "model"
+        mdlcf_json["rank_num"] = int(mdl_name_parts[-3])
+        mdlcf_json["mdl_num"] = int(mdl_name_parts[-1])
+
        ost_ent = _create_model_json(mdlcf_json, fle, up_acs)

        # read quality scores from JSON file
        _get_scores(mdlcf_json, file_prfx)
        print(f" ({timer()-pstart:.2f}s)")

-        _store_as_modelcif(uid, mdlcf_json, ost_ent, file_prfx)
-        # ToDo [internal]: wipe data or is it overwritten in mdlcf_json?
+        _store_as_modelcif(uid, mdlcf_json, ost_ent, opts.out_dir, file_prfx,
+                           opts.compress)
        print(f"  ... done with {fle} ({timer()-pdb_start:.2f}s).")

    print(f"... done with {opts.model_dir}.")