convert_to_modelcif.py

#!/usr/bin/env python3

"""Take the output of the AlphaPulldown pipeline and turn it into a ModelCIF
file with a lot of metadata in place."""

from typing import Tuple
import gzip
import hashlib
import json
import os
import pickle
import re
import shutil
import sys
import zipfile

from Bio import SeqIO
from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.Structure import Structure as BioStructure
from absl import app, flags, logging
import numpy as np

import ihm.citations
import modelcif
import modelcif.associated
import modelcif.dumper
import modelcif.model
import modelcif.protocol

# ToDo: Get options properly, best get the same names as used in existing
#       scripts, e.g. could '--monomer_objects_dir' be used as feature
#       directory/ directory with the feature JSON files?
# ToDo: Monomers work separately - features may come from different set of
#       software, databases... so target sequences may be connected to different
#       versions of the same sequence database, may use different versions of
#       software... can this still go into single protocol steps or would this
#       prevent identifying which MSAs were produced with which software? E.g.
#       can there still be a single "sequence search" step? (This is
#       definitively for later, not the first working version of the converter
#       script)
# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__'
# ToDo: protocol step software parameters
flags.DEFINE_string(
    "ap_output", None, "AlphaPulldown pipeline output directory."
)
flags.DEFINE_integer(
    "model_selected",
    None,
    "Model to be converted into ModelCIF, use '--select_all' to convert all "
    + "models found in '--af2_output'",
)
flags.DEFINE_bool("compress", False, "Compress the ModelCIF file using Gzip")
flags.mark_flags_as_required(["ap_output"])

FLAGS = flags.FLAGS

# ToDo: implement a flags.register_validator() checking that files/ directories
#       exist as expected.


# pylint: disable=too-few-public-methods
class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
    """Predicted accuracy according to the TM-score score in [0,1]"""

    name = "pTM"
    software = None


class _GlobalIPTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
    # python-modelcif reads the first line of class-doc has description of a
    # score, so we need to allow long lines, here.
    # pylint: disable=line-too-long
    """Predicted protein-protein interface score, based on the TM-score score in [0,1]."""

    name = "ipTM"
    software = None


class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _PAE(modelcif.qa_metric.MetricType):
    """Predicted aligned error (in Angstroms)"""

    type = "PAE"
    other_details = None


class _LocalPairwisePAE(modelcif.qa_metric.LocalPairwise, _PAE):
    """Predicted aligned error (in Angstroms)"""

    name = "PAE"
    software = None


# pylint: enable=too-few-public-methods


class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
    """Map Biopython PDB.Structure object to ihm.model"""

    def __init__(self, *args, **kwargs):
        """Initialise a model"""
        self.structure = kwargs.pop("bio_pdb_structure")
        self.asym = kwargs.pop("asym")

        super().__init__(*args, **kwargs)

    def get_atoms(self):
        for atm in self.structure.get_atoms():
            yield modelcif.model.Atom(
                asym_unit=self.asym[atm.parent.parent.id],
                seq_id=atm.parent.id[1],
                atom_id=atm.name,
                type_symbol=atm.element,
                x=atm.coord[0],
                y=atm.coord[1],
                z=atm.coord[2],
                het=atm.parent.id[0] != " ",
                biso=atm.bfactor,
                occupancy=atm.occupancy,
            )

    def add_scores(self, scores_json, entry_id, file_prefix, sw_dct):
        """Add QA metrics"""
        _GlobalPLDDT.software = sw_dct["AlphaFold"]
        _GlobalPTM.software = sw_dct["AlphaFold"]
        _GlobalIPTM.software = sw_dct["AlphaFold"]
        _LocalPLDDT.software = sw_dct["AlphaFold"]
        _LocalPairwisePAE.software = sw_dct["AlphaFold"]
        # global scores
        self.qa_metrics.extend(
            (
                _GlobalPLDDT(np.mean(scores_json["plddt"])),
                _GlobalPTM(scores_json["ptm"]),
                _GlobalIPTM(scores_json["iptm"]),
            )
        )

        # local scores
        # iterate polypetide chains
        # local PLDDT
        i = 0
        lpae = []
        # aa_only=False includes non-canonical amino acids but seems to skip
        # non-peptide-linking residues like ions
        polypeptides = PPBuilder().build_peptides(self.structure, aa_only=False)
        for chn_i in polypeptides:
            for res_i in chn_i:
                # local pLDDT
                # Assertion assumes that pLDDT values are also stored in the
                # B-factor column.
                assert (
                    round(scores_json["plddt"][i], 2)
                    == next(res_i.get_atoms()).bfactor
                )
                self.qa_metrics.append(
                    _LocalPLDDT(
                        self.asym[res_i.parent.id].residue(res_i.id[1]),
                        round(scores_json["plddt"][i], 2),
                    )
                )

                # pairwise alignment error
                j = 0
                # We do a 2nd iteration over the structure instead of doing
                # index magic because it keeps the code cleaner and should not
                # be noticeably slower than iterating the array directly.
                # Majority of time goes into writing files, anyway.
                for chn_j in polypeptides:
                    for res_j in chn_j:
                        lpae.append(
                            _LocalPairwisePAE(
                                self.asym[res_i.parent.id].residue(res_i.id[1]),
                                self.asym[res_j.parent.id].residue(res_j.id[1]),
                                scores_json["pae"][i][j],
                            )
                        )
                        j += 1

                i += 1
        self.qa_metrics.extend(lpae)

        # outsource PAE to associated file
        arc_files = [
            modelcif.associated.LocalPairwiseQAScoresFile(
                f"{file_prefix}_local_pairwise_qa.cif",
                categories=["_ma_qa_metric_local_pairwise"],
                copy_categories=["_ma_qa_metric"],
                entry_id=entry_id,
                entry_details="This file is an associated file consisting "
                + "of local pairwise QA metrics. This is a partial mmCIF "
                + "file and can be validated by merging with the main "
                + "mmCIF file containing the model coordinates and other "
                + "associated data.",
                details="Predicted aligned error",
            )
        ]

        return modelcif.associated.Repository(
            "",
            [
                modelcif.associated.ZipFile(
                    f"{file_prefix}.zip", files=arc_files
                )
            ],
        )


def _get_modelcif_entities(target_ents, asym_units, system):
    """Create ModelCIF entities and asymmetric units."""
    for cif_ent in target_ents:
        mdlcif_ent = modelcif.Entity(
            # 'pdb_sequence' can be used here, since AF2 always has the
            # complete sequence.
            cif_ent["pdb_sequence"],
            description=cif_ent["description"],
        )
        for pdb_chain_id in cif_ent["pdb_chain_id"]:
            asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent)
        system.target_entities.append(mdlcif_ent)


def _get_modelcif_protocol_input(
    input_data_group, target_entities, ref_dbs, model
):
    """Assemble input data for a ModelCIF protocol step."""
    if input_data_group == "target_sequences":
        input_data = modelcif.data.DataGroup(target_entities)
        input_data.extend(ref_dbs)
    elif input_data_group == "model":
        input_data = model
    else:
        raise RuntimeError(f"Unknown protocol input: '{input_data_group}'")
    return input_data


def _get_modelcif_protocol_output(output_data_group, model):
    """Assemble output data for a ModelCIF protocol step."""
    if output_data_group == "model":
        output_data = model
    elif output_data_group == "monomer_pickle_files":
        output_data = modelcif.data.DataGroup(
            [
                modelcif.data.Data(
                    "Pickle files", details="Monomer feature/ MSA pickle files"
                )
            ]
        )
    else:
        raise RuntimeError(f"Unknown protocol output: '{output_data_group}'")
    return output_data


def _get_modelcif_protocol(
    protocol_steps, target_entities, model, sw_dict, ref_dbs
):
    """Create the protocol for the ModelCIF file."""
    protocol = modelcif.protocol.Protocol()
    for js_step in protocol_steps:
        # assemble input & output data
        input_data = _get_modelcif_protocol_input(
            js_step["input_data_group"], target_entities, ref_dbs, model
        )
        output_data = _get_modelcif_protocol_output(
            js_step["output_data_group"], model
        )
        # loop over software group and assemble software group from that
        sw_grp = modelcif.SoftwareGroup()
        for pss in js_step["software_group"]:  # protocol step software
            if sw_dict[pss] is not None:
                # add software w/o individual parameters
                sw_grp.append(sw_dict[pss])
                # add software with individual parameters
                # sw_grp.append(
                #     modelcif.SoftwareWithParameters(
                #         sw_dict[pss],
                #         [modelcif.SoftwareParameter(<name>, <value>)],
                #     )
                # )
        # ToDo: make sure AlphaPulldown is first in the SoftwareGroup() list,
        #       AlphaFold second; that influences citation order in the ModelCIF
        #       file.
        # mix everything together in a protocol step
        protocol.steps.append(
            modelcif.protocol.Step(
                input_data=input_data,
                output_data=output_data,
                name=js_step["step_name"],
                details=js_step["details"],
                software=sw_grp,
            )
        )
        print("modelcif.protocol.Step(")
        print(f"            input_data={input_data},")
        print(f"            output_data={output_data},")
        print(f"            name={js_step['step_name']},")
        print(f"            details=\"{js_step['details']}\",")
        print(f"            software={sw_grp},")
        print(")")

    return protocol


def _get_modelcif_ref_dbs(meta_json):
    """Get sequence DBs used for monomer features."""
    # vendor formatting for DB names/ URLs, extend on KeyError
    db_info = {
        "uniref90": {
            "name": "UniRef90",
            "url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/"
            + "uniref90/",
        },
        "mgnify": {"name": "MGnify", "url": None},
        "bfd": {"name": "BFD", "url": None},
        "small_bfd": {"name": "Reduced BFD", "url": None},
        "uniref30": {"name": "UniRef30", "url": None},
        "uniprot": {"name": "UniProt", "url": None},
        "pdb70": {"name": "PDB70", "url": None},
        "pdb_seqres": {"name": "PDB seqres", "url": None},
        "colabfold": {"name": "ColabFold", "url": None},
    }
    sdb_dct = {}  # 'sequence database list', starts as dict
    for data in meta_json.values():
        for db_name, vdct in data["databases"].items():
            db_name = db_name.lower()
            # if DB already exists, check URL and version
            if db_name in sdb_dct:
                # ToDo: switch URL to the actual URL read from JSON
                if (
                    sdb_dct[db_name].version != vdct["version"]
                    or sdb_dct[db_name].url != db_info[db_name]["url"]
                ):
                    raise RuntimeError(
                        "Database versions or URLs differ for "
                        + f"'{db_name}': '{sdb_dct[db_name].version}/ "
                        + f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ "
                        + f"{db_info[db_name]['url']}'"
                    )
            sdb_dct[db_name] = modelcif.ReferenceDatabase(
                db_info[db_name]["name"],
                db_info[db_name]["url"],
                version=vdct["version"],
            )

    return sdb_dct.values()


def _store_as_modelcif(
    data_json: dict,
    structure: BioStructure,
    mdl_file: str,
    out_dir: str,
    compress: bool = False,
    # file_prfx, add_files
) -> None:
    """Create the actual ModelCIF file."""
    system = modelcif.System(
        title=data_json["_struct.title"],
        id=data_json["data_"].upper(),
        model_details=data_json["_struct.pdbx_model_details"],
    )

    # create target entities, references, source, asymmetric units & assembly
    # create an asymmetric unit and an entity per target sequence
    asym_units = {}
    _get_modelcif_entities(
        data_json["target_entities"],
        asym_units,
        system,
    )

    # ToDo: get modelling-experiment authors
    # audit_authors
    # system.authors.extend(data_json["audit_authors"])

    # set up the model to produce coordinates
    model = _Biopython2ModelCIF(
        assembly=modelcif.Assembly(asym_units.values()),
        asym=asym_units,
        bio_pdb_structure=structure,
        name=data_json["_ma_model_list.model_name"],
    )

    # create software list from feature metadata
    # ToDo: store_as_modelcif should not use __meta__
    sw_dct = _get_software_data(data_json["__meta__"])

    # process scores
    mdl_file = os.path.splitext(os.path.basename(mdl_file))[0]
    system.repositories.append(
        model.add_scores(data_json, system.id, mdl_file, sw_dct)
    )

    system.model_groups.append(modelcif.model.ModelGroup([model]))

    # ToDo: get protocol steps together
    #       - 'coevolution MSA' (create_individual_features.py) step
    #       - 'template search' is usually omitted for AF2 as that term more
    #         relates to homology modelling, AF2 uses templates in a different
    #         way and a template search usually results in a list of templates
    #         which would need to be included, here, in theory
    #      - for MSA/ template search, how to represent the outcome? Adding the
    #        pickle files quickly exceeds reasonable storage use
    #      - 'modeling' (run_multimer_jobs.py), are the four modes reflected by
    #        the JSON data/ does the JSON data look different for each mode?
    #      - are the scores only calculated by alpha-analysis.sif or do they
    #        come out of run_multimer_jobs.py? Does this go into its own step?
    #      - what about including the tabular summary?
    #      - model selection: only if just a certain model is translated to
    #        ModelCIF, or mix it with scoring step?
    #
    #      - cancer-PPI-domains has 'coevolutin MSA'
    #      - then modelling
    #      - do we have an example with a split MSA & modelling step?
    #      - model selection like for Tara
    system.protocols.append(
        _get_modelcif_protocol(
            data_json["ma_protocol_step"],
            system.target_entities,
            model,
            sw_dct,
            # ToDo: _storte_as_modelcif should not use __meta__, __meta__ is
            #       tool specific
            _get_modelcif_ref_dbs(data_json["__meta__"]),
        )
    )

    # write modelcif.System to file
    # NOTE: this will dump PAE on path provided in add_scores
    # -> hence we cheat by changing path and back while being exception-safe...
    oldpwd = os.getcwd()
    os.chdir(out_dir)
    try:
        with open(
            f"{mdl_file}.cif",
            "w",
            encoding="ascii",
        ) as mmcif_fh:
            modelcif.dumper.write(mmcif_fh, [system])
        if compress:
            _compress_cif_file(f"{mdl_file}.cif")
        # Create associated archive
        for archive in system.repositories[0].files:
            with zipfile.ZipFile(
                archive.path, "w", zipfile.ZIP_BZIP2
            ) as cif_zip:
                for zfile in archive.files:
                    cif_zip.write(zfile.path, arcname=zfile.path)
                    os.remove(zfile.path)
    finally:
        os.chdir(oldpwd)


def _compress_cif_file(cif_file):
    """Compress cif file and delete original."""
    with open(cif_file, "rb") as f_in:
        with gzip.open(cif_file + ".gz", "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(cif_file)


def _get_model_details(cmplx_name: str, data_json: dict) -> str:
    """Get the model description."""
    ap_versions = []
    af2_version = None
    for mnmr in data_json["__meta__"]:  # mnmr = monomer
        if (
            data_json["__meta__"][mnmr]["software"]["AlphaPulldown"]["version"]
            not in ap_versions
        ):
            ap_versions.append(
                data_json["__meta__"][mnmr]["software"]["AlphaPulldown"][
                    "version"
                ]
            )
        # AlphaFold-Multimer builds the model we are looking at, can only be a
        # single version.
        if af2_version is None:
            af2_version = data_json["__meta__"][mnmr]["software"]["AlphaFold"][
                "version"
            ]
        else:
            if (
                data_json["__meta__"][mnmr]["software"]["AlphaFold"]["version"]
                != af2_version
            ):
                # pylint: disable=line-too-long
                raise RuntimeError(
                    "Different versions of AlphaFold-Multimer found: "
                    + f"'{data_json['__meta__'][mnmr]['software']['alphafold']['version']}'"
                    + f" vs. '{af2_version}'"
                )

    return (
        f"Model generated for {' and '.join(cmplx_name)}, produced "
        + f"using AlphaFold-Multimer ({af2_version}) as implemented by "
        + f"AlphaPulldown ({', '.join(ap_versions)})."
    )


def _get_feature_metadata(
    modelcif_json: dict, cmplx_name: str, prj_dir: str
) -> list:
    """Read metadata from a feature JSON file."""
    cmplx_name = cmplx_name.split("_and_")
    prj_dir = os.path.join(prj_dir, "features_monomers")
    if not os.path.isdir(prj_dir):
        logging.info(f"No feature directory '{prj_dir}' found.")
        sys.exit()
    if "__meta__" not in modelcif_json:
        modelcif_json["__meta__"] = {}
    for mnmr in cmplx_name:
        modelcif_json["__meta__"][mnmr] = {}
        feature_json = os.path.join(prj_dir, f"{mnmr}_feature_metadata.json")
        if not os.path.isfile(feature_json):
            logging.info(f"No feature metadata file '{feature_json}' found.")
            sys.exit()
        # ToDo: make sure that its always ASCII
        with open(feature_json, "r", encoding="ascii") as jfh:
            jdata = json.load(jfh)
        modelcif_json["__meta__"][mnmr]["software"] = jdata["software"]
        modelcif_json["__meta__"][mnmr]["databases"] = jdata["databases"]

    return cmplx_name


def _get_model_info(
    cif_json: dict,
    cmplx_name: str,
    mdl_id: str,
    mdl_rank: int,
) -> None:
    """Get 'data_' block ID and data for categories '_struct' and '_entry'."""
    cif_json["data_"] = "_".join(cmplx_name)
    cif_json["_struct.title"] = f"Prediction for {' and '.join(cmplx_name)}"
    cif_json["_struct.pdbx_model_details"] = _get_model_details(
        cmplx_name, cif_json
    )
    cif_json[
        "_ma_model_list.model_name"
    ] = f"Model {mdl_id} (ranked #{mdl_rank})"


def _get_entities(
    cif_json: dict, pdb_file: str, cmplx_name: list, prj_dir: str
) -> BioStructure:
    """Gather data for the mmCIF (target) entities."""
    # read sequence from FastA, map to sequences from PDB file
    sequences = {}
    # Fetch FastA sequences to assign monomers to the molecular entities
    prj_dir = os.path.join(prj_dir, "fastas")
    if not os.path.isdir(prj_dir):
        logging.info(f"FastA directory '{prj_dir}' does not exist.")
        sys.exit()
    for mnmr in cmplx_name:
        fasta = os.path.join(prj_dir, f"{mnmr}.fa")
        if not os.path.isfile(fasta):
            logging.info(f"FastA file '{fasta}' does not exist.")
            sys.exit()
        with open(fasta, "r", encoding="ascii") as ffh:
            seq = SeqIO.read(ffh, "fasta")
            # Using MD5 sums for comparing sequences only works since AF2
            # *ALWAYS* has the complete sequence in a chain.
            sequences[hashlib.md5(seq.seq.encode()).hexdigest()] = mnmr
    # gather molecular entities from PDB file
    structure = PDBParser().get_structure("_".join(cmplx_name), pdb_file)
    cif_json["target_entities"] = []
    already_seen = []
    for seq in PPBuilder().build_peptides(structure):
        chn_id = seq[0].parent.id
        seq = seq.get_sequence()
        seq_md5 = hashlib.md5(seq.encode()).hexdigest()
        cif_ent = {}
        try:
            e_idx = already_seen.index(seq_md5)
        except ValueError:
            pass
        else:
            cif_json["target_entities"][e_idx]["pdb_chain_id"].append(chn_id)
            continue
        cif_ent["pdb_sequence"] = seq
        cif_ent["pdb_chain_id"] = [chn_id]
        cif_ent["description"] = sequences[seq_md5]
        cif_json["target_entities"].append(cif_ent)
        already_seen.append(sequences[seq_md5])

    return structure


def _get_scores(cif_json: dict, scr_file: str) -> None:
    """Add scores to JSON data."""
    with open(scr_file, "rb") as sfh:
        scr_dict = pickle.load(sfh)
    # Get pLDDT as a list, the global pLDDT is the average, calculated on the
    # spot.
    cif_json["plddt"] = scr_dict["plddt"]
    cif_json["ptm"] = float(scr_dict["ptm"])
    cif_json["iptm"] = float(scr_dict["iptm"])
    cif_json["pae"] = scr_dict["predicted_aligned_error"]


def _get_software_data(meta_json: dict) -> list:
    """Turn meta data about software into modelcif.Software objects."""
    cite_hhsuite = ihm.Citation(
        pmid="31521110",
        title="HH-suite3 for fast remote homology detection and deep "
        + "protein annotation.",
        journal="BMC Bioinformatics",
        volume=20,
        page_range=None,
        year=2019,
        authors=[
            "Steinegger, M.",
            "Meier, M.",
            "Mirdita, M.",
            "Voehringer, H.",
            "Haunsberger, S.J.",
            "Soeding, J.",
        ],
        doi="10.1186/s12859-019-3019-7",
    )
    # {key from json: dict needed to produce sw entry plus internal key}
    sw_data = {
        "AlphaFold": modelcif.Software(
            "AlphaFold-Multimer",
            "model building",
            "Structure prediction",
            "https://github.com/deepmind/alphafold",
            "package",
            None,
            ihm.Citation(
                pmid=None,
                title="Protein complex prediction with AlphaFold-Multimer.",
                journal="bioRxiv",
                volume=None,
                page_range=None,
                year=2021,
                authors=[
                    "Evans, R.",
                    "O'Neill, M.",
                    "Pritzel, A.",
                    "Antropova, N.",
                    "Senior, A.",
                    "Green, T.",
                    "Zidek, A.",
                    "Bates, R.",
                    "Blackwell, S.",
                    "Yim, J.",
                    "Ronneberger, O.",
                    "Bodenstein, S.",
                    "Zielinski, M.",
                    "Bridgland, A.",
                    "Potapenko, A.",
                    "Cowie, A.",
                    "Tunyasuvunakool, K.",
                    "Jain, R.",
                    "Clancy, E.",
                    "Kohli, P.",
                    "Jumper, J.",
                    "Hassabis, D.",
                ],
                doi="10.1101/2021.10.04.463034",
            ),
        ),
        "AlphaPulldown": modelcif.Software(
            "AlphaPulldown",
            "model building",
            "Structure prediction",
            "https://github.com/KosinskiLab/AlphaPulldown",
            "package",
            None,
            ihm.Citation(
                pmid="36413069",
                title="AlphaPulldown-a python package for protein-protein "
                + "interaction screens using AlphaFold-Multimer.",
                journal="Bioinformatics",
                volume=39,
                page_range=None,
                year=2023,
                authors=[
                    "Yu, D.",
                    "Chojnowski, G.",
                    "Rosenthal, M.",
                    "Kosinski, J.",
                ],
                doi="10.1093/bioinformatics/btac749",
            ),
        ),
        "hhblits": modelcif.Software(
            "HHblits",
            "data collection",
            "Iterative protein sequence searching by HMM-HMM alignment",
            "https://github.com/soedinglab/hh-suite",
            "program",
            None,
            cite_hhsuite,
        ),
        "hhsearch": modelcif.Software(
            "HHsearch",
            "data collection",
            "Protein sequence searching by HMM-HMM comparison",
            "https://github.com/soedinglab/hh-suite",
            "program",
            None,
            cite_hhsuite,
        ),
        "hmmbuild": modelcif.Software(
            "hmmbuild",
            "data collection",
            "Building HMM search profiles",
            "http://hmmer.org/",
            "program",
            None,
            None,
        ),
        "hmmsearch": None,
        "jackhmmer": None,
        "kalign": None,
    }
    # ToDo: refactor to only those SW objects created/ added that are actually
    #       in the dictionary. That is, instead of a pre-build dictionary,
    #       instantiate on the fly, there is anyway a hard-coded-tool-name.
    for data in meta_json.values():
        for sftwr, version in data["software"].items():
            if sftwr not in sw_data:
                raise RuntimeError(
                    "Unknown software found in meta data: " + f"'{sftwr}'"
                )
            version = version["version"]
            # ToDo: software should not be None, remove in final version
            if sw_data[sftwr] is not None:
                if sw_data[sftwr].version is not None:
                    if sw_data[sftwr].version != version:
                        raise RuntimeError(
                            "Software versions differ for "
                            + f"'{sftwr}': '{sw_data[sftwr].version}' vs. "
                            + f"'{version}'"
                        )
                sw_data[sftwr].version = version

    return sw_data


def _get_protocol_steps(modelcif_json):
    """Create the list of protocol steps with software and parameters used."""
    protocol = []
    # MSA/ monomer feature generation step
    step = {
        "method_type": "coevolution MSA",
        "step_name": "MSA generation",
        "details": "Create sequence features for corresponding monomers.",
        "input_data_group": "target_sequences",
        "output_data_group": "monomer_pickle_files",
        "software_group": []
        # _ma_protocol_step.protocol_id
    }
    for sftwr in modelcif_json["__meta__"].values():
        sftwr = sftwr["software"]
        for tool in sftwr:
            if tool not in step["software_group"]:
                step["software_group"].append(tool)
    protocol.append(step)

    # modelling step

    # model selection step

    return protocol


def alphapulldown_model_to_modelcif(
    cmplx_name: str,
    mdl: tuple,
    out_dir: str,
    prj_dir: str,
    compress: bool = False,
) -> None:
    """Convert an AlphaPulldown model into a ModelCIF formatted mmCIF file.

    Metadata for the ModelCIF categories will be fetched from AlphaPulldown
    output as far as possible. This expects modelling projects to exists in
    AlphaPulldown's output directory structure."""
    # ToDo: ENABLE logging.info(f"Processing '{mdl[0]}'...")
    modelcif_json = {}
    # fetch metadata
    cmplx_name = _get_feature_metadata(modelcif_json, cmplx_name, prj_dir)
    # fetch/ assemble more data about the modelling experiment
    _get_model_info(
        modelcif_json,
        cmplx_name,
        mdl[2],
        mdl[3],
    )
    # gather target entities (sequences that have been modeled) info
    structure = _get_entities(modelcif_json, mdl[0], cmplx_name, prj_dir)

    # read quality scores from pickle file
    _get_scores(modelcif_json, mdl[1])

    modelcif_json["ma_protocol_step"] = _get_protocol_steps(modelcif_json)

    _store_as_modelcif(modelcif_json, structure, mdl[0], out_dir, compress)
    # ToDo: ENABLE logging.info(f"... done with '{mdl[0]}'")


def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
    """Get the list of models to be converted.

    If `model_selected` is none, all models will be marked for conversion."""
    mdl_path = os.path.join(ap_dir, "models")
    cmplx = os.listdir(mdl_path)
    # For now, exactly 1 complex is expected in the 'models' subdirectory. If
    # there are more, the 'model_selected' mechanism needs to be further tuned
    # to get to the right model.
    assert len(cmplx) == 1
    cmplx = cmplx[0]
    mdl_path = os.path.join(mdl_path, cmplx)
    models = []
    # We are going for models with name "rank_?.pdb" as it does not depend on
    # relaxation. But this means we have to match the pickle files via
    # ranking_debug.json.
    ranking_dbg = os.path.join(mdl_path, "ranking_debug.json")
    if not os.path.isfile(ranking_dbg):
        logging.info(
            f"Ranking file '{ranking_dbg} does not exist or is no regular "
            + "file."
        )
        sys.exit()
    with open(ranking_dbg, "r", encoding="utf8") as jfh:
        ranking_dbg = json.load(jfh)
    score_files = {}
    for i, fle in enumerate(ranking_dbg["order"]):
        if not fle.startswith("model_"):
            raise RuntimeError(
                "Filename does not start with 'model_', can "
                + f"not determine model ID: '{fle}'"
            )
        score_files[i] = (
            os.path.join(mdl_path, f"result_{fle}.pkl"),
            fle.split("_")[1],
            i,
        )
    # match PDB files with pickle files
    if model_selected is not None:
        models.append(
            (
                os.path.join(mdl_path, f"ranked_{model_selected}.pdb"),
                score_files[model_selected][0],
                score_files[model_selected][1],  # model ID
                score_files[model_selected][2],  # model rank
            )
        )
    else:
        for mdl in os.listdir(mdl_path):
            rank = re.match(r"ranked_(\d+)\.pdb", mdl)
            if rank is not None:
                rank = int(rank.group(1))
                models.append(
                    (
                        os.path.join(mdl_path, mdl),
                        score_files[rank][0].score_files[rank][1],
                    )
                )

    # check that files actually exist
    for mdl, scrs, *_ in models:
        if not os.path.isfile(mdl):
            logging.info(
                f"Model file '{mdl}' does not exist or is not a regular file."
            )
            sys.exit()
        if not os.path.isfile(scrs):
            logging.info(
                f"Scores file '{scrs}' does not exist or is not a regular file."
            )
            sys.exit()

    return cmplx, mdl_path, models


def main(argv):
    """Run as script."""
    # pylint: disable=pointless-string-statement
    """
    Here, the metadata json files for each feature are in features_monomers/
    directory. The models are in models/ directory, and usually there are many
    complexes modelled using different permutations of the monomeric features.
    For the sake of size, I send you the models of only one dimer
    cage_B_and_cage_C/ that was generated using features_monomers/cage_B.pkl
    and features_monomers/cage_C.pkl accordingly.
    Please note that typically all the cage_?_feature_metadata.json files are
    identical in terms of used databases and software versions and generated
    in one go.
    However, theoretically they could be generated using different binaries/DBs
    versions, so maybe it makes sense to compare them and store both/all
    versions if they are different. This merging can be done on our
    AlphaPulldown side and may be added now or later on. Let me know if it is
    critical for you now.
    """
    # pylint: enable=pointless-string-statement
    del argv  # Unused.

    # get list of selected models and assemble ModelCIF files + associated data
    complex_name, model_dir, model_list = _get_model_list(
        FLAGS.ap_output, FLAGS.model_selected
    )
    for mdl in model_list:
        alphapulldown_model_to_modelcif(
            complex_name,
            mdl,
            model_dir,
            FLAGS.ap_output,
            FLAGS.compress,
        )


if __name__ == "__main__":
    app.run(main)

# ToDo: Question - option to include all the non-selected models in associated
#       data archive? This blows up storage size (especially if PAEs included),
#       but we did that already in the past. Idea is to have all models
#       available for... reproducibility and whatnot, but show the selected
#       (representative) of the modelling experiment/ study more prominently.
# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
#       'data_', '_entry', maybe have a user-defined JSON document with things
#       like that, including author names?
# ToDo: where to store which model was chosen? Should be in Tara's models.
# ToDo: make sure all functions come with types

# From former discussions:
# - including Jupyter notebooks would require adding the pickle files to the
#   associated files (too much storage needed for that)

#  LocalWords:  ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
#  LocalWords:  multimer sif Jupyter aa MSAs