From 68ad8fb62a76a63c9bb802b4b8215c837df9adfe Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Mon, 3 Oct 2022 14:10:59 +0200
Subject: [PATCH] Add selection step

---
 translate2modelcif.py | 116 +++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 63 deletions(-)

diff --git a/translate2modelcif.py b/translate2modelcif.py
index db57ff1..aa90588 100644
--- a/translate2modelcif.py
+++ b/translate2modelcif.py
@@ -249,7 +249,6 @@ def _check_model_extra_files_present(model_dir, pdb_file):
 
 def _get_audit_authors():
     """Return the list of authors that produced this model."""
-    # ToDo: tell Xabi that his name can't have a á in mmCIF
     return (
         "Bartolec, T.",
         "Vazquez-Campos, X.",
@@ -289,7 +288,8 @@ def _parse_colabfold_config(cnfg_file):
         use_msa = False
     elif cf_config["msa_mode"] == "custom":
         print(
-            "WARNING: Custom MSA mode used. Not clear from config what to do here!"
+            "WARNING: Custom MSA mode used. Not clear from config what to do "
+            + "here!"
         )
         seq_dbs = []
         use_mmseqs = False
@@ -311,63 +311,71 @@ def _parse_colabfold_config(cnfg_file):
     else:
         raise ValueError(f"Unknown model_type {cf_config['model_type']}")
 
-    # write description
-    description = f"Model generated using ColabFold v{cf_config['version']}"
+    # write modeling description
+    mdl_description = f"Model generated using ColabFold v{cf_config['version']}"
     if use_multimer:
-        description += f" with AlphaFold-Multimer (v{multimer_version})"
+        mdl_description += f" with AlphaFold-Multimer (v{multimer_version})"
     else:
-        description += " with AlphaFold"
+        mdl_description += " with AlphaFold"
     if cf_config["stop_at_score"] < 100:
         # early stopping feature of ColabFold
         upto = "up to "
     else:
         upto = ""
-    description += (
+    mdl_description += (
         f" producing {upto}{cf_config['num_models']} models"
         f" with {upto}{cf_config['num_recycles']} recycles each"
     )
     if cf_config["use_amber"]:
-        description += ", with AMBER relaxation"
+        mdl_description += ", with AMBER relaxation"
     else:
-        description += ", without model relaxation"
+        mdl_description += ", without model relaxation"
     if cf_config["use_templates"]:
         print(
             "WARNING: ColabFold may use PDB70 or custom templates. "
             "Not clear from config!"
         )
-        description += ", using templates"
+        mdl_description += ", using templates"
     else:
-        description += ", without templates"
+        mdl_description += ", without templates"
     if cf_config["rank_by"] == "plddt":
-        description += ", ranked by pLDDT"
+        mdl_description += ", ranked by pLDDT"
     elif cf_config["rank_by"] == "ptmscore":
-        description += ", ranked by pTM"
+        mdl_description += ", ranked by pTM"
     elif cf_config["rank_by"] == "multimer":
-        description += ", ranked by ipTM*0.8+pTM*0.2"
+        mdl_description += ", ranked by ipTM*0.8+pTM*0.2"
     else:
         raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
     if use_msa:
-        description += ", starting from"
+        mdl_description += ", starting from"
         if use_mmseqs:
             msa_type = "MSA"
         else:
             msa_type = "custom MSA"
         if use_multimer:
             if cf_config["pair_mode"] == "unpaired+paired":
-                description += f" paired and unpaired {msa_type}s"
+                mdl_description += f" paired and unpaired {msa_type}s"
             elif cf_config["pair_mode"] == "paired":
-                description += f" paired {msa_type}s"
+                mdl_description += f" paired {msa_type}s"
             elif cf_config["pair_mode"] == "unpaired":
-                description += f" unpaired {msa_type}s"
+                mdl_description += f" unpaired {msa_type}s"
             else:
                 raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
         else:
-            description += f" an {msa_type}"
+            mdl_description += f" an {msa_type}"
         if use_mmseqs:
-            description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
+            mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
     else:
-        description += " without an MSA"
-    description += "."
+        mdl_description += " without an MSA"
+    mdl_description += "."
+
+    # write selection description
+    slct_description = (
+        "Select best model, which is either the top-ranked model as "
+        + "determined by the ColabFold pipeline (ipTM*0.8+pTM*0.2), or else "
+        + "the model with best congruence with crosslinks reported in the "
+        + "related study."
+    )
 
     return {
         "config": cf_config,
@@ -376,7 +384,8 @@ def _parse_colabfold_config(cnfg_file):
         "use_msa": use_msa,
         "use_multimer": use_multimer,
         "multimer_version": multimer_version,
-        "description": description,
+        "modeling_description": mdl_description,
+        "selection_description": slct_description,
     }
 
 
@@ -388,7 +397,7 @@ def _get_protocol_steps_and_software(config_data):
     step = {
         "method_type": "modeling",
         "name": None,
-        "details": config_data["description"],
+        "details": config_data["modeling_description"],
     }
     # get input data
     # Must refer to data already in the JSON, so we try keywords
@@ -415,21 +424,7 @@ def _get_protocol_steps_and_software(config_data):
                 "classification": "data collection",
                 "description": "Many-against-Many sequence searching",
                 # ToDo: add citation to ihm.citations
-                "citation": ihm.Citation(
-                    pmid="30615063",
-                    title="MMseqs2 desktop and local web server app for fast, "
-                    + "interactive sequence searches.",
-                    journal="Bioinformatics",
-                    volume=35,
-                    page_range=(2856, 2858),
-                    year=2019,
-                    authors=[
-                        "Mirdita, M.",
-                        "Steinegger, M.",
-                        "Soeding, J.",
-                    ],
-                    doi="10.1093/bioinformatics/bty1057",
-                ),
+                "citation": ihm.citations.mmseqs2,
                 "location": "https://github.com/soedinglab/mmseqs2",
                 "type": "package",
                 "version": None,
@@ -496,29 +491,22 @@ def _get_protocol_steps_and_software(config_data):
     protocol.append(step)
 
     # model selection step
-    # ToDo [input/ internal]: model selection step on a single model is a bit
-    # silly, how do we get a list of models?
-    # GT-NOTES:
-    # - input/output should be ok without list of models
-    # - rank of model is already stored in _ma_model_list.model_name and
-    #   _ma_data.name (in _store_as_modelcif)
-    # - ColabFold ranking details is already in details of step above.
-    # - Suggestion: add extra step only if AF-ranking was overruled and
-    #   include it in step above.
-
-    # step = {
-    #     "method_type": "model selection",
-    #     "name": "ma_protocol_step.step_name",
-    #     "details": "Select best model, which is either the top-ranked model "
-    #     + "as determined by the ColabFold pipeline "
-    #     + "(ipTM*0.8+pTM*0.2), or else the model with best "
-    #     + "congruence with crosslinks reported in the related study.",
-    # }
-    # step["input"] = "model"
-    # step["output"] = "model"
-    # step["software"] = []
-    # step["software_parameters"] = {}
-    # protocol.append(step)
+    if (
+        "selection_description" not in config_data
+        or len(config_data["selection_description"]) == 0
+    ):
+        return protocol
+
+    step = {
+        "method_type": "model selection",
+        "name": None,
+        "details": config_data["selection_description"],
+    }
+    step["input"] = "model"
+    step["output"] = "model"
+    step["software"] = []
+    step["software_parameters"] = {}
+    protocol.append(step)
 
     return protocol
 
@@ -584,7 +572,9 @@ def _fetch_upkb_entry(up_ac):
     data["up_organism"] = ""
     data["up_sequence"] = ""
     data["up_ac"] = up_ac
-    rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
+    rspns = requests.get(
+        f"https://www.uniprot.org/uniprot/{up_ac}.txt", timeout=180
+    )
     for line in rspns.iter_lines(decode_unicode=True):
         if line.startswith("ID   "):
             sline = line.split()
-- 
GitLab