From 743319e57337fb86ea5bccf9b68455efef84a5e4 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Wed, 8 Nov 2023 16:18:48 +0100
Subject: [PATCH] Get reference databases from JSON

---
 convert_to_modelcif.py | 48 ++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py
index d1c18ea..a2c3747 100755
--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -4,6 +4,7 @@
 file with a lot of metadata in place."""
 
 from typing import Tuple
+import datetime
 import gzip
 import hashlib
 import json
@@ -346,45 +347,52 @@ def _get_modelcif_protocol(
     return protocol
 
 
+def _cast_release_date(release_date):
+    """Type cast a date into datetime.date"""
+    # "AF2" has a special meaning, those DBs did not change since the first
+    # release of AF2. This information is needed in the model-producing
+    # pipeline.
+    if release_date is None or release_date == "AF2":
+        return None
+
+    try:
+        return datetime.datetime.strptime(release_date, "%Y-%m-%d %H:%M:%S")
+    except ValueError:
+        logging.warning(
+            f"Unsupported release date format found: {release_date}"
+        )
+        raise
+
+
 def _get_modelcif_ref_dbs(meta_json):
     """Get sequence databases used for monomer features."""
     # vendor formatting for DB names/ URLs, extend on KeyError
-    db_info = {
-        "uniref90": {
-            "name": "UniRef90",
-            "url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/"
-            + "uniref90/",
-        },
-        "mgnify": {"name": "MGnify", "url": None},
-        "bfd": {"name": "BFD", "url": None},
-        "small_bfd": {"name": "Reduced BFD", "url": None},
-        "uniref30": {"name": "UniRef30", "url": None},
-        "uniprot": {"name": "UniProt", "url": None},
-        "pdb70": {"name": "PDB70", "url": None},
-        "pdb_seqres": {"name": "PDB seqres", "url": None},
-        "colabfold": {"name": "ColabFold", "url": None},
-    }
+    # ToDo: adapt to new JSON input
     sdb_dct = {}  # 'sequence database list', starts as dict
     for data in meta_json.values():
         for db_name, vdct in data["databases"].items():
-            db_name = db_name.lower()
+            if vdct["version"] == "NA":
+                vdct["version"] = None
+            vdct["release_date"] = _cast_release_date(vdct["release_date"])
             # if DB already exists, check URL and version
             if db_name in sdb_dct:
                 # ToDo: switch URL to the actual URL read from JSON
                 if (
                     sdb_dct[db_name].version != vdct["version"]
-                    or sdb_dct[db_name].url != db_info[db_name]["url"]
+                    or sdb_dct[db_name].url != vdct["location_url"][0]
                 ):
                     raise RuntimeError(
                         "Database versions or URLs differ for "
                         + f"'{db_name}': '{sdb_dct[db_name].version}/ "
                         + f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ "
-                        + f"{db_info[db_name]['url']}'"
+                        + f"{vdct['location_url'][0]}'"
                     )
+            # ToDo: deal with DBs with multiple URLs
             sdb_dct[db_name] = modelcif.ReferenceDatabase(
-                db_info[db_name]["name"],
-                db_info[db_name]["url"],
+                db_name,
+                vdct["location_url"][0],
                 version=vdct["version"],
+                release_date=vdct["release_date"],
             )
 
     return sdb_dct.values()
-- 
GitLab