Skip to content
Snippets Groups Projects
Commit 743319e5 authored by Bienchen's avatar Bienchen
Browse files

Get reference databases from JSON

parent ddbb905c
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,7 @@
file with a lot of metadata in place."""
from typing import Tuple
import datetime
import gzip
import hashlib
import json
......@@ -346,45 +347,52 @@ def _get_modelcif_protocol(
return protocol
def _cast_release_date(release_date):
"""Type cast a date into datetime.date"""
# "AF2" has a special meaning, those DBs did not change since the first
# release of AF2. This information is needed in the model-producing
# pipeline.
if release_date is None or release_date == "AF2":
return None
try:
return datetime.datetime.strptime(release_date, "%Y-%m-%d %H:%M:%S")
except ValueError:
logging.warning(
f"Unsupported release date format found: {release_date}"
)
raise
def _get_modelcif_ref_dbs(meta_json):
"""Get sequence databases used for monomer features."""
# vendor formatting for DB names/ URLs, extend on KeyError
db_info = {
"uniref90": {
"name": "UniRef90",
"url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/"
+ "uniref90/",
},
"mgnify": {"name": "MGnify", "url": None},
"bfd": {"name": "BFD", "url": None},
"small_bfd": {"name": "Reduced BFD", "url": None},
"uniref30": {"name": "UniRef30", "url": None},
"uniprot": {"name": "UniProt", "url": None},
"pdb70": {"name": "PDB70", "url": None},
"pdb_seqres": {"name": "PDB seqres", "url": None},
"colabfold": {"name": "ColabFold", "url": None},
}
# ToDo: adapt to new JSON input
sdb_dct = {} # 'sequence database list', starts as dict
for data in meta_json.values():
for db_name, vdct in data["databases"].items():
db_name = db_name.lower()
if vdct["version"] == "NA":
vdct["version"] = None
vdct["release_date"] = _cast_release_date(vdct["release_date"])
# if DB already exists, check URL and version
if db_name in sdb_dct:
# ToDo: switch URL to the actual URL read from JSON
if (
sdb_dct[db_name].version != vdct["version"]
or sdb_dct[db_name].url != db_info[db_name]["url"]
or sdb_dct[db_name].url != vdct["location_url"][0]
):
raise RuntimeError(
"Database versions or URLs differ for "
+ f"'{db_name}': '{sdb_dct[db_name].version}/ "
+ f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ "
+ f"{db_info[db_name]['url']}'"
+ f"{vdct['location_url'][0]}'"
)
# ToDo: deal with DBs with multiple URLs
sdb_dct[db_name] = modelcif.ReferenceDatabase(
db_info[db_name]["name"],
db_info[db_name]["url"],
db_name,
vdct["location_url"][0],
version=vdct["version"],
release_date=vdct["release_date"],
)
return sdb_dct.values()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment