diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index d1c18eacefda3545230e904fa080cfa6d698b832..a2c37472949621941c08e2d0d317144bafd9465e 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -4,6 +4,7 @@ file with a lot of metadata in place.""" from typing import Tuple +import datetime import gzip import hashlib import json @@ -346,45 +347,52 @@ def _get_modelcif_protocol( return protocol +def _cast_release_date(release_date): + """Type cast a date into datetime.date""" + # "AF2" has a special meaning, those DBs did not change since the first + # release of AF2. This information is needed in the model-producing + # pipeline. + if release_date is None or release_date == "AF2": + return None + + try: + return datetime.datetime.strptime(release_date, "%Y-%m-%d %H:%M:%S") + except ValueError: + logging.warning( + f"Unsupported release date format found: {release_date}" + ) + raise + + def _get_modelcif_ref_dbs(meta_json): """Get sequence databases used for monomer features.""" # vendor formatting for DB names/ URLs, extend on KeyError - db_info = { - "uniref90": { - "name": "UniRef90", - "url": "https://ftp.uniprot.org/pub/databases/uniprot/uniref/" - + "uniref90/", - }, - "mgnify": {"name": "MGnify", "url": None}, - "bfd": {"name": "BFD", "url": None}, - "small_bfd": {"name": "Reduced BFD", "url": None}, - "uniref30": {"name": "UniRef30", "url": None}, - "uniprot": {"name": "UniProt", "url": None}, - "pdb70": {"name": "PDB70", "url": None}, - "pdb_seqres": {"name": "PDB seqres", "url": None}, - "colabfold": {"name": "ColabFold", "url": None}, - } + # ToDo: adapt to new JSON input sdb_dct = {} # 'sequence database list', starts as dict for data in meta_json.values(): for db_name, vdct in data["databases"].items(): - db_name = db_name.lower() + if vdct["version"] == "NA": + vdct["version"] = None + vdct["release_date"] = _cast_release_date(vdct["release_date"]) # if DB already exists, check URL and version if db_name in sdb_dct: # ToDo: switch URL to the actual URL read from JSON if ( sdb_dct[db_name].version != vdct["version"] - or sdb_dct[db_name].url != db_info[db_name]["url"] + or sdb_dct[db_name].url != vdct["location_url"][0] ): raise RuntimeError( "Database versions or URLs differ for " + f"'{db_name}': '{sdb_dct[db_name].version}/ " + f"{sdb_dct[db_name].url}' vs. '{vdct['version']}/ " - + f"{db_info[db_name]['url']}'" + + f"{vdct['location_url'][0]}'" ) + # ToDo: deal with DBs with multiple URLs sdb_dct[db_name] = modelcif.ReferenceDatabase( - db_info[db_name]["name"], - db_info[db_name]["url"], + db_name, + vdct["location_url"][0], version=vdct["version"], + release_date=vdct["release_date"], ) return sdb_dct.values()