diff --git a/projects/human-heterodimers-w-crosslinks/README.md b/projects/human-heterodimers-w-crosslinks/README.md index 2ebf0be2aefaab061119a7cd500d6bac98c6149c..902d1d68c712d794f11d35eb0dcd6f5b2555478d 100644 --- a/projects/human-heterodimers-w-crosslinks/README.md +++ b/projects/human-heterodimers-w-crosslinks/README.md @@ -6,6 +6,9 @@ This project consists of around 800 heterodimer models for the human reference proteome. Modelling was done with [ColabFold](https://colabfold.mmseqs.com)/ [LocalColabFold](https://github.com/YoshitakaMo/localcolabfold). Model selection is special in a sense that for some heterodimers experimental crosslinking data is available guiding the choice, otherwise top-ranking models are used. +- special for this set: has a few homomers +- special for this set: old versions of UniProtKB sequences are traced down to a matching version in the history/ using UniSave + <how are the ModelCIF files created using this software> These models qualify as "de novo modelling". diff --git a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py index 8b91dcfdbffe1bd6042f4a812d40e7b51dcf831b..df86631e37023797bcfb18f79d4e0cc1a49c979e 100755 --- a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py +++ b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py @@ -561,8 +561,8 @@ def _check_sequence(up_ac, sequence): ) -def _fetch_upkb_entry(up_ac): - """Fetch data for an UniProtKB entry.""" +def _get_n_parse_up_entry(up_ac, up_url): + """Get data for an UniProtKB entry and parse it.""" # This is a simple parser for UniProtKB txt format, instead of breaking it # up into multiple functions, we just allow many many branches & statements, # here. @@ -571,9 +571,7 @@ def _fetch_upkb_entry(up_ac): data["up_organism"] = "" data["up_sequence"] = "" data["up_ac"] = up_ac - rspns = requests.get( - f"https://www.uniprot.org/uniprot/{up_ac}.txt", timeout=180 - ) + rspns = requests.get(up_url, timeout=180) for line in rspns.iter_lines(decode_unicode=True): if line.startswith("ID "): sline = line.split() @@ -615,6 +613,11 @@ def _fetch_upkb_entry(up_ac): data["up_last_mod"] = datetime.datetime.strptime( dt_flds[0], "%d-%b-%Y" ) + elif dt_flds[1].upper().startswith("ENTRY VERSION "): + data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :] + if data["up_entry_version"][-1] == ".": + data["up_entry_version"] = data["up_entry_version"][:-1] + data["up_entry_version"] = int(data["up_entry_version"]) elif line.startswith("GN Name="): data["up_gn"] = line[len("GN Name=") :].split(";")[0] data["up_gn"] = data["up_gn"].split("{")[0].strip() @@ -648,28 +651,64 @@ def _fetch_upkb_entry(up_ac): return data +def _fetch_upkb_entry(up_ac): + """Get an UniProtKB entry.""" + return _get_n_parse_up_entry( + up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt" + ) + + +def _fetch_unisave_entry(up_ac, version): + """Get an UniSave entry, in contrast to an UniProtKB entry, that allows us + to specify a version.""" + return _get_n_parse_up_entry( + up_ac, + f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&" + + f"versions={version}", + ) + + def _get_upkb_for_sequence(sqe, up_ac): """Get UniProtKB entry data for given sequence.""" up_data = _fetch_upkb_entry(up_ac) - if sqe != up_data["up_sequence"]: - raise RuntimeError( - f"Sequences not equal from file: {sqe}, from UniProtKB: " - + f"{up_data['up_sequence']}" - ) + while sqe != up_data["up_sequence"]: + if up_data["up_entry_version"] > 1: + up_data = _fetch_unisave_entry( + up_ac, up_data["up_entry_version"] - 1 + ) + else: + raise RuntimeError( + f"Sequences not equal from file: {sqe}, from UniProtKB: " + f"{up_data['up_sequence']} ({up_ac}), checked entire entry " + "history." + ) return up_data def _get_entities(pdb_file, up_acs): """Gather data for the mmCIF (target) entities.""" entities = [] - ost_ent = io.LoadPDB(pdb_file) + already_seen = [] for i, chn in enumerate(ost_ent.chains): cif_ent = {} sqe = _get_sequence(chn) + try: + e_idx = already_seen.index(up_acs[i]) + except ValueError: + pass + else: + if sqe != entities[e_idx]["pdb_sequence"]: + _abort_msg( + "Sequences are different for two chains of the same " + "UniProtKB AC. This case is not implemented, yet." + ) + entities[e_idx]["pdb_chain_id"].append(chn.name) + continue + already_seen.append(up_acs[i]) upkb = _get_upkb_for_sequence(sqe, up_acs[i]) cif_ent["pdb_sequence"] = sqe - cif_ent["pdb_chain_id"] = chn.name + cif_ent["pdb_chain_id"] = [chn.name] cif_ent["description"] = ( f"{upkb['up_organism']} {upkb['up_gn']} " f"({upkb['up_ac']})" ) @@ -712,7 +751,8 @@ def _get_modelcif_entities(target_ents, source, asym_units, system): ) ], ) - asym_units[cif_ent["pdb_chain_id"]] = modelcif.AsymUnit(mdlcif_ent) + for pdb_chain_id in cif_ent["pdb_chain_id"]: + asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent) system.target_entities.append(mdlcif_ent)