Skip to content
Snippets Groups Projects
Commit 4f19d88e authored by B13nch3n's avatar B13nch3n
Browse files

Fix homoers and outdated UniProtKB sequences

parent b89be40c
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
This project consists of around 800 heterodimer models for the human reference proteome. Modelling was done with [ColabFold](https://colabfold.mmseqs.com)/ [LocalColabFold](https://github.com/YoshitakaMo/localcolabfold). Model selection is special in a sense that for some heterodimers experimental crosslinking data is available guiding the choice, otherwise top-ranking models are used. This project consists of around 800 heterodimer models for the human reference proteome. Modelling was done with [ColabFold](https://colabfold.mmseqs.com)/ [LocalColabFold](https://github.com/YoshitakaMo/localcolabfold). Model selection is special in a sense that for some heterodimers experimental crosslinking data is available guiding the choice, otherwise top-ranking models are used.
- special for this set: has a few homomers
- special for this set: old versions of UniProtKB sequences are traced down to a matching version in the history/ using UniSave
<how are the ModelCIF files created using this software> <how are the ModelCIF files created using this software>
These models qualify as "de novo modelling". These models qualify as "de novo modelling".
......
...@@ -561,8 +561,8 @@ def _check_sequence(up_ac, sequence): ...@@ -561,8 +561,8 @@ def _check_sequence(up_ac, sequence):
) )
def _fetch_upkb_entry(up_ac): def _get_n_parse_up_entry(up_ac, up_url):
"""Fetch data for an UniProtKB entry.""" """Get data for an UniProtKB entry and parse it."""
# This is a simple parser for UniProtKB txt format, instead of breaking it # This is a simple parser for UniProtKB txt format, instead of breaking it
# up into multiple functions, we just allow many many branches & statements, # up into multiple functions, we just allow many many branches & statements,
# here. # here.
...@@ -571,9 +571,7 @@ def _fetch_upkb_entry(up_ac): ...@@ -571,9 +571,7 @@ def _fetch_upkb_entry(up_ac):
data["up_organism"] = "" data["up_organism"] = ""
data["up_sequence"] = "" data["up_sequence"] = ""
data["up_ac"] = up_ac data["up_ac"] = up_ac
rspns = requests.get( rspns = requests.get(up_url, timeout=180)
f"https://www.uniprot.org/uniprot/{up_ac}.txt", timeout=180
)
for line in rspns.iter_lines(decode_unicode=True): for line in rspns.iter_lines(decode_unicode=True):
if line.startswith("ID "): if line.startswith("ID "):
sline = line.split() sline = line.split()
...@@ -615,6 +613,11 @@ def _fetch_upkb_entry(up_ac): ...@@ -615,6 +613,11 @@ def _fetch_upkb_entry(up_ac):
data["up_last_mod"] = datetime.datetime.strptime( data["up_last_mod"] = datetime.datetime.strptime(
dt_flds[0], "%d-%b-%Y" dt_flds[0], "%d-%b-%Y"
) )
elif dt_flds[1].upper().startswith("ENTRY VERSION "):
data["up_entry_version"] = dt_flds[1][len("ENTRY VERSION ") :]
if data["up_entry_version"][-1] == ".":
data["up_entry_version"] = data["up_entry_version"][:-1]
data["up_entry_version"] = int(data["up_entry_version"])
elif line.startswith("GN Name="): elif line.startswith("GN Name="):
data["up_gn"] = line[len("GN Name=") :].split(";")[0] data["up_gn"] = line[len("GN Name=") :].split(";")[0]
data["up_gn"] = data["up_gn"].split("{")[0].strip() data["up_gn"] = data["up_gn"].split("{")[0].strip()
...@@ -648,28 +651,64 @@ def _fetch_upkb_entry(up_ac): ...@@ -648,28 +651,64 @@ def _fetch_upkb_entry(up_ac):
return data return data
def _fetch_upkb_entry(up_ac):
"""Get an UniProtKB entry."""
return _get_n_parse_up_entry(
up_ac, f"https://rest.uniprot.org/uniprotkb/{up_ac}.txt"
)
def _fetch_unisave_entry(up_ac, version):
"""Get an UniSave entry, in contrast to an UniProtKB entry, that allows us
to specify a version."""
return _get_n_parse_up_entry(
up_ac,
f"https://rest.uniprot.org/unisave/{up_ac}?format=txt&"
+ f"versions={version}",
)
def _get_upkb_for_sequence(sqe, up_ac): def _get_upkb_for_sequence(sqe, up_ac):
"""Get UniProtKB entry data for given sequence.""" """Get UniProtKB entry data for given sequence."""
up_data = _fetch_upkb_entry(up_ac) up_data = _fetch_upkb_entry(up_ac)
if sqe != up_data["up_sequence"]: while sqe != up_data["up_sequence"]:
raise RuntimeError( if up_data["up_entry_version"] > 1:
f"Sequences not equal from file: {sqe}, from UniProtKB: " up_data = _fetch_unisave_entry(
+ f"{up_data['up_sequence']}" up_ac, up_data["up_entry_version"] - 1
) )
else:
raise RuntimeError(
f"Sequences not equal from file: {sqe}, from UniProtKB: "
f"{up_data['up_sequence']} ({up_ac}), checked entire entry "
"history."
)
return up_data return up_data
def _get_entities(pdb_file, up_acs): def _get_entities(pdb_file, up_acs):
"""Gather data for the mmCIF (target) entities.""" """Gather data for the mmCIF (target) entities."""
entities = [] entities = []
ost_ent = io.LoadPDB(pdb_file) ost_ent = io.LoadPDB(pdb_file)
already_seen = []
for i, chn in enumerate(ost_ent.chains): for i, chn in enumerate(ost_ent.chains):
cif_ent = {} cif_ent = {}
sqe = _get_sequence(chn) sqe = _get_sequence(chn)
try:
e_idx = already_seen.index(up_acs[i])
except ValueError:
pass
else:
if sqe != entities[e_idx]["pdb_sequence"]:
_abort_msg(
"Sequences are different for two chains of the same "
"UniProtKB AC. This case is not implemented, yet."
)
entities[e_idx]["pdb_chain_id"].append(chn.name)
continue
already_seen.append(up_acs[i])
upkb = _get_upkb_for_sequence(sqe, up_acs[i]) upkb = _get_upkb_for_sequence(sqe, up_acs[i])
cif_ent["pdb_sequence"] = sqe cif_ent["pdb_sequence"] = sqe
cif_ent["pdb_chain_id"] = chn.name cif_ent["pdb_chain_id"] = [chn.name]
cif_ent["description"] = ( cif_ent["description"] = (
f"{upkb['up_organism']} {upkb['up_gn']} " f"({upkb['up_ac']})" f"{upkb['up_organism']} {upkb['up_gn']} " f"({upkb['up_ac']})"
) )
...@@ -712,7 +751,8 @@ def _get_modelcif_entities(target_ents, source, asym_units, system): ...@@ -712,7 +751,8 @@ def _get_modelcif_entities(target_ents, source, asym_units, system):
) )
], ],
) )
asym_units[cif_ent["pdb_chain_id"]] = modelcif.AsymUnit(mdlcif_ent) for pdb_chain_id in cif_ent["pdb_chain_id"]:
asym_units[pdb_chain_id] = modelcif.AsymUnit(mdlcif_ent)
system.target_entities.append(mdlcif_ent) system.target_entities.append(mdlcif_ent)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment