Skip to content
Snippets Groups Projects
Commit 3fd151a3 authored by B13nch3n's avatar B13nch3n
Browse files

Update translationscript

parent de569684
No related branches found
No related tags found
No related merge requests found
...@@ -228,6 +228,11 @@ def _abort_msg(msg, exit_code=1): ...@@ -228,6 +228,11 @@ def _abort_msg(msg, exit_code=1):
sys.exit(exit_code) sys.exit(exit_code)
def _warn_msg(msg):
"""Write a warning message to stdout."""
print(f"WARNING: {msg}")
def _check_file(file_path): def _check_file(file_path):
"""Make sure a file exists and is actually a file.""" """Make sure a file exists and is actually a file."""
if not os.path.exists(file_path): if not os.path.exists(file_path):
...@@ -292,9 +297,8 @@ def _parse_colabfold_config(cnfg_file): ...@@ -292,9 +297,8 @@ def _parse_colabfold_config(cnfg_file):
use_mmseqs = False use_mmseqs = False
use_msa = False use_msa = False
elif cf_config["msa_mode"] == "custom": elif cf_config["msa_mode"] == "custom":
print( _warn_msg(
"WARNING: Custom MSA mode used. Not clear from config what to do " "Custom MSA mode used. Not clear from config what to do here!"
+ "here!"
) )
seq_dbs = [] seq_dbs = []
use_mmseqs = False use_mmseqs = False
...@@ -336,9 +340,9 @@ def _parse_colabfold_config(cnfg_file): ...@@ -336,9 +340,9 @@ def _parse_colabfold_config(cnfg_file):
else: else:
mdl_description += ", without model relaxation" mdl_description += ", without model relaxation"
if cf_config["use_templates"]: if cf_config["use_templates"]:
print( _warn_msg(
"WARNING: ColabFold may use PDB70 or custom templates. " "ColabFold may use PDB70 or custom templates. Not clear "
"Not clear from config!" + "from config!"
) )
mdl_description += ", using templates" mdl_description += ", using templates"
else: else:
...@@ -553,12 +557,22 @@ def _get_sequence(chn): ...@@ -553,12 +557,22 @@ def _get_sequence(chn):
def _check_sequence(up_ac, sequence): def _check_sequence(up_ac, sequence):
"""Verify sequence to only contain standard olc.""" """Verify sequence to only contain standard olc."""
for res in sequence: ns_aa_pos = [] # positions of non-standard amino acids
for i, res in enumerate(sequence):
if res not in "ACDEFGHIKLMNPQRSTVWY": if res not in "ACDEFGHIKLMNPQRSTVWY":
if res == "U":
_warn_msg(
f"Selenocysteine found at position {i+1} of entry "
+ f"'{up_ac}', this residue may be missing in the "
+ "model."
)
ns_aa_pos.append(i)
continue
raise RuntimeError( raise RuntimeError(
"Non-standard aa found in UniProtKB sequence " "Non-standard aa found in UniProtKB sequence "
+ f"for entry '{up_ac}': {res}" + f"for entry '{up_ac}': {res}, position {i+1}"
) )
return ns_aa_pos
def _get_n_parse_up_entry(up_ac, up_url): def _get_n_parse_up_entry(up_ac, up_url):
...@@ -626,7 +640,11 @@ def _get_n_parse_up_entry(up_ac, up_url): ...@@ -626,7 +640,11 @@ def _get_n_parse_up_entry(up_ac, up_url):
data["up_isoform"] = None data["up_isoform"] = None
if "up_gn" not in data: if "up_gn" not in data:
_abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.") _warn_msg(
f"No gene name found for UniProtKB entry '{up_ac}', using "
+ "UniProtKB AC instead."
)
data["up_gn"] = up_ac
if "up_last_mod" not in data: if "up_last_mod" not in data:
_abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.") _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
if "up_crc64" not in data: if "up_crc64" not in data:
...@@ -640,7 +658,7 @@ def _get_n_parse_up_entry(up_ac, up_url): ...@@ -640,7 +658,7 @@ def _get_n_parse_up_entry(up_ac, up_url):
+ f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != " + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
+ f"{len(data['up_sequence'])}" + f"{len(data['up_sequence'])}"
) )
_check_sequence(data["up_ac"], data["up_sequence"]) data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"])
if "up_id" not in data: if "up_id" not in data:
_abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.") _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
...@@ -668,10 +686,24 @@ def _fetch_unisave_entry(up_ac, version): ...@@ -668,10 +686,24 @@ def _fetch_unisave_entry(up_ac, version):
) )
def _cmp_sequences(mdl, upkb, ns_aa_pos):
"""Compare sequence while paying attention on non-standard amino acids."""
# We add a U to the sequence when necessary. AF2 does not model it. The PDB
# has selenocysteine as canonical aa, see PDB entry 7Z0T.
for pos in ns_aa_pos:
if mdl[pos] != "-":
_abort_msg(
f"Position {pos+1} of non-canonical amino acid should be "
"a gap!"
)
mdl = mdl[0:pos] + "U" + mdl[pos + 1 :]
return mdl == upkb
def _get_upkb_for_sequence(sqe, up_ac): def _get_upkb_for_sequence(sqe, up_ac):
"""Get UniProtKB entry data for given sequence.""" """Get UniProtKB entry data for given sequence."""
up_data = _fetch_upkb_entry(up_ac) up_data = _fetch_upkb_entry(up_ac)
while sqe != up_data["up_sequence"]: while not _cmp_sequences(sqe, up_data["up_sequence"], up_data["up_ns_aa"]):
if up_data["up_entry_version"] > 1: if up_data["up_entry_version"] > 1:
up_data = _fetch_unisave_entry( up_data = _fetch_unisave_entry(
up_ac, up_data["up_entry_version"] - 1 up_ac, up_data["up_entry_version"] - 1
...@@ -732,7 +764,10 @@ def _get_modelcif_entities(target_ents, source, asym_units, system): ...@@ -732,7 +764,10 @@ def _get_modelcif_entities(target_ents, source, asym_units, system):
"""Create ModelCIF entities and asymmetric units.""" """Create ModelCIF entities and asymmetric units."""
for cif_ent in target_ents: for cif_ent in target_ents:
mdlcif_ent = modelcif.Entity( mdlcif_ent = modelcif.Entity(
cif_ent["pdb_sequence"], # Use up_sequence above pdb_sequence as it contains no gaps but
# otherwise is equal to the pdb_sequence. Gaps in pdb_sequence
# originate from missing residues in the model.
cif_ent["up_sequence"],
description=cif_ent["description"], description=cif_ent["description"],
source=source, source=source,
references=[ references=[
...@@ -1111,3 +1146,5 @@ def _main(): ...@@ -1111,3 +1146,5 @@ def _main():
if __name__ == "__main__": if __name__ == "__main__":
_main() _main()
# LocalWords: aa pdb
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment