Skip to content
Snippets Groups Projects
Commit 68ad8fb6 authored by Bienchen's avatar Bienchen
Browse files

Add selection step

parent 6b9c2544
No related branches found
No related tags found
No related merge requests found
......@@ -249,7 +249,6 @@ def _check_model_extra_files_present(model_dir, pdb_file):
def _get_audit_authors():
"""Return the list of authors that produced this model."""
# ToDo: tell Xabi that his name can't have a á in mmCIF
return (
"Bartolec, T.",
"Vazquez-Campos, X.",
......@@ -289,7 +288,8 @@ def _parse_colabfold_config(cnfg_file):
use_msa = False
elif cf_config["msa_mode"] == "custom":
print(
"WARNING: Custom MSA mode used. Not clear from config what to do here!"
"WARNING: Custom MSA mode used. Not clear from config what to do "
+ "here!"
)
seq_dbs = []
use_mmseqs = False
......@@ -311,63 +311,71 @@ def _parse_colabfold_config(cnfg_file):
else:
raise ValueError(f"Unknown model_type {cf_config['model_type']}")
# write description
description = f"Model generated using ColabFold v{cf_config['version']}"
# write modeling description
mdl_description = f"Model generated using ColabFold v{cf_config['version']}"
if use_multimer:
description += f" with AlphaFold-Multimer (v{multimer_version})"
mdl_description += f" with AlphaFold-Multimer (v{multimer_version})"
else:
description += " with AlphaFold"
mdl_description += " with AlphaFold"
if cf_config["stop_at_score"] < 100:
# early stopping feature of ColabFold
upto = "up to "
else:
upto = ""
description += (
mdl_description += (
f" producing {upto}{cf_config['num_models']} models"
f" with {upto}{cf_config['num_recycles']} recycles each"
)
if cf_config["use_amber"]:
description += ", with AMBER relaxation"
mdl_description += ", with AMBER relaxation"
else:
description += ", without model relaxation"
mdl_description += ", without model relaxation"
if cf_config["use_templates"]:
print(
"WARNING: ColabFold may use PDB70 or custom templates. "
"Not clear from config!"
)
description += ", using templates"
mdl_description += ", using templates"
else:
description += ", without templates"
mdl_description += ", without templates"
if cf_config["rank_by"] == "plddt":
description += ", ranked by pLDDT"
mdl_description += ", ranked by pLDDT"
elif cf_config["rank_by"] == "ptmscore":
description += ", ranked by pTM"
mdl_description += ", ranked by pTM"
elif cf_config["rank_by"] == "multimer":
description += ", ranked by ipTM*0.8+pTM*0.2"
mdl_description += ", ranked by ipTM*0.8+pTM*0.2"
else:
raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
if use_msa:
description += ", starting from"
mdl_description += ", starting from"
if use_mmseqs:
msa_type = "MSA"
else:
msa_type = "custom MSA"
if use_multimer:
if cf_config["pair_mode"] == "unpaired+paired":
description += f" paired and unpaired {msa_type}s"
mdl_description += f" paired and unpaired {msa_type}s"
elif cf_config["pair_mode"] == "paired":
description += f" paired {msa_type}s"
mdl_description += f" paired {msa_type}s"
elif cf_config["pair_mode"] == "unpaired":
description += f" unpaired {msa_type}s"
mdl_description += f" unpaired {msa_type}s"
else:
raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
else:
description += f" an {msa_type}"
mdl_description += f" an {msa_type}"
if use_mmseqs:
description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
else:
description += " without an MSA"
description += "."
mdl_description += " without an MSA"
mdl_description += "."
# write selection description
slct_description = (
"Select best model, which is either the top-ranked model as "
+ "determined by the ColabFold pipeline (ipTM*0.8+pTM*0.2), or else "
+ "the model with best congruence with crosslinks reported in the "
+ "related study."
)
return {
"config": cf_config,
......@@ -376,7 +384,8 @@ def _parse_colabfold_config(cnfg_file):
"use_msa": use_msa,
"use_multimer": use_multimer,
"multimer_version": multimer_version,
"description": description,
"modeling_description": mdl_description,
"selection_description": slct_description,
}
......@@ -388,7 +397,7 @@ def _get_protocol_steps_and_software(config_data):
step = {
"method_type": "modeling",
"name": None,
"details": config_data["description"],
"details": config_data["modeling_description"],
}
# get input data
# Must refer to data already in the JSON, so we try keywords
......@@ -415,21 +424,7 @@ def _get_protocol_steps_and_software(config_data):
"classification": "data collection",
"description": "Many-against-Many sequence searching",
# ToDo: add citation to ihm.citations
"citation": ihm.Citation(
pmid="30615063",
title="MMseqs2 desktop and local web server app for fast, "
+ "interactive sequence searches.",
journal="Bioinformatics",
volume=35,
page_range=(2856, 2858),
year=2019,
authors=[
"Mirdita, M.",
"Steinegger, M.",
"Soeding, J.",
],
doi="10.1093/bioinformatics/bty1057",
),
"citation": ihm.citations.mmseqs2,
"location": "https://github.com/soedinglab/mmseqs2",
"type": "package",
"version": None,
......@@ -496,29 +491,22 @@ def _get_protocol_steps_and_software(config_data):
protocol.append(step)
# model selection step
# ToDo [input/ internal]: model selection step on a single model is a bit
# silly, how do we get a list of models?
# GT-NOTES:
# - input/output should be ok without list of models
# - rank of model is already stored in _ma_model_list.model_name and
# _ma_data.name (in _store_as_modelcif)
# - ColabFold ranking details is already in details of step above.
# - Suggestion: add extra step only if AF-ranking was overruled and
# include it in step above.
# step = {
# "method_type": "model selection",
# "name": "ma_protocol_step.step_name",
# "details": "Select best model, which is either the top-ranked model "
# + "as determined by the ColabFold pipeline "
# + "(ipTM*0.8+pTM*0.2), or else the model with best "
# + "congruence with crosslinks reported in the related study.",
# }
# step["input"] = "model"
# step["output"] = "model"
# step["software"] = []
# step["software_parameters"] = {}
# protocol.append(step)
if (
"selection_description" not in config_data
or len(config_data["selection_description"]) == 0
):
return protocol
step = {
"method_type": "model selection",
"name": None,
"details": config_data["selection_description"],
}
step["input"] = "model"
step["output"] = "model"
step["software"] = []
step["software_parameters"] = {}
protocol.append(step)
return protocol
......@@ -584,7 +572,9 @@ def _fetch_upkb_entry(up_ac):
data["up_organism"] = ""
data["up_sequence"] = ""
data["up_ac"] = up_ac
rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
rspns = requests.get(
f"https://www.uniprot.org/uniprot/{up_ac}.txt", timeout=180
)
for line in rspns.iter_lines(decode_unicode=True):
if line.startswith("ID "):
sline = line.split()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment