Skip to content
Snippets Groups Projects
Commit 07cf74e2 authored by Bienchen's avatar Bienchen
Browse files

Protocol for CloabFold

parent 27be2ce6
Branches
No related tags found
No related merge requests found
......@@ -301,8 +301,6 @@ def _get_sequence(chn, use_auth=False):
lst_rn += 1
sqe += res.one_letter_code
if "-" in sqe:
print("GAP")
return sqe
......@@ -347,7 +345,11 @@ def _get_entities(pdb_file, fam_name, trg_seq):
if len_diff > 0:
exp_seq += "X" * len_diff
if exp_seq != trg_seq.string:
print(f"Sequence in {pdb_file} does not match target.")
print(
f"Sequence in {os.path.splitext(os.path.basename(pdb_file))[0]} "
+ "does not match target.",
exp_seq,
)
# ToDo: re-enable check
# raise RuntimeError(f"Sequence in {pdb_file} does not match target.")
......@@ -474,27 +476,59 @@ def _get_modelcif_protocol_software(js_step):
return None
def _get_modelcif_protocol_data(data_label, target_entities, model):
def _get_sequence_dbs_colabfold(seq_dbs):
"""Get ColabFold seq. DBs."""
db_dict = {
"UniRef": modelcif.ReferenceDatabase(
"UniRef30",
"https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz",
version="2022_02",
),
"Environmental": modelcif.ReferenceDatabase(
"ColabFold DB",
"http://wwwuser.gwdg.de/~compbiol/colabfold/"
+ "colabfold_envdb_202108.tar.gz",
version="2021_08",
),
}
return [db_dict[seq_db] for seq_db in seq_dbs]
def _get_modelcif_protocol_data(data_labels, target_entities, model, msa):
"""Assemble data for a ModelCIF protocol step."""
if data_label == "target_sequences":
data = modelcif.data.DataGroup(target_entities)
elif data_label == "model":
data = model
else:
raise RuntimeError(f"Unknown protocol data: '{data_label}'")
data = modelcif.data.DataGroup()
for label in data_labels:
if label == "target_sequences":
data.extend(target_entities)
elif label == "model":
data = model
elif label == "colabfold_reference_dbs":
data.extend(
_get_sequence_dbs_colabfold(["UniRef", "Environmental"])
)
elif label == "msas":
data.append(msa)
else:
raise RuntimeError(f"Unknown protocol data: '{label}'")
return data
def _get_modelcif_protocol(protocol_steps, target_entities, model):
"""Create the protocol for the ModelCIF file."""
protocol = modelcif.protocol.Protocol()
# This is a bit unelegant, but we need a single MSA object, that can serve
# as output & input and is only referenced once in the ModelCIF file.
msa = modelcif.data.Data(
"MSA", details="MSAs of the target sequence and search DBs."
)
for js_step in protocol_steps:
sftwre = _get_modelcif_protocol_software(js_step)
input_data = _get_modelcif_protocol_data(
js_step["input"], target_entities, model
js_step["input"], target_entities, model, msa
)
output_data = _get_modelcif_protocol_data(
js_step["output"], target_entities, model
js_step["output"], target_entities, model, msa
)
protocol.steps.append(
......@@ -618,7 +652,21 @@ def _store_as_modelcif(
os.chdir(oldpwd)
def _get_af2_software():
def _get_colabfold_software(version=None):
"""Get ColabFold as a dictionary, suitable to create a modelcif software
object."""
return {
"name": "ColabFold",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.citations.colabfold,
"location": "https://github.com/sokrypton/ColabFold",
"type": "package",
"version": version,
}
def _get_af2_software(version=None):
"""Get AF2 as dictionary, suitable to create a modelcif software object."""
return {
"name": "AlphaFold",
......@@ -627,7 +675,7 @@ def _get_af2_software():
"citation": ihm.citations.alphafold2,
"location": "https://github.com/deepmind/alphafold",
"type": "package",
"version": None,
"version": version,
}
......@@ -635,32 +683,32 @@ def _get_protocol_steps_and_software_colabfold(config_data):
"""Get protocol steps for ColabFold models."""
protocol = []
# MSA step
# Step 1 - MSA: Using default Colabfold databases with default parameters (colabfold_envdb_202108, uniref30_2202)
step = {
"method_type": "coevolution MSA",
"name": None,
"details": config_data["msa_description"],
"input": ["target_sequences", "colabfold_reference_dbs"],
"output": ["msas"],
"software": [_get_colabfold_software(config_data["cf_version"])],
"software_parameters": None,
}
protocol.append(step)
# modelling step
step = {
"method_type": "modeling",
"name": None,
"details": config_data["description"],
"details": config_data["mdl_description"],
"input": ["target_sequences", "msas"],
"output": ["model"],
"software": [
_get_colabfold_software(config_data["cf_version"]),
_get_af2_software(config_data["af2_version"]),
],
"software_parameters": None,
}
# get input data
# Must refer to data already in the JSON, so we try keywords
step["input"] = "target_sequences"
# get output data
# Must refer to existing data, so we try keywords
step["output"] = "model"
# get software
step["software"] = [
{
"name": "ColabFold",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.citations.colabfold,
"location": "https://github.com/sokrypton/ColabFold",
"type": "package",
"version": None,
}
]
step["software"].append(_get_af2_software())
step["software_parameters"] = None
protocol.append(step)
return protocol
......@@ -668,14 +716,29 @@ def _get_protocol_steps_and_software_colabfold(config_data):
def _get_config_colabfold():
"""Get config variables for ColabFold"""
description = "Model generation using ColabFold."
af2_version = "2.1.14"
cf_version = "1.3.0"
msa_description = (
"MSAs created for corresponding target sequence with ColabFold using "
+ "default parameters."
)
mdl_description = (
f"Model generated using AlphaFold ({af2_version}, "
+ f"executed within ColabFold {cf_version}) producing 5 models, "
+ "ranked by pLDDT, starting from the ColabFold produced MSA."
)
return {"description": description}
return {
"cf_version": cf_version,
"af2_version": af2_version,
"msa_description": msa_description,
"mdl_description": mdl_description,
}
def _get_config_alphafold():
"""Get config variables for AlphaFold"""
description = "Model generation using AlphaFold."
description = "Predict model coordinates using AlphaFold."
return {"description": description}
......@@ -692,12 +755,12 @@ def _get_protocol_steps_and_software_alphafold(config_data):
}
# get input data
# Must refer to data already in the JSON, so we try keywords
step["input"] = "target_sequences"
step["input"] = ["target_sequences"]
# get output data
# Must refer to existing data, so we try keywords
step["output"] = "model"
step["output"] = ["model"]
# get software
step["software"] = [_get_af2_software()]
step["software"] = [_get_af2_software("2.2.0")]
step["software_parameters"] = None
protocol.append(step)
......@@ -877,6 +940,11 @@ def _main():
)
except (_InvalidCoordinateError, _NoEntitiesError):
continue
except Exception as exc:
# ToDo: remove catching ALL exceptions
_warn_msg(f"Uncaught exception for '{f_name}':")
print(str(exc))
continue
# report progress after a bit of time
if timer() - tmstmp > 60:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment