diff --git a/projects/novelfams/translate2modelcif.py b/projects/novelfams/translate2modelcif.py index 22b22b7fea948c3fff1ba70952abe27c88b8509c..8cfbf839eb16606be308e80ee7949e45b33df531 100644 --- a/projects/novelfams/translate2modelcif.py +++ b/projects/novelfams/translate2modelcif.py @@ -6,6 +6,7 @@ from timeit import default_timer as timer import argparse +import datetime import gzip import os import shutil @@ -494,9 +495,45 @@ def _get_sequence_dbs_colabfold(seq_dbs): return [db_dict[seq_db] for seq_db in seq_dbs] +def _get_sequence_dbs_alphafold(seq_dbs): + """Get AlphaFold seq. DBs.""" + db_dict = { + "MGnify": modelcif.ReferenceDatabase( + "MGnify", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/mgy_clusters_2018_12.fa.gz", + version="2018_12", + release_date=datetime.datetime(2018, 12, 6), + ), + "UniRef90": modelcif.ReferenceDatabase( + "UniRef90", + "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/" + + "uniref90.fasta.gz", + version=None, + release_date=None, + ), + "BFD": modelcif.ReferenceDatabase( + "BFD", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/" + + "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz", + version="6a634dc6eb105c2e9b4cba7bbae93412", + ), + "Uniclust30": modelcif.ReferenceDatabase( + "Uniclust30", + "https://storage.googleapis.com/alphafold-databases/" + + "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz", + version="2018_08", + release_date=None, + ), + } + return [db_dict[seq_db] for seq_db in seq_dbs] + + def _get_modelcif_protocol_data(data_labels, target_entities, model, msa): """Assemble data for a ModelCIF protocol step.""" data = modelcif.data.DataGroup() + for label in data_labels: if label == "target_sequences": data.extend(target_entities) @@ -506,6 +543,12 @@ def _get_modelcif_protocol_data(data_labels, target_entities, model, msa): data.extend( _get_sequence_dbs_colabfold(["UniRef", "Environmental"]) ) + elif label == "alphafold_reference_dbs": + data.extend( + _get_sequence_dbs_alphafold( + ["MGnify", "UniRef90", "BFD", "Uniclust30"] + ) + ) elif label == "msas": data.append(msa) else: @@ -684,7 +727,6 @@ def _get_protocol_steps_and_software_colabfold(config_data): protocol = [] # MSA step - # Step 1 - MSA: Using default Colabfold databases with default parameters (colabfold_envdb_202108, uniref30_2202) step = { "method_type": "coevolution MSA", "name": None, @@ -738,30 +780,50 @@ def _get_config_colabfold(): def _get_config_alphafold(): """Get config variables for AlphaFold""" - description = "Predict model coordinates using AlphaFold." + af2_version = "2.2.0" + msa_description = ( + "MSAs created for corresponding target sequence with AlphaFold using " + + "default parameters." + ) - return {"description": description} + mdl_description = ( + f"Model generated using AlphaFold ({af2_version} with default " + + "parameters) producing 5 models,ranked by pLDDT, starting from a the " + + f"Alphafold {af2_version} produced MSA." + ) + + return { + "af2_version": af2_version, + "msa_description": msa_description, + "mdl_description": mdl_description, + } def _get_protocol_steps_and_software_alphafold(config_data): """Get protocol steps for AF2 based models.""" protocol = [] + # MSA generation + step = { + "method_type": "coevolution MSA", + "name": None, + "details": config_data["msa_description"], + "input": ["target_sequences", "alphafold_reference_dbs"], + "output": ["msas"], + "software": [_get_af2_software(config_data["af2_version"])], + "software_parameters": None, + } + protocol.append(step) # modelling step step = { "method_type": "modeling", "name": None, - "details": config_data["description"], + "details": config_data["mdl_description"], + "input": ["target_sequences"], + "output": ["model"], + "software": [_get_af2_software("2.2.0")], + "software_parameters": None, } - # get input data - # Must refer to data already in the JSON, so we try keywords - step["input"] = ["target_sequences"] - # get output data - # Must refer to existing data, so we try keywords - step["output"] = ["model"] - # get software - step["software"] = [_get_af2_software("2.2.0")] - step["software_parameters"] = None protocol.append(step) return protocol