diff --git a/projects/novelfams/translate2modelcif.py b/projects/novelfams/translate2modelcif.py index 10b61a0b50c0d34b528dbafda9d78e3e6ae44d57..22b22b7fea948c3fff1ba70952abe27c88b8509c 100644 --- a/projects/novelfams/translate2modelcif.py +++ b/projects/novelfams/translate2modelcif.py @@ -301,8 +301,6 @@ def _get_sequence(chn, use_auth=False): lst_rn += 1 sqe += res.one_letter_code - if "-" in sqe: - print("GAP") return sqe @@ -347,7 +345,11 @@ def _get_entities(pdb_file, fam_name, trg_seq): if len_diff > 0: exp_seq += "X" * len_diff if exp_seq != trg_seq.string: - print(f"Sequence in {pdb_file} does not match target.") + print( + f"Sequence in {os.path.splitext(os.path.basename(pdb_file))[0]} " + + "does not match target.", + exp_seq, + ) # ToDo: re-enable check # raise RuntimeError(f"Sequence in {pdb_file} does not match target.") @@ -474,27 +476,59 @@ def _get_modelcif_protocol_software(js_step): return None -def _get_modelcif_protocol_data(data_label, target_entities, model): +def _get_sequence_dbs_colabfold(seq_dbs): + """Get ColabFold seq. DBs.""" + db_dict = { + "UniRef": modelcif.ReferenceDatabase( + "UniRef30", + "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz", + version="2022_02", + ), + "Environmental": modelcif.ReferenceDatabase( + "ColabFold DB", + "http://wwwuser.gwdg.de/~compbiol/colabfold/" + + "colabfold_envdb_202108.tar.gz", + version="2021_08", + ), + } + return [db_dict[seq_db] for seq_db in seq_dbs] + + +def _get_modelcif_protocol_data(data_labels, target_entities, model, msa): """Assemble data for a ModelCIF protocol step.""" - if data_label == "target_sequences": - data = modelcif.data.DataGroup(target_entities) - elif data_label == "model": - data = model - else: - raise RuntimeError(f"Unknown protocol data: '{data_label}'") + data = modelcif.data.DataGroup() + for label in data_labels: + if label == "target_sequences": + data.extend(target_entities) + elif label == "model": + data = model + elif label == "colabfold_reference_dbs": + data.extend( + _get_sequence_dbs_colabfold(["UniRef", "Environmental"]) + ) + elif label == "msas": + data.append(msa) + else: + raise RuntimeError(f"Unknown protocol data: '{label}'") + return data def _get_modelcif_protocol(protocol_steps, target_entities, model): """Create the protocol for the ModelCIF file.""" protocol = modelcif.protocol.Protocol() + # This is a bit unelegant, but we need a single MSA object, that can serve + # as output & input and is only referenced once in the ModelCIF file. + msa = modelcif.data.Data( + "MSA", details="MSAs of the target sequence and search DBs." + ) for js_step in protocol_steps: sftwre = _get_modelcif_protocol_software(js_step) input_data = _get_modelcif_protocol_data( - js_step["input"], target_entities, model + js_step["input"], target_entities, model, msa ) output_data = _get_modelcif_protocol_data( - js_step["output"], target_entities, model + js_step["output"], target_entities, model, msa ) protocol.steps.append( @@ -618,7 +652,21 @@ def _store_as_modelcif( os.chdir(oldpwd) -def _get_af2_software(): +def _get_colabfold_software(version=None): + """Get ColabFold as a dictionary, suitable to create a modelcif software + object.""" + return { + "name": "ColabFold", + "classification": "model building", + "description": "Structure prediction", + "citation": ihm.citations.colabfold, + "location": "https://github.com/sokrypton/ColabFold", + "type": "package", + "version": version, + } + + +def _get_af2_software(version=None): """Get AF2 as dictionary, suitable to create a modelcif software object.""" return { "name": "AlphaFold", @@ -627,7 +675,7 @@ def _get_af2_software(): "citation": ihm.citations.alphafold2, "location": "https://github.com/deepmind/alphafold", "type": "package", - "version": None, + "version": version, } @@ -635,32 +683,32 @@ def _get_protocol_steps_and_software_colabfold(config_data): """Get protocol steps for ColabFold models.""" protocol = [] + # MSA step + # Step 1 - MSA: Using default Colabfold databases with default parameters (colabfold_envdb_202108, uniref30_2202) + step = { + "method_type": "coevolution MSA", + "name": None, + "details": config_data["msa_description"], + "input": ["target_sequences", "colabfold_reference_dbs"], + "output": ["msas"], + "software": [_get_colabfold_software(config_data["cf_version"])], + "software_parameters": None, + } + protocol.append(step) + # modelling step step = { "method_type": "modeling", "name": None, - "details": config_data["description"], + "details": config_data["mdl_description"], + "input": ["target_sequences", "msas"], + "output": ["model"], + "software": [ + _get_colabfold_software(config_data["cf_version"]), + _get_af2_software(config_data["af2_version"]), + ], + "software_parameters": None, } - # get input data - # Must refer to data already in the JSON, so we try keywords - step["input"] = "target_sequences" - # get output data - # Must refer to existing data, so we try keywords - step["output"] = "model" - # get software - step["software"] = [ - { - "name": "ColabFold", - "classification": "model building", - "description": "Structure prediction", - "citation": ihm.citations.colabfold, - "location": "https://github.com/sokrypton/ColabFold", - "type": "package", - "version": None, - } - ] - step["software"].append(_get_af2_software()) - step["software_parameters"] = None protocol.append(step) return protocol @@ -668,14 +716,29 @@ def _get_protocol_steps_and_software_colabfold(config_data): def _get_config_colabfold(): """Get config variables for ColabFold""" - description = "Model generation using ColabFold." + af2_version = "2.1.14" + cf_version = "1.3.0" + msa_description = ( + "MSAs created for corresponding target sequence with ColabFold using " + + "default parameters." + ) + mdl_description = ( + f"Model generated using AlphaFold ({af2_version}, " + + f"executed within ColabFold {cf_version}) producing 5 models, " + + "ranked by pLDDT, starting from the ColabFold produced MSA." + ) - return {"description": description} + return { + "cf_version": cf_version, + "af2_version": af2_version, + "msa_description": msa_description, + "mdl_description": mdl_description, + } def _get_config_alphafold(): """Get config variables for AlphaFold""" - description = "Model generation using AlphaFold." + description = "Predict model coordinates using AlphaFold." return {"description": description} @@ -692,12 +755,12 @@ def _get_protocol_steps_and_software_alphafold(config_data): } # get input data # Must refer to data already in the JSON, so we try keywords - step["input"] = "target_sequences" + step["input"] = ["target_sequences"] # get output data # Must refer to existing data, so we try keywords - step["output"] = "model" + step["output"] = ["model"] # get software - step["software"] = [_get_af2_software()] + step["software"] = [_get_af2_software("2.2.0")] step["software_parameters"] = None protocol.append(step) @@ -877,6 +940,11 @@ def _main(): ) except (_InvalidCoordinateError, _NoEntitiesError): continue + except Exception as exc: + # ToDo: remove catching ALL exceptions + _warn_msg(f"Uncaught exception for '{f_name}':") + print(str(exc)) + continue # report progress after a bit of time if timer() - tmstmp > 60: