From 68ad8fb62a76a63c9bb802b4b8215c837df9adfe Mon Sep 17 00:00:00 2001 From: Stefan Bienert <stefan.bienert@unibas.ch> Date: Mon, 3 Oct 2022 14:10:59 +0200 Subject: [PATCH] Add selection step --- translate2modelcif.py | 116 +++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/translate2modelcif.py b/translate2modelcif.py index db57ff1..aa90588 100644 --- a/translate2modelcif.py +++ b/translate2modelcif.py @@ -249,7 +249,6 @@ def _check_model_extra_files_present(model_dir, pdb_file): def _get_audit_authors(): """Return the list of authors that produced this model.""" - # ToDo: tell Xabi that his name can't have a รก in mmCIF return ( "Bartolec, T.", "Vazquez-Campos, X.", @@ -289,7 +288,8 @@ def _parse_colabfold_config(cnfg_file): use_msa = False elif cf_config["msa_mode"] == "custom": print( - "WARNING: Custom MSA mode used. Not clear from config what to do here!" + "WARNING: Custom MSA mode used. Not clear from config what to do " + + "here!" ) seq_dbs = [] use_mmseqs = False @@ -311,63 +311,71 @@ def _parse_colabfold_config(cnfg_file): else: raise ValueError(f"Unknown model_type {cf_config['model_type']}") - # write description - description = f"Model generated using ColabFold v{cf_config['version']}" + # write modeling description + mdl_description = f"Model generated using ColabFold v{cf_config['version']}" if use_multimer: - description += f" with AlphaFold-Multimer (v{multimer_version})" + mdl_description += f" with AlphaFold-Multimer (v{multimer_version})" else: - description += " with AlphaFold" + mdl_description += " with AlphaFold" if cf_config["stop_at_score"] < 100: # early stopping feature of ColabFold upto = "up to " else: upto = "" - description += ( + mdl_description += ( f" producing {upto}{cf_config['num_models']} models" f" with {upto}{cf_config['num_recycles']} recycles each" ) if cf_config["use_amber"]: - description += ", with AMBER relaxation" + mdl_description += ", with AMBER relaxation" else: - description += ", without model relaxation" + mdl_description += ", without model relaxation" if cf_config["use_templates"]: print( "WARNING: ColabFold may use PDB70 or custom templates. " "Not clear from config!" ) - description += ", using templates" + mdl_description += ", using templates" else: - description += ", without templates" + mdl_description += ", without templates" if cf_config["rank_by"] == "plddt": - description += ", ranked by pLDDT" + mdl_description += ", ranked by pLDDT" elif cf_config["rank_by"] == "ptmscore": - description += ", ranked by pTM" + mdl_description += ", ranked by pTM" elif cf_config["rank_by"] == "multimer": - description += ", ranked by ipTM*0.8+pTM*0.2" + mdl_description += ", ranked by ipTM*0.8+pTM*0.2" else: raise ValueError(f"Unknown rank_by {cf_config['rank_by']}") if use_msa: - description += ", starting from" + mdl_description += ", starting from" if use_mmseqs: msa_type = "MSA" else: msa_type = "custom MSA" if use_multimer: if cf_config["pair_mode"] == "unpaired+paired": - description += f" paired and unpaired {msa_type}s" + mdl_description += f" paired and unpaired {msa_type}s" elif cf_config["pair_mode"] == "paired": - description += f" paired {msa_type}s" + mdl_description += f" paired {msa_type}s" elif cf_config["pair_mode"] == "unpaired": - description += f" unpaired {msa_type}s" + mdl_description += f" unpaired {msa_type}s" else: raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}") else: - description += f" an {msa_type}" + mdl_description += f" an {msa_type}" if use_mmseqs: - description += f" from MMseqs2 ({'+'.join(seq_dbs)})" + mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})" else: - description += " without an MSA" - description += "." + mdl_description += " without an MSA" + mdl_description += "." + + # write selection description + slct_description = ( + "Select best model, which is either the top-ranked model as " + + "determined by the ColabFold pipeline (ipTM*0.8+pTM*0.2), or else " + + "the model with best congruence with crosslinks reported in the " + + "related study." + ) return { "config": cf_config, @@ -376,7 +384,8 @@ def _parse_colabfold_config(cnfg_file): "use_msa": use_msa, "use_multimer": use_multimer, "multimer_version": multimer_version, - "description": description, + "modeling_description": mdl_description, + "selection_description": slct_description, } @@ -388,7 +397,7 @@ def _get_protocol_steps_and_software(config_data): step = { "method_type": "modeling", "name": None, - "details": config_data["description"], + "details": config_data["modeling_description"], } # get input data # Must refer to data already in the JSON, so we try keywords @@ -415,21 +424,7 @@ def _get_protocol_steps_and_software(config_data): "classification": "data collection", "description": "Many-against-Many sequence searching", # ToDo: add citation to ihm.citations - "citation": ihm.Citation( - pmid="30615063", - title="MMseqs2 desktop and local web server app for fast, " - + "interactive sequence searches.", - journal="Bioinformatics", - volume=35, - page_range=(2856, 2858), - year=2019, - authors=[ - "Mirdita, M.", - "Steinegger, M.", - "Soeding, J.", - ], - doi="10.1093/bioinformatics/bty1057", - ), + "citation": ihm.citations.mmseqs2, "location": "https://github.com/soedinglab/mmseqs2", "type": "package", "version": None, @@ -496,29 +491,22 @@ def _get_protocol_steps_and_software(config_data): protocol.append(step) # model selection step - # ToDo [input/ internal]: model selection step on a single model is a bit - # silly, how do we get a list of models? - # GT-NOTES: - # - input/output should be ok without list of models - # - rank of model is already stored in _ma_model_list.model_name and - # _ma_data.name (in _store_as_modelcif) - # - ColabFold ranking details is already in details of step above. - # - Suggestion: add extra step only if AF-ranking was overruled and - # include it in step above. - - # step = { - # "method_type": "model selection", - # "name": "ma_protocol_step.step_name", - # "details": "Select best model, which is either the top-ranked model " - # + "as determined by the ColabFold pipeline " - # + "(ipTM*0.8+pTM*0.2), or else the model with best " - # + "congruence with crosslinks reported in the related study.", - # } - # step["input"] = "model" - # step["output"] = "model" - # step["software"] = [] - # step["software_parameters"] = {} - # protocol.append(step) + if ( + "selection_description" not in config_data + or len(config_data["selection_description"]) == 0 + ): + return protocol + + step = { + "method_type": "model selection", + "name": None, + "details": config_data["selection_description"], + } + step["input"] = "model" + step["output"] = "model" + step["software"] = [] + step["software_parameters"] = {} + protocol.append(step) return protocol @@ -584,7 +572,9 @@ def _fetch_upkb_entry(up_ac): data["up_organism"] = "" data["up_sequence"] = "" data["up_ac"] = up_ac - rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt") + rspns = requests.get( + f"https://www.uniprot.org/uniprot/{up_ac}.txt", timeout=180 + ) for line in rspns.iter_lines(decode_unicode=True): if line.startswith("ID "): sline = line.split() -- GitLab