diff --git a/.gitignore b/.gitignore index a60f727325ff9d5e54be75aeec29ea1118ff5f8f..a81a1ca17283cda57e3c34be881874e31dea4384 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ A0A1B0GTU1-O75152* -.*~ \ No newline at end of file +.*~ +.docker-bash-history +.DS_Store diff --git a/translate2modelcif.py b/translate2modelcif.py index 6a3ded6411c754526fbd519e878ef39151aab906..4e9e9b2620a8c3d8058ad0fecd39f7f3b3da0be8 100644 --- a/translate2modelcif.py +++ b/translate2modelcif.py @@ -16,6 +16,7 @@ import modelcif import modelcif.associated import modelcif.dumper import modelcif.model +import modelcif.protocol import modelcif.reference from ost import io @@ -218,6 +219,69 @@ def _get_audit_authors(): return ("Foo B", "Bar F") +def _get_protocol_steps_and_software(trg_ents): + """Create the list of protocol steps with software and parameters used.""" + protocol = [] + + # modelling step + step = { + "method_type": "modeling", + "name": "ma_protocol_step.step_name", + "details": "ma_protocol_step.details", + } + # get input data + # Must refer to data already in the JSON, so we try keywords + step["input"] = "target_sequences" + # get output data + # Must refer to existing data, so we try keywords + step["output"] = "model" + # get software + step["software"] = { + "name": "ColabFold", + "classification": "model building", + "description": "software.description", + "citation": { + "pmid": None, + "title": "ColabFold - Making protein folding accessible to all", + "journal": "bioRxiv", + "volume": None, + "page_range": None, + "year": 2022, + "authors": [ + "Mirdita M", + "Schütze K", + "Moriwaki Y", + "Heo L", + "Ovchinnikov S", + "Steinegger M", + ], + "doi": "10.1101/2021.08.15.456425", + }, + "location": "https://github.com/sokrypton/ColabFold", + "type": "package", + "version": "software.version", + } + # get parameters + step["software_parameters"] = {} + protocol.append(step) + + # model selection step + # ToDo [input/ internal]: model selection step on a single model is a bit + # silly, how do we get a list of models? + step = { + "method_type": "model selection", + "name": "ma_protocol_step.step_name", + "details": "ma_protocol_step.details", + } + step["input"] = "model" + step["output"] = "model" + step["software"] = {} + step["software_parameters"] = {} + protocol.append(step) + + return protocol + + def _get_title(): """Get a title for this modelling experiment.""" # ToDo [input]: Add title @@ -379,28 +443,9 @@ def _get_scores(data, prfx): data.update(scrs_json) -def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): - """Mix all the data into a ModelCIF file.""" - print(" generating ModelCIF objects...", end="") - pstart = timer() - # ToDo [internal]: Get protocol/ software - # ToDo [internal]: Get QA metrics - # create system to gather all the data - system = modelcif.System( - title=data_json["title"], - id=interaction_name.upper(), - model_details=data_json["model_details"], - ) - # create target entities, references, source, asymmetric units & assembly - # for source we assume all chains come from the same taxon - source = ihm.source.Natural( - ncbi_taxonomy_id=data_json["target_entities"][0]["up_ncbi_taxid"], - scientific_name=data_json["target_entities"][0]["up_organism"], - ) - - # create an asymmetric unit and an entity per target sequence - asym_units = {} - for cif_ent in data_json["target_entities"]: +def _get_modelcif_entities(target_ents, source, asym_units, system): + """Create ModelCIF entities and asymmetric units.""" + for cif_ent in target_ents: # ToDo [input]: Get entity description mdlcif_ent = modelcif.Entity( cif_ent["pdb_sequence"], @@ -430,6 +475,31 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): ) system.target_entities.append(mdlcif_ent) + +def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): + """Mix all the data into a ModelCIF file.""" + print(" generating ModelCIF objects...", end="") + pstart = timer() + # ToDo [internal]: Get protocol/ software + # create system to gather all the data + system = modelcif.System( + title=data_json["title"], + id=interaction_name.upper(), + model_details=data_json["model_details"], + ) + # create target entities, references, source, asymmetric units & assembly + # for source we assume all chains come from the same taxon + source = ihm.source.Natural( + ncbi_taxonomy_id=data_json["target_entities"][0]["up_ncbi_taxid"], + scientific_name=data_json["target_entities"][0]["up_organism"], + ) + + # create an asymmetric unit and an entity per target sequence + asym_units = {} + _get_modelcif_entities( + data_json["target_entities"], source, asym_units, system + ) + # ToDo [input]: Get Assembly name assembly = modelcif.Assembly( asym_units.values(), name="ma_struct_assembly_details.assembly_name" @@ -460,6 +530,56 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx): ) system.model_groups.append(model_group) + # Add protocol + protocol = modelcif.protocol.Protocol() + for js_step in data_json["protocol"]: + sftwre = None + # ToDo [input]: Turn into software group if parameters are available + # ToDo [input]: Get software.description + # ToDo [input]: Get software.version + if js_step["software"]: + sftwre = modelcif.Software( + js_step["software"]["name"], + js_step["software"]["classification"], + js_step["software"]["description"], + js_step["software"]["location"], + js_step["software"]["type"], + js_step["software"]["version"], + citation=ihm.Citation( + pmid=js_step["software"]["citation"]["pmid"], + title=js_step["software"]["citation"]["title"], + journal=js_step["software"]["citation"]["journal"], + volume=js_step["software"]["citation"]["volume"], + page_range=js_step["software"]["citation"]["page_range"], + year=js_step["software"]["citation"]["year"], + authors=js_step["software"]["citation"]["authors"], + doi=js_step["software"]["citation"]["doi"], + ), + ) + if js_step["input"] == "target_sequences": + input_data = modelcif.data.DataGroup(system.target_entities) + elif js_step["input"] == "model": + input_data = model + else: + raise RuntimeError(f"Unknown protocol input: '{js_step['input']}'") + if js_step["output"] == "model": + output_data = model + else: + raise RuntimeError( + f"Unknown protocol output: '{js_step['output']}'" + ) + protocol.steps.append( + modelcif.protocol.Step( + input_data=input_data, + output_data=output_data, + name=js_step["name"], + details=js_step["details"], + software=sftwre, + ) + ) + protocol.steps[-1].method_type = js_step["method_type"] + system.protocols.append(protocol) + # write modelcif System to file print(" write to disk...", end="", flush=True) pstart = timer() @@ -484,6 +604,7 @@ def _create_model_json(data, pdb_file, up_acs): """Create a dictionary (mimicking JSON) that contains all the data.""" data["target_entities"], ost_ent = _get_entities(pdb_file, up_acs) + data["protocol"] = _get_protocol_steps_and_software(data["target_entities"]) return ost_ent