diff --git a/.spelling b/.spelling index 239bed71dca5930640153c1ce988b7436bd0b2ab..2fbaabd24abefdf135975d298fdb7b9de2efbee5 100644 --- a/.spelling +++ b/.spelling @@ -2,12 +2,14 @@ Biopython CIF DBs FastA +HH-suite Jupyter MSA ModelCIF PAE PDB PPI +Prefilled coevolution modeled modeling diff --git a/convert_to_modelcif.py b/convert_to_modelcif.py index 0612cc8fe8b1b9ff3bc791a68e30f593866a1e6a..5f5bfc093bea19e0a43977545929cdc72b26e367 100755 --- a/convert_to_modelcif.py +++ b/convert_to_modelcif.py @@ -31,6 +31,7 @@ import modelcif.protocol from alphapulldown.utils import make_dir_monomer_dictionary +# ToDo: Software versions can not have a white space, e.g. ColabFold (drop time) # ToDo: DISCUSS Get options properly, best get the same names as used in # existing scripts # ToDo: Monomers work separately - features may come from different set of @@ -738,6 +739,64 @@ def _get_software_data(meta_json: dict) -> list: ], doi="10.1186/s12859-019-3019-7", ) + + class _HHsuiteSW(modelcif.Software): + """Prefilled software object for HH-suite tools.""" + + # We keep the parameter names from the parent class here, so let Pylint + # ignore redefining the 'type' builtin. + # pylint: disable=redefined-builtin + + def __init__( + self, + name, + classification="data collection", + description="Iterative protein sequence searching by HMM-HMM " + + "alignment", + location="https://github.com/soedinglab/hh-suite", + type="program", + version=None, + citation=cite_hhsuite, + ): + """Initialise a model""" + super().__init__( + name, + classification, + description, + location, + type, + version, + citation, + ) + + class _HmmerSW(modelcif.Software): + """Prefilled software object for HMMER tools.""" + + # We keep the parameter names from the parent class here, so let Pylint + # ignore redefining the 'type' builtin. + # pylint: disable=redefined-builtin + + def __init__( + self, + name, + classification="data collection", + description="Building HMM search profiles", + location="http://hmmer.org/", + type="program", + version=None, + citation=None, + ): + """Initialise a model""" + super().__init__( + name, + classification, + description, + location, + type, + version, + citation, + ) + # {key from JSON: dict needed to produce software entry plus internal key} sw_data = { "AlphaFold": modelcif.Software( @@ -805,36 +864,41 @@ def _get_software_data(meta_json: dict) -> list: doi="10.1093/bioinformatics/btac749", ), ), - "hhblits": modelcif.Software( - "HHblits", - "data collection", - "Iterative protein sequence searching by HMM-HMM alignment", - "https://github.com/soedinglab/hh-suite", - "program", - None, - cite_hhsuite, - ), - "hhsearch": modelcif.Software( + "hhblits": _HHsuiteSW("HHblits"), + "hhsearch": _HHsuiteSW( "HHsearch", - "data collection", - "Protein sequence searching by HMM-HMM comparison", - "https://github.com/soedinglab/hh-suite", - "program", - None, - cite_hhsuite, + description="Protein sequence searching by HMM-HMM comparison", + ), + "hmmbuild": _HmmerSW("hmmbuild"), + "hmmsearch": _HmmerSW( + "hmmsearch", + description="Search profile(s) against a sequence database", ), - "hmmbuild": modelcif.Software( - "hmmbuild", + "jackhmmer": _HmmerSW( + "jackhmmer", + description="Iteratively search sequence(s) against a sequence " + + "database", + ), + "kalign": modelcif.Software( + "kalign", "data collection", - "Building HMM search profiles", - "http://hmmer.org/", + "Kalign is a fast multiple sequence alignment program for " + + "biological sequences", + "https://github.com/timolassmann/kalign", "program", None, - None, + ihm.Citation( + pmid="31665271", + title="Kalign 3: multiple sequence alignment of large data " + + "sets", + journal="Bioinformatics", + volume=36, + page_range=(1928, 1929), + year=2019, + authors=["Lassmann, T."], + doi="10.1093/bioinformatics/btz795", + ), ), - "hmmsearch": None, - "jackhmmer": None, - "kalign": None, } # ToDo: refactor to only those SW objects created/ added that are actually # in the dictionary. That is, instead of a pre-build dictionary, @@ -862,6 +926,9 @@ def _get_software_data(meta_json: dict) -> list: def _get_protocol_steps(modelcif_json): """Create the list of protocol steps with software and parameters used.""" + # ToDo: Get software_group from external input, right now the protocol steps + # are hard-coded here with the software per step. The JSON input does + # not list steps, only software. protocol = [] # MSA/ monomer feature generation step # ToDo: Discuss input, manual has baits & sequences @@ -884,7 +951,6 @@ def _get_protocol_steps(modelcif_json): # ToDo: Discuss input, seem to depend on mode # ToDo: what about step details? Would it be nice to add the AlphaPulldown # mode here? - # ToDo: get software_group from external input step = { "method_type": "modeling", "step_name": None,