Merge branch 'develop' into main

8fb7c027 · Bienchen · a3f62f6e · 02038207 · 8fb7c027 · 8fb7c027
Commit 8fb7c027 authored 1 year ago by Bienchen
--- a/.spelling
+++ b/.spelling
@@ -2,12 +2,14 @@ Biopython
 CIF
 DBs
 FastA
+HH-suite
 Jupyter
 MSA
 ModelCIF
 PAE
 PDB
 PPI
+Prefilled
 coevolution
 modeled
 modeling

--- a/convert_to_modelcif.py
+++ b/convert_to_modelcif.py
@@ -31,6 +31,7 @@ import modelcif.protocol

 from alphapulldown.utils import make_dir_monomer_dictionary

+# ToDo: Software versions can not have a white space, e.g. ColabFold (drop time)
 # ToDo: DISCUSS Get options properly, best get the same names as used in
 #       existing scripts
 # ToDo: Monomers work separately - features may come from different set of
@@ -46,7 +47,6 @@ from alphapulldown.utils import make_dir_monomer_dictionary
 # ToDo: Example 1 from the GitHub repo mentions MMseqs2
 # ToDo: Discuss input of protocol steps, feature creation has baits, sequences
 #       does modelling depend on mode?
-# ToDo: Option to add remaining models w PAE files to archive
 # ToDo: deal with `--max_template_date`, beta-barrel project has it as software
 #       parameter
 flags.DEFINE_string(
@@ -394,7 +394,6 @@ def _cmp_ref_dbs(db_dct, db_objs):
 def _get_modelcif_ref_dbs(meta_json):
    """Get sequence databases used for monomer features."""
    # vendor formatting for DB names/ URLs, extend on KeyError
-    # ToDo: adapt to new JSON input
    sdb_lst = {}  # 'sequence database list' starts as dict since we need to
    # compare DBs between the different monomers.
    i = 0
@@ -740,6 +739,64 @@ def _get_software_data(meta_json: dict) -> list:
        ],
        doi="10.1186/s12859-019-3019-7",
    )
+
+    class _HHsuiteSW(modelcif.Software):
+        """Prefilled software object for HH-suite tools."""
+
+        # We keep the parameter names from the parent class here, so let Pylint
+        # ignore redefining the 'type' builtin.
+        # pylint: disable=redefined-builtin
+
+        def __init__(
+            self,
+            name,
+            classification="data collection",
+            description="Iterative protein sequence searching by HMM-HMM "
+            + "alignment",
+            location="https://github.com/soedinglab/hh-suite",
+            type="program",
+            version=None,
+            citation=cite_hhsuite,
+        ):
+            """Initialise a model"""
+            super().__init__(
+                name,
+                classification,
+                description,
+                location,
+                type,
+                version,
+                citation,
+            )
+
+    class _HmmerSW(modelcif.Software):
+        """Prefilled software object for HMMER tools."""
+
+        # We keep the parameter names from the parent class here, so let Pylint
+        # ignore redefining the 'type' builtin.
+        # pylint: disable=redefined-builtin
+
+        def __init__(
+            self,
+            name,
+            classification="data collection",
+            description="Building HMM search profiles",
+            location="http://hmmer.org/",
+            type="program",
+            version=None,
+            citation=None,
+        ):
+            """Initialise a model"""
+            super().__init__(
+                name,
+                classification,
+                description,
+                location,
+                type,
+                version,
+                citation,
+            )
+
    # {key from JSON: dict needed to produce software entry plus internal key}
    sw_data = {
        "AlphaFold": modelcif.Software(
@@ -807,36 +864,41 @@ def _get_software_data(meta_json: dict) -> list:
                doi="10.1093/bioinformatics/btac749",
            ),
        ),
-        "hhblits": modelcif.Software(
-            "HHblits",
-            "data collection",
-            "Iterative protein sequence searching by HMM-HMM alignment",
-            "https://github.com/soedinglab/hh-suite",
-            "program",
-            None,
-            cite_hhsuite,
-        ),
-        "hhsearch": modelcif.Software(
+        "hhblits": _HHsuiteSW("HHblits"),
+        "hhsearch": _HHsuiteSW(
            "HHsearch",
-            "data collection",
-            "Protein sequence searching by HMM-HMM comparison",
-            "https://github.com/soedinglab/hh-suite",
-            "program",
-            None,
-            cite_hhsuite,
+            description="Protein sequence searching by HMM-HMM comparison",
+        ),
+        "hmmbuild": _HmmerSW("hmmbuild"),
+        "hmmsearch": _HmmerSW(
+            "hmmsearch",
+            description="Search profile(s) against a sequence database",
        ),
-        "hmmbuild": modelcif.Software(
-            "hmmbuild",
+        "jackhmmer": _HmmerSW(
+            "jackhmmer",
+            description="Iteratively search sequence(s) against a sequence "
+            + "database",
+        ),
+        "kalign": modelcif.Software(
+            "kalign",
            "data collection",
-            "Building HMM search profiles",
-            "http://hmmer.org/",
+            "Kalign is a fast multiple sequence alignment program for "
+            + "biological sequences",
+            "https://github.com/timolassmann/kalign",
            "program",
            None,
-            None,
+            ihm.Citation(
+                pmid="31665271",
+                title="Kalign 3: multiple sequence alignment of large data "
+                + "sets",
+                journal="Bioinformatics",
+                volume=36,
+                page_range=(1928, 1929),
+                year=2019,
+                authors=["Lassmann, T."],
+                doi="10.1093/bioinformatics/btz795",
+            ),
        ),
-        "hmmsearch": None,
-        "jackhmmer": None,
-        "kalign": None,
    }
    # ToDo: refactor to only those SW objects created/ added that are actually
    #       in the dictionary. That is, instead of a pre-build dictionary,
@@ -864,6 +926,9 @@ def _get_software_data(meta_json: dict) -> list:

 def _get_protocol_steps(modelcif_json):
    """Create the list of protocol steps with software and parameters used."""
+    # ToDo: Get software_group from external input, right now the protocol steps
+    #       are hard-coded here with the software per step. The JSON input does
+    #       not list steps, only software.
    protocol = []
    # MSA/ monomer feature generation step
    # ToDo: Discuss input, manual has baits & sequences
@@ -886,7 +951,6 @@ def _get_protocol_steps(modelcif_json):
    # ToDo: Discuss input, seem to depend on mode
    # ToDo: what about step details? Would it be nice to add the AlphaPulldown
    #       mode here?
-    # ToDo: get software_group from external input
    step = {
        "method_type": "modeling",
        "step_name": None,
@@ -1101,11 +1165,6 @@ def main(argv):
 if __name__ == "__main__":
    app.run(main)

-# ToDo: Question - option to include all the non-selected models in associated
-#       data archive? This blows up storage size (especially if PAEs included),
-#       but we did that already in the past. Idea is to have all models
-#       available for... reproducibility and whatnot, but show the selected
-#       (representative) of the modelling experiment/ study more prominently.
 # ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
 #       'data_', '_entry', maybe have a user-defined JSON document with things
 #       like that, including author names?