Skip to content
Snippets Groups Projects
Commit 8fb7c027 authored by Bienchen's avatar Bienchen
Browse files

Merge branch 'develop' into main

parents a3f62f6e 02038207
No related branches found
No related tags found
No related merge requests found
......@@ -2,12 +2,14 @@ Biopython
CIF
DBs
FastA
HH-suite
Jupyter
MSA
ModelCIF
PAE
PDB
PPI
Prefilled
coevolution
modeled
modeling
......
......@@ -31,6 +31,7 @@ import modelcif.protocol
from alphapulldown.utils import make_dir_monomer_dictionary
# ToDo: Software versions can not have a white space, e.g. ColabFold (drop time)
# ToDo: DISCUSS Get options properly, best get the same names as used in
# existing scripts
# ToDo: Monomers work separately - features may come from different set of
......@@ -46,7 +47,6 @@ from alphapulldown.utils import make_dir_monomer_dictionary
# ToDo: Example 1 from the GitHub repo mentions MMseqs2
# ToDo: Discuss input of protocol steps, feature creation has baits, sequences
# does modelling depend on mode?
# ToDo: Option to add remaining models w PAE files to archive
# ToDo: deal with `--max_template_date`, beta-barrel project has it as software
# parameter
flags.DEFINE_string(
......@@ -394,7 +394,6 @@ def _cmp_ref_dbs(db_dct, db_objs):
def _get_modelcif_ref_dbs(meta_json):
"""Get sequence databases used for monomer features."""
# vendor formatting for DB names/ URLs, extend on KeyError
# ToDo: adapt to new JSON input
sdb_lst = {} # 'sequence database list' starts as dict since we need to
# compare DBs between the different monomers.
i = 0
......@@ -740,6 +739,64 @@ def _get_software_data(meta_json: dict) -> list:
],
doi="10.1186/s12859-019-3019-7",
)
class _HHsuiteSW(modelcif.Software):
"""Prefilled software object for HH-suite tools."""
# We keep the parameter names from the parent class here, so let Pylint
# ignore redefining the 'type' builtin.
# pylint: disable=redefined-builtin
def __init__(
self,
name,
classification="data collection",
description="Iterative protein sequence searching by HMM-HMM "
+ "alignment",
location="https://github.com/soedinglab/hh-suite",
type="program",
version=None,
citation=cite_hhsuite,
):
"""Initialise a model"""
super().__init__(
name,
classification,
description,
location,
type,
version,
citation,
)
class _HmmerSW(modelcif.Software):
"""Prefilled software object for HMMER tools."""
# We keep the parameter names from the parent class here, so let Pylint
# ignore redefining the 'type' builtin.
# pylint: disable=redefined-builtin
def __init__(
self,
name,
classification="data collection",
description="Building HMM search profiles",
location="http://hmmer.org/",
type="program",
version=None,
citation=None,
):
"""Initialise a model"""
super().__init__(
name,
classification,
description,
location,
type,
version,
citation,
)
# {key from JSON: dict needed to produce software entry plus internal key}
sw_data = {
"AlphaFold": modelcif.Software(
......@@ -807,36 +864,41 @@ def _get_software_data(meta_json: dict) -> list:
doi="10.1093/bioinformatics/btac749",
),
),
"hhblits": modelcif.Software(
"HHblits",
"data collection",
"Iterative protein sequence searching by HMM-HMM alignment",
"https://github.com/soedinglab/hh-suite",
"program",
None,
cite_hhsuite,
),
"hhsearch": modelcif.Software(
"hhblits": _HHsuiteSW("HHblits"),
"hhsearch": _HHsuiteSW(
"HHsearch",
"data collection",
"Protein sequence searching by HMM-HMM comparison",
"https://github.com/soedinglab/hh-suite",
"program",
None,
cite_hhsuite,
description="Protein sequence searching by HMM-HMM comparison",
),
"hmmbuild": _HmmerSW("hmmbuild"),
"hmmsearch": _HmmerSW(
"hmmsearch",
description="Search profile(s) against a sequence database",
),
"hmmbuild": modelcif.Software(
"hmmbuild",
"jackhmmer": _HmmerSW(
"jackhmmer",
description="Iteratively search sequence(s) against a sequence "
+ "database",
),
"kalign": modelcif.Software(
"kalign",
"data collection",
"Building HMM search profiles",
"http://hmmer.org/",
"Kalign is a fast multiple sequence alignment program for "
+ "biological sequences",
"https://github.com/timolassmann/kalign",
"program",
None,
None,
ihm.Citation(
pmid="31665271",
title="Kalign 3: multiple sequence alignment of large data "
+ "sets",
journal="Bioinformatics",
volume=36,
page_range=(1928, 1929),
year=2019,
authors=["Lassmann, T."],
doi="10.1093/bioinformatics/btz795",
),
),
"hmmsearch": None,
"jackhmmer": None,
"kalign": None,
}
# ToDo: refactor to only those SW objects created/ added that are actually
# in the dictionary. That is, instead of a pre-build dictionary,
......@@ -864,6 +926,9 @@ def _get_software_data(meta_json: dict) -> list:
def _get_protocol_steps(modelcif_json):
"""Create the list of protocol steps with software and parameters used."""
# ToDo: Get software_group from external input, right now the protocol steps
# are hard-coded here with the software per step. The JSON input does
# not list steps, only software.
protocol = []
# MSA/ monomer feature generation step
# ToDo: Discuss input, manual has baits & sequences
......@@ -886,7 +951,6 @@ def _get_protocol_steps(modelcif_json):
# ToDo: Discuss input, seem to depend on mode
# ToDo: what about step details? Would it be nice to add the AlphaPulldown
# mode here?
# ToDo: get software_group from external input
step = {
"method_type": "modeling",
"step_name": None,
......@@ -1101,11 +1165,6 @@ def main(argv):
if __name__ == "__main__":
app.run(main)
# ToDo: Question - option to include all the non-selected models in associated
# data archive? This blows up storage size (especially if PAEs included),
# but we did that already in the past. Idea is to have all models
# available for... reproducibility and whatnot, but show the selected
# (representative) of the modelling experiment/ study more prominently.
# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
# 'data_', '_entry', maybe have a user-defined JSON document with things
# like that, including author names?
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment