Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
AlphaPulldown-ModelCIF-Conversion
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Bienchen
AlphaPulldown-ModelCIF-Conversion
Commits
8fb7c027
Commit
8fb7c027
authored
1 year ago
by
Bienchen
Browse files
Options
Downloads
Plain Diff
Merge branch 'develop' into main
parents
a3f62f6e
02038207
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
.spelling
+2
-0
2 additions, 0 deletions
.spelling
convert_to_modelcif.py
+91
-32
91 additions, 32 deletions
convert_to_modelcif.py
with
93 additions
and
32 deletions
.spelling
+
2
−
0
View file @
8fb7c027
...
...
@@ -2,12 +2,14 @@ Biopython
CIF
DBs
FastA
HH-suite
Jupyter
MSA
ModelCIF
PAE
PDB
PPI
Prefilled
coevolution
modeled
modeling
...
...
This diff is collapsed.
Click to expand it.
convert_to_modelcif.py
+
91
−
32
View file @
8fb7c027
...
...
@@ -31,6 +31,7 @@ import modelcif.protocol
from
alphapulldown.utils
import
make_dir_monomer_dictionary
# ToDo: Software versions can not have a white space, e.g. ColabFold (drop time)
# ToDo: DISCUSS Get options properly, best get the same names as used in
# existing scripts
# ToDo: Monomers work separately - features may come from different set of
...
...
@@ -46,7 +47,6 @@ from alphapulldown.utils import make_dir_monomer_dictionary
# ToDo: Example 1 from the GitHub repo mentions MMseqs2
# ToDo: Discuss input of protocol steps, feature creation has baits, sequences
# does modelling depend on mode?
# ToDo: Option to add remaining models w PAE files to archive
# ToDo: deal with `--max_template_date`, beta-barrel project has it as software
# parameter
flags
.
DEFINE_string
(
...
...
@@ -394,7 +394,6 @@ def _cmp_ref_dbs(db_dct, db_objs):
def
_get_modelcif_ref_dbs
(
meta_json
):
"""
Get sequence databases used for monomer features.
"""
# vendor formatting for DB names/ URLs, extend on KeyError
# ToDo: adapt to new JSON input
sdb_lst
=
{}
# 'sequence database list' starts as dict since we need to
# compare DBs between the different monomers.
i
=
0
...
...
@@ -740,6 +739,64 @@ def _get_software_data(meta_json: dict) -> list:
],
doi
=
"
10.1186/s12859-019-3019-7
"
,
)
class
_HHsuiteSW
(
modelcif
.
Software
):
"""
Prefilled software object for HH-suite tools.
"""
# We keep the parameter names from the parent class here, so let Pylint
# ignore redefining the 'type' builtin.
# pylint: disable=redefined-builtin
def
__init__
(
self
,
name
,
classification
=
"
data collection
"
,
description
=
"
Iterative protein sequence searching by HMM-HMM
"
+
"
alignment
"
,
location
=
"
https://github.com/soedinglab/hh-suite
"
,
type
=
"
program
"
,
version
=
None
,
citation
=
cite_hhsuite
,
):
"""
Initialise a model
"""
super
().
__init__
(
name
,
classification
,
description
,
location
,
type
,
version
,
citation
,
)
class
_HmmerSW
(
modelcif
.
Software
):
"""
Prefilled software object for HMMER tools.
"""
# We keep the parameter names from the parent class here, so let Pylint
# ignore redefining the 'type' builtin.
# pylint: disable=redefined-builtin
def
__init__
(
self
,
name
,
classification
=
"
data collection
"
,
description
=
"
Building HMM search profiles
"
,
location
=
"
http://hmmer.org/
"
,
type
=
"
program
"
,
version
=
None
,
citation
=
None
,
):
"""
Initialise a model
"""
super
().
__init__
(
name
,
classification
,
description
,
location
,
type
,
version
,
citation
,
)
# {key from JSON: dict needed to produce software entry plus internal key}
sw_data
=
{
"
AlphaFold
"
:
modelcif
.
Software
(
...
...
@@ -807,36 +864,41 @@ def _get_software_data(meta_json: dict) -> list:
doi
=
"
10.1093/bioinformatics/btac749
"
,
),
),
"
hhblits
"
:
modelcif
.
Software
(
"
HHblits
"
,
"
data collection
"
,
"
Iterative protein sequence searching by HMM-HMM alignment
"
,
"
https://github.com/soedinglab/hh-suite
"
,
"
program
"
,
None
,
cite_hhsuite
,
),
"
hhsearch
"
:
modelcif
.
Software
(
"
hhblits
"
:
_HHsuiteSW
(
"
HHblits
"
),
"
hhsearch
"
:
_HHsuiteSW
(
"
HHsearch
"
,
"
data collecti
on
"
,
"
Protein sequence searching by HMM-HMM comparison
"
,
"
https://github.com/soedinglab/hh-suite
"
,
"
program
"
,
None
,
cite_hhsuite
,
description
=
"
Protein sequence searching by HMM-HMM comparis
on
"
,
)
,
"
hmmbuild
"
:
_HmmerSW
(
"
hmmbuild
"
)
,
"
hmmsearch
"
:
_HmmerSW
(
"
hmmsearch
"
,
description
=
"
Search profile(s) against a sequence database
"
,
),
"
hmmbuild
"
:
modelcif
.
Software
(
"
hmmbuild
"
,
"
jackhmmer
"
:
_HmmerSW
(
"
jackhmmer
"
,
description
=
"
Iteratively search sequence(s) against a sequence
"
+
"
database
"
,
),
"
kalign
"
:
modelcif
.
Software
(
"
kalign
"
,
"
data collection
"
,
"
Building HMM search profiles
"
,
"
http://hmmer.org/
"
,
"
Kalign is a fast multiple sequence alignment program for
"
+
"
biological sequences
"
,
"
https://github.com/timolassmann/kalign
"
,
"
program
"
,
None
,
None
,
ihm
.
Citation
(
pmid
=
"
31665271
"
,
title
=
"
Kalign 3: multiple sequence alignment of large data
"
+
"
sets
"
,
journal
=
"
Bioinformatics
"
,
volume
=
36
,
page_range
=
(
1928
,
1929
),
year
=
2019
,
authors
=
[
"
Lassmann, T.
"
],
doi
=
"
10.1093/bioinformatics/btz795
"
,
),
),
"
hmmsearch
"
:
None
,
"
jackhmmer
"
:
None
,
"
kalign
"
:
None
,
}
# ToDo: refactor to only those SW objects created/ added that are actually
# in the dictionary. That is, instead of a pre-build dictionary,
...
...
@@ -864,6 +926,9 @@ def _get_software_data(meta_json: dict) -> list:
def
_get_protocol_steps
(
modelcif_json
):
"""
Create the list of protocol steps with software and parameters used.
"""
# ToDo: Get software_group from external input, right now the protocol steps
# are hard-coded here with the software per step. The JSON input does
# not list steps, only software.
protocol
=
[]
# MSA/ monomer feature generation step
# ToDo: Discuss input, manual has baits & sequences
...
...
@@ -886,7 +951,6 @@ def _get_protocol_steps(modelcif_json):
# ToDo: Discuss input, seem to depend on mode
# ToDo: what about step details? Would it be nice to add the AlphaPulldown
# mode here?
# ToDo: get software_group from external input
step
=
{
"
method_type
"
:
"
modeling
"
,
"
step_name
"
:
None
,
...
...
@@ -1101,11 +1165,6 @@ def main(argv):
if
__name__
==
"
__main__
"
:
app
.
run
(
main
)
# ToDo: Question - option to include all the non-selected models in associated
# data archive? This blows up storage size (especially if PAEs included),
# but we did that already in the past. Idea is to have all models
# available for... reproducibility and whatnot, but show the selected
# (representative) of the modelling experiment/ study more prominently.
# ToDo: Things to look at: '_struct.title', '_struct.pdbx_model_details',
# 'data_', '_entry', maybe have a user-defined JSON document with things
# like that, including author names?
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment