Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
ma-wilkins-import
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
schwede
ma-wilkins-import
Commits
14a60b46
Commit
14a60b46
authored
2 years ago
by
Gerardo Tauriello
Browse files
Options
Downloads
Patches
Plain Diff
Updated translate2modelcif.py based on Niko-model-set
parent
6bbd6fa7
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
translate2modelcif.py
+358
-161
358 additions, 161 deletions
translate2modelcif.py
with
358 additions
and
161 deletions
translate2modelcif.py
+
358
−
161
View file @
14a60b46
#! /usr/local/bin/ost
"""
Translate models from Tara/ Xabi from PDB + extra data into ModelCIF.
"""
# ToDo [internal]: get DB versions in - https://colabfold.mmseqs.com, scroll
# down to "Database Information"
# EXAMPLES for running:
"""
ost scripts/translate2modelcif.py
"
A0A1B0GTU1-O75152
"
\
--top_ranked_only --out_dir=
"
./modelcif
"
"""
import
argparse
import
datetime
import
os
import
sys
import
gzip
,
shutil
,
zipfile
from
timeit
import
default_timer
as
timer
import
numpy
as
np
...
...
@@ -14,6 +19,7 @@ import requests
import
ujson
as
json
import
ihm
import
ihm.citations
import
modelcif
import
modelcif.associated
import
modelcif.dumper
...
...
@@ -38,6 +44,27 @@ def _parse_args():
help
=
"
Directory with model(s) to be translated. Must be of form
"
+
"'
<UniProtKB AC>-<UniProtKB AC>
'"
,
)
parser
.
add_argument
(
"
--top_ranked_only
"
,
default
=
False
,
action
=
"
store_true
"
,
help
=
"
Only process top ranked model.
"
)
parser
.
add_argument
(
"
--out_dir
"
,
type
=
str
,
metavar
=
"
<OUTPUT DIR>
"
,
default
=
""
,
help
=
"
Path to separate path to store results
"
\
"
(model_dir used, if none given).
"
,
)
parser
.
add_argument
(
"
--compress
"
,
default
=
False
,
action
=
"
store_true
"
,
help
=
"
Compress ModelCIF file with gzip
"
\
"
(note that QA file is zipped either way).
"
,
)
opts
=
parser
.
parse_args
()
...
...
@@ -48,42 +75,49 @@ def _parse_args():
_abort_msg
(
f
"
Model directory
'
{
opts
.
model_dir
}
'
does not exist.
"
)
if
not
os
.
path
.
isdir
(
opts
.
model_dir
):
_abort_msg
(
f
"
Path
'
{
opts
.
model_dir
}
'
does not point to a directory.
"
)
# check out_dir
if
not
opts
.
out_dir
:
opts
.
out_dir
=
opts
.
model_dir
else
:
if
not
os
.
path
.
exists
(
opts
.
out_dir
):
_abort_msg
(
f
"
Output directory
'
{
opts
.
out_dir
}
'
does not exist.
"
)
if
not
os
.
path
.
isdir
(
opts
.
out_dir
):
_abort_msg
(
f
"
Path
'
{
opts
.
out_dir
}
'
does not point to a directory.
"
)
return
opts
# pylint: disable=too-few-public-methods
class
_GlobalPTM
(
modelcif
.
qa_metric
.
Global
,
modelcif
.
qa_metric
.
PTM
):
"""
Predicted accuracy according to the TM-score score in [0,1]
.
"""
"""
Predicted accuracy according to the TM-score score in [0,1]
"""
name
=
"
pTM
"
software
=
None
class
_GlobalPLDDT
(
modelcif
.
qa_metric
.
Global
,
modelcif
.
qa_metric
.
PLDDT
):
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
.
"""
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
"""
name
=
"
pLDDT
"
software
=
None
class
_LocalPLDDT
(
modelcif
.
qa_metric
.
Local
,
modelcif
.
qa_metric
.
PLDDT
):
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
.
"""
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
"""
name
=
"
pLDDT
"
software
=
None
class
_PAE
(
modelcif
.
qa_metric
.
MetricType
):
"""
Predicted aligned error (in Angstroms).
See :class:`MetricType` for more information.
"""
"""
Predicted aligned error (in Angstroms)
"""
type
=
"
PAE
"
other_details
=
None
class
_LocalPairwisePAE
(
modelcif
.
qa_metric
.
LocalPairwise
,
_PAE
):
"""
p
redicted aligned error (in Angstroms)
.
"""
"""
P
redicted aligned error (in Angstroms)
"""
name
=
"
PAE
"
software
=
None
...
...
@@ -118,7 +152,7 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
occupancy
=
atm
.
occupancy
,
)
def
add_scores
(
self
,
scores_json
,
entry_id
,
ac_file_prfx
):
def
add_scores
(
self
,
scores_json
,
entry_id
,
mdl_name
):
"""
Add QA metrics from AF2 scores.
"""
# global scores
self
.
qa_metrics
.
extend
(
...
...
@@ -162,25 +196,28 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
self
.
qa_metrics
.
extend
(
lpae
)
ac_file
=
f
"
{
os
.
path
.
basename
(
ac_file_prfx
)
}
_local_pairwise_qa.cif
"
ac_file
=
f
"
{
mdl_name
}
_local_pairwise_qa.cif
"
qa_file
=
modelcif
.
associated
.
LocalPairwiseQAScoresFile
(
ac_file
,
categories
=
[
"
_ma_qa_metric_local_pairwise
"
],
copy_categories
=
[
"
_ma_qa_metric
"
],
entry_id
=
entry_id
,
entry_details
=
"
This file is an associated file consisting
"
+
"
of local pairwise QA metrics. This is a partial mmCIF
"
+
"
file and can be validated by merging with the main
"
+
"
mmCIF file containing the model coordinates and other
"
+
"
associated data.
"
,
details
=
"
Predicted aligned error
"
,
)
return
modelcif
.
associated
.
Repository
(
""
,
[
modelcif
.
associated
.
LocalPairwiseQAScoresFile
(
ac_file
,
categories
=
[
"
_ma_qa_metric_local_pairwise
"
],
copy_categories
=
[
"
_ma_qa_metric
"
],
entry_id
=
entry_id
,
entry_details
=
"
This file is an associated file consisting
"
+
"
of local pairwise QA metrics. This is a partial mmCIF
"
+
"
file and can be validated by merging with the main
"
+
"
mmCIF file containing the model coordinates and other
"
+
"
associated data.
"
,
details
=
"
Predicted aligned error.
"
,
)
modelcif
.
associated
.
ZipFile
(
f
"
{
mdl_name
}
.zip
"
,
files
=
[
qa_file
])
],
)
# NOTE: by convention MA expects zip file with same name as model-cif
def
_abort_msg
(
msg
,
exit_code
=
1
):
...
...
@@ -220,29 +257,131 @@ def _get_audit_authors():
"""
Return the list of authors that produced this model.
"""
# ToDo: tell Xabi that his name can't have a á in mmCIF
return
(
"
Bartolec T
"
,
"
Vazquez-Campos X
"
,
"
Johnson M
"
,
"
Norman A
"
,
"
Payne R
"
,
"
Wilkins M
"
,
"
Mackay J
"
,
"
Low J
"
,
"
Bartolec
,
T
.
"
,
"
Vazquez-Campos
,
X
.
"
,
"
Johnson
,
M
.
"
,
"
Norman
,
A
.
"
,
"
Payne
,
R
.
"
,
"
Wilkins
,
M
.
"
,
"
Mackay
,
J
.
"
,
"
Low
,
J
.
"
,
)
def
_get_protocol_steps_and_software
(
cnfg_file
):
def
_parse_colabfold_config
(
cnfg_file
):
"""
Read config.json and fetch relevant data from it.
"""
# NOTE: following code from https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to understand config
# fetch and drop fields which are not relevant for model building
with
open
(
cnfg_file
,
encoding
=
"
utf8
"
)
as
jfh
:
cf_config
=
json
.
load
(
jfh
)
if
"
num_queries
"
in
cf_config
:
del
cf_config
[
"
num_queries
"
]
# fetch relevant data
# -> MSA mode
if
cf_config
[
"
msa_mode
"
]
==
"
MMseqs2 (UniRef+Environmental)
"
:
seq_dbs
=
[
"
UniRef
"
,
"
Environmental
"
]
use_mmseqs
=
True
use_msa
=
True
elif
cf_config
[
"
msa_mode
"
]
==
"
MMseqs2 (UniRef only)
"
:
seq_dbs
=
[
"
UniRef
"
]
use_mmseqs
=
True
use_msa
=
True
elif
cf_config
[
"
msa_mode
"
]
==
"
single_sequence
"
:
seq_dbs
=
[]
use_mmseqs
=
False
use_msa
=
False
elif
cf_config
[
"
msa_mode
"
]
==
"
custom
"
:
print
(
"
WARNING: Custom MSA mode used. Not clear from config what to do here!
"
)
seq_dbs
=
[]
use_mmseqs
=
False
use_msa
=
True
else
:
raise
ValueError
(
f
"
Unknown msa_mode
{
cf_config
[
'
msa_mode
'
]
}
"
)
# -> model type
if
cf_config
[
"
model_type
"
]
==
"
AlphaFold2-multimer-v1
"
:
# AF-Multimer as introduced in AlphaFold v2.1.0
use_multimer
=
True
multimer_version
=
1
elif
cf_config
[
"
model_type
"
]
==
"
AlphaFold2-multimer-v2
"
:
# AF-Multimer as introduced in AlphaFold v2.2.0
use_multimer
=
True
multimer_version
=
2
elif
cf_config
[
"
model_type
"
]
==
"
AlphaFold2-ptm
"
:
use_multimer
=
False
multimer_version
=
None
else
:
raise
ValueError
(
f
"
Unknown model_type
{
cf_config
[
'
model_type
'
]
}
"
)
# write description
description
=
f
"
Model generated using ColabFold v
{
cf_config
[
'
version
'
]
}
"
if
use_multimer
:
description
+=
f
"
with AlphaFold-Multimer (v
{
multimer_version
}
)
"
else
:
description
+=
f
"
with AlphaFold
"
description
+=
f
"
producing
{
cf_config
[
'
num_models
'
]
}
models
"
\
f
"
with
{
cf_config
[
'
num_recycles
'
]
}
recycles each
"
if
cf_config
[
"
use_amber
"
]:
description
+=
"
, with AMBER relaxation
"
else
:
description
+=
"
, without model relaxation
"
if
cf_config
[
"
use_templates
"
]:
print
(
"
WARNING: ColabFold may use PDB70 or custom templates.
"
\
"
Not clear from config!
"
)
description
+=
"
, using templates
"
else
:
description
+=
"
, without templates
"
if
cf_config
[
"
rank_by
"
]
==
"
plddt
"
:
description
+=
"
, ranked by pLDDT
"
elif
cf_config
[
"
rank_by
"
]
==
"
ptmscore
"
:
description
+=
"
, ranked by pTM
"
elif
cf_config
[
"
rank_by
"
]
==
"
multimer
"
:
description
+=
"
, ranked by ipTM*0.8+pTM*0.2
"
else
:
raise
ValueError
(
f
"
Unknown rank_by
{
cf_config
[
'
rank_by
'
]
}
"
)
if
use_msa
:
description
+=
"
, starting from
"
if
use_mmseqs
:
msa_type
=
"
MSA
"
else
:
msa_type
=
"
custom MSA
"
if
use_multimer
:
if
cf_config
[
"
pair_mode
"
]
==
"
unpaired+paired
"
:
description
+=
f
"
paired and unpaired
{
msa_type
}
s
"
elif
cf_config
[
"
pair_mode
"
]
==
"
paired
"
:
description
+=
f
"
paired
{
msa_type
}
s
"
elif
cf_config
[
"
pair_mode
"
]
==
"
unpaired
"
:
description
+=
f
"
unpaired
{
msa_type
}
s
"
else
:
raise
ValueError
(
f
"
Unknown pair_mode
{
cf_config
[
'
pair_mode
'
]
}
"
)
else
:
description
+=
f
"
an
{
msa_type
}
"
if
use_mmseqs
:
description
+=
f
"
from MMseqs2 (
{
'
+
'
.
join
(
seq_dbs
)
}
)
"
else
:
description
+=
"
without an MSA
"
description
+=
"
.
"
return
{
"
config
"
:
cf_config
,
"
seq_dbs
"
:
seq_dbs
,
"
use_mmseqs
"
:
use_mmseqs
,
"
use_msa
"
:
use_msa
,
"
use_multimer
"
:
use_multimer
,
"
multimer_version
"
:
multimer_version
,
"
description
"
:
description
}
def
_get_protocol_steps_and_software
(
config_data
):
"""
Create the list of protocol steps with software and parameters used.
"""
protocol
=
[]
# modelling step
step
=
{
"
method_type
"
:
"
modeling
"
,
"
name
"
:
"
ma_protocol_step.step_name
"
,
"
details
"
:
"
Model using AlphaFold-Multimer (AlphaFold v2.2.0),
"
+
"
without amber relaxation and producing 5 models with up to 3
"
+
"
recycles each, starting from paired and unparied MSAs for the
"
+
"
dimers using MMseqs2.
"
,
"
name
"
:
None
,
"
details
"
:
config_data
[
"
description
"
],
}
# get input data
# Must refer to data already in the JSON, so we try keywords
...
...
@@ -255,116 +394,116 @@ def _get_protocol_steps_and_software(cnfg_file):
{
"
name
"
:
"
ColabFold
"
,
"
classification
"
:
"
model building
"
,
# ToDo: Get description for ColabFold
"
description
"
:
"
software.description
"
,
"
citation
"
:
{
"
pmid
"
:
None
,
"
title
"
:
"
ColabFold - Making protein folding accessible to all
"
,
"
journal
"
:
"
bioRxiv
"
,
"
volume
"
:
None
,
"
page_range
"
:
None
,
"
year
"
:
2022
,
"
authors
"
:
[
"
Mirdita M
"
,
"
Schuetze K
"
,
"
Moriwaki Y
"
,
"
Heo L
"
,
"
Ovchinnikov S
"
,
"
Steinegger M
"
,
],
"
doi
"
:
"
10.1101/2021.08.15.456425
"
,
},
"
description
"
:
"
Structure prediction
"
,
"
citation
"
:
ihm
.
citations
.
colabfold
,
"
location
"
:
"
https://github.com/sokrypton/ColabFold
"
,
"
type
"
:
"
package
"
,
"
version
"
:
"
1.2.0
"
,
},
{
}]
if
config_data
[
"
use_mmseqs
"
]:
step
[
"
software
"
].
append
({
"
name
"
:
"
MMseqs2
"
,
"
classification
"
:
"
data collection
"
,
"
description
"
:
"
Many-against-Many sequence searching
"
,
"
citation
"
:
{
"
pmid
"
:
"
30615063
"
,
"
title
"
:
"
MMseqs2 desktop and local web server app for fast,
"
+
"
interactive sequence searches
"
,
"
journal
"
:
"
Bioinformatics
"
,
"
volume
"
:
35
,
"
page_range
"
:
(
2856
,
2858
),
"
year
"
:
2019
,
"
authors
"
:
[
"
Mirdita M
"
,
"
Steinegger M
"
,
"
Soeding J
"
,
"
citation
"
:
ihm
.
Citation
(
pmid
=
"
30615063
"
,
title
=
"
MMseqs2 desktop and local web server app for fast,
"
+
"
interactive sequence searches
.
"
,
journal
=
"
Bioinformatics
"
,
volume
=
35
,
page_range
=
(
2856
,
2858
),
year
=
2019
,
authors
=
[
"
Mirdita
,
M
.
"
,
"
Steinegger
,
M
.
"
,
"
Soeding
,
J
.
"
,
],
"
doi
"
:
"
10.1093/bioinformatics/bty1057
"
,
}
,
doi
=
"
10.1093/bioinformatics/bty1057
"
,
)
,
"
location
"
:
"
https://github.com/soedinglab/mmseqs2
"
,
"
type
"
:
"
package
"
,
"
version
"
:
None
,
},
{
})
if
config_data
[
"
use_multimer
"
]:
step
[
"
software
"
].
append
({
"
name
"
:
"
AlphaFold-Multimer
"
,
"
classification
"
:
"
model building
"
,
"
description
"
:
"
Structure prediction
"
,
"
citation
"
:
{
"
pmid
"
:
None
,
"
title
"
:
"
Protein complex prediction with
"
"
citation
"
:
ihm
.
Citation
(
pmid
=
None
,
title
=
"
Protein complex prediction with
"
+
"
AlphaFold-Multimer.
"
,
"
journal
"
:
"
bioRxiv
"
,
"
volume
"
:
None
,
"
page_range
"
:
None
,
"
year
"
:
2021
,
"
authors
"
:
[
"
Evans R
"
,
"
O
'
Neill M
"
,
"
Pritzel A
"
,
"
Antropova N
"
,
"
Senior A
"
,
"
Green T
"
,
"
Zidek A
"
,
"
Bates R
"
,
"
Blackwell S
"
,
"
Yim J
"
,
"
Ronneberger O
"
,
"
Bodenstein S
"
,
"
Zielinski M
"
,
"
Bridgland A
"
,
"
Potapenko A
"
,
"
Cowie A
"
,
"
Tunyasuvunakool K
"
,
"
Jain R
"
,
"
Clancy E
"
,
"
Kohli P
"
,
"
Jumper J
"
,
"
Hassabis D
"
,
journal
=
"
bioRxiv
"
,
volume
=
None
,
page_range
=
None
,
year
=
2021
,
authors
=
[
"
Evans
,
R
.
"
,
"
O
'
Neill
,
M
.
"
,
"
Pritzel
,
A
.
"
,
"
Antropova
,
N
.
"
,
"
Senior
,
A
.
"
,
"
Green
,
T
.
"
,
"
Zidek
,
A
.
"
,
"
Bates
,
R
.
"
,
"
Blackwell
,
S
.
"
,
"
Yim
,
J
.
"
,
"
Ronneberger
,
O
.
"
,
"
Bodenstein
,
S
.
"
,
"
Zielinski
,
M
.
"
,
"
Bridgland
,
A
.
"
,
"
Potapenko
,
A
.
"
,
"
Cowie
,
A
.
"
,
"
Tunyasuvunakool
,
K
.
"
,
"
Jain
,
R
.
"
,
"
Clancy
,
E
.
"
,
"
Kohli
,
P
.
"
,
"
Jumper
,
J
.
"
,
"
Hassabis
,
D
.
"
,
],
"
doi
"
:
"
10.1101/2021.10.04.463034
"
,
}
,
doi
=
"
10.1101/2021.10.04.463034
"
,
)
,
"
location
"
:
"
https://github.com/deepmind/alphafold
"
,
"
type
"
:
"
package
"
,
"
version
"
:
"
2.1.1
"
,
},
]
# get parameters
with
open
(
cnfg_file
,
encoding
=
"
utf8
"
)
as
jfh
:
step
[
"
software_parameters
"
]
=
json
.
load
(
jfh
)
"
version
"
:
None
,
})
else
:
step
[
"
software
"
].
append
({
"
name
"
:
"
AlphaFold
"
,
"
classification
"
:
"
model building
"
,
"
description
"
:
"
Structure prediction
"
,
"
citation
"
:
ihm
.
citations
.
alphafold2
,
"
location
"
:
"
https://github.com/deepmind/alphafold
"
,
"
type
"
:
"
package
"
,
"
version
"
:
None
,
})
step
[
"
software_parameters
"
]
=
config_data
[
"
config
"
]
protocol
.
append
(
step
)
# model selection step
# ToDo [input/ internal]: model selection step on a single model is a bit
# silly, how do we get a list of models?
step
=
{
"
method_type
"
:
"
model selection
"
,
"
name
"
:
"
ma_protocol_step.step_name
"
,
"
details
"
:
"
Select best model, which is either the top-ranked model
"
+
"
as determined by the ColabFold pipeline
"
+
"
(iptmscore*0.8+ptmscore*0.2), or else the model with best
"
+
"
congruence with crosslinks reported in the related study.
"
,
}
step
[
"
input
"
]
=
"
model
"
step
[
"
output
"
]
=
"
model
"
step
[
"
software
"
]
=
[]
step
[
"
software_parameters
"
]
=
{}
protocol
.
append
(
step
)
# GT-NOTES:
# - input/output should be ok without list of models
# - rank of model is already stored in _ma_model_list.model_name and
# _ma_data.name (in _store_as_modelcif)
# - ColabFold ranking details is already in details of step above.
# - Suggestion: add extra step only if AF-ranking was overruled and
# include it in step above.
# step = {
# "method_type": "model selection",
# "name": "ma_protocol_step.step_name",
# "details": "Select best model, which is either the top-ranked model "
# + "as determined by the ColabFold pipeline "
# + "(iptmscore*0.8+ptmscore*0.2), or else the model with best "
# + "congruence with crosslinks reported in the related study.",
# }
# step["input"] = "model"
# step["output"] = "model"
# step["software"] = []
# step["software_parameters"] = {}
# protocol.append(step)
return
protocol
...
...
@@ -387,7 +526,7 @@ def _get_model_details(gene_names):
def
_get_model_group_name
():
"""
Get a name for a model group.
"""
return
"
Crosslinked Heterodimer A
L
phaFold-Multimer v2 Models
"
return
"
Crosslinked Heterodimer A
l
phaFold-Multimer v2 Models
"
def
_get_sequence
(
chn
):
...
...
@@ -529,7 +668,8 @@ def _get_entities(pdb_file, up_acs):
upkb
=
_get_upkb_for_sequence
(
sqe
,
up_acs
[
i
])
cif_ent
[
"
pdb_sequence
"
]
=
sqe
cif_ent
[
"
pdb_chain_id
"
]
=
chn
.
name
cif_ent
[
"
description
"
]
=
f
"
Model of
{
upkb
[
'
up_gn
'
]
}
(
{
upkb
[
'
up_ac
'
]
}
)
"
cif_ent
[
"
description
"
]
=
f
"
{
upkb
[
'
up_organism
'
]
}
{
upkb
[
'
up_gn
'
]
}
"
\
f
"
(
{
upkb
[
'
up_ac
'
]
}
)
"
cif_ent
.
update
(
upkb
)
entities
.
append
(
cif_ent
)
...
...
@@ -542,8 +682,8 @@ def _get_scores(data, prfx):
with
open
(
scrs_fle
,
encoding
=
"
utf8
"
)
as
jfh
:
scrs_json
=
json
.
load
(
jfh
)
#
ToDo: is dict.update still the way to go
when iterating multiple model
#
direct
or
i
es
? Aka, does dict.update overwrite old scores?
#
NOTE for reuse of data
when iterating multiple model
s: this will overwrite
#
sc
ores
in data but will not delete any scores if prev. models had more...
data
.
update
(
scrs_json
)
...
...
@@ -570,10 +710,8 @@ def _get_modelcif_entities(target_ents, source, asym_units, system):
)
],
)
# ToDo [input]: Add details
asym_units
[
cif_ent
[
"
pdb_chain_id
"
]]
=
modelcif
.
AsymUnit
(
mdlcif_ent
,
details
=
"
struct_asym.details
"
,
mdlcif_ent
)
system
.
target_entities
.
append
(
mdlcif_ent
)
...
...
@@ -587,25 +725,34 @@ def _assemble_modelcif_software(soft_dict):
soft_dict
[
"
location
"
],
soft_dict
[
"
type
"
],
soft_dict
[
"
version
"
],
citation
=
ihm
.
Citation
(
pmid
=
soft_dict
[
"
citation
"
][
"
pmid
"
],
title
=
soft_dict
[
"
citation
"
][
"
title
"
],
journal
=
soft_dict
[
"
citation
"
][
"
journal
"
],
volume
=
soft_dict
[
"
citation
"
][
"
volume
"
],
page_range
=
soft_dict
[
"
citation
"
][
"
page_range
"
],
year
=
soft_dict
[
"
citation
"
][
"
year
"
],
authors
=
soft_dict
[
"
citation
"
][
"
authors
"
],
doi
=
soft_dict
[
"
citation
"
][
"
doi
"
],
),
citation
=
soft_dict
[
"
citation
"
]
)
def
_get_modelcif_protocol
(
protocol_steps
,
target_entities
,
model
):
def
_get_sequence_dbs
(
seq_dbs
):
"""
Get ColabFold seq. DBs.
"""
# NOTE: hard coded for ColabFold versions before 2022/07/13
# -> afterwards UniRef30 updated to 2022_02 (and maybe more changes)
db_dict
=
{
"
UniRef
"
:
modelcif
.
ReferenceDatabase
(
"
UniRef30
"
,
"
http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz
"
,
version
=
"
2021_03
"
),
"
Environmental
"
:
modelcif
.
ReferenceDatabase
(
"
ColabFold DB
"
,
"
http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz
"
,
version
=
"
2021_08
"
)
}
return
[
db_dict
[
seq_db
]
for
seq_db
in
seq_dbs
]
def
_get_modelcif_protocol
(
protocol_steps
,
target_entities
,
model
,
ref_dbs
):
"""
Create the protocol for the ModelCIF file.
"""
protocol
=
modelcif
.
protocol
.
Protocol
()
for
js_step
in
protocol_steps
:
sftwre
=
None
# ToDo [input]: Turn into software group if parameters are available
if
js_step
[
"
software
"
]:
if
len
(
js_step
[
"
software
"
])
==
1
:
sftwre
=
_assemble_modelcif_software
(
js_step
[
"
software
"
][
0
])
...
...
@@ -616,7 +763,6 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):
sftwre
=
modelcif
.
SoftwareGroup
(
elements
=
sftwre
)
if
js_step
[
"
software_parameters
"
]:
params
=
[]
# ToDo [internal]: handle lists!
for
k
,
v
in
js_step
[
"
software_parameters
"
].
items
():
params
.
append
(
modelcif
.
SoftwareParameter
(
k
,
v
)
...
...
@@ -630,7 +776,7 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):
if
js_step
[
"
input
"
]
==
"
target_sequences
"
:
input_data
=
modelcif
.
data
.
DataGroup
(
target_entities
)
# ToDo: Add databases + versions
input_data
.
extend
(
ref_dbs
)
elif
js_step
[
"
input
"
]
==
"
model
"
:
input_data
=
model
else
:
...
...
@@ -655,7 +801,28 @@ def _get_modelcif_protocol(protocol_steps, target_entities, model):
return
protocol
def
_store_as_modelcif
(
interaction_name
,
data_json
,
ost_ent
,
file_prfx
):
def
_compress_cif_file
(
cif_file
):
"""
Compress cif file and delete original.
"""
with
open
(
cif_file
,
'
rb
'
)
as
f_in
:
with
gzip
.
open
(
cif_file
+
'
.gz
'
,
'
wb
'
)
as
f_out
:
shutil
.
copyfileobj
(
f_in
,
f_out
)
os
.
remove
(
cif_file
)
def
_package_associated_files
(
mdl_name
):
"""
Compress associated files into single zip file and delete original.
"""
# file names must match ones from add_scores
zip_path
=
f
"
{
mdl_name
}
.zip
"
files
=
[
f
"
{
mdl_name
}
_local_pairwise_qa.cif
"
]
# zip settings tested for good speed vs compression
with
zipfile
.
ZipFile
(
zip_path
,
'
w
'
,
zipfile
.
ZIP_BZIP2
)
as
myzip
:
for
file
in
files
:
myzip
.
write
(
file
)
os
.
remove
(
file
)
def
_store_as_modelcif
(
interaction_name
,
data_json
,
ost_ent
,
out_dir
,
file_prfx
,
compress
):
"""
Mix all the data into a ModelCIF file.
"""
print
(
"
generating ModelCIF objects...
"
,
end
=
""
)
pstart
=
timer
()
...
...
@@ -678,27 +845,31 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx):
data_json
[
"
target_entities
"
],
source
,
asym_units
,
system
)
# ToDo [input]: Get Assembly name
assembly
=
modelcif
.
Assembly
(
asym_units
.
values
()
,
name
=
"
ma_struct_assembly_details.assembly_name
"
asym_units
.
values
()
)
# audit_authors
system
.
authors
.
extend
(
data_json
[
"
audit_authors
"
])
# set up the model to produce coordinates
# ToDo [input]: Get ma_model_list.model_name
if
data_json
[
"
rank_num
"
]
==
1
:
mdl_list_name
=
f
"
Model
{
data_json
[
'
mdl_num
'
]
}
(top ranked model)
"
else
:
mdl_list_name
=
f
"
Model
{
data_json
[
'
mdl_num
'
]
}
"
\
f
"
(#
{
data_json
[
'
rank_num
'
]
}
ranked model)
"
model
=
_OST2ModelCIF
(
assembly
=
assembly
,
asym
=
asym_units
,
ost_entity
=
ost_ent
,
name
=
"
ma_model_list.model
_name
"
,
name
=
mdl_list
_name
,
)
print
(
f
"
(
{
timer
()
-
pstart
:
.
2
f
}
s)
"
)
print
(
"
processing QA scores...
"
,
end
=
""
,
flush
=
True
)
pstart
=
timer
()
mdl_name
=
os
.
path
.
basename
(
file_prfx
)
system
.
repositories
.
append
(
model
.
add_scores
(
data_json
,
system
.
id
,
file_prfx
)
model
.
add_scores
(
data_json
,
system
.
id
,
mdl_name
)
)
print
(
f
"
(
{
timer
()
-
pstart
:
.
2
f
}
s)
"
)
...
...
@@ -707,26 +878,39 @@ def _store_as_modelcif(interaction_name, data_json, ost_ent, file_prfx):
)
system
.
model_groups
.
append
(
model_group
)
ref_dbs
=
_get_sequence_dbs
(
data_json
[
"
config_data
"
][
"
seq_dbs
"
])
protocol
=
_get_modelcif_protocol
(
data_json
[
"
protocol
"
],
system
.
target_entities
,
model
data_json
[
"
protocol
"
],
system
.
target_entities
,
model
,
ref_dbs
)
system
.
protocols
.
append
(
protocol
)
# write modelcif System to file
print
(
"
write to disk...
"
,
end
=
""
,
flush
=
True
)
pstart
=
timer
()
with
open
(
f
"
{
file_prfx
}
.cif
"
,
"
w
"
,
encoding
=
"
ascii
"
)
as
mmcif_fh
:
modelcif
.
dumper
.
write
(
mmcif_fh
,
[
system
])
# NOTE: this will dump PAE on path provided in add_scores
# -> hence we cheat by changing path and back while being exception-safe...
oldpwd
=
os
.
getcwd
()
os
.
chdir
(
out_dir
)
try
:
with
open
(
f
"
{
mdl_name
}
.cif
"
,
"
w
"
,
encoding
=
"
ascii
"
)
as
mmcif_fh
:
modelcif
.
dumper
.
write
(
mmcif_fh
,
[
system
])
_package_associated_files
(
mdl_name
)
if
compress
:
_compress_cif_file
(
f
"
{
mdl_name
}
.cif
"
)
finally
:
os
.
chdir
(
oldpwd
)
print
(
f
"
(
{
timer
()
-
pstart
:
.
2
f
}
s)
"
)
def
_create_interaction_json
(
cnfg_
file
):
def
_create_interaction_json
(
c
o
nf
i
g_
data
):
"""
Create a dictionary (mimicking JSON) that contains data which is the same
for all models.
"""
data
=
{}
data
[
"
audit_authors
"
]
=
_get_audit_authors
()
data
[
"
protocol
"
]
=
_get_protocol_steps_and_software
(
cnfg_file
)
data
[
"
protocol
"
]
=
_get_protocol_steps_and_software
(
config_data
)
data
[
"
config_data
"
]
=
config_data
return
data
...
...
@@ -756,14 +940,15 @@ def _main():
up_acs
=
interaction
.
split
(
"
-
"
)
cnfg
=
_check_interaction_extra_files_present
(
opts
.
model_dir
)
mdlcf_json
=
_create_interaction_json
(
cnfg
)
config_data
=
_parse_colabfold_config
(
cnfg
)
# iterate model directory
for
fle
in
os
.
listdir
(
opts
.
model_dir
):
for
fle
in
sorted
(
os
.
listdir
(
opts
.
model_dir
)
)
:
# iterate PDB files
if
not
fle
.
endswith
(
"
.pdb
"
):
continue
if
opts
.
top_ranked_only
and
"
rank_1
"
not
in
fle
:
continue
print
(
f
"
translating
{
fle
}
...
"
)
pdb_start
=
timer
()
file_prfx
,
uid
=
_check_model_extra_files_present
(
opts
.
model_dir
,
fle
)
...
...
@@ -772,14 +957,26 @@ def _main():
# gather data into JSON-like structure
print
(
"
preparing data...
"
,
end
=
""
)
pstart
=
timer
()
# NOTE: could also be prepared globally if all carefully overwritten
# but not worth the trouble...
mdlcf_json
=
_create_interaction_json
(
config_data
)
# uid = ..._rank_X_model_Y.pdb
mdl_name_parts
=
uid
.
split
(
'
_
'
)
assert
mdl_name_parts
[
-
4
]
==
"
rank
"
assert
mdl_name_parts
[
-
2
]
==
"
model
"
mdlcf_json
[
"
rank_num
"
]
=
int
(
mdl_name_parts
[
-
3
])
mdlcf_json
[
"
mdl_num
"
]
=
int
(
mdl_name_parts
[
-
1
])
ost_ent
=
_create_model_json
(
mdlcf_json
,
fle
,
up_acs
)
# read quality scores from JSON file
_get_scores
(
mdlcf_json
,
file_prfx
)
print
(
f
"
(
{
timer
()
-
pstart
:
.
2
f
}
s)
"
)
_store_as_modelcif
(
uid
,
mdlcf_json
,
ost_ent
,
file_prfx
)
# ToDo [internal]: wipe data or is it overwritten in mdlcf_json?
_store_as_modelcif
(
uid
,
mdlcf_json
,
ost_ent
,
opts
.
out_dir
,
file_prfx
,
opts
.
compress
)
print
(
f
"
... done with
{
fle
}
(
{
timer
()
-
pdb_start
:
.
2
f
}
s).
"
)
print
(
f
"
... done with
{
opts
.
model_dir
}
.
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment