Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
AlphaPulldown-ModelCIF-Conversion
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Bienchen
AlphaPulldown-ModelCIF-Conversion
Commits
45ffa009
Commit
45ffa009
authored
1 year ago
by
Bienchen
Browse files
Options
Downloads
Patches
Plain Diff
Get a proper model name
parent
d45d73c3
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert_to_modelcif.py
+72
-29
72 additions, 29 deletions
convert_to_modelcif.py
with
72 additions
and
29 deletions
convert_to_modelcif.py
+
72
−
29
View file @
45ffa009
...
@@ -29,6 +29,15 @@ import modelcif.model
...
@@ -29,6 +29,15 @@ import modelcif.model
# ToDo: Get options properly, best get the same names as used in existing
# ToDo: Get options properly, best get the same names as used in existing
# scripts, e.g. could '--monomer_objects_dir' be used as feature
# scripts, e.g. could '--monomer_objects_dir' be used as feature
# directory/ directory with the feature JSON files?
# directory/ directory with the feature JSON files?
# ToDo: Monomers work separately - features may come from different set of
# software, databases... so target sequences may be connected to different
# versions of the same sequence database, may use different versions of
# software... can this still go into single protocol steps or would this
# prevent identifying which MSAs were produced with which software? E.g.
# can there still be a single "sequence search" step? (This is
# definitively for later, not the first working version of the converter
# script)
# ToDo: sort non-ModelCIF items in the main JSON object into '__meta__'
flags
.
DEFINE_string
(
flags
.
DEFINE_string
(
"
ap_output
"
,
None
,
"
AlphaPulldown pipeline output directory.
"
"
ap_output
"
,
None
,
"
AlphaPulldown pipeline output directory.
"
)
)
...
@@ -245,19 +254,16 @@ def _store_as_modelcif(
...
@@ -245,19 +254,16 @@ def _store_as_modelcif(
system
,
system
,
)
)
# ToDo: get modelling-experiment au
h
tors
# ToDo: get modelling-experiment aut
h
ors
# audit_authors
# audit_authors
# system.authors.extend(data_json["audit_authors"])
# system.authors.extend(data_json["audit_authors"])
# set up the model to produce coordinates
# set up the model to produce coordinates
# ToDo: model file names are different than expected, got ranked_<N>.pdb
# and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
# like model_<N>_rank_<M>.pdb, used for 'model list name'
model
=
_Biopython2ModelCIF
(
model
=
_Biopython2ModelCIF
(
assembly
=
modelcif
.
Assembly
(
asym_units
.
values
()),
assembly
=
modelcif
.
Assembly
(
asym_units
.
values
()),
asym
=
asym_units
,
asym
=
asym_units
,
bio_pdb_structure
=
structure
,
bio_pdb_structure
=
structure
,
name
=
"
ToDo: Model <N> (ranked #<M>)
"
,
name
=
data_json
[
"
_ma_model_list.model_name
"
]
,
)
)
# create software list from feature metadata
# create software list from feature metadata
...
@@ -327,6 +333,7 @@ def _compress_cif_file(cif_file):
...
@@ -327,6 +333,7 @@ def _compress_cif_file(cif_file):
def
_get_model_details
(
cmplx_name
:
str
,
data_json
:
dict
)
->
str
:
def
_get_model_details
(
cmplx_name
:
str
,
data_json
:
dict
)
->
str
:
"""
Get the model description.
"""
"""
Get the model description.
"""
ap_versions
=
[]
ap_versions
=
[]
af2_version
=
None
for
mnmr
in
data_json
[
"
__meta__
"
]:
# mnmr = monomer
for
mnmr
in
data_json
[
"
__meta__
"
]:
# mnmr = monomer
if
(
if
(
data_json
[
"
__meta__
"
][
mnmr
][
"
software
"
][
"
alphapulldown
"
][
"
version
"
]
data_json
[
"
__meta__
"
][
mnmr
][
"
software
"
][
"
alphapulldown
"
][
"
version
"
]
...
@@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str:
...
@@ -337,11 +344,27 @@ def _get_model_details(cmplx_name: str, data_json: dict) -> str:
"
version
"
"
version
"
]
]
)
)
# AlphaFold-Multimer builds the model we are looking at, can only be a
# single version.
if
af2_version
is
None
:
af2_version
=
data_json
[
"
__meta__
"
][
mnmr
][
"
software
"
][
"
alphafold
"
][
"
version
"
]
else
:
if
(
data_json
[
"
__meta__
"
][
mnmr
][
"
software
"
][
"
alphafold
"
][
"
version
"
]
!=
af2_version
):
# pylint: disable=line-too-long
raise
RuntimeError
(
"
Different versions of AlphaFold-Multimer found:
"
+
f
"'
{
data_json
[
'
__meta__
'
][
mnmr
][
'
software
'
][
'
alphafold
'
][
'
version
'
]
}
'"
+
f
"
vs.
'
{
af2_version
}
'"
)
# ToDo: fetch AF2 version/ have it in metadata JSON
return
(
return
(
f
"
Model generated for
{
'
and
'
.
join
(
cmplx_name
)
}
, produced
"
f
"
Model generated for
{
'
and
'
.
join
(
cmplx_name
)
}
, produced
"
+
"
using AlphaFold-Multimer (
<AF2 VERSION>
) as implemented by
"
+
f
"
using AlphaFold-Multimer (
{
af2_version
}
) as implemented by
"
+
f
"
AlphaPulldown (
{
'
,
'
.
join
(
ap_versions
)
}
).
"
+
f
"
AlphaPulldown (
{
'
,
'
.
join
(
ap_versions
)
}
).
"
)
)
...
@@ -377,8 +400,11 @@ def _get_feature_metadata(
...
@@ -377,8 +400,11 @@ def _get_feature_metadata(
return
cmplx_name
return
cmplx_name
def
_get_data_block_id_and_struct_and_entry_categories
(
def
_get_model_info
(
cif_json
:
dict
,
cmplx_name
:
str
cif_json
:
dict
,
cmplx_name
:
str
,
mdl_id
:
str
,
mdl_rank
:
int
,
)
->
None
:
)
->
None
:
"""
Get
'
data_
'
block ID and data for categories
'
_struct
'
and
'
_entry
'
.
"""
"""
Get
'
data_
'
block ID and data for categories
'
_struct
'
and
'
_entry
'
.
"""
cif_json
[
"
data_
"
]
=
"
_
"
.
join
(
cmplx_name
)
cif_json
[
"
data_
"
]
=
"
_
"
.
join
(
cmplx_name
)
...
@@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories(
...
@@ -386,6 +412,9 @@ def _get_data_block_id_and_struct_and_entry_categories(
cif_json
[
"
_struct.pdbx_model_details
"
]
=
_get_model_details
(
cif_json
[
"
_struct.pdbx_model_details
"
]
=
_get_model_details
(
cmplx_name
,
cif_json
cmplx_name
,
cif_json
)
)
cif_json
[
"
_ma_model_list.model_name
"
]
=
f
"
Model
{
mdl_id
}
(ranked #
{
mdl_rank
}
)
"
def
_get_entities
(
def
_get_entities
(
...
@@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list:
...
@@ -525,8 +554,7 @@ def _get_software_data(meta_json: dict) -> list:
def
alphapulldown_model_to_modelcif
(
def
alphapulldown_model_to_modelcif
(
cmplx_name
:
str
,
cmplx_name
:
str
,
mdl_file
:
str
,
mdl
:
tuple
,
scr_file
:
str
,
out_dir
:
str
,
out_dir
:
str
,
prj_dir
:
str
,
prj_dir
:
str
,
compress
:
bool
=
False
,
compress
:
bool
=
False
,
...
@@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif(
...
@@ -536,31 +564,31 @@ def alphapulldown_model_to_modelcif(
Metadata for the ModelCIF categories will be fetched from AlphaPulldown
Metadata for the ModelCIF categories will be fetched from AlphaPulldown
output as far as possible. This expects modelling projects to exists in
output as far as possible. This expects modelling projects to exists in
AlphaPulldown
'
s output directory structure.
"""
AlphaPulldown
'
s output directory structure.
"""
# ToDo: ENABLE logging.info(f"Processing '{mdl
_file
}'...")
# ToDo: ENABLE logging.info(f"Processing '{mdl
[0]
}'...")
modelcif_json
=
{}
modelcif_json
=
{}
# fetch metadata
# fetch metadata
cmplx_name
=
_get_feature_metadata
(
modelcif_json
,
cmplx_name
,
prj_dir
)
cmplx_name
=
_get_feature_metadata
(
modelcif_json
,
cmplx_name
,
prj_dir
)
# fetch/ assemble more data about the modelling experiment
# fetch/ assemble more data about the modelling experiment
_get_data_block_id_and_struct_and_entry_categories
(
_get_model_info
(
modelcif_json
,
cmplx_name
modelcif_json
,
cmplx_name
,
mdl
[
2
],
mdl
[
3
],
)
)
# gather target entities (sequences that have been modeled) info
# gather target entities (sequences that have been modeled) info
structure
=
_get_entities
(
modelcif_json
,
mdl
_file
,
cmplx_name
,
prj_dir
)
structure
=
_get_entities
(
modelcif_json
,
mdl
[
0
]
,
cmplx_name
,
prj_dir
)
# read quality scores from pickle file
# read quality scores from pickle file
_get_scores
(
modelcif_json
,
scr_file
)
_get_scores
(
modelcif_json
,
mdl
[
1
]
)
_store_as_modelcif
(
modelcif_json
,
structure
,
mdl
_file
,
out_dir
,
compress
)
_store_as_modelcif
(
modelcif_json
,
structure
,
mdl
[
0
]
,
out_dir
,
compress
)
# ToDo: ENABLE logging.info(f"... done with '{mdl
_file
}'")
# ToDo: ENABLE logging.info(f"... done with '{mdl
[0]
}'")
def
_get_model_list
(
ap_dir
:
str
,
model_selected
:
str
)
->
Tuple
[
str
,
str
,
list
]:
def
_get_model_list
(
ap_dir
:
str
,
model_selected
:
str
)
->
Tuple
[
str
,
str
,
list
]:
"""
Get the list of models to be converted.
"""
Get the list of models to be converted.
If `model_selected` is none, all models will be marked for conversion.
"""
If `model_selected` is none, all models will be marked for conversion.
"""
# ToDo: Question - use 'ranked_*.pdb' or
# 'unrelaxed_model_*_multimer_v3_pred_0.pdb' models?
mdl_path
=
os
.
path
.
join
(
ap_dir
,
"
models
"
)
mdl_path
=
os
.
path
.
join
(
ap_dir
,
"
models
"
)
cmplx
=
os
.
listdir
(
mdl_path
)
cmplx
=
os
.
listdir
(
mdl_path
)
# For now, exactly 1 complex is expected in the 'models' subdirectory. If
# For now, exactly 1 complex is expected in the 'models' subdirectory. If
...
@@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
...
@@ -576,7 +604,7 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
ranking_dbg
=
os
.
path
.
join
(
mdl_path
,
"
ranking_debug.json
"
)
ranking_dbg
=
os
.
path
.
join
(
mdl_path
,
"
ranking_debug.json
"
)
if
not
os
.
path
.
isfile
(
ranking_dbg
):
if
not
os
.
path
.
isfile
(
ranking_dbg
):
logging
.
info
(
logging
.
info
(
f
"
Ranking file
'
{
ranking_dbg
}
doe
snot exist or is no regular
"
f
"
Ranking file
'
{
ranking_dbg
}
does
not exist or is no regular
"
+
"
file.
"
+
"
file.
"
)
)
sys
.
exit
()
sys
.
exit
()
...
@@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
...
@@ -584,13 +612,24 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
ranking_dbg
=
json
.
load
(
jfh
)
ranking_dbg
=
json
.
load
(
jfh
)
score_files
=
{}
score_files
=
{}
for
i
,
fle
in
enumerate
(
ranking_dbg
[
"
order
"
]):
for
i
,
fle
in
enumerate
(
ranking_dbg
[
"
order
"
]):
score_files
[
i
]
=
os
.
path
.
join
(
mdl_path
,
f
"
result_
{
fle
}
.pkl
"
)
if
not
fle
.
startswith
(
"
model_
"
):
raise
RuntimeError
(
"
Filename does not start with
'
model_
'
, can
"
+
f
"
not determine model ID:
'
{
fle
}
'"
)
score_files
[
i
]
=
(
os
.
path
.
join
(
mdl_path
,
f
"
result_
{
fle
}
.pkl
"
),
fle
.
split
(
"
_
"
)[
1
],
i
,
)
# match PDB files with pickle files
# match PDB files with pickle files
if
model_selected
is
not
None
:
if
model_selected
is
not
None
:
models
.
append
(
models
.
append
(
(
(
os
.
path
.
join
(
mdl_path
,
f
"
ranked_
{
model_selected
}
.pdb
"
),
os
.
path
.
join
(
mdl_path
,
f
"
ranked_
{
model_selected
}
.pdb
"
),
score_files
[
model_selected
],
score_files
[
model_selected
][
0
],
score_files
[
model_selected
][
1
],
# model ID
score_files
[
model_selected
][
2
],
# model rank
)
)
)
)
else
:
else
:
...
@@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
...
@@ -598,10 +637,15 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
rank
=
re
.
match
(
r
"
ranked_(\d+)\.pdb
"
,
mdl
)
rank
=
re
.
match
(
r
"
ranked_(\d+)\.pdb
"
,
mdl
)
if
rank
is
not
None
:
if
rank
is
not
None
:
rank
=
int
(
rank
.
group
(
1
))
rank
=
int
(
rank
.
group
(
1
))
models
.
append
((
os
.
path
.
join
(
mdl_path
,
mdl
),
score_files
[
rank
]))
models
.
append
(
(
os
.
path
.
join
(
mdl_path
,
mdl
),
score_files
[
rank
][
0
].
score_files
[
rank
][
1
],
)
)
# check that files actually exist
# check that files actually exist
for
mdl
,
scrs
in
models
:
for
mdl
,
scrs
,
*
_
in
models
:
if
not
os
.
path
.
isfile
(
mdl
):
if
not
os
.
path
.
isfile
(
mdl
):
logging
.
info
(
logging
.
info
(
f
"
Model file
'
{
mdl
}
'
does not exist or is not a regular file.
"
f
"
Model file
'
{
mdl
}
'
does not exist or is not a regular file.
"
...
@@ -642,11 +686,10 @@ def main(argv):
...
@@ -642,11 +686,10 @@ def main(argv):
complex_name
,
model_dir
,
model_list
=
_get_model_list
(
complex_name
,
model_dir
,
model_list
=
_get_model_list
(
FLAGS
.
ap_output
,
FLAGS
.
model_selected
FLAGS
.
ap_output
,
FLAGS
.
model_selected
)
)
for
mdl
,
scrs
in
model_list
:
for
mdl
in
model_list
:
alphapulldown_model_to_modelcif
(
alphapulldown_model_to_modelcif
(
complex_name
,
complex_name
,
mdl
,
mdl
,
scrs
,
model_dir
,
model_dir
,
FLAGS
.
ap_output
,
FLAGS
.
ap_output
,
FLAGS
.
compress
,
FLAGS
.
compress
,
...
@@ -668,4 +711,4 @@ if __name__ == "__main__":
...
@@ -668,4 +711,4 @@ if __name__ == "__main__":
# ToDo: make sure all functions come with types
# ToDo: make sure all functions come with types
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter aa
# LocalWords: multimer sif Jupyter aa
MSAs
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment