Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
AlphaPulldown-ModelCIF-Conversion
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Bienchen
AlphaPulldown-ModelCIF-Conversion
Commits
490cdfe0
Commit
490cdfe0
authored
1 year ago
by
Bienchen
Browse files
Options
Downloads
Patches
Plain Diff
Added global and local pLDDT
parent
09b5e132
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert_to_modelcif.py
+132
-11
132 additions, 11 deletions
convert_to_modelcif.py
with
132 additions
and
11 deletions
convert_to_modelcif.py
+
132
−
11
View file @
490cdfe0
...
...
@@ -7,12 +7,15 @@ from typing import Tuple
import
hashlib
import
json
import
os
import
pickle
import
re
import
sys
from
Bio
import
SeqIO
from
Bio.PDB
import
PDBParser
,
PPBuilder
from
Bio.PDB.Structure
import
Structure
as
BioStructure
from
absl
import
app
,
flags
,
logging
import
numpy
as
np
import
modelcif
import
modelcif.dumper
...
...
@@ -39,6 +42,41 @@ FLAGS = flags.FLAGS
# exist as expected.
# pylint: disable=too-few-public-methods
class
_GlobalPLDDT
(
modelcif
.
qa_metric
.
Global
,
modelcif
.
qa_metric
.
PLDDT
):
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
"""
name
=
"
pLDDT
"
software
=
None
class
_GlobalPTM
(
modelcif
.
qa_metric
.
Global
,
modelcif
.
qa_metric
.
PTM
):
"""
Predicted accuracy according to the TM-score score in [0,1]
"""
name
=
"
pTM
"
software
=
None
class
_GlobalIPTM
(
modelcif
.
qa_metric
.
Global
,
modelcif
.
qa_metric
.
IpTM
):
# python-modelcif reads the first line of class-doc has description of a
# score, so we need to allow long lines, here.
# pylint: disable=line-too-long
"""
Predicted protein-protein interface score, based on the TM-score score in [0,1].
"""
name
=
"
ipTM
"
software
=
None
class
_LocalPLDDT
(
modelcif
.
qa_metric
.
Local
,
modelcif
.
qa_metric
.
PLDDT
):
"""
Predicted accuracy according to the CA-only lDDT in [0,100]
"""
name
=
"
pLDDT
"
software
=
None
# pylint: enable=too-few-public-methods
class
_Biopython2ModelCIF
(
modelcif
.
model
.
AbInitioModel
):
"""
Map Biopython PDB.Structure object to ihm.model
"""
...
...
@@ -64,6 +102,44 @@ class _Biopython2ModelCIF(modelcif.model.AbInitioModel):
occupancy
=
atm
.
occupancy
,
)
def
add_scores
(
self
,
scores_json
,
entry_id
):
"""
Add QA metrics
"""
# global scores
self
.
qa_metrics
.
extend
(
(
_GlobalPLDDT
(
np
.
mean
(
scores_json
[
"
plddt
"
])),
_GlobalPTM
(
scores_json
[
"
ptm
"
]),
_GlobalIPTM
(
scores_json
[
"
iptm
"
]),
)
)
# local scores
# iterate polypetide chains
# local PLDDT
i
=
0
# aa_only=False includes non-canonical amino acids but seems to skip
# non-peptide-linking residues like ions
for
chn_i
in
PPBuilder
().
build_peptides
(
self
.
structure
,
aa_only
=
False
):
for
res_i
in
chn_i
:
# local pLDDT
# Assertion assumes that pLDDT values are also stored in the
# B-factor column.
assert
(
round
(
scores_json
[
"
plddt
"
][
i
],
2
)
==
next
(
res_i
.
get_atoms
()).
bfactor
)
self
.
qa_metrics
.
append
(
_LocalPLDDT
(
self
.
asym
[
res_i
.
parent
.
id
].
residue
(
res_i
.
id
[
1
]),
round
(
scores_json
[
"
plddt
"
][
i
],
2
),
)
)
# pairwise alignment error
j
=
0
i
+=
1
def
_get_modelcif_entities
(
target_ents
,
asym_units
,
system
):
"""
Create ModelCIF entities and asymmetric units.
"""
...
...
@@ -84,7 +160,7 @@ def _store_as_modelcif(
structure
:
BioStructure
,
mdl_file
:
str
,
out_dir
:
str
,
#
ost_ent,
file_prfx, compress, add_files
# file_prfx, compress, add_files
)
->
None
:
"""
Create the actual ModelCIF file.
"""
system
=
modelcif
.
System
(
...
...
@@ -118,10 +194,8 @@ def _store_as_modelcif(
)
# process scores
# ToDo: Get scores, but where are they stored? For other AF2 related
# projects, there are JSON or Pickle files with all AF2 scores
# system.repositories.append(
#
model.add_scores(data_json, system.id
, mdl_name, add_files
)
model
.
add_scores
(
data_json
,
system
.
id
)
# )
system
.
model_groups
.
append
(
modelcif
.
model
.
ModelGroup
([
model
]))
...
...
@@ -240,6 +314,9 @@ def _get_entities(
structure
=
PDBParser
().
get_structure
(
"
_
"
.
join
(
cmplx_name
),
pdb_file
)
cif_json
[
"
target_entities
"
]
=
[]
already_seen
=
[]
# ToDo: skip using chains to make it safe for models with ligands. Chains
# are only used for the chain ID. That can be obtained from the first
# residue in the sequence: seq[0].parent.id
for
chn
,
seq
in
zip
(
structure
.
get_chains
(),
PPBuilder
().
build_peptides
(
structure
)
):
...
...
@@ -262,9 +339,22 @@ def _get_entities(
return
structure
def
_get_scores
(
cif_json
:
dict
,
scr_file
:
str
)
->
None
:
"""
Add scores to JSON data.
"""
with
open
(
scr_file
,
"
rb
"
)
as
sfh
:
scr_dict
=
pickle
.
load
(
sfh
)
# dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'num_recycles', 'predicted_aligned_error', 'structure_module', 'plddt', 'aligned_confidence_probs', 'max_predicted_aligned_error', 'ranking_confidence'])
# Get pLDDT as a list, the global pLDDT is the average, calculated on the
# spot.
cif_json
[
"
plddt
"
]
=
scr_dict
[
"
plddt
"
]
cif_json
[
"
ptm
"
]
=
float
(
scr_dict
[
"
ptm
"
])
cif_json
[
"
iptm
"
]
=
float
(
scr_dict
[
"
iptm
"
])
def
alphapulldown_model_to_modelcif
(
cmplx_name
:
str
,
mdl_file
:
str
,
scr_file
:
str
,
out_dir
:
str
,
prj_dir
:
str
,
)
->
None
:
...
...
@@ -284,6 +374,9 @@ def alphapulldown_model_to_modelcif(
# gather target entities (sequences that have been modeled) info
structure
=
_get_entities
(
modelcif_json
,
mdl_file
,
cmplx_name
,
prj_dir
)
# read quality scores from pickle file
_get_scores
(
modelcif_json
,
scr_file
)
_store_as_modelcif
(
modelcif_json
,
structure
,
mdl_file
,
out_dir
)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
...
...
@@ -304,20 +397,48 @@ def _get_model_list(ap_dir: str, model_selected: str) -> Tuple[str, str, list]:
cmplx
=
cmplx
[
0
]
mdl_path
=
os
.
path
.
join
(
mdl_path
,
cmplx
)
models
=
[]
# We are going for models with name "rank_?.pdb" as it does not depend on
# relaxation. But this means we have to match the pickle files via
# ranking_debug.json.
ranking_dbg
=
os
.
path
.
join
(
mdl_path
,
"
ranking_debug.json
"
)
if
not
os
.
path
.
isfile
(
ranking_dbg
):
logging
.
info
(
f
"
Ranking file
'
{
ranking_dbg
}
doe snot exist or is no regular
"
+
"
file.
"
)
sys
.
exit
()
with
open
(
ranking_dbg
,
"
r
"
,
encoding
=
"
utf8
"
)
as
jfh
:
ranking_dbg
=
json
.
load
(
jfh
)
score_files
=
{}
for
i
,
fle
in
enumerate
(
ranking_dbg
[
"
order
"
]):
score_files
[
i
]
=
os
.
path
.
join
(
mdl_path
,
f
"
result_
{
fle
}
.pkl
"
)
# match PDB files with pickle files
if
model_selected
is
not
None
:
models
.
append
(
os
.
path
.
join
(
mdl_path
,
f
"
ranked_
{
model_selected
}
.pdb
"
))
models
.
append
(
(
os
.
path
.
join
(
mdl_path
,
f
"
ranked_
{
model_selected
}
.pdb
"
),
score_files
[
model_selected
],
)
)
else
:
for
mdl
in
os
.
listdir
(
mdl_path
):
if
mdl
.
startswith
(
"
ranked_
"
):
models
.
append
(
os
.
path
.
join
(
mdl_path
,
mdl
))
rank
=
re
.
match
(
r
"
ranked_(\d+)\.pdb
"
,
mdl
)
if
rank
is
not
None
:
rank
=
int
(
rank
.
group
(
1
))
models
.
append
((
os
.
path
.
join
(
mdl_path
,
mdl
),
score_files
[
rank
]))
# check that files actually exist
for
mdl
in
models
:
for
mdl
,
scrs
in
models
:
if
not
os
.
path
.
isfile
(
mdl
):
logging
.
info
(
f
"
Model file
'
{
mdl
}
'
does not exist or is not a regular file.
"
)
sys
.
exit
()
if
not
os
.
path
.
isfile
(
scrs
):
logging
.
info
(
f
"
Scores file
'
{
scrs
}
'
does not exist or is not a regular file.
"
)
sys
.
exit
()
return
cmplx
,
mdl_path
,
models
...
...
@@ -348,9 +469,9 @@ def main(argv):
complex_name
,
model_dir
,
model_list
=
_get_model_list
(
FLAGS
.
ap_output
,
FLAGS
.
model_selected
)
for
mdl
in
model_list
:
for
mdl
,
scrs
in
model_list
:
alphapulldown_model_to_modelcif
(
complex_name
,
mdl
,
model_dir
,
FLAGS
.
ap_output
complex_name
,
mdl
,
scrs
,
model_dir
,
FLAGS
.
ap_output
)
...
...
@@ -369,4 +490,4 @@ if __name__ == "__main__":
# ToDo: make sure all functions come with types
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter
# LocalWords: multimer sif Jupyter
aa
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment