Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
AlphaPulldown-ModelCIF-Conversion
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Bienchen
AlphaPulldown-ModelCIF-Conversion
Commits
150a9cbf
Commit
150a9cbf
authored
1 year ago
by
Bienchen
Browse files
Options
Downloads
Patches
Plain Diff
Notes on protocol steps
parent
5f2676e3
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert_to_modelcif.py
+145
-2
145 additions, 2 deletions
convert_to_modelcif.py
with
145 additions
and
2 deletions
convert_to_modelcif.py
+
145
−
2
View file @
150a9cbf
...
...
@@ -4,14 +4,19 @@
file with a lot of metadata in place.
"""
from
typing
import
Tuple
import
hashlib
import
json
import
os
import
sys
from
Bio
import
SeqIO
from
Bio.PDB
import
PDBParser
,
PPBuilder
from
Bio.PDB.Structure
import
Structure
as
BioStructure
from
absl
import
app
,
flags
,
logging
import
modelcif
import
modelcif.dumper
import
modelcif.model
# ToDo: Get options properly, best get the same names as used in existing
# scripts, e.g. could '--monomer_objects_dir' be used as feature
...
...
@@ -34,8 +39,49 @@ FLAGS = flags.FLAGS
# exist as expected.
class
_Biopython2ModelCIF
(
modelcif
.
model
.
AbInitioModel
):
"""
Map Biopython PDB.Structure object to ihm.model
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
"""
Initialise a model
"""
self
.
structure
=
kwargs
.
pop
(
"
bio_pdb_structure
"
)
self
.
asym
=
kwargs
.
pop
(
"
asym
"
)
super
().
__init__
(
*
args
,
**
kwargs
)
def
get_atoms
(
self
):
for
atm
in
self
.
structure
.
get_atoms
():
yield
modelcif
.
model
.
Atom
(
asym_unit
=
self
.
asym
[
atm
.
parent
.
parent
.
id
],
seq_id
=
atm
.
parent
.
id
[
1
],
atom_id
=
atm
.
name
,
type_symbol
=
atm
.
element
,
x
=
atm
.
coord
[
0
],
y
=
atm
.
coord
[
1
],
z
=
atm
.
coord
[
2
],
het
=
atm
.
parent
.
id
[
0
]
!=
"
"
,
biso
=
atm
.
bfactor
,
occupancy
=
atm
.
occupancy
,
)
def
_get_modelcif_entities
(
target_ents
,
asym_units
,
system
):
"""
Create ModelCIF entities and asymmetric units.
"""
for
cif_ent
in
target_ents
:
mdlcif_ent
=
modelcif
.
Entity
(
# 'pdb_sequence' can be used here, since AF2 always has the
# complete sequence.
cif_ent
[
"
pdb_sequence
"
],
description
=
cif_ent
[
"
description
"
],
)
for
pdb_chain_id
in
cif_ent
[
"
pdb_chain_id
"
]:
asym_units
[
pdb_chain_id
]
=
modelcif
.
AsymUnit
(
mdlcif_ent
)
system
.
target_entities
.
append
(
mdlcif_ent
)
def
_store_as_modelcif
(
data_json
:
dict
,
structure
:
BioStructure
,
mdl_file
:
str
,
out_dir
:
str
,
# ost_ent, file_prfx, compress, add_files
...
...
@@ -47,6 +93,52 @@ def _store_as_modelcif(
model_details
=
data_json
[
"
_struct.pdbx_model_details
"
],
)
# create target entities, references, source, asymmetric units & assembly
# create an asymmetric unit and an entity per target sequence
asym_units
=
{}
_get_modelcif_entities
(
data_json
[
"
target_entities
"
],
asym_units
,
system
,
)
# ToDo: get modelling-experiment auhtors
# audit_authors
# system.authors.extend(data_json["audit_authors"])
# set up the model to produce coordinates
# ToDo: model file names are different than expected, got ranked_<N>.pdb
# and unrelaxed_model_<N>_multimer_v3_pred_0.pdb, expected something
# like model_<N>_rank_<M>.pdb, used for 'model list name'
model
=
_Biopython2ModelCIF
(
assembly
=
modelcif
.
Assembly
(
asym_units
.
values
()),
asym
=
asym_units
,
bio_pdb_structure
=
structure
,
name
=
"
ToDo: Model <N> (ranked #<M>)
"
,
)
# ToDO: check data_json["model_group_name"] at Tara's
system
.
model_groups
.
append
(
modelcif
.
model
.
ModelGroup
([
model
]))
# ToDo: get protocol steps together
# - 'coevolution MSA' (create_individual_features.py) step
# - 'template search' is usually omitted for AF2 as that term more
# relates to homology modelling, AF2 uses templates in a different
# way and a template search usually results in a list of templates
# which would need to be included, here, in theory
# - for MSA/ template search, how to represent the outcome? Adding the
# pickle files quickly exceeds reasonable storage use
# - 'modeling' (run_multimer_jobs.py), are the four modes reflected by
# the JSON data/ does the JSON data look different for each mode?
# - are the scores only calculated by alpha-analysis.sif or do they
# come out of run_multimer_jobs.py? Does this go into its own step?
# - could Jupyter notebooks be included as associated file or do they
# require the pickle files?
# - what about including the tabular summary?
# - model selection: only if just a certain model is translated to
# ModelCIF, or mix it with scoring step?
# system.protocols.append()
# write modelcif.System to file
# NOTE: this will dump PAE on path provided in add_scores
# -> hence we cheat by changing path and back while being exception-safe...
...
...
@@ -117,6 +209,53 @@ def _get_data_block_id_and_struct_and_entry_categories(
)
def
_get_entities
(
cif_json
:
dict
,
pdb_file
:
str
,
cmplx_name
:
list
,
prj_dir
:
str
)
->
BioStructure
:
"""
Gather data for the mmCIF (target) entities.
"""
# read sequence from FastA, map to sequences from PDB file
sequences
=
{}
# Fetch FastA sequences to assign monomers to the molecular entities
prj_dir
=
os
.
path
.
join
(
prj_dir
,
"
fastas
"
)
if
not
os
.
path
.
isdir
(
prj_dir
):
logging
.
info
(
f
"
FastA directory
'
{
prj_dir
}
'
does not exist.
"
)
sys
.
exit
()
for
mnmr
in
cmplx_name
:
fasta
=
os
.
path
.
join
(
prj_dir
,
f
"
{
mnmr
}
.fa
"
)
if
not
os
.
path
.
isfile
(
fasta
):
logging
.
info
(
f
"
FastA file
'
{
fasta
}
'
does not exist.
"
)
sys
.
exit
()
with
open
(
fasta
,
"
r
"
,
encoding
=
"
ascii
"
)
as
ffh
:
seq
=
SeqIO
.
read
(
ffh
,
"
fasta
"
)
# Using MD5 sums for comparing sequences only works since AF2
# *ALWAYS* has the complete sequence in a chain.
sequences
[
hashlib
.
md5
(
seq
.
seq
.
encode
()).
hexdigest
()]
=
mnmr
# gather molecular entities from PDB file
structure
=
PDBParser
().
get_structure
(
"
_
"
.
join
(
cmplx_name
),
pdb_file
)
cif_json
[
"
target_entities
"
]
=
[]
already_seen
=
[]
for
chn
,
seq
in
zip
(
structure
.
get_chains
(),
PPBuilder
().
build_peptides
(
structure
)
):
seq
=
seq
.
get_sequence
()
seq_md5
=
hashlib
.
md5
(
seq
.
encode
()).
hexdigest
()
cif_ent
=
{}
try
:
e_idx
=
already_seen
.
index
(
seq_md5
)
except
ValueError
:
pass
else
:
cif_json
[
"
target_entities
"
][
e_idx
][
"
pdb_chain_id
"
].
append
(
chn
.
id
)
continue
cif_ent
[
"
pdb_sequence
"
]
=
seq
cif_ent
[
"
pdb_chain_id
"
]
=
[
chn
.
id
]
cif_ent
[
"
description
"
]
=
sequences
[
seq_md5
]
cif_json
[
"
target_entities
"
].
append
(
cif_ent
)
already_seen
.
append
(
sequences
[
seq_md5
])
return
structure
def
alphapulldown_model_to_modelcif
(
cmplx_name
:
str
,
mdl_file
:
str
,
...
...
@@ -136,7 +275,10 @@ def alphapulldown_model_to_modelcif(
_get_data_block_id_and_struct_and_entry_categories
(
modelcif_json
,
cmplx_name
)
_store_as_modelcif
(
modelcif_json
,
mdl_file
,
out_dir
)
# gather target entities (sequences that have been modeled) info
structure
=
_get_entities
(
modelcif_json
,
mdl_file
,
cmplx_name
,
prj_dir
)
_store_as_modelcif
(
modelcif_json
,
structure
,
mdl_file
,
out_dir
)
# ToDo: ENABLE logging.info(f"... done with '{mdl_file}'")
...
...
@@ -219,4 +361,5 @@ if __name__ == "__main__":
# like that, including author names?
# ToDo: where to store which model was chosen? Should be in Tara's models.
# LocalWords: ToDo AlphaPulldown PAEs dir struct
# LocalWords: ToDo AlphaPulldown PAEs dir struct coevolution MSA py modeling
# LocalWords: multimer sif Jupyter
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment