Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
modelcif-converters
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
schwede
modelcif-converters
Commits
842bfa30
Commit
842bfa30
authored
1 year ago
by
Bienchen
Browse files
Options
Downloads
Patches
Plain Diff
SCHWED6036: Deal with X/ UNK residues
parent
afaf4b2b
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
projects/dark-matter-metagenomics/translate2modelcif.py
+26
-23
26 additions, 23 deletions
projects/dark-matter-metagenomics/translate2modelcif.py
pyproject.toml
+1
-1
1 addition, 1 deletion
pyproject.toml
with
27 additions
and
24 deletions
projects/dark-matter-metagenomics/translate2modelcif.py
+
26
−
23
View file @
842bfa30
...
...
@@ -26,12 +26,10 @@ from ost import io
# EXAMPLES for running:
"""
ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt
\
./web_dloads/pivot ./modelcif --prefix=F000347
\
--pdb-web-path=./web_dloads/pdb
\
--refseq-path=./web_dloads/consensus_all.fasta
"""
# ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt \
# ./web_dloads/pivot ./modelcif --prefix=F000347 \
# --pdb-web-path=./web_dloads/pdb \
# --refseq-path=./web_dloads/consensus_all.fasta
# NOTE: add "--compress" for final runs
...
...
@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
other_details
=
"
NMPFamsDB
"
class
_LPeptideAlphabetWithX
(
ihm
.
LPeptideAlphabet
):
"""
Have the default amino acid alphabet plus
'
X
'
for unknown residues.
"""
def
__init__
(
self
):
"""
Create the alphabet.
"""
super
().
__init__
()
self
.
_comps
[
"
X
"
]
=
self
.
_comps
[
"
UNK
"
]
# pylint: enable=too-few-public-methods
...
...
@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False):
"""
Get res. num. from auth. IDs if reading from mmCIF files.
"""
if
use_auth
:
return
int
(
r
.
GetStringProp
(
"
pdb_auth_resnum
"
))
else
:
return
r
.
number
.
num
return
r
.
number
.
num
def
_get_ch_name
(
ch
,
use_auth
=
False
):
"""
Get chain name from auth. IDs if reading from mmCIF files.
"""
if
use_auth
:
return
ch
.
GetStringProp
(
"
pdb_auth_chain_name
"
)
else
:
return
ch
.
name
return
ch
.
name
class
_OST2ModelCIF
(
modelcif
.
model
.
AbInitioModel
):
...
...
@@ -307,14 +312,17 @@ def _get_metadata(metadata_file):
return
metadata
def
_get_pdb_files
(
model_base_dir
):
def
_get_pdb_files
(
model_base_dir
,
model_dir_prfx
=
"
all_pdb
"
):
"""
Collect PDB files from pub_data_* folders.
model_dir_prfx was
"
pub_data
"
for Sergey
'
s old data.
Returns dict with key = family name and value = list of paths to PDB files.
"""
pdb_files_split
=
dict
()
# to return
pdb_files_split
=
{}
# to return
pdb_files_raw
=
set
()
# to check for duplicates
pub_paths
=
[
f
for
f
in
os
.
listdir
(
model_base_dir
)
if
f
.
startswith
(
"
pub_data_
"
)
f
for
f
in
os
.
listdir
(
model_base_dir
)
if
f
.
startswith
(
model_dir_prfx
)
]
# NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
for
pub_path
in
sorted
(
pub_paths
):
...
...
@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name):
"
pdb_sequence
"
:
sqe_gaps
,
"
pdb_chain_id
"
:
[
_get_ch_name
(
chn
,
False
)],
"
fam_name
"
:
fam_name
,
"
description
"
:
f
"
Representative Sequence of NMPFamsDB Family
{
fam_name
}
"
,
"
description
"
:
"
Representative Sequence of NMPFamsDB Family
"
+
f
"
{
fam_name
}
"
,
}
return
[
cif_ent
],
ost_ent
...
...
@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name):
def
_get_modelcif_entities
(
target_ents
,
asym_units
,
system
):
"""
Create ModelCIF entities and asymmetric units.
"""
alphabet
=
_LPeptideAlphabetWithX
()
for
cif_ent
in
target_ents
:
mdlcif_ent
=
modelcif
.
Entity
(
# NOTE: sequence here defines residues in model!
cif_ent
[
"
seqres
"
],
alphabet
=
alphabet
,
description
=
cif_ent
[
"
description
"
],
source
=
None
,
references
=
[
...
...
@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
f
"
Cannot deal with missing MSA for
{
f_name
}
(yet).
"
f
"
Skipping...
"
)
return
#
aln
=
io
.
LoadAlignment
(
aln_path
)
# note: this checks that it's an actual MSA
ref_seq
=
aln
.
sequences
[
0
]
if
ref_seq_check
is
not
None
and
ref_seq_check
.
string
!=
ref_seq
.
string
:
raise
RuntimeError
(
f
"
Sequence mismatch for
{
f_name
}
"
)
# TODO: allow "X" (or whatever can be used to label unknown AA) if needed
if
"
X
"
in
ref_seq
.
string
:
_warn_msg
(
f
"
Cannot deal with
'
X
'
in ref_seq for
{
f_name
}
(yet).
"
f
"
Skipping...
"
)
return
#
# gather data into JSON-like structure
print
(
"
preparing data...
"
,
end
=
""
)
...
...
This diff is collapsed.
Click to expand it.
pyproject.toml
+
1
−
1
View file @
842bfa30
...
...
@@ -11,7 +11,7 @@ reports='no'
extension-pkg-allow-list
=[
"rapidjson"
,
"ost"
]
# [tool.pylint.typecheck]
#
generated-members = ["
Loa
dSequence
List
"]
generated-members
=
[
"
Fin
dSequence"
]
[tool.pylint.FORMAT]
max-line-length
=
80
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment