Skip to content
Snippets Groups Projects
Commit 842bfa30 authored by Bienchen's avatar Bienchen
Browse files

SCHWED6036: Deal with X/ UNK residues

parent afaf4b2b
No related branches found
No related tags found
No related merge requests found
......@@ -26,12 +26,10 @@ from ost import io
# EXAMPLES for running:
"""
ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \
./web_dloads/pivot ./modelcif --prefix=F000347 \
--pdb-web-path=./web_dloads/pdb \
--refseq-path=./web_dloads/consensus_all.fasta
"""
# ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt \
# ./web_dloads/pivot ./modelcif --prefix=F000347 \
# --pdb-web-path=./web_dloads/pdb \
# --refseq-path=./web_dloads/consensus_all.fasta
# NOTE: add "--compress" for final runs
......@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
other_details = "NMPFamsDB"
class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet):
"""Have the default amino acid alphabet plus 'X' for unknown residues."""
def __init__(self):
"""Create the alphabet."""
super().__init__()
self._comps["X"] = self._comps["UNK"]
# pylint: enable=too-few-public-methods
......@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False):
"""Get res. num. from auth. IDs if reading from mmCIF files."""
if use_auth:
return int(r.GetStringProp("pdb_auth_resnum"))
else:
return r.number.num
return r.number.num
def _get_ch_name(ch, use_auth=False):
"""Get chain name from auth. IDs if reading from mmCIF files."""
if use_auth:
return ch.GetStringProp("pdb_auth_chain_name")
else:
return ch.name
return ch.name
class _OST2ModelCIF(modelcif.model.AbInitioModel):
......@@ -307,14 +312,17 @@ def _get_metadata(metadata_file):
return metadata
def _get_pdb_files(model_base_dir):
def _get_pdb_files(model_base_dir, model_dir_prfx="all_pdb"):
"""Collect PDB files from pub_data_* folders.
model_dir_prfx was "pub_data" for Sergey's old data.
Returns dict with key = family name and value = list of paths to PDB files.
"""
pdb_files_split = dict() # to return
pdb_files_split = {} # to return
pdb_files_raw = set() # to check for duplicates
pub_paths = [
f for f in os.listdir(model_base_dir) if f.startswith("pub_data_")
f for f in os.listdir(model_base_dir) if f.startswith(model_dir_prfx)
]
# NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
for pub_path in sorted(pub_paths):
......@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name):
"pdb_sequence": sqe_gaps,
"pdb_chain_id": [_get_ch_name(chn, False)],
"fam_name": fam_name,
"description": f"Representative Sequence of NMPFamsDB Family {fam_name}",
"description": "Representative Sequence of NMPFamsDB Family "
+ f"{fam_name}",
}
return [cif_ent], ost_ent
......@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name):
def _get_modelcif_entities(target_ents, asym_units, system):
"""Create ModelCIF entities and asymmetric units."""
alphabet = _LPeptideAlphabetWithX()
for cif_ent in target_ents:
mdlcif_ent = modelcif.Entity(
# NOTE: sequence here defines residues in model!
cif_ent["seqres"],
alphabet=alphabet,
description=cif_ent["description"],
source=None,
references=[
......@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..."
)
return
#
aln = io.LoadAlignment(
aln_path
) # note: this checks that it's an actual MSA
ref_seq = aln.sequences[0]
if ref_seq_check is not None and ref_seq_check.string != ref_seq.string:
raise RuntimeError(f"Sequence mismatch for {f_name}")
# TODO: allow "X" (or whatever can be used to label unknown AA) if needed
if "X" in ref_seq.string:
_warn_msg(
f"Cannot deal with 'X' in ref_seq for {f_name} (yet). "
f"Skipping..."
)
return
#
# gather data into JSON-like structure
print(" preparing data...", end="")
......
......@@ -11,7 +11,7 @@ reports='no'
extension-pkg-allow-list=["rapidjson", "ost"]
# [tool.pylint.typecheck]
# generated-members = ["LoadSequenceList"]
generated-members = ["FindSequence"]
[tool.pylint.FORMAT]
max-line-length=80
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment