Skip to content
Snippets Groups Projects
Commit 842bfa30 authored by Bienchen's avatar Bienchen
Browse files

SCHWED6036: Deal with X/ UNK residues

parent afaf4b2b
No related branches found
No related tags found
No related merge requests found
...@@ -26,12 +26,10 @@ from ost import io ...@@ -26,12 +26,10 @@ from ost import io
# EXAMPLES for running: # EXAMPLES for running:
""" # ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt \
ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \ # ./web_dloads/pivot ./modelcif --prefix=F000347 \
./web_dloads/pivot ./modelcif --prefix=F000347 \ # --pdb-web-path=./web_dloads/pdb \
--pdb-web-path=./web_dloads/pdb \ # --refseq-path=./web_dloads/consensus_all.fasta
--refseq-path=./web_dloads/consensus_all.fasta
"""
# NOTE: add "--compress" for final runs # NOTE: add "--compress" for final runs
...@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference): ...@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
other_details = "NMPFamsDB" other_details = "NMPFamsDB"
class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet):
"""Have the default amino acid alphabet plus 'X' for unknown residues."""
def __init__(self):
"""Create the alphabet."""
super().__init__()
self._comps["X"] = self._comps["UNK"]
# pylint: enable=too-few-public-methods # pylint: enable=too-few-public-methods
...@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False): ...@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False):
"""Get res. num. from auth. IDs if reading from mmCIF files.""" """Get res. num. from auth. IDs if reading from mmCIF files."""
if use_auth: if use_auth:
return int(r.GetStringProp("pdb_auth_resnum")) return int(r.GetStringProp("pdb_auth_resnum"))
else: return r.number.num
return r.number.num
def _get_ch_name(ch, use_auth=False): def _get_ch_name(ch, use_auth=False):
"""Get chain name from auth. IDs if reading from mmCIF files.""" """Get chain name from auth. IDs if reading from mmCIF files."""
if use_auth: if use_auth:
return ch.GetStringProp("pdb_auth_chain_name") return ch.GetStringProp("pdb_auth_chain_name")
else: return ch.name
return ch.name
class _OST2ModelCIF(modelcif.model.AbInitioModel): class _OST2ModelCIF(modelcif.model.AbInitioModel):
...@@ -307,14 +312,17 @@ def _get_metadata(metadata_file): ...@@ -307,14 +312,17 @@ def _get_metadata(metadata_file):
return metadata return metadata
def _get_pdb_files(model_base_dir): def _get_pdb_files(model_base_dir, model_dir_prfx="all_pdb"):
"""Collect PDB files from pub_data_* folders. """Collect PDB files from pub_data_* folders.
model_dir_prfx was "pub_data" for Sergey's old data.
Returns dict with key = family name and value = list of paths to PDB files. Returns dict with key = family name and value = list of paths to PDB files.
""" """
pdb_files_split = dict() # to return pdb_files_split = {} # to return
pdb_files_raw = set() # to check for duplicates pdb_files_raw = set() # to check for duplicates
pub_paths = [ pub_paths = [
f for f in os.listdir(model_base_dir) if f.startswith("pub_data_") f for f in os.listdir(model_base_dir) if f.startswith(model_dir_prfx)
] ]
# NOTE: we sort pub_paths to ensure that pub_data_02 is before _03 # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
for pub_path in sorted(pub_paths): for pub_path in sorted(pub_paths):
...@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name): ...@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name):
"pdb_sequence": sqe_gaps, "pdb_sequence": sqe_gaps,
"pdb_chain_id": [_get_ch_name(chn, False)], "pdb_chain_id": [_get_ch_name(chn, False)],
"fam_name": fam_name, "fam_name": fam_name,
"description": f"Representative Sequence of NMPFamsDB Family {fam_name}", "description": "Representative Sequence of NMPFamsDB Family "
+ f"{fam_name}",
} }
return [cif_ent], ost_ent return [cif_ent], ost_ent
...@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name): ...@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name):
def _get_modelcif_entities(target_ents, asym_units, system): def _get_modelcif_entities(target_ents, asym_units, system):
"""Create ModelCIF entities and asymmetric units.""" """Create ModelCIF entities and asymmetric units."""
alphabet = _LPeptideAlphabetWithX()
for cif_ent in target_ents: for cif_ent in target_ents:
mdlcif_ent = modelcif.Entity( mdlcif_ent = modelcif.Entity(
# NOTE: sequence here defines residues in model! # NOTE: sequence here defines residues in model!
cif_ent["seqres"], cif_ent["seqres"],
alphabet=alphabet,
description=cif_ent["description"], description=cif_ent["description"],
source=None, source=None,
references=[ references=[
...@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): ...@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..." f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..."
) )
return return
#
aln = io.LoadAlignment( aln = io.LoadAlignment(
aln_path aln_path
) # note: this checks that it's an actual MSA ) # note: this checks that it's an actual MSA
ref_seq = aln.sequences[0] ref_seq = aln.sequences[0]
if ref_seq_check is not None and ref_seq_check.string != ref_seq.string: if ref_seq_check is not None and ref_seq_check.string != ref_seq.string:
raise RuntimeError(f"Sequence mismatch for {f_name}") raise RuntimeError(f"Sequence mismatch for {f_name}")
# TODO: allow "X" (or whatever can be used to label unknown AA) if needed
if "X" in ref_seq.string:
_warn_msg(
f"Cannot deal with 'X' in ref_seq for {f_name} (yet). "
f"Skipping..."
)
return
#
# gather data into JSON-like structure # gather data into JSON-like structure
print(" preparing data...", end="") print(" preparing data...", end="")
......
...@@ -11,7 +11,7 @@ reports='no' ...@@ -11,7 +11,7 @@ reports='no'
extension-pkg-allow-list=["rapidjson", "ost"] extension-pkg-allow-list=["rapidjson", "ost"]
# [tool.pylint.typecheck] # [tool.pylint.typecheck]
# generated-members = ["LoadSequenceList"] generated-members = ["FindSequence"]
[tool.pylint.FORMAT] [tool.pylint.FORMAT]
max-line-length=80 max-line-length=80
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment