From 842bfa30ab7dc0ea9582708f130a6fab39c5f05b Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Wed, 18 Oct 2023 15:52:21 +0200
Subject: [PATCH] SCHWED6036: Deal with X/ UNK residues

---
 .../translate2modelcif.py                     | 49 ++++++++++---------
 pyproject.toml                                |  2 +-
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/projects/dark-matter-metagenomics/translate2modelcif.py b/projects/dark-matter-metagenomics/translate2modelcif.py
index fd44a6d..68455db 100644
--- a/projects/dark-matter-metagenomics/translate2modelcif.py
+++ b/projects/dark-matter-metagenomics/translate2modelcif.py
@@ -26,12 +26,10 @@ from ost import io
 
 
 # EXAMPLES for running:
-"""
-ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \
-    ./web_dloads/pivot ./modelcif --prefix=F000347 \
-    --pdb-web-path=./web_dloads/pdb \
-    --refseq-path=./web_dloads/consensus_all.fasta
-"""
+# ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt  \
+#     ./web_dloads/pivot ./modelcif --prefix=F000347 \
+#     --pdb-web-path=./web_dloads/pdb \
+#     --refseq-path=./web_dloads/consensus_all.fasta
 # NOTE: add "--compress" for final runs
 
 
@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
     other_details = "NMPFamsDB"
 
 
+class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet):
+    """Have the default amino acid alphabet plus 'X' for unknown residues."""
+
+    def __init__(self):
+        """Create the alphabet."""
+        super().__init__()
+        self._comps["X"] = self._comps["UNK"]
+
+
 # pylint: enable=too-few-public-methods
 
 
@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False):
     """Get res. num. from auth. IDs if reading from mmCIF files."""
     if use_auth:
         return int(r.GetStringProp("pdb_auth_resnum"))
-    else:
-        return r.number.num
+    return r.number.num
 
 
 def _get_ch_name(ch, use_auth=False):
     """Get chain name from auth. IDs if reading from mmCIF files."""
     if use_auth:
         return ch.GetStringProp("pdb_auth_chain_name")
-    else:
-        return ch.name
+    return ch.name
 
 
 class _OST2ModelCIF(modelcif.model.AbInitioModel):
@@ -307,14 +312,17 @@ def _get_metadata(metadata_file):
     return metadata
 
 
-def _get_pdb_files(model_base_dir):
+def _get_pdb_files(model_base_dir, model_dir_prfx="all_pdb"):
     """Collect PDB files from pub_data_* folders.
+
+    model_dir_prfx was "pub_data" for Sergey's old data.
+
     Returns dict with key = family name and value = list of paths to PDB files.
     """
-    pdb_files_split = dict()  # to return
+    pdb_files_split = {}  # to return
     pdb_files_raw = set()  # to check for duplicates
     pub_paths = [
-        f for f in os.listdir(model_base_dir) if f.startswith("pub_data_")
+        f for f in os.listdir(model_base_dir) if f.startswith(model_dir_prfx)
     ]
     # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
     for pub_path in sorted(pub_paths):
@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name):
         "pdb_sequence": sqe_gaps,
         "pdb_chain_id": [_get_ch_name(chn, False)],
         "fam_name": fam_name,
-        "description": f"Representative Sequence of NMPFamsDB Family {fam_name}",
+        "description": "Representative Sequence of NMPFamsDB Family "
+        + f"{fam_name}",
     }
 
     return [cif_ent], ost_ent
@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name):
 
 def _get_modelcif_entities(target_ents, asym_units, system):
     """Create ModelCIF entities and asymmetric units."""
+    alphabet = _LPeptideAlphabetWithX()
     for cif_ent in target_ents:
         mdlcif_ent = modelcif.Entity(
             # NOTE: sequence here defines residues in model!
             cif_ent["seqres"],
+            alphabet=alphabet,
             description=cif_ent["description"],
             source=None,
             references=[
@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
             f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..."
         )
         return
-    #
+
     aln = io.LoadAlignment(
         aln_path
     )  # note: this checks that it's an actual MSA
     ref_seq = aln.sequences[0]
     if ref_seq_check is not None and ref_seq_check.string != ref_seq.string:
         raise RuntimeError(f"Sequence mismatch for {f_name}")
-    # TODO: allow "X" (or whatever can be used to label unknown AA) if needed
-    if "X" in ref_seq.string:
-        _warn_msg(
-            f"Cannot deal with 'X' in ref_seq for {f_name} (yet). "
-            f"Skipping..."
-        )
-        return
-    #
 
     # gather data into JSON-like structure
     print("    preparing data...", end="")
diff --git a/pyproject.toml b/pyproject.toml
index 39c78b3..6c0fe14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ reports='no'
 extension-pkg-allow-list=["rapidjson", "ost"]
 
 # [tool.pylint.typecheck]
-# generated-members = ["LoadSequenceList"]
+generated-members = ["FindSequence"]
 
 [tool.pylint.FORMAT]
 max-line-length=80
-- 
GitLab