SCHWED6036: Deal with X/ UNK residues

842bfa30 · Bienchen · afaf4b2b · 842bfa30 · 842bfa30
Commit 842bfa30 authored 1 year ago by Bienchen
--- a/projects/dark-matter-metagenomics/translate2modelcif.py
+++ b/projects/dark-matter-metagenomics/translate2modelcif.py
@@ -26,12 +26,10 @@ from ost import io


 # EXAMPLES for running:
-"""
-ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \
-    ./web_dloads/pivot ./modelcif --prefix=F000347 \
-    --pdb-web-path=./web_dloads/pdb \
-    --refseq-path=./web_dloads/consensus_all.fasta
-"""
+# ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt  \
+#     ./web_dloads/pivot ./modelcif --prefix=F000347 \
+#     --pdb-web-path=./web_dloads/pdb \
+#     --refseq-path=./web_dloads/consensus_all.fasta
 # NOTE: add "--compress" for final runs


@@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
    other_details = "NMPFamsDB"


+class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet):
+    """Have the default amino acid alphabet plus 'X' for unknown residues."""
+
+    def __init__(self):
+        """Create the alphabet."""
+        super().__init__()
+        self._comps["X"] = self._comps["UNK"]
+
+
 # pylint: enable=too-few-public-methods


@@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False):
    """Get res. num. from auth. IDs if reading from mmCIF files."""
    if use_auth:
        return int(r.GetStringProp("pdb_auth_resnum"))
-    else:
-        return r.number.num
+    return r.number.num


 def _get_ch_name(ch, use_auth=False):
    """Get chain name from auth. IDs if reading from mmCIF files."""
    if use_auth:
        return ch.GetStringProp("pdb_auth_chain_name")
-    else:
-        return ch.name
+    return ch.name


 class _OST2ModelCIF(modelcif.model.AbInitioModel):
@@ -307,14 +312,17 @@ def _get_metadata(metadata_file):
    return metadata


-def _get_pdb_files(model_base_dir):
+def _get_pdb_files(model_base_dir, model_dir_prfx="all_pdb"):
    """Collect PDB files from pub_data_* folders.
+
+    model_dir_prfx was "pub_data" for Sergey's old data.
+
    Returns dict with key = family name and value = list of paths to PDB files.
    """
-    pdb_files_split = dict()  # to return
+    pdb_files_split = {}  # to return
    pdb_files_raw = set()  # to check for duplicates
    pub_paths = [
-        f for f in os.listdir(model_base_dir) if f.startswith("pub_data_")
+        f for f in os.listdir(model_base_dir) if f.startswith(model_dir_prfx)
    ]
    # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
    for pub_path in sorted(pub_paths):
@@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name):
        "pdb_sequence": sqe_gaps,
        "pdb_chain_id": [_get_ch_name(chn, False)],
        "fam_name": fam_name,
-        "description": f"Representative Sequence of NMPFamsDB Family {fam_name}",
+        "description": "Representative Sequence of NMPFamsDB Family "
+        + f"{fam_name}",
    }

    return [cif_ent], ost_ent
@@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name):

 def _get_modelcif_entities(target_ents, asym_units, system):
    """Create ModelCIF entities and asymmetric units."""
+    alphabet = _LPeptideAlphabetWithX()
    for cif_ent in target_ents:
        mdlcif_ent = modelcif.Entity(
            # NOTE: sequence here defines residues in model!
            cif_ent["seqres"],
+            alphabet=alphabet,
            description=cif_ent["description"],
            source=None,
            references=[
@@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
            f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..."
        )
        return
-    #
+
    aln = io.LoadAlignment(
        aln_path
    )  # note: this checks that it's an actual MSA
    ref_seq = aln.sequences[0]
    if ref_seq_check is not None and ref_seq_check.string != ref_seq.string:
        raise RuntimeError(f"Sequence mismatch for {f_name}")
-    # TODO: allow "X" (or whatever can be used to label unknown AA) if needed
-    if "X" in ref_seq.string:
-        _warn_msg(
-            f"Cannot deal with 'X' in ref_seq for {f_name} (yet). "
-            f"Skipping..."
-        )
-        return
-    #

    # gather data into JSON-like structure
    print("    preparing data...", end="")

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ reports='no'
 extension-pkg-allow-list=["rapidjson", "ost"]

 # [tool.pylint.typecheck]
-# generated-members = ["LoadSequenceList"]
+generated-members = ["FindSequence"]

 [tool.pylint.FORMAT]
 max-line-length=80