diff --git a/projects/dark-matter-metagenomics/translate2modelcif.py b/projects/dark-matter-metagenomics/translate2modelcif.py index fd44a6d510e4efff14883ab866c56c131a1169c5..68455db196d7526debfe46e63486bbf471266d52 100644 --- a/projects/dark-matter-metagenomics/translate2modelcif.py +++ b/projects/dark-matter-metagenomics/translate2modelcif.py @@ -26,12 +26,10 @@ from ost import io # EXAMPLES for running: -""" -ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \ - ./web_dloads/pivot ./modelcif --prefix=F000347 \ - --pdb-web-path=./web_dloads/pdb \ - --refseq-path=./web_dloads/consensus_all.fasta -""" +# ost translate2modelcif.py ./raw_data ./raw_data/all_ptm_plddt.txt \ +# ./web_dloads/pivot ./modelcif --prefix=F000347 \ +# --pdb-web-path=./web_dloads/pdb \ +# --refseq-path=./web_dloads/consensus_all.fasta # NOTE: add "--compress" for final runs @@ -182,6 +180,15 @@ class _NmpfamsdbTrgRef(modelcif.reference.TargetReference): other_details = "NMPFamsDB" +class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet): + """Have the default amino acid alphabet plus 'X' for unknown residues.""" + + def __init__(self): + """Create the alphabet.""" + super().__init__() + self._comps["X"] = self._comps["UNK"] + + # pylint: enable=too-few-public-methods @@ -189,16 +196,14 @@ def _get_res_num(r, use_auth=False): """Get res. num. from auth. IDs if reading from mmCIF files.""" if use_auth: return int(r.GetStringProp("pdb_auth_resnum")) - else: - return r.number.num + return r.number.num def _get_ch_name(ch, use_auth=False): """Get chain name from auth. IDs if reading from mmCIF files.""" if use_auth: return ch.GetStringProp("pdb_auth_chain_name") - else: - return ch.name + return ch.name class _OST2ModelCIF(modelcif.model.AbInitioModel): @@ -307,14 +312,17 @@ def _get_metadata(metadata_file): return metadata -def _get_pdb_files(model_base_dir): +def _get_pdb_files(model_base_dir, model_dir_prfx="all_pdb"): """Collect PDB files from pub_data_* folders. + + model_dir_prfx was "pub_data" for Sergey's old data. + Returns dict with key = family name and value = list of paths to PDB files. """ - pdb_files_split = dict() # to return + pdb_files_split = {} # to return pdb_files_raw = set() # to check for duplicates pub_paths = [ - f for f in os.listdir(model_base_dir) if f.startswith("pub_data_") + f for f in os.listdir(model_base_dir) if f.startswith(model_dir_prfx) ] # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03 for pub_path in sorted(pub_paths): @@ -522,7 +530,8 @@ def _get_entities(pdb_file, ref_seq, fam_name): "pdb_sequence": sqe_gaps, "pdb_chain_id": [_get_ch_name(chn, False)], "fam_name": fam_name, - "description": f"Representative Sequence of NMPFamsDB Family {fam_name}", + "description": "Representative Sequence of NMPFamsDB Family " + + f"{fam_name}", } return [cif_ent], ost_ent @@ -530,10 +539,12 @@ def _get_entities(pdb_file, ref_seq, fam_name): def _get_modelcif_entities(target_ents, asym_units, system): """Create ModelCIF entities and asymmetric units.""" + alphabet = _LPeptideAlphabetWithX() for cif_ent in target_ents: mdlcif_ent = modelcif.Entity( # NOTE: sequence here defines residues in model! cif_ent["seqres"], + alphabet=alphabet, description=cif_ent["description"], source=None, references=[ @@ -785,21 +796,13 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check): f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..." ) return - # + aln = io.LoadAlignment( aln_path ) # note: this checks that it's an actual MSA ref_seq = aln.sequences[0] if ref_seq_check is not None and ref_seq_check.string != ref_seq.string: raise RuntimeError(f"Sequence mismatch for {f_name}") - # TODO: allow "X" (or whatever can be used to label unknown AA) if needed - if "X" in ref_seq.string: - _warn_msg( - f"Cannot deal with 'X' in ref_seq for {f_name} (yet). " - f"Skipping..." - ) - return - # # gather data into JSON-like structure print(" preparing data...", end="") diff --git a/pyproject.toml b/pyproject.toml index 39c78b39be79fba7adb7152a43851a25fb3ac035..6c0fe14fe0b1e29cfd2e6d6bc8e5a3c341838202 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ reports='no' extension-pkg-allow-list=["rapidjson", "ost"] # [tool.pylint.typecheck] -# generated-members = ["LoadSequenceList"] +generated-members = ["FindSequence"] [tool.pylint.FORMAT] max-line-length=80