diff --git a/projects/novelfams/translate2modelcif.py b/projects/novelfams/translate2modelcif.py index 9188daca8267d9b96ce9eea92edcef58506bd919..10b61a0b50c0d34b528dbafda9d78e3e6ae44d57 100644 --- a/projects/novelfams/translate2modelcif.py +++ b/projects/novelfams/translate2modelcif.py @@ -12,7 +12,6 @@ import shutil import sys import zipfile -import numpy as np import pandas as pd import ihm @@ -25,7 +24,7 @@ import modelcif.protocol import modelcif.reference # pylint: disable=unused-import,wrong-import-order -from ost import io, geom, mol +from ost import io, geom, mol, seq # pylint: enable=unused-import,wrong-import-order @@ -68,6 +67,13 @@ def _parse_args(): description=__doc__, ) + parser.add_argument( + "trgt_sequences", + type=str, + metavar="<MULTI FASTA FILE>", + help="Multi-FastA file with sequences for targets, each sequence " + + "needs the target ID as name.", + ) parser.add_argument( "model_dir", type=str, @@ -148,6 +154,13 @@ class _LocalPairwisePAE( software = None +class _FesnovcatalogTrgRef(modelcif.reference.TargetReference): + """FESNov catalog as target reference.""" + + name = "Other" + other_details = "FESNov catalog" + + class _LPeptideAlphabetWithX(ihm.LPeptideAlphabet): """Have the default amino acid alphabet plus 'X' for unknown residues.""" @@ -287,6 +300,9 @@ def _get_sequence(chn, use_auth=False): sqe += "-" lst_rn += 1 sqe += res.one_letter_code + + if "-" in sqe: + print("GAP") return sqe @@ -300,7 +316,7 @@ class _NoEntitiesError(RuntimeError): such PDB files.""" -def _get_entities(pdb_file, fam_name): +def _get_entities(pdb_file, fam_name, trg_seq): """Gather data for the mmCIF (target) entities.""" try: ost_ent = io.LoadPDB(pdb_file) @@ -327,13 +343,20 @@ def _get_entities(pdb_file, fam_name): # NOTE: can have gaps to accommodate "X" in ref_seq exp_seq = sqe_gaps.replace("-", "X") + len_diff = len(trg_seq.string) - len(exp_seq) + if len_diff > 0: + exp_seq += "X" * len_diff + if exp_seq != trg_seq.string: + print(f"Sequence in {pdb_file} does not match target.") + # ToDo: re-enable check + # raise RuntimeError(f"Sequence in {pdb_file} does not match target.") cif_ent = { - "seqres": exp_seq, + "seqres": trg_seq.string, "pdb_sequence": sqe_gaps, "pdb_chain_id": [_get_ch_name(chn, False)], "fam_name": fam_name, - "description": f"Sequence of family {fam_name}", + "description": f"Representative sequence of FESNov family {fam_name}", } return [cif_ent], ost_ent @@ -349,7 +372,14 @@ def _get_modelcif_entities(target_ents, asym_units, system): alphabet=alphabet, description=cif_ent["description"], source=None, - references=[], + references=[ + _FesnovcatalogTrgRef( + cif_ent["fam_name"], + cif_ent["fam_name"], + align_begin=1, + align_end=len(cif_ent["seqres"]), + ) + ], ) # NOTE: this assigns (potentially new) alphabetic chain names for pdb_chain_id in cif_ent["pdb_chain_id"]: @@ -693,6 +723,7 @@ def _translate2modelcif_single( mdl_details, af2_lst, global_plddts, + trg_seq, ): """Convert a single model with its accompanying data to ModelCIF.""" # ToDo: re-enable Pylint @@ -709,7 +740,7 @@ def _translate2modelcif_single( mdlcf_json["global_plddts"] = global_plddts # process coordinates - target_entities, ost_ent = _get_entities(f_name, fam_name) + target_entities, ost_ent = _get_entities(f_name, fam_name, trg_seq) mdlcf_json["target_entities"] = target_entities # fill annotations @@ -726,7 +757,7 @@ def _translate2modelcif_single( ) -def _translate2modelcif(f_name, af2_lst, global_plddts, opts): +def _translate2modelcif(f_name, af2_lst, global_plddts, trg_seq, opts): """Convert a family of models with their accompanying data to ModelCIF.""" # ToDo: re-enable Pylint # pylint: disable=too-many-locals @@ -754,6 +785,7 @@ def _translate2modelcif(f_name, af2_lst, global_plddts, opts): mdl_details, af2_lst, global_plddts, + trg_seq, ) @@ -777,10 +809,10 @@ def _read_af2_model_list(path): def _read_global_plddt_list(path): """Read global pLDDT values from file.""" - plddt = {} if path is None: - return plddt + return None + plddt = {} with open(path, encoding="ascii") as pfh: for line in pfh: line = line.strip() @@ -790,11 +822,34 @@ def _read_global_plddt_list(path): return plddt +def _read_sequences(path): + """The sequence list provided by depositor contains "*" as stop codon in + some sequences, those need to be removed.""" + sqnz_lst = seq.CreateSequenceList() + with open(path, encoding="ascii") as ffh: + sqnz = "" + for line in ffh: + line = line.strip() + if line.startswith(">"): + if len(sqnz) > 0: + sqnz = sqnz.rstrip("*") + sqnz_lst.AddSequence(seq.CreateSequence(name, sqnz)) + sqnz = "" + name = line.split(">")[1].strip() + continue + sqnz += line + + return sqnz_lst + + def _main(): """Run as script.""" s_tmstmp = timer() opts = _parse_args() + # read target sequences + trg_seqs = _read_sequences(opts.trgt_sequences) + # get a list of PDB files with the path to load them. pdb_files = _get_pdb_files(opts.model_dir) n_mdls = len(pdb_files) @@ -815,6 +870,9 @@ def _main(): f_name, af2_mdls, global_plddts, + trg_seqs.FindSequence( + os.path.splitext(os.path.basename(f_name))[0] + ), opts, ) except (_InvalidCoordinateError, _NoEntitiesError):