From 3537f0783b373224b201b39a9b5ce64327bde65d Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Tue, 20 Sep 2022 15:08:30 +0200 Subject: [PATCH] chain mapping: make structure processing functionality available --- modules/mol/alg/pymod/chain_mapping.py | 219 ++++++++++++------------- 1 file changed, 108 insertions(+), 111 deletions(-) diff --git a/modules/mol/alg/pymod/chain_mapping.py b/modules/mol/alg/pymod/chain_mapping.py index 145a882c1..234c6f631 100644 --- a/modules/mol/alg/pymod/chain_mapping.py +++ b/modules/mol/alg/pymod/chain_mapping.py @@ -423,7 +423,6 @@ class ReprResult: return bb_pos - class ChainMapper: """ Class to compute chain mappings @@ -521,19 +520,6 @@ class ChainMapper: min_pep_length = 10, min_nuc_length = 4, n_max_naive = 1e8): - # target structure preprocessing - self._target, self._polypep_seqs, self._polynuc_seqs = \ - _ProcessStructure(target, min_pep_length, min_nuc_length) - - # helper class to generate pairwise alignments - self.aligner = _Aligner(resnum_aln = resnum_alignments, - pep_subst_mat = pep_subst_mat, - pep_gap_open = pep_gap_open, - pep_gap_ext = pep_gap_ext, - nuc_subst_mat = nuc_subst_mat, - nuc_gap_open = nuc_gap_open, - nuc_gap_ext = nuc_gap_ext) - # attributes self.pep_seqid_thr = pep_seqid_thr self.pep_gap_thr = pep_gap_thr @@ -549,6 +535,19 @@ class ChainMapper: self._chem_group_ref_seqs = None self._chem_group_types = None + # helper class to generate pairwise alignments + self.aligner = _Aligner(resnum_aln = resnum_alignments, + pep_subst_mat = pep_subst_mat, + pep_gap_open = pep_gap_open, + pep_gap_ext = pep_gap_ext, + nuc_subst_mat = nuc_subst_mat, + nuc_gap_open = nuc_gap_open, + nuc_gap_ext = nuc_gap_ext) + + # target structure preprocessing + self._target, self._polypep_seqs, self._polynuc_seqs = \ + self.ProcessStructure(target) + @property def target(self): """Target structure that only contains peptides/nucleotides @@ -604,7 +603,7 @@ class ChainMapper: """MSA for each group in :attr:`~chem_groups` Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and - have the respective :class:`EntityView` from *target* attached. + have the respective :class:`ost.mol.EntityView` from *target* attached. :getter: Computed on first use (cached) :type: :class:`ost.seq.AlignmentList` @@ -678,10 +677,7 @@ class ChainMapper: polynucleotides whose ATOMSEQ exactly matches the sequence info in the returned alignments. """ - mdl, mdl_pep_seqs, mdl_nuc_seqs = _ProcessStructure(model, - self.min_pep_length, - self.min_nuc_length) - + mdl, mdl_pep_seqs, mdl_nuc_seqs = self.ProcessStructure(model) mapping = [list() for x in self.chem_groups] alns = [seq.AlignmentList() for x in self.chem_groups] @@ -1291,105 +1287,106 @@ class ChainMapper: chem_mapping, chem_group_alns, mdl = self.GetChemMapping(model) return _NMappings(self.chem_groups, chem_mapping) + def ProcessStructure(self, ent): + """ Entity processing for chain mapping + + * Selects view containing peptide and nucleotide residues which have + required backbone atoms present - for peptide residues thats + N, CA, C and CB (no CB for GLY), for nucleotide residues thats + O5', C5', C4', C3' and O3'. + * filters view by chain lengths, see *min_pep_length* and + *min_nuc_length* in constructor + * Extracts atom sequences for each chain in that view + * Attaches corresponding :class:`ost.mol.EntityView` to each sequence + * Ensures strictly increasing residue numbers without insertion codes + in each chain + + + :param ent: Entity to process + :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle` + :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView` + containing peptide and nucleotide residues 2) + :class:`ost.seq.SequenceList` containing ATOMSEQ sequences + for each polypeptide chain in returned view, sequences have + :class:`ost.mol.EntityView` of according chains attached + 3) same for polynucleotide chains + """ + view = ent.CreateEmptyView() + exp_pep_atoms = ["N", "CA", "C", "CB"] + exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""] + pep_query = "peptide=true and aname=" + ','.join(exp_pep_atoms) + nuc_query = "nucleotide=true and aname=" + ','.join(exp_nuc_atoms) -# INTERNAL HELPERS -################## - -def _ProcessStructure(ent, min_pep_length, min_nuc_length): - """ Entity processing for chain mapping - - * Selects view containing peptide and nucleotide residues which have - required backbone atoms present - for peptide residues thats - N, CA, C and CB (no CB for GLY), for nucleotide residues thats - O5', C5', C4', C3' and O3'. - * Extracts atom sequences for each chain in that view - * Attaches corresponding :class:`ost.mol.EntityView` to each sequence - * Ensures strictly increasing residue numbers without insertion codes - in each chain - * filters chains by length - - :param ent: Entity to process - :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle` - :returns: Tuple with 3 elements: 1) :class:`EntityView` containing - peptide and nucleotide residues 2) :class:`seq.SequenceList` - containing ATOMSEQ sequences for each polypeptide chain in - returned view, sequences have :class:`ost.mol.EntityView` of - according chains attached 3) same for polynucleotide chains - """ - view = ent.CreateEmptyView() - exp_pep_atoms = ["N", "CA", "C", "CB"] - exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""] - pep_query = "peptide=true and aname=" + ','.join(exp_pep_atoms) - nuc_query = "nucleotide=true and aname=" + ','.join(exp_nuc_atoms) - - pep_sel = ent.Select(pep_query) - for r in pep_sel.residues: - if len(r.atoms) == 4: - view.AddResidue(r.handle, mol.INCLUDE_ALL) - elif r.name == "GLY" and len(r.atoms) == 3: - atom_names = [a.GetName() for a in r.atoms] - if sorted(atom_names) == ["C", "CA", "N"]: + pep_sel = ent.Select(pep_query) + for r in pep_sel.residues: + if len(r.atoms) == 4: + view.AddResidue(r.handle, mol.INCLUDE_ALL) + elif r.name == "GLY" and len(r.atoms) == 3: + atom_names = [a.GetName() for a in r.atoms] + if sorted(atom_names) == ["C", "CA", "N"]: + view.AddResidue(r.handle, mol.INCLUDE_ALL) + + nuc_sel = ent.Select(nuc_query) + for r in nuc_sel.residues: + if len(r.atoms) == 5: view.AddResidue(r.handle, mol.INCLUDE_ALL) - nuc_sel = ent.Select(nuc_query) - for r in nuc_sel.residues: - if len(r.atoms) == 5: - view.AddResidue(r.handle, mol.INCLUDE_ALL) + polypep_seqs = seq.CreateSequenceList() + polynuc_seqs = seq.CreateSequenceList() + + for ch in view.chains: + n_res = len(ch.residues) + n_pep = sum([r.IsPeptideLinking() for r in ch.residues]) + n_nuc = sum([r.IsNucleotideLinking() for r in ch.residues]) + + # guarantee that we have either pep or nuc (no mix of the two) + if n_pep > 0 and n_nuc > 0: + raise RuntimeError(f"Must not mix peptide and nucleotide linking " + f"residues in same chain ({ch.GetName()})") + + if (n_pep + n_nuc) != n_res: + raise RuntimeError("All residues must either be peptide_linking " + "or nucleotide_linking") + + # filter out short chains + if n_pep > 0 and n_pep < self.min_pep_length: + continue + + if n_nuc > 0 and n_nuc < self.min_nuc_length: + continue + + # check if no insertion codes are present in residue numbers + ins_codes = [r.GetNumber().GetInsCode() for r in ch.residues] + if len(set(ins_codes)) != 1 or ins_codes[0] != '\0': + raise RuntimeError("Residue numbers in input structures must not " + "contain insertion codes") + + # check if residue numbers are strictly increasing + nums = [r.GetNumber().GetNum() for r in ch.residues] + if not all(i < j for i, j in zip(nums, nums[1:])): + raise RuntimeError("Residue numbers in input structures must be " + "strictly increasing for each chain") + + s = ''.join([r.one_letter_code for r in ch.residues]) + s = seq.CreateSequence(ch.GetName(), s) + s.AttachView(view.Select(f"cname={ch.GetName()}")) + if n_pep == n_res: + polypep_seqs.AddSequence(s) + elif n_nuc == n_res: + polynuc_seqs.AddSequence(s) + else: + raise RuntimeError("This shouldnt happen") - polypep_seqs = seq.CreateSequenceList() - polynuc_seqs = seq.CreateSequenceList() + # select for chains for which we actually extracted the sequence + chain_names = [s.GetAttachedView().chains[0].name for s in polypep_seqs] + chain_names += [s.GetAttachedView().chains[0].name for s in polynuc_seqs] + view = view.Select(f"cname={','.join(chain_names)}") - for ch in view.chains: - n_res = len(ch.residues) - n_pep = sum([r.IsPeptideLinking() for r in ch.residues]) - n_nuc = sum([r.IsNucleotideLinking() for r in ch.residues]) - - # guarantee that we have either pep or nuc (no mix of the two) - if n_pep > 0 and n_nuc > 0: - raise RuntimeError(f"Must not mix peptide and nucleotide linking " - f"residues in same chain ({ch.GetName()})") - - if (n_pep + n_nuc) != n_res: - raise RuntimeError("All residues must either be peptide_linking " - "or nucleotide_linking") - - # filter out short chains - if n_pep > 0 and n_pep < min_pep_length: - continue - - if n_nuc > 0 and n_nuc < min_nuc_length: - continue - - # check if no insertion codes are present in residue numbers - ins_codes = [r.GetNumber().GetInsCode() for r in ch.residues] - if len(set(ins_codes)) != 1 or ins_codes[0] != '\0': - raise RuntimeError("Residue numbers in input structures must not " - "contain insertion codes") - - # check if residue numbers are strictly increasing - nums = [r.GetNumber().GetNum() for r in ch.residues] - if not all(i < j for i, j in zip(nums, nums[1:])): - raise RuntimeError("Residue numbers in input structures must be " - "strictly increasing for each chain") - - s = ''.join([r.one_letter_code for r in ch.residues]) - s = seq.CreateSequence(ch.GetName(), s) - s.AttachView(view.Select(f"cname={ch.GetName()}")) - if n_pep == n_res: - polypep_seqs.AddSequence(s) - elif n_nuc == n_res: - polynuc_seqs.AddSequence(s) - else: - raise RuntimeError("This shouldnt happen") - - # select for chains for which we actually extracted the sequence - chain_names = [s.GetAttachedView().chains[0].name for s in polypep_seqs] - chain_names += [s.GetAttachedView().chains[0].name for s in polynuc_seqs] - query = f"cname={','.join(chain_names)}" - view = view.Select(query) + return (view, polypep_seqs, polynuc_seqs) - return (view, polypep_seqs, polynuc_seqs) +# INTERNAL HELPERS +################## class _Aligner: def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5, pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44, -- GitLab