diff --git a/modules/mol/alg/doc/scoring_base.rst b/modules/mol/alg/doc/scoring_base.rst new file mode 100644 index 0000000000000000000000000000000000000000..1df6993399bb421953d1e7d11f5ad9ebb2f87a1f --- /dev/null +++ b/modules/mol/alg/doc/scoring_base.rst @@ -0,0 +1,7 @@ +:mod:`~ost.mol.alg.scoring_base` -- Helper functions for scoring +--------------------------------------------------------------------------------- + +.. automodule:: ost.mol.alg.scoring_base + :members: + :member-order: bysource + :synopsis: Helper functions for scoring \ No newline at end of file diff --git a/modules/mol/alg/pymod/scoring_base.py b/modules/mol/alg/pymod/scoring_base.py index 3aa58a5d04ad26137f14d65eb9d4501d5c9a7332..32351e079c0b08f330e7c5c98fa4160f2e34e0ba 100644 --- a/modules/mol/alg/pymod/scoring_base.py +++ b/modules/mol/alg/pymod/scoring_base.py @@ -2,6 +2,7 @@ import ost from ost import io from ost import conop from ost import mol +from ost import seq def CleanHydrogens(ent, clib): @@ -30,21 +31,30 @@ def CleanHydrogens(ent, clib): def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, - fault_tolerant=False, allow_heuristic_conn=False): + fault_tolerant=False, allow_heuristic_conn=False, + extract_seqres_mapping=False): """ Scoring helper - Prepares input from mmCIF Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated to scoring classes. + Depending on input flags, the following outputs can be retrieved: + + * poly_ent (:class:`ost.mol.EntityHandle`): An OpenStructure entity with only + polymer chains. + * non_poly_entities (:class:`list` of :class:`ost.mol.EntityHandle`): + OpenStructure entities representing all non-polymer (ligand) entities. + * seqres (:class:`ost.seq.SequenceList`): Seqres sequences with entity id + as sequence names and the respective canonical seqres as sequence. + * trg_seqres_mapping (:class:`dict`): Dictionary with chain names in + poly_ent as keys and the respective entity ids as values. + :param mmcif_path: Path to mmCIF file that contains polymer and optionally non-polymer entities :type mmcif_path: :class:`str` :param biounit: If given, construct specified biounit from mmCIF AU :type biounit: :class:`str` - :param extract_nonpoly: Additionally returns a list of - :class:`ost.mol.EntityHandle` - objects representing all non-polymer (ligand) - entities. + :param extract_nonpoly: Controls return value :type extract_nonpoly: :class:`bool` :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF` :type fault_tolerant: :class:`bool` @@ -57,11 +67,14 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, of a distance based heuristic as fallback. With all its consequences in ligand matching. :type allow_heuristic_conn: :class:`bool` - :returns: :class:`ost.mol.EntityHandle` which only contains polymer - entities representing the receptor structure. If *extract_nonpoly* - is True, a tuple is returned which additionally contains a - :class:`list` of :class:`ost.mol.EntityHandle`, where each - :class:`ost.mol.EntityHandle` represents a non-polymer (ligand). + :param extract_seqres_mapping: Controls return value + :type extract_seqres_mapping: :class:`bool` + :returns: poly_ent if *extract_nonpoly*/*extract_seqres_mapping* are False. + (poly_ent, non_poly_entities) if *extract_nonpoly* is True. + (poly_ent, seqres, trg_seqres_mapping) if *extract_seqres_mapping* + is True. + (poly_ent, non_poly_entities, seqres, trg_seqres_mapping) if both + flags are True. """ clib = conop.GetDefaultLib() if not clib: @@ -70,8 +83,15 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, "https://openstructure.org/docs/conop/compoundlib/.") raise RuntimeError("No compound library found") - mmcif_entity, mmcif_info = io.LoadMMCIF(mmcif_path, info=True, - fault_tolerant=fault_tolerant) + # return variables that will be defined depending on input flags + poly_ent = None + non_poly_entities = None + seqres = None + trg_seqres_mapping = None + + + mmcif_entity, mmcif_seqres, mmcif_info = io.LoadMMCIF(mmcif_path, seqres=True, info=True, + fault_tolerant=fault_tolerant) mmcif_entity = CleanHydrogens(mmcif_entity, clib) # get AU chain names representing polymer entities @@ -125,30 +145,63 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, poly_sel = mmcif_entity.Select("gcpoly:0=1") poly_ent = mol.CreateEntityFromView(poly_sel, True) - if extract_nonpoly == False: + if extract_nonpoly: + non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1") + non_poly_entities = list() + for i in range(non_poly_id): + view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}") + if view.GetResidueCount() != 1: + raise RuntimeError(f"Expect non-polymer entities in " + f"{mmcif_path} to contain exactly 1 " + f"residue. Got {ch.GetResidueCount()} " + f"in chain {ch.name}") + if not allow_heuristic_conn: + compound = clib.FindCompound(view.residues[0].name) + if compound is None: + raise RuntimeError(f"Can only extract non-polymer entities if " + f"respective residues are available in PDB " + f"component dictionary. Can't find " + f"\"{view.residues[0].name}\"") + + non_poly_entities.append(mol.CreateEntityFromView(view, True)) + + if extract_seqres_mapping: + # mmcif seqres is a list of sequences that relates to + # chain names in the assymetric unit. What we want is a list + # of sequences that relate to the underlying entities. + seqres = seq.CreateSequenceList() + seqres_processed = set() + + for s in mmcif_seqres: + entity_id = mmcif_info.GetMMCifEntityIdTr(s.GetName()) + if entity_id not in seqres_processed: + seqres_processed.add(entity_id) + seqres.AddSequence(seq.CreateSequence(entity_id, s.GetGaplessString())) + + trg_seqres_mapping = dict() + if biounit is None: + cnames = [ch.name for ch in poly_ent.chains] + for cname in cnames: + trg_seqres_mapping[cname] = mmcif_info.GetMMCifEntityIdTr(cname) + else: + bu_cnames = [ch.name for ch in poly_ent.chains] + au_cnames = list() + for bu_cname in bu_cnames: + dot_idx = bu_cname.index(".") + au_cnames.append(bu_cname[dot_idx + 1 :]) + for au_cname, bu_cname in zip(au_cnames, bu_cnames): + trg_seqres_mapping[bu_cname] = mmcif_info.GetMMCifEntityIdTr(au_cname) + + + if extract_nonpoly and extract_seqres_mapping: + return (poly_ent, non_poly_entities, seqres, trg_seqres_mapping) + elif extract_nonpoly: + return (poly_ent, non_poly_entities) + elif extract_seqres_mapping: + return (poly_ent, seqres, trg_seqres_mapping) + else: return poly_ent - non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1") - non_poly_entities = list() - for i in range(non_poly_id): - view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}") - if view.GetResidueCount() != 1: - raise RuntimeError(f"Expect non-polymer entities in " - f"{mmcif_path} to contain exactly 1 " - f"residue. Got {ch.GetResidueCount()} " - f"in chain {ch.name}") - if not allow_heuristic_conn: - compound = clib.FindCompound(view.residues[0].name) - if compound is None: - raise RuntimeError(f"Can only extract non-polymer entities if " - f"respective residues are available in PDB " - f"component dictionary. Can't find " - f"\"{view.residues[0].name}\"") - - non_poly_entities.append(mol.CreateEntityFromView(view, True)) - - return (poly_ent, non_poly_entities) - def PDBPrep(pdb_path, fault_tolerant=False): """ Scoring helper - Prepares scoring input from PDB diff --git a/modules/mol/alg/tests/CMakeLists.txt b/modules/mol/alg/tests/CMakeLists.txt index 1c030fec3041690dfaa756a8931d9d97f7e21eba..319e86fe85d13348ac0ec6758c9d3b607608295e 100644 --- a/modules/mol/alg/tests/CMakeLists.txt +++ b/modules/mol/alg/tests/CMakeLists.txt @@ -22,7 +22,8 @@ if (COMPOUND_LIB) test_nonstandard.py test_chain_mapping.py test_ligand_scoring.py - test_scoring.py) + test_scoring.py + test_scoring_base.py) endif() ost_unittest(MODULE mol_alg SOURCES "${OST_MOL_ALG_UNIT_TESTS}" LINK ost_io) diff --git a/modules/mol/alg/tests/test_scoring_base.py b/modules/mol/alg/tests/test_scoring_base.py new file mode 100644 index 0000000000000000000000000000000000000000..bc917f80196e1e85daec69eb9f17580ae9aa668b --- /dev/null +++ b/modules/mol/alg/tests/test_scoring_base.py @@ -0,0 +1,93 @@ +import unittest +import os + +import ost +from ost import io +from ost import seq +from ost.mol.alg import scoring_base + +def _GetTestfilePath(filename): + """Get the path to the test file given filename""" + return os.path.join('testfiles', filename) + +class TestLigandScoringFancy(unittest.TestCase): + + def test_MMCIFPrep(self): + + poly_ent = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz")) + self.assertEqual(poly_ent.GetChainCount(), 4) + cnames = [ch.name for ch in poly_ent.chains] + self.assertEqual(cnames, ["A", "B", "C", "D"]) + + + # test enabling extract_nonpoly flag + poly_ent, non_poly_entities = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"), + extract_nonpoly=True) + self.assertEqual(poly_ent.GetChainCount(), 4) + cnames = [ch.name for ch in poly_ent.chains] + self.assertEqual(cnames, ["A", "B", "C", "D"]) + self.assertEqual(len(non_poly_entities), 7) + nonpoly_names = [ent.residues[0].name for ent in non_poly_entities] + self.assertEqual(nonpoly_names, ["MG", "G3D", "AFB", "ZN", "MG", "G3D", "AFB"]) + + # test enabling extract_seqres_mapping flag + poly_ent, seqres, trg_seqres_mapping = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"), + extract_seqres_mapping=True) + self.assertEqual(poly_ent.GetChainCount(), 4) + cnames = [ch.name for ch in poly_ent.chains] + self.assertEqual(cnames, ["A", "B", "C", "D"]) + + self.assertEqual(len(seqres), 2) + seqres_1 = seq.CreateSequence("1", "MGNIFANLFKGLFGKKEMRILMVGLDAAGKTTIL" + "YKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDV" + "GGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNE" + "AREELMRMLAEDELRDAVLLVFANKQDLPNAMNA" + "AEITDKLGLHSLRHRNWYIQATCATSGDGLYEGL" + "DWLSNQLRNQK") + seqres_2 = seq.CreateSequence("2", "LEANEGSKTLQRNRKMAMGRKKFNMDPKKGIQFL" + "VENELLQNTPEEIARFLYKGEGLNKTAIGDYLGE" + "REELNLAVLHAFVDLHEFTDLNLVQALRQFLWSF" + "RLPGEAQKIDRMMEAFAQRYCLCNPGVFQSTDTC" + "YVLSYSVIMLNTDLHNPNVRDKMGLERFVAMNRG" + "INEGGDLPEELLRNLYDSIRNEPFKIPEDDGND") + + self.assertEqual(seqres[0].GetName(), seqres_1.GetName()) + self.assertEqual(seqres[0].GetGaplessString(), seqres_1.GetGaplessString()) + self.assertEqual(seqres[1].GetName(), seqres_2.GetName()) + self.assertEqual(seqres[1].GetGaplessString(), seqres_2.GetGaplessString()) + expected_trg_seqres_mapping = {"A": "1", + "B": "2", + "C": "1", + "D": "2"} + self.assertEqual(len(expected_trg_seqres_mapping), len(trg_seqres_mapping)) + for k,v in trg_seqres_mapping.items(): + self.assertEqual(expected_trg_seqres_mapping[k], v) + + # specify biounit and enable EVERYTHING + poly_ent, non_poly_entities, seqres, trg_seqres_mapping =\ + scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"), + extract_nonpoly=True, + extract_seqres_mapping=True, + biounit="1") + self.assertEqual(poly_ent.GetChainCount(), 2) + cnames = [ch.name for ch in poly_ent.chains] + self.assertEqual(cnames, ["1.A", "1.B"]) + self.assertEqual(len(non_poly_entities), 4) + nonpoly_names = [ent.residues[0].name for ent in non_poly_entities] + self.assertEqual(nonpoly_names, ["MG", "G3D", "AFB", "ZN"]) + self.assertEqual(seqres[0].GetName(), seqres_1.GetName()) + self.assertEqual(seqres[0].GetGaplessString(), seqres_1.GetGaplessString()) + self.assertEqual(seqres[1].GetName(), seqres_2.GetName()) + self.assertEqual(seqres[1].GetGaplessString(), seqres_2.GetGaplessString()) + expected_trg_seqres_mapping = {"1.A": "1", + "1.B": "2"} + self.assertEqual(len(expected_trg_seqres_mapping), len(trg_seqres_mapping)) + for k,v in trg_seqres_mapping.items(): + self.assertEqual(expected_trg_seqres_mapping[k], v) + +if __name__ == "__main__": + from ost import testutils + if testutils.DefaultCompoundLibIsSet(): + testutils.RunTests() + else: + print('No compound lib available. Ignoring test_scoring_base.py tests.') \ No newline at end of file