Skip to content
Snippets Groups Projects
Commit bb8ad358 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

MMCIFPrep on steroids

parent fbeac0db
Branches
Tags
No related merge requests found
:mod:`~ost.mol.alg.scoring_base` -- Helper functions for scoring
---------------------------------------------------------------------------------
.. automodule:: ost.mol.alg.scoring_base
:members:
:member-order: bysource
:synopsis: Helper functions for scoring
\ No newline at end of file
...@@ -2,6 +2,7 @@ import ost ...@@ -2,6 +2,7 @@ import ost
from ost import io from ost import io
from ost import conop from ost import conop
from ost import mol from ost import mol
from ost import seq
def CleanHydrogens(ent, clib): def CleanHydrogens(ent, clib):
...@@ -30,21 +31,30 @@ def CleanHydrogens(ent, clib): ...@@ -30,21 +31,30 @@ def CleanHydrogens(ent, clib):
def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
fault_tolerant=False, allow_heuristic_conn=False): fault_tolerant=False, allow_heuristic_conn=False,
extract_seqres_mapping=False):
""" Scoring helper - Prepares input from mmCIF """ Scoring helper - Prepares input from mmCIF
Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
to scoring classes. to scoring classes.
Depending on input flags, the following outputs can be retrieved:
* poly_ent (:class:`ost.mol.EntityHandle`): An OpenStructure entity with only
polymer chains.
* non_poly_entities (:class:`list` of :class:`ost.mol.EntityHandle`):
OpenStructure entities representing all non-polymer (ligand) entities.
* seqres (:class:`ost.seq.SequenceList`): Seqres sequences with entity id
as sequence names and the respective canonical seqres as sequence.
* trg_seqres_mapping (:class:`dict`): Dictionary with chain names in
poly_ent as keys and the respective entity ids as values.
:param mmcif_path: Path to mmCIF file that contains polymer and optionally :param mmcif_path: Path to mmCIF file that contains polymer and optionally
non-polymer entities non-polymer entities
:type mmcif_path: :class:`str` :type mmcif_path: :class:`str`
:param biounit: If given, construct specified biounit from mmCIF AU :param biounit: If given, construct specified biounit from mmCIF AU
:type biounit: :class:`str` :type biounit: :class:`str`
:param extract_nonpoly: Additionally returns a list of :param extract_nonpoly: Controls return value
:class:`ost.mol.EntityHandle`
objects representing all non-polymer (ligand)
entities.
:type extract_nonpoly: :class:`bool` :type extract_nonpoly: :class:`bool`
:param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF` :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
:type fault_tolerant: :class:`bool` :type fault_tolerant: :class:`bool`
...@@ -57,11 +67,14 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, ...@@ -57,11 +67,14 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
of a distance based heuristic as fallback. of a distance based heuristic as fallback.
With all its consequences in ligand matching. With all its consequences in ligand matching.
:type allow_heuristic_conn: :class:`bool` :type allow_heuristic_conn: :class:`bool`
:returns: :class:`ost.mol.EntityHandle` which only contains polymer :param extract_seqres_mapping: Controls return value
entities representing the receptor structure. If *extract_nonpoly* :type extract_seqres_mapping: :class:`bool`
is True, a tuple is returned which additionally contains a :returns: poly_ent if *extract_nonpoly*/*extract_seqres_mapping* are False.
:class:`list` of :class:`ost.mol.EntityHandle`, where each (poly_ent, non_poly_entities) if *extract_nonpoly* is True.
:class:`ost.mol.EntityHandle` represents a non-polymer (ligand). (poly_ent, seqres, trg_seqres_mapping) if *extract_seqres_mapping*
is True.
(poly_ent, non_poly_entities, seqres, trg_seqres_mapping) if both
flags are True.
""" """
clib = conop.GetDefaultLib() clib = conop.GetDefaultLib()
if not clib: if not clib:
...@@ -70,8 +83,15 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, ...@@ -70,8 +83,15 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
"https://openstructure.org/docs/conop/compoundlib/.") "https://openstructure.org/docs/conop/compoundlib/.")
raise RuntimeError("No compound library found") raise RuntimeError("No compound library found")
mmcif_entity, mmcif_info = io.LoadMMCIF(mmcif_path, info=True, # return variables that will be defined depending on input flags
fault_tolerant=fault_tolerant) poly_ent = None
non_poly_entities = None
seqres = None
trg_seqres_mapping = None
mmcif_entity, mmcif_seqres, mmcif_info = io.LoadMMCIF(mmcif_path, seqres=True, info=True,
fault_tolerant=fault_tolerant)
mmcif_entity = CleanHydrogens(mmcif_entity, clib) mmcif_entity = CleanHydrogens(mmcif_entity, clib)
# get AU chain names representing polymer entities # get AU chain names representing polymer entities
...@@ -125,30 +145,63 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, ...@@ -125,30 +145,63 @@ def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
poly_sel = mmcif_entity.Select("gcpoly:0=1") poly_sel = mmcif_entity.Select("gcpoly:0=1")
poly_ent = mol.CreateEntityFromView(poly_sel, True) poly_ent = mol.CreateEntityFromView(poly_sel, True)
if extract_nonpoly == False: if extract_nonpoly:
non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1")
non_poly_entities = list()
for i in range(non_poly_id):
view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}")
if view.GetResidueCount() != 1:
raise RuntimeError(f"Expect non-polymer entities in "
f"{mmcif_path} to contain exactly 1 "
f"residue. Got {ch.GetResidueCount()} "
f"in chain {ch.name}")
if not allow_heuristic_conn:
compound = clib.FindCompound(view.residues[0].name)
if compound is None:
raise RuntimeError(f"Can only extract non-polymer entities if "
f"respective residues are available in PDB "
f"component dictionary. Can't find "
f"\"{view.residues[0].name}\"")
non_poly_entities.append(mol.CreateEntityFromView(view, True))
if extract_seqres_mapping:
# mmcif seqres is a list of sequences that relates to
# chain names in the assymetric unit. What we want is a list
# of sequences that relate to the underlying entities.
seqres = seq.CreateSequenceList()
seqres_processed = set()
for s in mmcif_seqres:
entity_id = mmcif_info.GetMMCifEntityIdTr(s.GetName())
if entity_id not in seqres_processed:
seqres_processed.add(entity_id)
seqres.AddSequence(seq.CreateSequence(entity_id, s.GetGaplessString()))
trg_seqres_mapping = dict()
if biounit is None:
cnames = [ch.name for ch in poly_ent.chains]
for cname in cnames:
trg_seqres_mapping[cname] = mmcif_info.GetMMCifEntityIdTr(cname)
else:
bu_cnames = [ch.name for ch in poly_ent.chains]
au_cnames = list()
for bu_cname in bu_cnames:
dot_idx = bu_cname.index(".")
au_cnames.append(bu_cname[dot_idx + 1 :])
for au_cname, bu_cname in zip(au_cnames, bu_cnames):
trg_seqres_mapping[bu_cname] = mmcif_info.GetMMCifEntityIdTr(au_cname)
if extract_nonpoly and extract_seqres_mapping:
return (poly_ent, non_poly_entities, seqres, trg_seqres_mapping)
elif extract_nonpoly:
return (poly_ent, non_poly_entities)
elif extract_seqres_mapping:
return (poly_ent, seqres, trg_seqres_mapping)
else:
return poly_ent return poly_ent
non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1")
non_poly_entities = list()
for i in range(non_poly_id):
view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}")
if view.GetResidueCount() != 1:
raise RuntimeError(f"Expect non-polymer entities in "
f"{mmcif_path} to contain exactly 1 "
f"residue. Got {ch.GetResidueCount()} "
f"in chain {ch.name}")
if not allow_heuristic_conn:
compound = clib.FindCompound(view.residues[0].name)
if compound is None:
raise RuntimeError(f"Can only extract non-polymer entities if "
f"respective residues are available in PDB "
f"component dictionary. Can't find "
f"\"{view.residues[0].name}\"")
non_poly_entities.append(mol.CreateEntityFromView(view, True))
return (poly_ent, non_poly_entities)
def PDBPrep(pdb_path, fault_tolerant=False): def PDBPrep(pdb_path, fault_tolerant=False):
""" Scoring helper - Prepares scoring input from PDB """ Scoring helper - Prepares scoring input from PDB
......
...@@ -22,7 +22,8 @@ if (COMPOUND_LIB) ...@@ -22,7 +22,8 @@ if (COMPOUND_LIB)
test_nonstandard.py test_nonstandard.py
test_chain_mapping.py test_chain_mapping.py
test_ligand_scoring.py test_ligand_scoring.py
test_scoring.py) test_scoring.py
test_scoring_base.py)
endif() endif()
ost_unittest(MODULE mol_alg SOURCES "${OST_MOL_ALG_UNIT_TESTS}" LINK ost_io) ost_unittest(MODULE mol_alg SOURCES "${OST_MOL_ALG_UNIT_TESTS}" LINK ost_io)
import unittest
import os
import ost
from ost import io
from ost import seq
from ost.mol.alg import scoring_base
def _GetTestfilePath(filename):
"""Get the path to the test file given filename"""
return os.path.join('testfiles', filename)
class TestLigandScoringFancy(unittest.TestCase):
def test_MMCIFPrep(self):
poly_ent = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"))
self.assertEqual(poly_ent.GetChainCount(), 4)
cnames = [ch.name for ch in poly_ent.chains]
self.assertEqual(cnames, ["A", "B", "C", "D"])
# test enabling extract_nonpoly flag
poly_ent, non_poly_entities = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
extract_nonpoly=True)
self.assertEqual(poly_ent.GetChainCount(), 4)
cnames = [ch.name for ch in poly_ent.chains]
self.assertEqual(cnames, ["A", "B", "C", "D"])
self.assertEqual(len(non_poly_entities), 7)
nonpoly_names = [ent.residues[0].name for ent in non_poly_entities]
self.assertEqual(nonpoly_names, ["MG", "G3D", "AFB", "ZN", "MG", "G3D", "AFB"])
# test enabling extract_seqres_mapping flag
poly_ent, seqres, trg_seqres_mapping = scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
extract_seqres_mapping=True)
self.assertEqual(poly_ent.GetChainCount(), 4)
cnames = [ch.name for ch in poly_ent.chains]
self.assertEqual(cnames, ["A", "B", "C", "D"])
self.assertEqual(len(seqres), 2)
seqres_1 = seq.CreateSequence("1", "MGNIFANLFKGLFGKKEMRILMVGLDAAGKTTIL"
"YKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDV"
"GGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNE"
"AREELMRMLAEDELRDAVLLVFANKQDLPNAMNA"
"AEITDKLGLHSLRHRNWYIQATCATSGDGLYEGL"
"DWLSNQLRNQK")
seqres_2 = seq.CreateSequence("2", "LEANEGSKTLQRNRKMAMGRKKFNMDPKKGIQFL"
"VENELLQNTPEEIARFLYKGEGLNKTAIGDYLGE"
"REELNLAVLHAFVDLHEFTDLNLVQALRQFLWSF"
"RLPGEAQKIDRMMEAFAQRYCLCNPGVFQSTDTC"
"YVLSYSVIMLNTDLHNPNVRDKMGLERFVAMNRG"
"INEGGDLPEELLRNLYDSIRNEPFKIPEDDGND")
self.assertEqual(seqres[0].GetName(), seqres_1.GetName())
self.assertEqual(seqres[0].GetGaplessString(), seqres_1.GetGaplessString())
self.assertEqual(seqres[1].GetName(), seqres_2.GetName())
self.assertEqual(seqres[1].GetGaplessString(), seqres_2.GetGaplessString())
expected_trg_seqres_mapping = {"A": "1",
"B": "2",
"C": "1",
"D": "2"}
self.assertEqual(len(expected_trg_seqres_mapping), len(trg_seqres_mapping))
for k,v in trg_seqres_mapping.items():
self.assertEqual(expected_trg_seqres_mapping[k], v)
# specify biounit and enable EVERYTHING
poly_ent, non_poly_entities, seqres, trg_seqres_mapping =\
scoring_base.MMCIFPrep(_GetTestfilePath("1r8q.cif.gz"),
extract_nonpoly=True,
extract_seqres_mapping=True,
biounit="1")
self.assertEqual(poly_ent.GetChainCount(), 2)
cnames = [ch.name for ch in poly_ent.chains]
self.assertEqual(cnames, ["1.A", "1.B"])
self.assertEqual(len(non_poly_entities), 4)
nonpoly_names = [ent.residues[0].name for ent in non_poly_entities]
self.assertEqual(nonpoly_names, ["MG", "G3D", "AFB", "ZN"])
self.assertEqual(seqres[0].GetName(), seqres_1.GetName())
self.assertEqual(seqres[0].GetGaplessString(), seqres_1.GetGaplessString())
self.assertEqual(seqres[1].GetName(), seqres_2.GetName())
self.assertEqual(seqres[1].GetGaplessString(), seqres_2.GetGaplessString())
expected_trg_seqres_mapping = {"1.A": "1",
"1.B": "2"}
self.assertEqual(len(expected_trg_seqres_mapping), len(trg_seqres_mapping))
for k,v in trg_seqres_mapping.items():
self.assertEqual(expected_trg_seqres_mapping[k], v)
if __name__ == "__main__":
from ost import testutils
if testutils.DefaultCompoundLibIsSet():
testutils.RunTests()
else:
print('No compound lib available. Ignoring test_scoring_base.py tests.')
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment