From 6347e9bfeba9765fb51e563546862a02c2433c4a Mon Sep 17 00:00:00 2001
From: Marco Biasini <marco.biasini@unibas.ch>
Date: Sun, 3 Feb 2013 12:05:45 +0100
Subject: [PATCH] move the bulk of PDBize to C++

---
 modules/geom/pymod/export_mat4.cc     |   7 ++
 modules/geom/src/mat4.hh              |   3 +
 modules/io/pymod/__init__.py          | 107 ++----------------
 modules/io/tests/test_io_mmcif.py     |   6 +-
 modules/mol/alg/pymod/wrap_mol_alg.cc |   3 +
 modules/mol/alg/src/CMakeLists.txt    |   2 +
 modules/mol/alg/src/pdbize.cc         | 157 ++++++++++++++++++++++++++
 modules/mol/alg/src/pdbize.hh         |  40 +++++++
 8 files changed, 224 insertions(+), 101 deletions(-)
 create mode 100644 modules/mol/alg/src/pdbize.cc
 create mode 100644 modules/mol/alg/src/pdbize.hh

diff --git a/modules/geom/pymod/export_mat4.cc b/modules/geom/pymod/export_mat4.cc
index ec003a0e3..c457eb261 100644
--- a/modules/geom/pymod/export_mat4.cc
+++ b/modules/geom/pymod/export_mat4.cc
@@ -18,10 +18,12 @@
 //------------------------------------------------------------------------------
 #define BOOST_PYTHON_MAX_ARITY 20
 #include <boost/python.hpp>
+#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
 #include <boost/python/slice.hpp>
 using namespace boost::python;
 
 #include <ost/geom/geom.hh>
+#include <ost/geom/export_helper/vector.hh>
 using namespace geom;
 
 
@@ -138,4 +140,9 @@ void export_Mat4()
     .def("PasteTranslation",&Mat4::PasteTranslation)
     .add_property("data",mat4_data)
   ;
+
+  class_<Mat4List>("Mat4List", init<>())
+    .def(vector_indexing_suite<Mat4List>())
+    .def(geom::VectorAdditions<Mat4List>())
+  ;
 }
diff --git a/modules/geom/src/mat4.hh b/modules/geom/src/mat4.hh
index 998bbc485..95ca3d632 100644
--- a/modules/geom/src/mat4.hh
+++ b/modules/geom/src/mat4.hh
@@ -22,6 +22,7 @@
 #include <cassert>
 #include <cstddef> // for size_t
 #include <ostream>
+#include <vector>
 
 #include <boost/operators.hpp>
 
@@ -136,6 +137,8 @@ double i32, double i33);
 
 };
 
+typedef std::vector<Mat4> Mat4List;
+
 DLLEXPORT_OST_GEOM std::ostream& operator<<(std::ostream& os, const Mat4& m);
 
 } // ns geom
diff --git a/modules/io/pymod/__init__.py b/modules/io/pymod/__init__.py
index 2d402eeac..741b9572f 100644
--- a/modules/io/pymod/__init__.py
+++ b/modules/io/pymod/__init__.py
@@ -19,7 +19,7 @@
 import os, tempfile, ftplib, httplib
 
 from _ost_io import *
-from ost import mol, geom, conop
+from ost import mol, geom, conop, seq
 
 profiles=None
 
@@ -358,29 +358,12 @@ def LoadMMCIF(filename, restrict_chains="", fault_tolerant=None, calpha_only=Non
 # MMCifInfoBioUnit.PDBize, since this function is not included in SPHINX.
 def _PDBize(biounit, asu, seqres=None, min_polymer_size=10,
             transformation=False):
-  def _CopyAtoms(src_res, dst_res, edi, trans=geom.Mat4()):
-    atom_pos_wrong = False
-    for atom in src_res.atoms:
-      tmp_pos = geom.Vec4(atom.pos)
-      new_atom=edi.InsertAtom(dst_res, atom.name, geom.Vec3(trans*tmp_pos), 
-                              element=atom.element,
-                              occupancy=atom.occupancy, 
-                              b_factor=atom.b_factor,
-                              is_hetatm=atom.is_hetatom)
-      for p in range(0,3):
-        if new_atom.pos[p] <= -1000:
-          atom_pos_wrong = True
-        elif new_atom.pos[p] >= 10000:
-          atom_pos_wrong = True
-    return atom_pos_wrong
-
-  chain_names='ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz'
   # create list of operations
   # for cartesian products, operations are stored in a list, multiplied with
   # the next list of operations and re-stored... until all lists of operations
   # are multiplied in an all-against-all manner.
   operations = biounit.GetOperations()
-  trans_matrices = list()
+  trans_matrices = geom.Mat4List()
   if len(operations) > 0:
     for op in operations[0]:
       rot = geom.Mat4()
@@ -407,86 +390,14 @@ def _PDBize(biounit, asu, seqres=None, min_polymer_size=10,
   assu = asu.Select('cname=' + ','.join(biounit.GetChainList()))
   # use each transformation on the view, store as entity and transform, PDBize
   # the result while adding everything to one large entity
-  pdb_bu = mol.CreateEntity()
-  edi = pdb_bu.EditXCS(mol.BUFFERED_EDIT)
-  cur_chain_name = 0
-  water_chain = mol.ChainHandle()
-  ligand_chain = mol.ChainHandle()
-  a_pos_wrong = False
-  for tr in trans_matrices:
-    # do a PDBize, add each new entity to the end product
-    for chain in assu.chains:
-      residue_count = len(chain.residues)
-      if seqres:
-        seqres_chain = seqres.FindSequence(chain.name)
-        if seqres_chain.IsValid():
-          residue_count = len(seqres_chain)
-      if chain.is_polymer and residue_count >= min_polymer_size:
-        if len(chain_names) == cur_chain_name:
-          raise RuntimeError('Running out of chain names')
-        new_chain = edi.InsertChain(chain_names[cur_chain_name])
-        cur_chain_name += 1
-        edi.SetChainDescription(new_chain, chain.description)
-        edi.SetChainType(new_chain, chain.type)
-        new_chain.SetStringProp('original_name', chain.name)
-        if chain.HasProp("pdb_auth_chain_name"):
-          new_chain.SetStringProp("pdb_auth_chain_name", 
-                                  chain.GetStringProp("pdb_auth_chain_name"))
-        for res in chain.residues:
-          new_res = edi.AppendResidue(new_chain, res.name, res.number)
-          a_b = _CopyAtoms(res, new_res, edi, tr)
-          if not a_pos_wrong:
-            a_pos_wrong = a_b
-      elif chain.type == mol.CHAINTYPE_WATER:
-        if not water_chain.IsValid():
-          # water gets '-' as name
-          water_chain = edi.InsertChain('-')
-          edi.SetChainDescription(water_chain, chain.description)
-          edi.SetChainType(water_chain, chain.type)
-        for res in chain.residues:
-          new_res = edi.AppendResidue(water_chain, res.name)
-          new_res.SetStringProp('type', mol.StringFromChainType(chain.type))
-          new_res.SetStringProp('description', chain.description)
-          a_b = _CopyAtoms(res, new_res, edi, tr)
-          if not a_pos_wrong:
-            a_pos_wrong = a_b
-      else:
-        if not ligand_chain.IsValid():
-          # all ligands, put in one chain, are named '_'
-          ligand_chain = edi.InsertChain('_')
-          last_rnum = 0
-        else:
-          last_rnum = ligand_chain.residues[-1].number.num
-        residues=chain.residues
-        ins_code='\0'
-        if len(residues)>1:
-          ins_code='A'
-        for res in chain.residues:
-          new_res = edi.AppendResidue(ligand_chain, res.name, 
-                                      mol.ResNum(last_rnum+1, ins_code))
-          new_res.SetStringProp('description', chain.description)
-          new_res.SetStringProp('type', mol.StringFromChainType(chain.type))
-          new_res.SetStringProp("original_name", chain.name)
-          if chain.HasProp("pdb_auth_chain_name"):
-            new_res.SetStringProp("pdb_auth_chain_name",
-                                  chain.GetStringProp("pdb_auth_chain_name"))
-          ins_code = chr(ord(ins_code)+1)
-          _CopyAtoms(res, new_res, edi, tr)
-          a_b = _CopyAtoms(res, new_res, edi, tr)
-          if not a_pos_wrong:
-            a_pos_wrong = a_b
-  move_to_origin = None
-  if a_pos_wrong:
-    start = pdb_bu.bounds.min
-    move_to_origin = geom.Mat4(1,0,0,(-999 - start[0]),
-                               0,1,0,(-999 - start[1]),
-                               0,0,1,(-999 - start[2]),
-                               0,0,0,1)
-    edi = pdb_bu.EditXCS(mol.UNBUFFERED_EDIT)
-    edi.ApplyTransform(move_to_origin)
-  conop.ConnectAll(pdb_bu)
+  ss = seqres
+  if not ss:
+    ss = seq.SequenceList()
+
+  pdb_bu = mol.alg.PDBize(assu, trans_matrices, ss, min_polymer_size=min_polymer_size, 
+                          shift_to_fit=transformation)
   if transformation:
-    return pdb_bu, move_to_origin
+    return pdb_bu, pdb_bu.GetTransformationMatrix()
   return pdb_bu
 
 MMCifInfoBioUnit.PDBize = _PDBize
diff --git a/modules/io/tests/test_io_mmcif.py b/modules/io/tests/test_io_mmcif.py
index bc8a35aa2..6753b48e2 100644
--- a/modules/io/tests/test_io_mmcif.py
+++ b/modules/io/tests/test_io_mmcif.py
@@ -171,9 +171,9 @@ class TestMMCifInfo(unittest.TestCase):
                                      seqres=True,
                                      info=True)
     pdb_ent, t = info.GetBioUnits()[0].PDBize(ent, transformation=True)
-    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[0], -915.759, 2)
-    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[1], -952.345, 2)
-    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[2], 3221.75, 2)
+    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[0], -915.759, 1)
+    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[1], -952.345, 1)
+    self.assertAlmostEquals(pdb_ent.GetCenterOfAtoms()[2], 3221.75, 1)
     self.assertEquals(geom.Equal(t,
                                  geom.Mat4(1,0,0,-920.462,
                                            0,1,0,-966.654,
diff --git a/modules/mol/alg/pymod/wrap_mol_alg.cc b/modules/mol/alg/pymod/wrap_mol_alg.cc
index bdf141aa5..f4bbb5f29 100644
--- a/modules/mol/alg/pymod/wrap_mol_alg.cc
+++ b/modules/mol/alg/pymod/wrap_mol_alg.cc
@@ -24,6 +24,7 @@
 #include <ost/mol/alg/superpose_frames.hh>
 #include <ost/mol/alg/filter_clashes.hh>
 #include <ost/mol/alg/consistency_checks.hh>
+#include <ost/mol/alg/pdbize.hh>
 #include <ost/export_helper/pair_to_tuple_conv.hh>
 
 using namespace boost::python;
@@ -168,6 +169,8 @@ BOOST_PYTHON_MODULE(_ost_mol_alg)
   def("PrintResidueRDMap",&mol::alg::PrintResidueRDMap);
  
   def("ResidueNamesMatch",&mol::alg::ResidueNamesMatch);
+  def("PDBize", &mol::alg::PDBize, (arg("asu"), arg("transformations"), arg("seqres"), 
+      arg("min_polymer_size")=10, arg("shift_to_fit")=false));
 
  
 }
diff --git a/modules/mol/alg/src/CMakeLists.txt b/modules/mol/alg/src/CMakeLists.txt
index eca9aa6e8..8f3450e88 100644
--- a/modules/mol/alg/src/CMakeLists.txt
+++ b/modules/mol/alg/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(OST_MOL_ALG_HEADERS
   trajectory_analysis.hh
   structure_analysis.hh
   consistency_checks.hh
+  pdbize.hh
 )
 
 set(OST_MOL_ALG_SOURCES
@@ -23,6 +24,7 @@ set(OST_MOL_ALG_SOURCES
   trajectory_analysis.cc
   structure_analysis.cc
   consistency_checks.cc
+  pdbize.cc
 )
 
 set(MOL_ALG_DEPS ost_mol ost_seq)
diff --git a/modules/mol/alg/src/pdbize.cc b/modules/mol/alg/src/pdbize.cc
new file mode 100644
index 000000000..093abe629
--- /dev/null
+++ b/modules/mol/alg/src/pdbize.cc
@@ -0,0 +1,157 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2011 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+#include "pdbize.hh"
+#include <ost/mol/residue_handle.hh>
+#include <ost/mol/residue_view.hh>
+#include <ost/mol/atom_handle.hh>
+#include <ost/mol/atom_view.hh>
+#include <ost/mol/transfer_connectivity.hh>
+#include <ost/mol/chain_handle.hh>
+#include <ost/mol/chain_view.hh>
+#include <ost/mol/xcs_editor.hh>
+#include <ost/seq/sequence_handle.hh>
+
+namespace ost { namespace mol { namespace alg {
+
+const char* POLYPEPTIDE_CHAIN_NAMES="ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz";
+const char* LIGAND_CHAIN_NAME="_";
+const char* WATER_CHAIN_NAME="-";
+
+
+namespace {
+
+bool copy_atoms(ResidueView src_res, ResidueHandle dst_res, XCSEditor& edi, 
+                const geom::Mat4 transform) {
+  bool needs_adjustment = false;
+  for (AtomViewList::const_iterator 
+       i = src_res.GetAtomList().begin(), 
+       e = src_res.GetAtomList().end(); i != e; ++i) {
+
+    AtomHandle new_atom = edi.InsertAtom(dst_res, i->GetName(), 
+                                         geom::Vec3(transform*i->GetPos()),
+                                         i->GetName(), i->GetOccupancy(), 
+                                         i->GetBFactor(), i->IsHetAtom());
+    geom::Vec3 pos = new_atom.GetPos();
+    for (int j = 0; j<3; ++j) {
+      needs_adjustment|= pos[j]<=-1000 || pos[j]>=10000;
+    }
+  }
+  return needs_adjustment;
+}
+
+}
+EntityHandle PDBize(EntityView asu, const geom::Mat4List& transforms, 
+                    seq::SequenceList seqres, int min_polymer_size, 
+                    bool shift_to_fit)
+{
+  EntityHandle pdb_bu = CreateEntity();
+  XCSEditor edi = pdb_bu.EditXCS(BUFFERED_EDIT);
+
+  ChainHandle ligand_chain;
+  ChainHandle water_chain;
+  const char* curr_chain_name = POLYPEPTIDE_CHAIN_NAMES;
+  bool needs_adjustment = false;
+  int last_rnum = 0;
+  std::map<ResidueHandle, ResidueHandle> dst_to_src_map;
+  for (geom::Mat4List::const_iterator i = transforms.begin(), 
+       e = transforms.end(); i != e; ++i) {
+    for (ChainViewList::const_iterator j = asu.GetChainList().begin(),
+         e2 =asu.GetChainList().end(); j != e2; ++j) {
+      ChainView chain = *j; 
+      int chain_length = chain.GetResidueCount();
+      if (chain_length < min_polymer_size && seqres.IsValid()) {
+        seq::SequenceHandle s = seqres.FindSequence(chain.GetName());
+        if (s.IsValid())
+          chain_length = s.GetLength();
+      }
+      if (chain.IsPolymer() && chain_length >= min_polymer_size) {
+        if (*curr_chain_name == 0) {
+          throw std::runtime_error("running out of chain names");
+        }
+        ChainHandle new_chain = edi.InsertChain(String(1, curr_chain_name[0]));
+        curr_chain_name++;
+        edi.SetChainDescription(new_chain, chain.GetDescription());
+        edi.SetChainType(new_chain, chain.GetType());
+        new_chain.SetStringProp("original_name", chain.GetName());
+        if (chain.HasProp("pdb_auth_chain_name"))
+          new_chain.SetStringProp("pdb_auth_chain_name",
+                                  chain.GetStringProp("pdb_auth_chain_name"));
+        
+        for (ResidueViewList::const_iterator k = chain.GetResidueList().begin(),
+             e3 = chain.GetResidueList().end(); k != e3; ++k) {
+          ResidueHandle new_res = edi.AppendResidue(new_chain, k->GetName(),
+                                                    k->GetNumber());
+          dst_to_src_map[new_res] = k->GetHandle();
+          needs_adjustment |= copy_atoms(*k, new_res, edi, *i);
+        }
+        continue;
+      }
+      if (chain.GetType() == CHAINTYPE_WATER) {
+        if (!water_chain) {
+          water_chain = edi.InsertChain(WATER_CHAIN_NAME);
+          edi.SetChainDescription(water_chain, chain.GetDescription());
+          edi.SetChainType(water_chain, chain.GetType());
+        }
+        for (ResidueViewList::const_iterator k = chain.GetResidueList().begin(),
+             e3 = chain.GetResidueList().end(); k != e3; ++k) {
+          ResidueHandle new_res = edi.AppendResidue(water_chain, k->GetName());
+          new_res.SetStringProp("type", StringFromChainType(chain.GetType()));
+          new_res.SetStringProp("description", chain.GetDescription());
+          dst_to_src_map[new_res] = k->GetHandle();
+          needs_adjustment |= copy_atoms(*k, new_res, edi, *i);
+        }
+        continue;
+      }
+      // deal with ligands...
+      if (!ligand_chain) {
+        ligand_chain = edi.InsertChain(LIGAND_CHAIN_NAME);
+        last_rnum = 0;
+      }
+      char ins_code = chain.GetResidueCount()>1 ? 'A' : '\0';
+     
+      for (ResidueViewList::const_iterator k = chain.GetResidueList().begin(),
+           e3 = chain.GetResidueList().end(); k != e3; ++k) {
+        ResidueHandle new_res = edi.AppendResidue(ligand_chain, k->GetName(),
+                                                  ResNum(last_rnum+1, ins_code));
+        new_res.SetStringProp("type", StringFromChainType(chain.GetType()));
+        new_res.SetStringProp("description", chain.GetDescription());
+        new_res.SetStringProp("original_name", chain.GetName());
+        if (chain.HasProp("pdb_auth_chain_name"))
+          new_res.SetStringProp("pdb_auth_chain_name", 
+                                chain.GetStringProp("pdb_auth_chain_name"));
+        dst_to_src_map[new_res] = k->GetHandle();
+        ins_code += 1;
+        needs_adjustment |= copy_atoms(*k, new_res, edi, *i);
+      }
+      last_rnum+=1;
+    }
+  }
+  if (needs_adjustment && shift_to_fit) {
+    geom::Vec3 min_coord = pdb_bu.GetBounds().GetMin();
+    geom::Mat4 tf(1, 0, 0, -999 - min_coord[0],
+                  0, 1, 0, -999 - min_coord[1],
+                  0, 0, 1, -999 - min_coord[2],
+                  0, 0, 0, 1);
+    edi.ApplyTransform(tf);
+  }
+  TransferConnectivity(pdb_bu, dst_to_src_map);
+  return pdb_bu;
+}
+
+}}}
diff --git a/modules/mol/alg/src/pdbize.hh b/modules/mol/alg/src/pdbize.hh
new file mode 100644
index 000000000..d4120de28
--- /dev/null
+++ b/modules/mol/alg/src/pdbize.hh
@@ -0,0 +1,40 @@
+//------------------------------------------------------------------------------
+// This file is part of the OpenStructure project <www.openstructure.org>
+//
+// Copyright (C) 2008-2011 by the OpenStructure authors
+//
+// This library is free software; you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 3.0 of the License, or (at your option)
+// any later version.
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+//------------------------------------------------------------------------------
+#ifndef OST_MOL_ALG_PDBIZE_HH
+#define OST_MOL_ALG_PDBIZE_HH
+
+#include <ost/mol/entity_view.hh>
+#include <ost/mol/entity_handle.hh>
+#include <ost/seq/sequence_list.hh>
+#include "module_config.hh"
+
+namespace ost { namespace mol { namespace alg {
+
+
+extern const char* POLYPEPTIDE_CHAIN_NAMES;
+extern const char* LIGAND_CHAIN_NAME;
+extern const char* WATER_CHAIN_NAME;
+
+mol::EntityHandle DLLEXPORT_OST_MOL_ALG
+PDBize(mol::EntityView asu, const geom::Mat4List& transforms, seq::SequenceList seqres, 
+       int min_polymer_size=10, bool shift_to_fit=true);
+
+}}}
+#endif
+
-- 
GitLab