From f5af65195d27252d0d2a55e60e9914e18e33386a Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Wed, 23 Aug 2023 18:43:19 +0200
Subject: [PATCH] OMF: remove functionality to generate biounits on the fly

The intention of this breaking change is to strictly concentrate
on coordinates. It makes limited sense to provide biounit support
without also providing SEQRES etc. So this should all go to some
external annotation system.
---
 modules/io/pymod/export_omf_io.cc |   4 +-
 modules/io/src/mol/omf.cc         | 215 ------------------------------
 modules/io/src/mol/omf.hh         |  32 ++---
 modules/io/tests/test_io_omf.py   |  67 ++--------
 4 files changed, 24 insertions(+), 294 deletions(-)

diff --git a/modules/io/pymod/export_omf_io.cc b/modules/io/pymod/export_omf_io.cc
index 974211d45..bb3e8169c 100644
--- a/modules/io/pymod/export_omf_io.cc
+++ b/modules/io/pymod/export_omf_io.cc
@@ -66,14 +66,14 @@ void export_omf_io() {
 
   class_<OMF, OMFPtr>("OMF",no_init)
     .def("FromEntity", &OMF::FromEntity, (arg("ent"), arg("options")=0)).staticmethod("FromEntity")
-    .def("FromMMCIF", &OMF::FromMMCIF, (arg("ent"), arg("mmcif_info"), arg("options")=0)).staticmethod("FromMMCIF")
     .def("FromFile", &OMF::FromFile).staticmethod("FromFile")
     .def("FromBytes", &wrap_from_bytes).staticmethod("FromBytes")
     .def("ToFile", &OMF::ToFile)
     .def("ToBytes", &wrap_to_bytes)
     .def("GetAU", &OMF::GetAU)
+    .def("GetEntity", &OMF::GetEntity)
     .def("GetAUChain", &OMF::GetAUChain)
-    .def("GetBU", &OMF::GetBU)
+    .def("GetEntityChain", &OMF::GetEntityChain)
     .def("GetName", &OMF::GetName)
     .def("GetChainNames", &wrap_get_chain_names)
     .def("GetPositions", &OMF::GetPositions, return_value_policy<reference_existing_object>(),(arg("cname")))
diff --git a/modules/io/src/mol/omf.cc b/modules/io/src/mol/omf.cc
index 35ee9ce2d..310857d4d 100644
--- a/modules/io/src/mol/omf.cc
+++ b/modules/io/src/mol/omf.cc
@@ -877,52 +877,6 @@ namespace{
     }
   }
 
-  // generates as many chain names as you want (potentially multiple characters)
-  struct ChainNameGenerator{
-    ChainNameGenerator() { 
-      chain_names = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz";
-      n_chain_names = chain_names.size();
-      indices.push_back(-1);
-    }
-
-    String Get() {
-      int idx = indices.size() - 1;
-      indices[idx] += 1;
-      bool more_digits = false;
-      while(idx >= 0) {
-        if(indices[idx] >= n_chain_names) {
-          indices[idx] = 0;
-          if(idx>0) {
-            indices[idx-1] += 1;
-            --idx;
-          } else {
-            more_digits = true;
-            break;
-          }
-        } else {
-          break;
-        }
-      }
-      if(more_digits) {
-        indices.insert(indices.begin(), 0);
-      }
-      String ch_name(indices.size(), 'X');
-      for(uint i = 0; i < indices.size(); ++i) {
-        ch_name[i] = chain_names[indices[i]];
-      }
-      return ch_name;
-    }
-
-    void Reset() {
-      indices.clear();
-      indices.push_back(-1);
-    }
-
-    String chain_names;
-    int n_chain_names;
-    std::vector<int> indices;
-  };
-
   // delta/runlength encodings/decodings
   void DeltaEncoding(const std::vector<int>& in, std::vector<int>& out) {
     out.clear();
@@ -1299,17 +1253,6 @@ namespace{
     }
   }
 
-  // dump and load vectors with BioUnitDefinition
-  void Load(std::istream& stream,
-            std::vector<ost::io::BioUnitDefinition>& vec) {
-    uint32_t size;
-    stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-    vec.resize(size);
-    for(uint i = 0; i < size; ++i) {
-      vec[i].FromStream(stream);
-    }
-  }
-
   void Dump(std::ostream& stream,
             const std::vector<bool>& vec) {
     uint32_t size = vec.size();
@@ -1337,34 +1280,6 @@ namespace{
     }
   }
 
-  void Dump(std::ostream& stream,
-            const std::vector<ost::io::BioUnitDefinition>& vec) {
-    uint32_t size = vec.size();
-    stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-    for(uint i = 0; i < size; ++i) {
-      vec[i].ToStream(stream);
-    }
-  }
-
-  // dump and load vectors with Mat4
-  void Load(std::istream& stream, std::vector<geom::Mat4>& vec) {
-    uint32_t size;
-    stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-    vec.resize(size);
-    for(uint i = 0; i < size; ++i) {
-      stream.read(reinterpret_cast<char*>(vec[i].Data()),16*sizeof(Real));
-    }
-  }
-
-  void Dump(std::ostream& stream, const std::vector<geom::Mat4>& vec) {
-    uint32_t size = vec.size();
-    stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-    for(uint i = 0; i < size; ++i) {
-      stream.write(reinterpret_cast<const char*>(vec[i].Data()),
-                   16*sizeof(Real));
-    }
-  }
-
   // knowledge based dump and load functions, i.e. awesome compressions
   void LoadIsHetatm(std::istream& stream, std::vector<bool>& vec) {
     std::vector<int> run_length_encoded;
@@ -1845,61 +1760,6 @@ void ResidueDefinition::_AddAtomRule(int a_idx, int anch_one_idx,
   sidechain_atom_rules.push_back(rule);
 }
 
-BioUnitDefinition::BioUnitDefinition(const ost::io::MMCifInfoBioUnit& bu) {
-
-  au_chains = bu.GetChainList();
-
-  const std::vector<std::pair<int, int> >& bu_ch_intvl =
-  bu.GetChainIntervalList();
-  for(auto it = bu_ch_intvl.begin(); it != bu_ch_intvl.end(); ++it) {
-    chain_intvl.push_back(it->first);      
-    chain_intvl.push_back(it->second);      
-  }
-
-  const std::vector<std::vector<MMCifInfoTransOpPtr> >& bu_op_list =
-  bu.GetOperations();
-  for(auto i = bu_op_list.begin(); i != bu_op_list.end(); ++i) {
-    std::vector<geom::Mat4> mat_list;
-    for(auto j = i->begin(); j != i->end(); ++j) {
-      geom::Mat4 m;
-      m.PasteRotation((*j)->GetMatrix());
-      m.PasteTranslation((*j)->GetVector());
-      mat_list.push_back(m);
-    }
-    operations.push_back(mat_list);
-  }
-
-  const std::vector<std::pair<int, int> >& bu_op_intvl =
-  bu.GetOperationsIntervalList();
-  for(auto it = bu_op_intvl.begin(); it != bu_op_intvl.end(); ++it) {
-    op_intvl.push_back(it->first);      
-    op_intvl.push_back(it->second);      
-  }
-}
-
-void BioUnitDefinition::ToStream(std::ostream& stream) const {
-  Dump(stream, au_chains);
-  Dump(stream, chain_intvl);
-  uint32_t size = operations.size();
-  stream.write(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-  for(auto it = operations.begin(); it != operations.end(); ++it) {
-    Dump(stream, *it);
-  }
-  Dump(stream, op_intvl);
-}
-
-void BioUnitDefinition::FromStream(std::istream& stream) {
-  Load(stream, au_chains);
-  Load(stream, chain_intvl);
-  uint32_t size = 0;
-  stream.read(reinterpret_cast<char*>(&size), sizeof(uint32_t));
-  operations.resize(size);
-  for(uint i = 0; i < size; ++i) {
-    Load(stream, operations[i]);
-  }
-  Load(stream, op_intvl);
-}
-
 ChainData::ChainData(const ost::mol::ChainHandle& chain,
                      const std::vector<ResidueDefinition>& residue_definitions,
                      const std::unordered_map<unsigned long, int>& res_idx_map,
@@ -4934,18 +4794,6 @@ OMFPtr OMF::FromEntity(const ost::mol::EntityHandle& ent,
   return omf;
 }
 
-OMFPtr OMF::FromMMCIF(const ost::mol::EntityHandle& ent,
-                      const MMCifInfo& info,
-                      uint8_t options) {
-
-  OMFPtr p = OMF::FromEntity(ent, options);
-  const std::vector<MMCifInfoBioUnit>& biounits = info.GetBioUnits();
-  for(auto it = biounits.begin(); it != biounits.end(); ++it) {
-    p->biounit_definitions_.push_back(BioUnitDefinition(*it));
-  }
-  return p;
-}
-
 OMFPtr OMF::FromFile(const String& fn) {
   std::ifstream in_stream(fn.c_str(), std::ios::binary);
   if (!in_stream) {
@@ -5014,67 +4862,6 @@ ost::mol::EntityHandle OMF::GetAUChain(const String& name) const{
   return ent;
 }
 
-ost::mol::EntityHandle OMF::GetBU(int bu_idx) const{
-  if(bu_idx < 0 || bu_idx >= static_cast<int>(biounit_definitions_.size())) {
-    throw ost::Error("Invalid biounit idx");
-  }
-
-  const BioUnitDefinition& bu = biounit_definitions_[bu_idx];
-  ost::mol::EntityHandle ent = mol::CreateEntity();
-  std::stringstream ss;
-  ss << name_ << " " << bu_idx;
-  ent.SetName(ss.str());
-  ost::mol::XCSEditor ed = ent.EditXCS(mol::BUFFERED_EDIT);
-
-  std::vector<String> au_chain_names;
-  std::vector<geom::Mat4> transforms;
-
-  // The code below is pure magic and heavily inspired by
-  // the biounit buildup in modules/io/pymod/__init__.py
-  int n_intervals = bu.chain_intvl.size() / 2;
-  for(int intvl_idx = 0; intvl_idx < n_intervals; ++intvl_idx) {
-    std::vector<geom::Mat4> rts;
-    int op_start = bu.op_intvl[2*intvl_idx];
-    int op_end = bu.op_intvl[2*intvl_idx+1];
-    int n_intv_ops = op_end - op_start;
-    if(n_intv_ops) {
-      for(auto it = bu.operations[op_start].begin(); 
-          it != bu.operations[op_start].end(); ++it) {
-        rts.push_back(*it);
-      }
-      ++op_start;
-      while(op_start < op_end) {
-        std::vector<geom::Mat4> tmp_rts;
-        for(auto i = bu.operations[op_start].begin(); 
-            i != bu.operations[op_start].end(); ++i) {
-          for(auto j = rts.begin(); j != rts.end(); ++j) {
-            tmp_rts.push_back((*j)*(*i));
-          }
-        }
-        rts = tmp_rts;
-        ++op_start;
-      }
-    }
-    for(int ch_idx = bu.chain_intvl[2*intvl_idx]; 
-        ch_idx < bu.chain_intvl[2*intvl_idx+1]; ++ch_idx) {
-      for(auto it = rts.begin(); it != rts.end(); ++it) {
-        au_chain_names.push_back(bu.au_chains[ch_idx]);
-        transforms.push_back(*it);
-      }
-    }
-  }
-
-  ChainNameGenerator gen;
-  for(uint bu_ch_idx = 0; bu_ch_idx < au_chain_names.size(); ++bu_ch_idx) {
-    String bu_ch_name = gen.Get();
-    ost::mol::ChainHandle added_chain = ed.InsertChain(bu_ch_name);
-    this->FillChain(added_chain, ed, chain_data_.at(au_chain_names[bu_ch_idx]),
-                    transforms[bu_ch_idx]);
-  }
-
-  return ent;
-}
-
 void OMF::ToStream(std::ostream& stream) const {
 
   uint32_t magic_number = 42;
@@ -5099,7 +4886,6 @@ void OMF::ToStream(std::ostream& stream) const {
     Dump(stream, residue_definitions_);
   }
 
-  Dump(stream, biounit_definitions_);
   Dump(stream, chain_data_, residue_definitions_, OptionSet(LOSSY),
        OptionSet(AVG_BFACTORS), OptionSet(ROUND_BFACTORS), OptionSet(SKIP_SS),
        OptionSet(INFER_POS));
@@ -5143,7 +4929,6 @@ void OMF::FromStream(std::istream& stream) {
     Load(stream, residue_definitions_);
   }
 
-  Load(stream, biounit_definitions_);
   Load(stream, chain_data_, residue_definitions_, version_, OptionSet(LOSSY),
        OptionSet(AVG_BFACTORS), OptionSet(ROUND_BFACTORS), OptionSet(SKIP_SS),
        OptionSet(INFER_POS));
diff --git a/modules/io/src/mol/omf.hh b/modules/io/src/mol/omf.hh
index c912bcec3..aa5eb6c2d 100644
--- a/modules/io/src/mol/omf.hh
+++ b/modules/io/src/mol/omf.hh
@@ -33,11 +33,9 @@ namespace ost { namespace io {
 const int OMF_VERSION = 2;
 
 class ChainData;
-class BioUnitData;
 class OMF;
 typedef boost::shared_ptr<OMF> OMFPtr;
 typedef boost::shared_ptr<ChainData> ChainDataPtr;
-typedef boost::shared_ptr<BioUnitData> BioUnitDataPtr;
 
 struct SidechainAtomRule {
   int sidechain_atom_idx;
@@ -120,23 +118,6 @@ struct ResidueDefinition {
   std::vector<SidechainAtomRule> sidechain_atom_rules;
 };
 
-
-struct BioUnitDefinition {
-  BioUnitDefinition() { }
-
-  BioUnitDefinition(const ost::io::MMCifInfoBioUnit& bu);
-
-  void ToStream(std::ostream& stream) const;
-
-  void FromStream(std::istream& stream);
-
-  std::vector<String> au_chains;
-  std::vector<int> chain_intvl;
-  std::vector<std::vector<geom::Mat4> > operations;
-  std::vector<int> op_intvl;
-};
-
-
 struct ChainData {
 
   ChainData(): ch_name(""), chain_type(ost::mol::CHAINTYPE_UNKNOWN) { }
@@ -210,10 +191,6 @@ public:
   static OMFPtr FromEntity(const ost::mol::EntityHandle& ent,
                            uint8_t options = 0);
 
-  static OMFPtr FromMMCIF(const ost::mol::EntityHandle& ent,
-                          const MMCifInfo& info,
-                          uint8_t options = 0);
-
   static OMFPtr FromFile(const String& fn);
 
   static OMFPtr FromString(const String& s);
@@ -224,8 +201,16 @@ public:
 
   ost::mol::EntityHandle GetAU() const;
 
+  ost::mol::EntityHandle GetEntity() const {
+    return this->GetAU();
+  }
+
   ost::mol::EntityHandle GetAUChain(const String& name) const;
 
+  ost::mol::EntityHandle GetEntityChain(const String& name) const {
+    return this->GetAUChain(name);
+  }
+
   ost::mol::EntityHandle GetBU(int bu_idx) const;
 
   int GetVersion() const { return version_; }
@@ -261,7 +246,6 @@ private:
 
   String name_;
   std::vector<ResidueDefinition> residue_definitions_;
-  std::vector<BioUnitDefinition> biounit_definitions_;
   std::map<String, ChainDataPtr> chain_data_;
 
   // bond features - only for bonds that are inter-chain
diff --git a/modules/io/tests/test_io_omf.py b/modules/io/tests/test_io_omf.py
index 8d519c508..81c19f2e8 100644
--- a/modules/io/tests/test_io_omf.py
+++ b/modules/io/tests/test_io_omf.py
@@ -118,17 +118,16 @@ class TestOMF(unittest.TestCase):
         self.ent.SetName("This is a name 123")
 
     def test_AU(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
         loaded_omf = io.OMF.FromBytes(omf_bytes)
         loaded_ent = loaded_omf.GetAU()
         self.assertTrue(compare_ent(self.ent, loaded_ent))
 
     def test_default_peplib(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_def_pep = io.OMF.FromMMCIF(self.ent, self.info,
-                                       io.OMFOption.DEFAULT_PEPLIB)
+        omf_def_pep = io.OMF.FromEntity(self.ent, io.OMFOption.DEFAULT_PEPLIB)
         omf_def_pep_bytes = omf_def_pep.ToBytes()
         loaded_omf_def_pep = io.OMF.FromBytes(omf_def_pep_bytes)
         loaded_ent = loaded_omf_def_pep.GetAU()
@@ -137,10 +136,9 @@ class TestOMF(unittest.TestCase):
         self.assertTrue(compare_ent(self.ent, loaded_ent))
 
     def test_lossy(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_lossy = io.OMF.FromMMCIF(self.ent, self.info,
-                                     io.OMFOption.LOSSY)
+        omf_lossy = io.OMF.FromEntity(self.ent, io.OMFOption.LOSSY)
         omf_lossy_bytes = omf_lossy.ToBytes()
         loaded_omf_lossy = io.OMF.FromBytes(omf_lossy_bytes)
         loaded_ent = loaded_omf_lossy.GetAU()
@@ -152,10 +150,9 @@ class TestOMF(unittest.TestCase):
                                     at_dist_thresh=max_dist))
 
     def test_avg_bfactors(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_avg_bfac = io.OMF.FromMMCIF(self.ent, self.info,
-                                        io.OMFOption.AVG_BFACTORS)
+        omf_avg_bfac = io.OMF.FromEntity(self.ent, io.OMFOption.AVG_BFACTORS)
         omf_avg_bfac_bytes = omf_avg_bfac.ToBytes()
         loaded_omf_avg_bfac = io.OMF.FromBytes(omf_avg_bfac_bytes)
         loaded_ent = loaded_omf_avg_bfac.GetAU()
@@ -174,10 +171,9 @@ class TestOMF(unittest.TestCase):
                 self.assertTrue(abs(a.b_factor - exp_bfac) < 0.008)
 
     def test_round_bfactors(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_round_bfac = io.OMF.FromMMCIF(self.ent, self.info,
-                                        io.OMFOption.ROUND_BFACTORS)
+        omf_round_bfac = io.OMF.FromEntity(self.ent, io.OMFOption.ROUND_BFACTORS)
         omf_round_bfac_bytes = omf_round_bfac.ToBytes()
         loaded_omf_round_bfac = io.OMF.FromBytes(omf_round_bfac_bytes)
         loaded_ent = loaded_omf_round_bfac.GetAU()
@@ -188,10 +184,9 @@ class TestOMF(unittest.TestCase):
                                     at_bfactor_thresh=0.5))
 
     def test_skip_ss(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_skip_ss = io.OMF.FromMMCIF(self.ent, self.info,
-                                          io.OMFOption.SKIP_SS)
+        omf_skip_ss = io.OMF.FromEntity(self.ent, io.OMFOption.SKIP_SS)
         omf_skip_ss_bytes = omf_skip_ss.ToBytes()
         loaded_omf_skip_ss = io.OMF.FromBytes(omf_skip_ss_bytes)
         loaded_ent = loaded_omf_skip_ss.GetAU()
@@ -201,10 +196,10 @@ class TestOMF(unittest.TestCase):
         self.assertTrue(compare_ent(self.ent, loaded_ent, skip_ss=True))
 
     def test_infer_pep_bonds(self):
-        omf = io.OMF.FromMMCIF(self.ent, self.info)
+        omf = io.OMF.FromEntity(self.ent)
         omf_bytes = omf.ToBytes()
-        omf_infer_pep_bonds = io.OMF.FromMMCIF(self.ent, self.info,
-                                               io.OMFOption.INFER_PEP_BONDS)
+        omf_infer_pep_bonds = io.OMF.FromEntity(self.ent,
+                                                io.OMFOption.INFER_PEP_BONDS)
         omf_infer_pep_bonds_bytes = omf_infer_pep_bonds.ToBytes()
         loaded_omf_infer_pep_bonds = io.OMF.FromBytes(omf_infer_pep_bonds_bytes)
         loaded_ent = loaded_omf_infer_pep_bonds.GetAU()
@@ -212,40 +207,6 @@ class TestOMF(unittest.TestCase):
         self.assertTrue(len(omf_infer_pep_bonds_bytes) < len(omf_bytes))
         self.assertTrue(compare_ent(self.ent, loaded_ent))
 
-    def test_multiple_BU(self):
-        ent, seqres, info = io.LoadMMCIF("testfiles/mmcif/3imj.cif.gz", 
-                                         seqres=True,
-                                         info=True)
-
-        omf = io.OMF.FromMMCIF(ent, info)
-        omf_bytes = omf.ToBytes()
-        omf_loaded = io.OMF.FromBytes(omf_bytes)
-
-        # there are quite some discrepancies between PDBize and OMF
-        # - chain names: PDBize has specific chain names for ligands and
-        #                water etc. OMF just iterates A, B, C, D, ...
-        # - skip_bonds: Thats qualified atom name based. PDBize used rnums
-        #               and insertion codes for waters...
-        # - skip_rnums: Again, insertion codes for waters...
-        self.assertTrue(compare_ent(info.GetBioUnits()[0].PDBize(ent),
-                                    omf_loaded.GetBU(0),
-                                    skip_cnames=True, skip_bonds=True,
-                                    skip_rnums=True, bu_idx = 0))
-
-        self.assertTrue(compare_ent(info.GetBioUnits()[1].PDBize(ent),
-                                    omf_loaded.GetBU(1),
-                                    skip_cnames=True, skip_bonds=True,
-                                    skip_rnums=True, bu_idx = 1))
-
-        # no check for the full guy... problem: PDBize throws all water
-        # molecules in the same chain, whereas OMF keeps them separate
-        # as in the chains from the assymetric unit... maybe needs some
-        # thinking on how to resolve discrepancies between PDBize and OMF
-        #self.assertTrue(compare_ent(omf_loaded.GetBU(2),
-        #                            info.GetBioUnits()[2].PDBize(ent),
-        #                            skip_cnames=True, skip_bonds=True,
-        #                            skip_rnums=True))
-
 if __name__== '__main__':
     from ost import testutils
     if testutils.DefaultCompoundLibIsSet():
-- 
GitLab