From 425e394de288477592eea0b01fc2b86d842f60de Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Fri, 17 Mar 2023 17:20:25 +0100
Subject: [PATCH] feat: SCHWED-5481 read gzipped SDF

---
 modules/io/doc/structure_formats.rst              |   2 +-
 modules/io/src/mol/entity_io_sdf_handler.cc       |   2 +-
 modules/io/src/mol/sdf_reader.cc                  |  10 ++++++++--
 modules/io/src/mol/sdf_reader.hh                  |   2 ++
 modules/io/tests/CMakeLists.txt                   |   1 +
 .../io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz | Bin 0 -> 505 bytes
 6 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz

diff --git a/modules/io/doc/structure_formats.rst b/modules/io/doc/structure_formats.rst
index a80133842..42aca8826 100644
--- a/modules/io/doc/structure_formats.rst
+++ b/modules/io/doc/structure_formats.rst
@@ -54,5 +54,5 @@ SDF - Structured Data File
 Chemical-data file format.
 
 *Recognized File Extensions*
-  .sdf
+  .sdf, .sdf.gz
   
diff --git a/modules/io/src/mol/entity_io_sdf_handler.cc b/modules/io/src/mol/entity_io_sdf_handler.cc
index a9dced835..a843d986c 100644
--- a/modules/io/src/mol/entity_io_sdf_handler.cc
+++ b/modules/io/src/mol/entity_io_sdf_handler.cc
@@ -69,7 +69,7 @@ bool sdf_handler_is_responsible_for(const boost::filesystem::path& loc,
   if(type=="auto") {
 	String match_suf_string=loc.string();
     std::transform(match_suf_string.begin(),match_suf_string.end(),match_suf_string.begin(),tolower);
-    if(detail::FilenameEndsWith(match_suf_string,".sdf")) {
+    if(detail::FilenameEndsWith(match_suf_string,".sdf") || detail::FilenameEndsWith(match_suf_string,".sdf.gz")) {
       return true;
     }
 
diff --git a/modules/io/src/mol/sdf_reader.cc b/modules/io/src/mol/sdf_reader.cc
index 064c14f29..7f3a7f560 100644
--- a/modules/io/src/mol/sdf_reader.cc
+++ b/modules/io/src/mol/sdf_reader.cc
@@ -21,7 +21,9 @@
  */
 
 #include <boost/algorithm/string.hpp>
+#include <boost/filesystem/convenience.hpp>
 #include <boost/format.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
 #include <boost/lexical_cast.hpp>
 #include <ost/mol/bond_handle.hh>
 #include <ost/conop/conop.hh>
@@ -58,7 +60,7 @@ void SDFReader::Import(mol::EntityHandle& ent)
 {
   String line;
   mol::XCSEditor editor=ent.EditXCS(mol::BUFFERED_EDIT);
-  while (std::getline(instream_,line)) {
+  while (std::getline(in_,line)) {
     ++line_num;
 
     // std::getline removes EOL character but may leave a DOS CR (\r) in Unix
@@ -87,7 +89,7 @@ void SDFReader::Import(mol::EntityHandle& ent)
         throw IOException(str(format(msg) % line_num));
       }
       String data_value="";
-      while(std::getline(instream_,line) && !boost::iequals(line, "")) {
+      while(std::getline(in_,line) && !boost::iequals(line, "")) {
         data_value.append(line);
       }
       curr_chain_.SetStringProp(data_header, data_value);
@@ -103,6 +105,10 @@ void SDFReader::Import(mol::EntityHandle& ent)
 
 void SDFReader::ClearState(const boost::filesystem::path& loc)
 {
+  if (boost::iequals(".gz", boost::filesystem::extension(loc))) {
+    in_.push(boost::iostreams::gzip_decompressor());
+  }
+  in_.push(instream_);
   if(!infile_) throw IOException("could not open "+loc.string());
   curr_chain_=mol::ChainHandle();
   curr_residue_=mol::ResidueHandle();
diff --git a/modules/io/src/mol/sdf_reader.hh b/modules/io/src/mol/sdf_reader.hh
index e7a478b7a..04d05a2d6 100644
--- a/modules/io/src/mol/sdf_reader.hh
+++ b/modules/io/src/mol/sdf_reader.hh
@@ -22,6 +22,7 @@
 #ifndef OST_IO_SDF_READER_HH
 #define OST_IO_SDF_READER_HH
 
+#include <boost/iostreams/filtering_stream.hpp>
 #include <boost/filesystem/fstream.hpp>
 #include <ost/mol/chain_handle.hh>
 #include <ost/mol/residue_handle.hh>
@@ -61,6 +62,7 @@ private:
   int line_num;
   boost::filesystem::ifstream infile_;
   std::istream& instream_;
+  boost::iostreams::filtering_stream<boost::iostreams::input>  in_;
 };
 
 }}
diff --git a/modules/io/tests/CMakeLists.txt b/modules/io/tests/CMakeLists.txt
index 0af1dfd5b..fbfef7413 100644
--- a/modules/io/tests/CMakeLists.txt
+++ b/modules/io/tests/CMakeLists.txt
@@ -2,6 +2,7 @@ set(OST_IO_UNIT_TESTS
   test_io_pdb.py
   test_io_mmcif.py
   test_io_omf.py
+  test_io_sdf.py
   test_clustal.cc
   test_io_pdb.cc
   test_io_crd.cc
diff --git a/modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz b/modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..658c3b9f089ef67229a54e18c766302083b22b16
GIT binary patch
literal 505
zcmV<V0S5jbiwFp2Bokx+12$wecVBX0Zfh}LV{&X}E^}mN0F_lct{X87><#!1DpC{t
zQ>@=hnjo)8kvpW%{Sj&<TMm#Q&|+Z>L2Eo5k@Wr6#`FFC>(5{9`N;eIcE2^I#@ZVG
z_07!ogAwsQD1<=RY#@O^&O(&n6w$u4e{sCN51Qyn12I@yB^o2Js<yWS8it_2N!XEj
zsbQoe8UYIH10i_Fhni41Rl~VxgwY2Lff=0@CBU4+Voo5UQ4}+fRiP&|3fi3>G?J61
zQ@QkUMAIqyq>=2G8U@mRW^o<C&`Fc&GLV(+<cBnY?$csYy)1T6MmwUh(!es+1PV<W
zSK%X?I+D3)XzHq>oavN?c#c4aG?Qtu<YD`HO_JNBk*-%!)!Alwbm385%-7)nqx97`
zUNk3a0%{6v8Y{s%X^{PF7<^3gy5?F%eU4@rpLt`TqB)BS_BER1bmomg)KE=}W38<!
zs{5-q^k15>tgibhMB>XEQtU@X)h?E61J~v$Uvcdlq?4N}89X(QsGwY*S5#1UuAuE)
zLEpK8aYa6&f_>)-=02B=@R=(}^vo5ct)(DsEd^<7sm5B-_`TX8qthy)6~+E$H0nE-
vkH}{YytNeMt%d*arOve%PI;|wa6XlNYwhz#{i}Ye=YIPQ>#-PNb_W0eRoVB(

literal 0
HcmV?d00001

-- 
GitLab