From 425e394de288477592eea0b01fc2b86d842f60de Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavier.robin@unibas.ch>
Date: Fri, 17 Mar 2023 17:20:25 +0100
Subject: [PATCH] feat: SCHWED-5481 read gzipped SDF

---
 modules/io/doc/structure_formats.rst              |   2 +-
 modules/io/src/mol/entity_io_sdf_handler.cc       |   2 +-
 modules/io/src/mol/sdf_reader.cc                  |  10 ++++++++--
 modules/io/src/mol/sdf_reader.hh                  |   2 ++
 modules/io/tests/CMakeLists.txt                   |   1 +
 .../io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz | Bin 0 -> 505 bytes
 6 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz

diff --git a/modules/io/doc/structure_formats.rst b/modules/io/doc/structure_formats.rst
index a80133842..42aca8826 100644
--- a/modules/io/doc/structure_formats.rst
+++ b/modules/io/doc/structure_formats.rst
@@ -54,5 +54,5 @@ SDF - Structured Data File
 Chemical-data file format.
 
 *Recognized File Extensions*
-  .sdf
+  .sdf, .sdf.gz
   
diff --git a/modules/io/src/mol/entity_io_sdf_handler.cc b/modules/io/src/mol/entity_io_sdf_handler.cc
index a9dced835..a843d986c 100644
--- a/modules/io/src/mol/entity_io_sdf_handler.cc
+++ b/modules/io/src/mol/entity_io_sdf_handler.cc
@@ -69,7 +69,7 @@ bool sdf_handler_is_responsible_for(const boost::filesystem::path& loc,
   if(type=="auto") {
 	String match_suf_string=loc.string();
     std::transform(match_suf_string.begin(),match_suf_string.end(),match_suf_string.begin(),tolower);
-    if(detail::FilenameEndsWith(match_suf_string,".sdf")) {
+    if(detail::FilenameEndsWith(match_suf_string,".sdf") || detail::FilenameEndsWith(match_suf_string,".sdf.gz")) {
       return true;
     }
 
diff --git a/modules/io/src/mol/sdf_reader.cc b/modules/io/src/mol/sdf_reader.cc
index 064c14f29..7f3a7f560 100644
--- a/modules/io/src/mol/sdf_reader.cc
+++ b/modules/io/src/mol/sdf_reader.cc
@@ -21,7 +21,9 @@
  */
 
 #include <boost/algorithm/string.hpp>
+#include <boost/filesystem/convenience.hpp>
 #include <boost/format.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
 #include <boost/lexical_cast.hpp>
 #include <ost/mol/bond_handle.hh>
 #include <ost/conop/conop.hh>
@@ -58,7 +60,7 @@ void SDFReader::Import(mol::EntityHandle& ent)
 {
   String line;
   mol::XCSEditor editor=ent.EditXCS(mol::BUFFERED_EDIT);
-  while (std::getline(instream_,line)) {
+  while (std::getline(in_,line)) {
     ++line_num;
 
     // std::getline removes EOL character but may leave a DOS CR (\r) in Unix
@@ -87,7 +89,7 @@ void SDFReader::Import(mol::EntityHandle& ent)
         throw IOException(str(format(msg) % line_num));
       }
       String data_value="";
-      while(std::getline(instream_,line) && !boost::iequals(line, "")) {
+      while(std::getline(in_,line) && !boost::iequals(line, "")) {
         data_value.append(line);
       }
       curr_chain_.SetStringProp(data_header, data_value);
@@ -103,6 +105,10 @@ void SDFReader::Import(mol::EntityHandle& ent)
 
 void SDFReader::ClearState(const boost::filesystem::path& loc)
 {
+  if (boost::iequals(".gz", boost::filesystem::extension(loc))) {
+    in_.push(boost::iostreams::gzip_decompressor());
+  }
+  in_.push(instream_);
   if(!infile_) throw IOException("could not open "+loc.string());
   curr_chain_=mol::ChainHandle();
   curr_residue_=mol::ResidueHandle();
diff --git a/modules/io/src/mol/sdf_reader.hh b/modules/io/src/mol/sdf_reader.hh
index e7a478b7a..04d05a2d6 100644
--- a/modules/io/src/mol/sdf_reader.hh
+++ b/modules/io/src/mol/sdf_reader.hh
@@ -22,6 +22,7 @@
 #ifndef OST_IO_SDF_READER_HH
 #define OST_IO_SDF_READER_HH
 
+#include <boost/iostreams/filtering_stream.hpp>
 #include <boost/filesystem/fstream.hpp>
 #include <ost/mol/chain_handle.hh>
 #include <ost/mol/residue_handle.hh>
@@ -61,6 +62,7 @@ private:
   int line_num;
   boost::filesystem::ifstream infile_;
   std::istream& instream_;
+  boost::iostreams::filtering_stream<boost::iostreams::input>  in_;
 };
 
 }}
diff --git a/modules/io/tests/CMakeLists.txt b/modules/io/tests/CMakeLists.txt
index 0af1dfd5b..fbfef7413 100644
--- a/modules/io/tests/CMakeLists.txt
+++ b/modules/io/tests/CMakeLists.txt
@@ -2,6 +2,7 @@ set(OST_IO_UNIT_TESTS
   test_io_pdb.py
   test_io_mmcif.py
   test_io_omf.py
+  test_io_sdf.py
   test_clustal.cc
   test_io_pdb.cc
   test_io_crd.cc
diff --git a/modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz b/modules/io/tests/testfiles/sdf/6d5w_rank1_crlf.sdf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..658c3b9f089ef67229a54e18c766302083b22b16
GIT binary patch
literal 505
zcmb2|=HN(E5l&%XHcK%rk1tBh%QlQpF3L&MD^5vcm>O!mPTG*?4et*=tsvn)!E5e&
z&r(?LG0}R-`M-i$s?mHC6fPts@;GMO$V~k5_u8?~fB(KdAOH5}iQj+A_M0v{c1!Bt
z*RyB7H4A>U(`a$Hk)zP)aL#GU1CfjSw$vBy*~dTYg(tINM3kzO@`hF0wlhn$EjZwL
zV`AmX#3jm7j9jmo6)TRn&kC>%J-joc<sv(8qvTvs)dO=5C;2HX3KTJ(5V}y^gzMH^
zIg?3~mj&<qQs{D7^VgDzZ>CGJUaCtgls$OCbMi~02~*!bX*X$ncR49&uTEKjoAsiN
zmkt<RW#RIb3V*0P%WBfjjEq;S7R`A%rLCgRq1|M1=Ef(9zbkzw-Ci<rU3lQtn`xCL
z4+CS*Tz|;0_{&xEI1}|OrYx>pDeZ$+nG^ozNYwaz-t{cXwYD!!Vt!@vf<>0yT;EJ*
zJt=wC>v$!=w>x?5TCG*TSDSx{pS5w#y5E{Ek4{Smy>WNFsy8j0`T8@>_`<i+OD50O
zlC&}HU$G!&et1Yg`ML#fV;t}AY$|lMThvthoa>q0^xh9MwLQL^5nZx&sY0$UYi9P!
zV^J4>?7gM9__Ed_(WC#;OkP<}?>~{wyJxG&leMk?KP)}BRot&~ZGM5>)Y|N~pWXki
Tu3PoI?l<r2jS`7v><kP5RoVB(

literal 0
HcmV?d00001

-- 
GitLab