From 8ecb86356ca6ce9562e12bafcc87a7e4d031799c Mon Sep 17 00:00:00 2001
From: Marco Biasini <marco.biasini@unibas.ch>
Date: Thu, 8 Dec 2011 22:28:33 +0100
Subject: [PATCH] added two more input/output formats to the table class

1. We now read traditional CSV files. Since the files do not explicitly
   specify the column format, we have to guess the format of each column
   after importing.
2. For efficient serialization, table data can be pickled/unpickled
   directly.
---
 .gitignore                       |   3 +
 modules/base/pymod/table.py      | 170 +++++++++++++++++++++++++++----
 modules/base/tests/test_table.py |  57 +++++++++--
 3 files changed, 206 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index fd4689e6c..99daac5ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ pov_*test.inc
 *-out.pdb
 *-out.sdf
 *-out.crd
+*-out.pqr
 *-out.png
 CMakeLists.txt.user
 OpenStructure.cbp
@@ -54,4 +55,6 @@ Debug
 /deployment/win/create_archive.bat
 /install_manifest.txt
 *_out.csv
+*_out.tab
+*_out.pickle
 /modules/io/tests/temp_img.tmp
diff --git a/modules/base/pymod/table.py b/modules/base/pymod/table.py
index 8258ad316..11a9dd8ea 100644
--- a/modules/base/pymod/table.py
+++ b/modules/base/pymod/table.py
@@ -3,6 +3,7 @@ import re
 from ost import stutil
 import itertools
 import operator
+import cPickle
 from ost import LogError, LogWarning, LogInfo, LogVerbose
 
 def MakeTitle(col_name):
@@ -16,7 +17,11 @@ def IsStringLike(value):
     return True
   except:
     return False
-  
+
+def IsNullString(value):
+  value=value.strip().upper()
+  return value in ('', 'NULL', 'NONE', 'NA')
+
 def IsScalar(value):
   if IsStringLike(value):
     return True
@@ -28,6 +33,38 @@ def IsScalar(value):
   except:
     return True
 
+def GuessColumnType(iterator):
+  empty=True
+  possibilities=set(['bool', 'int', 'float'])
+  for ele in iterator:
+    str_ele=str(ele).upper()
+    if IsNullString(str_ele):
+      continue
+    empty=False
+    if 'int' in possibilities:
+      try:
+        int(str_ele)
+      except ValueError:
+        possibilities.remove('int')
+
+    if 'float' in possibilities:
+      try:
+        float(str_ele)
+      except ValueError:
+        possibilities.remove('float')
+    if 'bool' in possibilities:
+      if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
+        possibilities.remove('bool')
+
+    if len(possibilities)==0:
+      return 'string'
+  if len(possibilities)==2:
+    return 'int'
+  if empty:
+    return 'string'
+  # return the last element available
+  return possibilities.pop()
+
 class BinaryColExpr:
   def __init__(self, op, lhs, rhs):
     self.op=op
@@ -85,7 +122,7 @@ class TableCol:
     return BinaryColExpr(operator.div, self, rhs)
 
 
-class Table:
+class Table(object):
   """
   
   The table class provides convenient access to data in tabular form. An empty 
@@ -448,21 +485,7 @@ class Table:
     return filt_tab
 
   @staticmethod
-  def Load(stream_or_filename):
-    """
-    Load table from stream or file with given name. The file must contain a 
-    header line of the form
-    
-      col_name1[type1] <col_name2[type2]>...
-    
-    The types given in brackets must be one of the data types the :class:`Table` 
-    class understands. Each following line in the file then must contains exactly 
-    the same number of data items as listed in the header. The data items are 
-    automatically converted to the column format. Lines starting with a '#' and 
-    empty lines are ignored.
-    
-    :returns: A new :class:`Table` instance
-    """
+  def _LoadOST(stream_or_filename):
     fieldname_pattern=re.compile(r'(?P<name>[A-Za-z0-9_]+)(\[(?P<type>\w+)\])?')
     if not hasattr(stream_or_filename, 'read'):
       stream=open(stream_or_filename, 'r')
@@ -496,6 +519,89 @@ class Table:
       raise IOError("Cannot read table from empty stream")
     return tab
 
+  def _GuessColumnTypes(self):
+    for col_idx in range(len(self.col_names)):
+      self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
+    for row in self.rows:
+      for idx in range(len(row)):
+        row[idx]=self._Coerce(row[idx], self.col_types[idx])
+  @staticmethod
+  def _LoadCSV(stream_or_filename, sep):
+    if not hasattr(stream_or_filename, 'read'):
+      stream=open(stream_or_filename, 'r')
+    else:
+      stream=stream_or_filename
+    reader=csv.reader(stream, delimiter=sep)
+    first=True
+    for row in reader:
+      if first:
+        header=row
+        types='s'*len(row)
+        tab=Table(header, types)
+        first=False
+      else:
+        tab.AddRow(row)
+    if first:
+      raise IOError('trying to load table from empty CSV stream/file')
+
+    tab._GuessColumnTypes()
+    return tab
+
+  @staticmethod
+  def _LoadPickle(stream_or_filename):
+    if not hasattr(stream_or_filename, 'read'):
+      stream=open(stream_or_filename, 'rb')
+    else:
+      stream=stream_or_filename
+    return cPickle.load(stream)
+
+  @staticmethod
+  def Load(stream_or_filename, format='ost', sep=','):
+    """
+    Load table from stream or file with given name.
+
+    The following file formats are understood:
+
+     ost
+
+      This is an ost-specific, but still human readable file format. The file
+      (stream) must start with header line of the form
+
+        col_name1[type1] <col_name2[type2]>...
+
+      The types given in brackets must be one of the data types the
+      :class:`Table` class understands. Each following line in the file then must
+      contains exactly the same number of data items as listed in the header. The
+      data items are automatically converted to the column format. Lines starting
+      with a '#' and empty lines are ignored.
+
+    pickle
+
+      Deserializes the table from a pickled byte stream
+
+    csv
+
+      Reads the table from comma separated values stream. Since there is no
+      explicit type information in the csv file, the column types are guessed,
+      using the following simple rules:
+
+      * if all values are either NA/NULL/NONE the type is set to string
+      * if all non-null values are convertible to float/int the type is set to
+        float/int
+      * if all non-null values are true/false/yes/no, the value is set to bool
+      * for all other cases, the column type is set to string
+
+    :returns: A new :class:`Table` instance
+    """
+    format=format.lower()
+    if format=='ost':
+      return Table._LoadOST(stream_or_filename)
+    if format=='csv':
+      return Table._LoadCSV(stream_or_filename, sep=sep)
+    if format=='pickle':
+      return Table._LoadPickle(stream_or_filename)
+    raise ValueError('unknown format ""' % format)
+
   def Sort(self, by, order='+'):
     """
     Performs an in-place sort of the table, based on column.
@@ -938,10 +1044,38 @@ class Table:
       raise
     
 
-  def Save(self, stream):
+  def Save(self, stream, format='ost', sep=','):
     """
     Save the table to stream or filename
     """
+    format=format.lower()
+    if format=='ost':
+      return self._SaveOST(stream)
+    if format=='csv':
+      return self._SaveCSV(stream, sep=sep)
+    if format=='pickle':
+      return self._SavePickle(stream)
+    raise ValueError('unknown format "%s"' % format)
+
+  def _SavePickle(self, stream):
+    if not hasattr(stream, 'write'):
+      stream=open(stream, 'wb')
+    cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
+
+  def _SaveCSV(self, stream, sep):
+    if not hasattr(stream, 'write'):
+      stream=open(stream, 'wb')
+
+    writer=csv.writer(stream, delimiter=sep)
+    writer.writerow(['%s' % n for n in self.col_names])
+    for row in self.rows:
+      row=list(row)
+      for i, c in enumerate(row):
+        if c==None:
+          row[i]='NA'
+      writer.writerow(row)
+
+  def _SaveOST(self, stream):
     if hasattr(stream, 'write'):
       writer=csv.writer(stream, delimiter=' ')
     else:
diff --git a/modules/base/tests/test_table.py b/modules/base/tests/test_table.py
index e2a3060c6..42276a657 100644
--- a/modules/base/tests/test_table.py
+++ b/modules/base/tests/test_table.py
@@ -190,6 +190,14 @@ class TestTable(unittest.TestCase):
     self.CompareRowCount(tab, 0)
     self.assertRaises(ValueError, tab.GetColIndex, 'a')
     
+  def testGuessColumnType(self):
+    self.assertEqual(GuessColumnType(['1', '1.3', '2']), 'float')
+    self.assertEqual(GuessColumnType(['1', '1', '2']), 'int')
+    self.assertEqual(GuessColumnType(['NONE', '1', '1', '2']), 'int')
+    self.assertEqual(GuessColumnType(['NONE', '1', '1', '2']), 'int')
+    self.assertEqual(GuessColumnType(['NONE', '1', '1', 'a']), 'string')
+    self.assertEqual(GuessColumnType(['NONE', 'TRUE', 'False']), 'bool')
+    self.assertEqual(GuessColumnType(['NONE']), 'string')
   def testTableInitSingleColEmpty(self):
     '''
     empty table with one float column:
@@ -705,21 +713,21 @@ class TestTable(unittest.TestCase):
     tab.Sort('third', '+')
     self.CompareDataFromDict(tab, {'first': [None,'foo','x'], 'second': [9,None,3], 'third': [3.3,2.2,None]})
 
-  def testSaveLoadTable(self):
+  def testSaveLoadTableOST(self):
     tab = self.CreateTestTable()
     self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
     
     # write to disc
-    tab.Save("saveloadtable_filename_out.csv")
-    out_stream = open("saveloadtable_stream_out.csv", 'w')
+    tab.Save("saveloadtable_filename_out.tab")
+    out_stream = open("saveloadtable_stream_out.tab", 'w')
     tab.Save(out_stream)
     out_stream.close()
     
     # read from disc
-    in_stream = open("saveloadtable_stream_out.csv", 'r')
+    in_stream = open("saveloadtable_stream_out.tab", 'r')
     tab_loaded_stream = Table.Load(in_stream)
     in_stream.close()
-    tab_loaded_fname = Table.Load('saveloadtable_filename_out.csv')
+    tab_loaded_fname = Table.Load('saveloadtable_filename_out.tab')
     
     # check content
     self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
@@ -727,10 +735,47 @@ class TestTable(unittest.TestCase):
     
     # check Errors for empty/non existing files
     self.assertRaises(IOError, Table.Load, 'nonexisting.file')
-    self.assertRaises(IOError, Table.Load, os.path.join('testfiles','emptytable.csv'))
+    self.assertRaises(IOError, Table.Load, os.path.join('testfiles','emptytable.tab'))
     in_stream = open(os.path.join('testfiles','emptytable.csv'), 'r')
     self.assertRaises(IOError, Table.Load, in_stream)
+  def testSaveLoadTableCSV(self):
+    tab = self.CreateTestTable()
+    self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+
+    # write to disc
+    tab.Save("saveloadtable_filename_out.csv", format='csv')
+    out_stream = open("saveloadtable_stream_out.csv", 'w')
+    tab.Save(out_stream, format='csv')
+    out_stream.close()
     
+    # read from disc
+    in_stream = open("saveloadtable_stream_out.csv", 'r')
+    tab_loaded_stream = Table.Load(in_stream, format='csv')
+    in_stream.close()
+    tab_loaded_fname = Table.Load('saveloadtable_filename_out.csv', format='csv')
+
+    # check content
+    self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+    self.CompareDataFromDict(tab_loaded_fname, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+  def testSaveLoadTablePickle(self):
+    tab = self.CreateTestTable()
+    self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+    # write to disc
+    tab.Save("saveloadtable_filename_out.pickle", format='pickle')
+    out_stream = open("saveloadtable_stream_out.pickle", 'wb')
+    tab.Save(out_stream, format='pickle')
+    out_stream.close()
+
+    # read from disc
+    in_stream = open("saveloadtable_stream_out.pickle", 'rb')
+    tab_loaded_stream = Table.Load(in_stream, format='pickle')
+    in_stream.close()
+    tab_loaded_fname = Table.Load('saveloadtable_filename_out.pickle', format='pickle')
+
+    # check content
+    self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+    self.CompareDataFromDict(tab_loaded_fname, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]})
+
   def testMergeTable(self):
     '''
     Merge the following two tables:
-- 
GitLab