diff --git a/.gitignore b/.gitignore index fd4689e6cb74f35784d549a66b504c717d958196..99daac5ca8151642fcea430d5a36fa42a1f987e1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ pov_*test.inc *-out.pdb *-out.sdf *-out.crd +*-out.pqr *-out.png CMakeLists.txt.user OpenStructure.cbp @@ -54,4 +55,6 @@ Debug /deployment/win/create_archive.bat /install_manifest.txt *_out.csv +*_out.tab +*_out.pickle /modules/io/tests/temp_img.tmp diff --git a/modules/base/pymod/table.py b/modules/base/pymod/table.py index 8258ad31685fa817326a89bf0243e9fef3b22b58..11a9dd8eaa2553606bc076fd263a8ed4e1944352 100644 --- a/modules/base/pymod/table.py +++ b/modules/base/pymod/table.py @@ -3,6 +3,7 @@ import re from ost import stutil import itertools import operator +import cPickle from ost import LogError, LogWarning, LogInfo, LogVerbose def MakeTitle(col_name): @@ -16,7 +17,11 @@ def IsStringLike(value): return True except: return False - + +def IsNullString(value): + value=value.strip().upper() + return value in ('', 'NULL', 'NONE', 'NA') + def IsScalar(value): if IsStringLike(value): return True @@ -28,6 +33,38 @@ def IsScalar(value): except: return True +def GuessColumnType(iterator): + empty=True + possibilities=set(['bool', 'int', 'float']) + for ele in iterator: + str_ele=str(ele).upper() + if IsNullString(str_ele): + continue + empty=False + if 'int' in possibilities: + try: + int(str_ele) + except ValueError: + possibilities.remove('int') + + if 'float' in possibilities: + try: + float(str_ele) + except ValueError: + possibilities.remove('float') + if 'bool' in possibilities: + if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']): + possibilities.remove('bool') + + if len(possibilities)==0: + return 'string' + if len(possibilities)==2: + return 'int' + if empty: + return 'string' + # return the last element available + return possibilities.pop() + class BinaryColExpr: def __init__(self, op, lhs, rhs): self.op=op @@ -85,7 +122,7 @@ class TableCol: return BinaryColExpr(operator.div, self, rhs) -class Table: +class Table(object): """ The table class provides convenient access to data in tabular form. An empty @@ -448,21 +485,7 @@ class Table: return filt_tab @staticmethod - def Load(stream_or_filename): - """ - Load table from stream or file with given name. The file must contain a - header line of the form - - col_name1[type1] <col_name2[type2]>... - - The types given in brackets must be one of the data types the :class:`Table` - class understands. Each following line in the file then must contains exactly - the same number of data items as listed in the header. The data items are - automatically converted to the column format. Lines starting with a '#' and - empty lines are ignored. - - :returns: A new :class:`Table` instance - """ + def _LoadOST(stream_or_filename): fieldname_pattern=re.compile(r'(?P<name>[A-Za-z0-9_]+)(\[(?P<type>\w+)\])?') if not hasattr(stream_or_filename, 'read'): stream=open(stream_or_filename, 'r') @@ -496,6 +519,89 @@ class Table: raise IOError("Cannot read table from empty stream") return tab + def _GuessColumnTypes(self): + for col_idx in range(len(self.col_names)): + self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]]) + for row in self.rows: + for idx in range(len(row)): + row[idx]=self._Coerce(row[idx], self.col_types[idx]) + @staticmethod + def _LoadCSV(stream_or_filename, sep): + if not hasattr(stream_or_filename, 'read'): + stream=open(stream_or_filename, 'r') + else: + stream=stream_or_filename + reader=csv.reader(stream, delimiter=sep) + first=True + for row in reader: + if first: + header=row + types='s'*len(row) + tab=Table(header, types) + first=False + else: + tab.AddRow(row) + if first: + raise IOError('trying to load table from empty CSV stream/file') + + tab._GuessColumnTypes() + return tab + + @staticmethod + def _LoadPickle(stream_or_filename): + if not hasattr(stream_or_filename, 'read'): + stream=open(stream_or_filename, 'rb') + else: + stream=stream_or_filename + return cPickle.load(stream) + + @staticmethod + def Load(stream_or_filename, format='ost', sep=','): + """ + Load table from stream or file with given name. + + The following file formats are understood: + + ost + + This is an ost-specific, but still human readable file format. The file + (stream) must start with header line of the form + + col_name1[type1] <col_name2[type2]>... + + The types given in brackets must be one of the data types the + :class:`Table` class understands. Each following line in the file then must + contains exactly the same number of data items as listed in the header. The + data items are automatically converted to the column format. Lines starting + with a '#' and empty lines are ignored. + + pickle + + Deserializes the table from a pickled byte stream + + csv + + Reads the table from comma separated values stream. Since there is no + explicit type information in the csv file, the column types are guessed, + using the following simple rules: + + * if all values are either NA/NULL/NONE the type is set to string + * if all non-null values are convertible to float/int the type is set to + float/int + * if all non-null values are true/false/yes/no, the value is set to bool + * for all other cases, the column type is set to string + + :returns: A new :class:`Table` instance + """ + format=format.lower() + if format=='ost': + return Table._LoadOST(stream_or_filename) + if format=='csv': + return Table._LoadCSV(stream_or_filename, sep=sep) + if format=='pickle': + return Table._LoadPickle(stream_or_filename) + raise ValueError('unknown format ""' % format) + def Sort(self, by, order='+'): """ Performs an in-place sort of the table, based on column. @@ -938,10 +1044,38 @@ class Table: raise - def Save(self, stream): + def Save(self, stream, format='ost', sep=','): """ Save the table to stream or filename """ + format=format.lower() + if format=='ost': + return self._SaveOST(stream) + if format=='csv': + return self._SaveCSV(stream, sep=sep) + if format=='pickle': + return self._SavePickle(stream) + raise ValueError('unknown format "%s"' % format) + + def _SavePickle(self, stream): + if not hasattr(stream, 'write'): + stream=open(stream, 'wb') + cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL) + + def _SaveCSV(self, stream, sep): + if not hasattr(stream, 'write'): + stream=open(stream, 'wb') + + writer=csv.writer(stream, delimiter=sep) + writer.writerow(['%s' % n for n in self.col_names]) + for row in self.rows: + row=list(row) + for i, c in enumerate(row): + if c==None: + row[i]='NA' + writer.writerow(row) + + def _SaveOST(self, stream): if hasattr(stream, 'write'): writer=csv.writer(stream, delimiter=' ') else: diff --git a/modules/base/tests/test_table.py b/modules/base/tests/test_table.py index e2a3060c62ae341f9f822a6d76403bd29e0cf8e5..42276a6577b3f70b839baa64368129835f421831 100644 --- a/modules/base/tests/test_table.py +++ b/modules/base/tests/test_table.py @@ -190,6 +190,14 @@ class TestTable(unittest.TestCase): self.CompareRowCount(tab, 0) self.assertRaises(ValueError, tab.GetColIndex, 'a') + def testGuessColumnType(self): + self.assertEqual(GuessColumnType(['1', '1.3', '2']), 'float') + self.assertEqual(GuessColumnType(['1', '1', '2']), 'int') + self.assertEqual(GuessColumnType(['NONE', '1', '1', '2']), 'int') + self.assertEqual(GuessColumnType(['NONE', '1', '1', '2']), 'int') + self.assertEqual(GuessColumnType(['NONE', '1', '1', 'a']), 'string') + self.assertEqual(GuessColumnType(['NONE', 'TRUE', 'False']), 'bool') + self.assertEqual(GuessColumnType(['NONE']), 'string') def testTableInitSingleColEmpty(self): ''' empty table with one float column: @@ -705,21 +713,21 @@ class TestTable(unittest.TestCase): tab.Sort('third', '+') self.CompareDataFromDict(tab, {'first': [None,'foo','x'], 'second': [9,None,3], 'third': [3.3,2.2,None]}) - def testSaveLoadTable(self): + def testSaveLoadTableOST(self): tab = self.CreateTestTable() self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) # write to disc - tab.Save("saveloadtable_filename_out.csv") - out_stream = open("saveloadtable_stream_out.csv", 'w') + tab.Save("saveloadtable_filename_out.tab") + out_stream = open("saveloadtable_stream_out.tab", 'w') tab.Save(out_stream) out_stream.close() # read from disc - in_stream = open("saveloadtable_stream_out.csv", 'r') + in_stream = open("saveloadtable_stream_out.tab", 'r') tab_loaded_stream = Table.Load(in_stream) in_stream.close() - tab_loaded_fname = Table.Load('saveloadtable_filename_out.csv') + tab_loaded_fname = Table.Load('saveloadtable_filename_out.tab') # check content self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) @@ -727,10 +735,47 @@ class TestTable(unittest.TestCase): # check Errors for empty/non existing files self.assertRaises(IOError, Table.Load, 'nonexisting.file') - self.assertRaises(IOError, Table.Load, os.path.join('testfiles','emptytable.csv')) + self.assertRaises(IOError, Table.Load, os.path.join('testfiles','emptytable.tab')) in_stream = open(os.path.join('testfiles','emptytable.csv'), 'r') self.assertRaises(IOError, Table.Load, in_stream) + def testSaveLoadTableCSV(self): + tab = self.CreateTestTable() + self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + + # write to disc + tab.Save("saveloadtable_filename_out.csv", format='csv') + out_stream = open("saveloadtable_stream_out.csv", 'w') + tab.Save(out_stream, format='csv') + out_stream.close() + # read from disc + in_stream = open("saveloadtable_stream_out.csv", 'r') + tab_loaded_stream = Table.Load(in_stream, format='csv') + in_stream.close() + tab_loaded_fname = Table.Load('saveloadtable_filename_out.csv', format='csv') + + # check content + self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + self.CompareDataFromDict(tab_loaded_fname, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + def testSaveLoadTablePickle(self): + tab = self.CreateTestTable() + self.CompareDataFromDict(tab, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + # write to disc + tab.Save("saveloadtable_filename_out.pickle", format='pickle') + out_stream = open("saveloadtable_stream_out.pickle", 'wb') + tab.Save(out_stream, format='pickle') + out_stream.close() + + # read from disc + in_stream = open("saveloadtable_stream_out.pickle", 'rb') + tab_loaded_stream = Table.Load(in_stream, format='pickle') + in_stream.close() + tab_loaded_fname = Table.Load('saveloadtable_filename_out.pickle', format='pickle') + + # check content + self.CompareDataFromDict(tab_loaded_stream, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + self.CompareDataFromDict(tab_loaded_fname, {'first': ['x','foo',None], 'second': [3,None,9], 'third': [None,2.2,3.3]}) + def testMergeTable(self): ''' Merge the following two tables: