Something went wrong on our end
-
Gabriel Studer authored
e.g. direct comparison of a value from a column of type bool with a value from a column with type string raises an error. NaN and None handling has also been improved.
Gabriel Studer authorede.g. direct comparison of a value from a column of type bool with a value from a column with type string raises an error. NaN and None handling has also been improved.
table_selector.py 16.06 KiB
class TableSelector:
def __init__(self, col_types, col_names, query):
self.col_types=col_types
self.col_names=col_names
self.query=query
self.valid_operators=dict()
self.valid_operators['and']={'type':'boolean','precedence':5}
self.valid_operators['or']={'type':'boolean','precedence':6}
self.valid_operators['!']={'type':'comparison','precedence':4,'extension':'!='}
self.valid_operators['!=']={'type':'comparison','precedence':4,'extension':None}
self.valid_operators['=']={'type':'comparison','precedence':4,'extension':'=='}
self.valid_operators['==']={'type':'comparison','precedence':4,'extension':None}
self.valid_operators['<=']={'type':'comparison','precedence':3,'extension':None}
self.valid_operators['>=']={'type':'comparison','precedence':3,'extension':None}
self.valid_operators['>']={'type':'comparison','precedence':3,'extension':'>='}
self.valid_operators['<']={'type':'comparison','precedence':3,'extension':'<='}
self.valid_operators['+']={'type':'arithmetic','precedence':2}
self.valid_operators['-']={'type':'arithmetic','precedence':2}
self.valid_operators['/']={'type':'arithmetic','precedence':1}
self.valid_operators['*']={'type':'arithmetic','precedence':1}
self.valid_operators['(']={'type':'left_bracket','precedence':float('NaN')}
self.valid_operators['[']={'type':'left_bracket','precedence':float('NaN')}
self.valid_operators['{']={'type':'left_bracket','precedence':float('NaN')}
self.valid_operators[')']={'type':'right_bracket','precedence':float('NaN')}
self.valid_operators[']']={'type':'right_bracket','precedence':float('NaN')}
self.valid_operators['}']={'type':'right_bracket','precedence':float('NaN')}
self.split_expression=self._ExpressionLexer(self.query)
self.parsed_expression=self._ParseExpression(self.split_expression)
self.rpn_expression=self._ShuntingYard(self.parsed_expression)
self.tab_indices=list()
self.exp_indices=list()
#extract column indices
for i, exp in enumerate(self.rpn_expression):
if exp in self.col_names:
self.tab_indices.append(self._GetIndex(exp))
self.exp_indices.append(i)
def EvaluateRow(self,row):
for ti, ei in zip(self.tab_indices, self.exp_indices):
#check for NaN
if row[ti]!=row[ti]:
self.rpn_expression[ei]=None
else:
self.rpn_expression[ei] = row[ti]
if self._EvaluateRPN(list(self.rpn_expression)):
return True
return False
def _GetIndex(self, col):
if col not in self.col_names:
raise ValueError('Table Selector has no column named "%s"' % col)
return self.col_names.index(col)
def _EvaluateAnd(self, lhs, rhs):
return lhs==True and rhs==True
def _EvaluateOr(self, lhs, rhs):
return lhs==True or rhs==True
def _EvaluateEqual(self, lhs, rhs):
return lhs==rhs
def _EvaluateNonEqual(self, lhs, rhs):
return lhs!=rhs
def _EvaluateLower(self, lhs, rhs):
if lhs==None or rhs==None:
return False
return lhs<rhs
def _EvaluateGreater(self, lhs, rhs):
if lhs==None or rhs==None:
return False
return lhs>rhs
def _EvaluateLowerEqual(self, lhs, rhs):
if lhs==None or rhs==None:
return False
return lhs<=rhs
def _EvaluateGreaterEqual(self, lhs, rhs):
if lhs==None or rhs==None:
return False
return lhs>=rhs
def _EvaluateAdd(self, lhs, rhs):
if lhs==None or rhs==None:
return None
return lhs+rhs
def _EvaluateSubtract(self, lhs, rhs):
if lhs==None or rhs==None:
return None
return lhs-rhs
def _EvaluateMultiply(self, lhs, rhs):
if lhs==None or rhs==None:
return None
return lhs*rhs
def _EvaluateDivide(self, lhs, rhs):
if lhs==None or rhs==None:
return None
return lhs/rhs
def _EvaluateOperator(self, op, lhs, rhs):
#this function assumes, that all NaN values have been replaced by None!
if op=='+':
return self._EvaluateAdd(lhs, rhs)
elif op=='-':
return self._EvaluateSubtract(lhs, rhs)
elif op=='/':
return self._EvaluateDivide(lhs, rhs)
elif op=='*':
return self._EvaluateMultiply(lhs, rhs)
elif op=='and':
return self._EvaluateAnd(lhs, rhs)
elif op=='or':
return self._EvaluateOr(lhs, rhs)
elif op=='=' or op=='==':
return self._EvaluateEqual(lhs, rhs)
elif op=='!=' or op=='!':
return self._EvaluateNonEqual(lhs, rhs)
elif op=='<':
return self._EvaluateLower(lhs, rhs)
elif op=='>':
return self._EvaluateGreater(lhs, rhs)
elif op=='<=':
return self._EvaluateLowerEqual(lhs, rhs)
elif op=='>=':
return self._EvaluateGreaterEqual(lhs, rhs)
else:
raise ValueError('Unknown operator: '+op)
def _EvaluateRPN(self, RPNExp):
#Evaluates the reverse polish notation
stack=list()
while True:
if len(RPNExp)==0:
break
exp=RPNExp.pop(0)
if exp in self.valid_operators:
if len(stack)<2:
raise ValueError('Cannot evaluate operator on less than two operands!')
rhs=stack.pop()
lhs=stack.pop()
result=self._EvaluateOperator(exp, lhs, rhs)
if result==None:
return False
stack.append(result)
else:
stack.append(exp)
if len(stack)>1:
raise ValueError('Too many operands for given operators!')
return stack.pop()
def _ShuntingYard(self, split_expression):
#Creates the so called reverse polish notation out of the expression parser output.
#note, that there won't be parenthesis anymore and potential parenthesis
#mismatches get recognized.
#The shunting yard algorithm from dijkstra gets used.
output_stack=list()
operator_stack=list()
while True:
if len(split_expression)==0:
while True:
if len(operator_stack)==0:
break
if self.valid_operators[operator_stack[-1]]['type'] in ['left_bracket','right_bracket']:
raise ValueError('Parenthesis mismatch!')
output_stack.append(operator_stack.pop())
break
exp=split_expression.pop(0)
if exp in self.valid_operators:
if self.valid_operators[exp]['type']=='left_bracket':
operator_stack.append(exp)
continue
if exp in self.valid_operators:
if self.valid_operators[exp]['type'] == 'right_bracket':
while True:
if len(operator_stack)==0:
raise ValueError('Parenthesis mismatch!')
if self.valid_operators[operator_stack[-1]]['type']=='left_bracket':
operator_stack.pop()
break
output_stack.append(operator_stack.pop())
continue
if exp in self.valid_operators:
prec=self.valid_operators[exp]['precedence']
while len(operator_stack)>0:
if self.valid_operators[operator_stack[-1]]['type']=='left_bracket':
break
elif prec>=self.valid_operators[operator_stack[-1]]['precedence']:
output_stack.append(operator_stack.pop())
else:
break
operator_stack.append(exp)
continue
output_stack.append(exp)
return output_stack
def _ParseSubExpression(self, subexpression):
valid_types={'float':'numeric','int':'numeric','string':'string','bool':'bool'}
column_names=list()
column_types=list()
final_expression=list()
for item in subexpression:
if item in self.col_names:
column_names.append(item)
column_types.append(valid_types[self.col_types[self._GetIndex(item)]])
unique_type=list(set(column_types))
if len(unique_type)>1:
raise RuntimeError('Try to compare columns '+','.join(column_names)+' which have inconsistent types!')
if len(unique_type)==0:
raise RuntimeError('Try to evaluate subexpression '+' '.join(subexpression)+' that contains no valid column name of current table!')
for item in subexpression:
if item in self.valid_operators:
final_expression.append(item)
continue
if item in column_names:
final_expression.append(item)
continue
if unique_type[0]=='numeric':
if item in ['NaN','nan','None','none']:
final_expression.append(None)
continue
else:
try:
final_expression.append(float(item))
continue
except:
raise RuntimeError('Tried to cast '+item+' into numeric type to compare with column(s) '+','.join(column_names)+', but failed!')
elif unique_type[0]=='bool':
if item in ['None','none']:
final_expression.append(None)
continue
if item in ['true','True']:
final_expression.append(True)
continue
if item in ['false','False']:
final_expression.append(False)
continue
raise RuntimeError('Tried to cast '+item+' into boolean type to compare with column(s) '+','.join(column_names)+', but failed!')
elif unique_type[0]=='string':
final_expression.append(item)
return final_expression
def _ParseExpression(self, split_expression):
#check for problematic cases like 'a<=b<=c'. We don't know which operator to evaluate first
for i in range(len(split_expression)-3):
if (split_expression[i] in self.valid_operators) and (split_expression[i+2] in self.valid_operators):
if self.valid_operators[split_expression[i]]['precedence']==self.valid_operators[split_expression[i+2]]['precedence']:
raise ValueError('Cannot Evaluate '+' '.join(split_expression[i:i+3])+' since both operators have same precedence!')
#handle , operator
#replaces an expression like 'rnum=1,2,3' with '(rnum=1 or rnum=2 or rnum=3)'
temp_split_expression=list()
skips=0
for i in range(len(split_expression)):
if skips>0:
skips-=1
continue
if ',' in split_expression[i]:
if split_expression[max(0,i-1)] != '=' and split_expression[min(i+1,len(split_expression)-1)] != '=':
raise ValueError('Can evaluate \',\' operator only in combination with \"=\" in subexpression ',' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))]))
single_operands=split_expression[i].split(',')
if split_expression[max(0,i-1)]=='=':
if i-2<0:
raise ValueError('Cannot evaluate subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))])+' starting with an \'=\'')
main_operand=split_expression[i-2]
temp_split_expression.pop()
temp_split_expression.pop()
skips=0
else:
if i+2>len(split_expression)-1:
raise ValueError('Cannot evaluate subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))])+' ending with an \'=\'')
main_operand=split_expression[i+2]
skips=2
temp_expression=list(['('])
temp_expression+=' or '.join(['%s = %s'% (a,b) for (a,b) in zip(len(single_operands)*[main_operand],single_operands)]).split()
temp_expression.append(')')
temp_split_expression+=temp_expression
continue
temp_split_expression.append(split_expression[i])
split_expression=temp_split_expression
#handle ':' operator
#replaces an expression like 'col_a=x:y' with '(col_a>=x and col_a<=y)'
temp_split_expression=list()
skips=0
for i in range(len(split_expression)):
if skips>0:
skips-=1
continue
if ':' in split_expression[i]:
if split_expression[max(0,i-1)] != '=' and split_expression[min(i+1,len(split_expression)-1)] != '=':
raise ValueError('Can evaluate subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))])+' \':\' sign is only allowed in combination with \'=\'')
if len(split_expression[i].split(':')) != 2:
raise ValueError('Can operate \':\' operator only on 2 operands in subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))]))
lhs=split_expression[i].split(':')[0]
rhs=split_expression[i].split(':')[1]
template_expression=['(','','<=','','and','','<=','',')']
if split_expression[max(0,i-1)] == '=':
if i-2<0:
raise ValueError('Cannot evaluate subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))])+' starting with an \'=\'')
temp_split_expression.pop()
temp_split_expression.pop()
template_expression[3]=split_expression[i-2]
template_expression[5]=split_expression[i-2]
skips=0
else:
if i+2>len(split_expression)-1:
raise ValueError('Cannot evaluate subexpression '+' '.join(split_expression[max(0,i-1):min(i+1,len(split_expression))])+' ending with an \'=\'')
template_expression[3]=split_expression[i+2]
template_expression[5]=split_expression[i+2]
skips=2
template_expression[1]=lhs
template_expression[7]=rhs
temp_split_expression+=template_expression
continue
temp_split_expression.append(split_expression[i])
split_expression=temp_split_expression
#the whole thing is now split to pieces, we need to cast the types of the operands
#into the types of the columns, the operands are compared against.
final_expression=list()
subexpression=list()
for item in split_expression:
if item in self.valid_operators:
if self.valid_operators[item]['type'] in ['boolean','left_bracket','right_bracket']:
if len(subexpression)>0:
#figure out the column type(s) and parse the operands
final_expression+=self._ParseSubExpression(subexpression)
subexpression=list()
final_expression.append(item)
continue
subexpression.append(item)
if len(subexpression)>0:
final_expression+=self._ParseSubExpression(subexpression)
return final_expression
def _ExpressionLexer(self, expression):
#Reads token after token and searches for brackets and valid_operators
#everything, that doesn't match the above is assumed to be an operand
split_expression=list()
actual_position=0
eaten_stuff=''
while True:
if actual_position>=len(expression):
if len(eaten_stuff)>0:
split_expression.append(eaten_stuff)
return split_expression
token=expression[actual_position]
if token.isspace():
if len(eaten_stuff)>0:
split_expression.append(eaten_stuff)
eaten_stuff=''
actual_position+=1
continue
#note, that there is no check for boolean operators. They need to be clearly separated by spaces
#or brackets anyway, so they get added with the eaten stuff
if token in self.valid_operators:
if self.valid_operators[token]['type']=='left_bracket' or self.valid_operators[token]['type']=='right_bracket':
if len(eaten_stuff)>0:
split_expression.append(eaten_stuff)
eaten_stuff=''
split_expression.append(token)
actual_position+=1
continue
if self.valid_operators[token]['type']=='arithmetic':
if len(eaten_stuff)>0:
split_expression.append(eaten_stuff)
eaten_stuff=''
split_expression.append(token)
actual_position+=1
continue
if self.valid_operators[token]['type']=='comparison':
if len(eaten_stuff)>0:
split_expression.append(eaten_stuff)
eaten_stuff=''
if self.valid_operators[token]['extension']!=None:
if actual_position+len(self.valid_operators[token]['extension'])<len(expression):
if expression[actual_position:actual_position+len(self.valid_operators[token]['extension'])]==self.valid_operators[token]['extension']:
split_expression.append(self.valid_operators[token]['extension'])
actual_position+=len(self.valid_operators[token]['extension'])
continue
split_expression.append(token)
actual_position+=1
continue
eaten_stuff+=token
actual_position+=1