diff --git a/modules/base/pymod/table.py b/modules/base/pymod/table.py index 6cc878fd9a64c55a387b75032591ad388b968ddf..6541cf721579f77ce232d17442bed36193efdcae 100644 --- a/modules/base/pymod/table.py +++ b/modules/base/pymod/table.py @@ -1613,6 +1613,74 @@ class Table(object): except ImportError: LogError("Function needs matplotlib, but I could not import it.") raise + + def ComputeMCC(self, score_col, class_col, score_dir='-', + class_dir='-', score_cutoff=2.0, class_cutoff=2.0): + ''' + Compute Matthews correlation coefficient (MCC) for one column (*score_col*) + with the points classified into true positives, false positives, true + negatives and false negatives according to a specified classification + column (*class_col*). + + The datapoints in *score_col* and *class_col* are classified into + positive and negative points. This can be done in two ways: + + - by using 'bool' columns which contains True for positives and False + for negatives + + - by using 'float' or 'int' columns and specifying a cutoff value and the + columns direction. This will generate the classification on the fly + + * if *class_dir*/*score_dir*=='-': values in the classification column + that are less than or equal to + *class_cutoff*/*score_cutoff* will be + counted as positives + * if *class_dir*/*score_dir*=='+': values in the classification column + that are larger than or equal to + *class_cutoff*/*score_cutoff* will be + counted as positives + + The two possibilities can be used together, i.e. 'bool' type for one column + and 'float'/'int' type and cutoff/direction for the other column. + ''' + ALLOWED_DIR = ['+','-'] + + score_idx = self.GetColIndex(score_col) + score_type = self.col_types[score_idx] + if score_type!='int' and score_type!='float' and score_type!='bool': + raise TypeError("Score column must be numeric or bool type") + + class_idx = self.GetColIndex(class_col) + class_type = self.col_types[class_idx] + if class_type!='int' and class_type!='float' and class_type!='bool': + raise TypeError("Classifier column must be numeric or bool type") + + if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR): + raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR)) + + tp = 0 + fp = 0 + fn = 0 + tn = 0 + + for i,row in enumerate(self.rows): + class_val = row[class_idx] + score_val = row[score_idx] + if class_val!=None: + if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))): + if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))): + tp += 1 + else: + fn += 1 + else: + if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))): + tn += 1 + else: + fp += 1 + + mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp)) + return mcc + def IsEmpty(self, col_name=None, ignore_nan=True): ''' diff --git a/modules/base/tests/test_table.py b/modules/base/tests/test_table.py index 7091106f5c4a1f3c329b89b1edea1e95c12de128..265a661861be6887560507de95c0675c19c77023 100644 --- a/modules/base/tests/test_table.py +++ b/modules/base/tests/test_table.py @@ -1142,6 +1142,34 @@ class TestTable(unittest.TestCase): auc = tab.ComputeROCAUC(score_col='score', score_dir='+', class_col='classific') self.assertAlmostEquals(auc, auc_ref) + def testCalcMCC(self): + tab = Table(['score', 'rmsd', 'class_rmsd', 'class_score'], 'ffbb', + score= [2.64, 1.11, 2.17, 0.45,0.15,0.85, 1.13, 2.90, 0.50, 1.03, 1.46, 2.83, 1.15, 2.04, 0.67, 1.27, 2.22, 1.90, 0.68, 0.36,1.04, 2.46, 0.91,0.60], + rmsd=[9.58,1.61,7.48,0.29,1.68,3.52,3.34,8.17,4.31,2.85,6.28,8.78,0.41,6.29,4.89,7.30,4.26,3.51,3.38,0.04,2.21,0.24,7.58,8.40], + class_rmsd= [False,True, False,True,True,False,False,False,False,False,False,False,True, False,False,False,False,False,False,True,False,True,False,False], + class_score=[False,False,False,True,True,True, False,False,True, False,False,False,False,False,True, False,False,False,True, True,False,False,True,True]) + + mcc = tab.ComputeMCC(score_col='score', score_dir='-', class_col='rmsd', class_dir='-', score_cutoff=1.0, class_cutoff=2.0) + self.assertAlmostEquals(mcc, 0.1490711984) + mcc = tab.ComputeMCC(score_col='class_score', class_col='class_rmsd') + self.assertAlmostEquals(mcc, 0.1490711984) + mcc = tab.ComputeMCC(score_col='score', score_dir='+', class_col='rmsd', class_dir='+', score_cutoff=1.0, class_cutoff=2.0) + self.assertAlmostEquals(mcc, 0.1490711984) + mcc = tab.ComputeMCC(score_col='score', score_dir='-', class_col='rmsd', class_dir='+', score_cutoff=1.0, class_cutoff=2.0) + self.assertAlmostEquals(mcc, -0.1490711984) + mcc = tab.ComputeMCC(score_col='score', score_dir='+', class_col='rmsd', class_dir='-', score_cutoff=1.0, class_cutoff=2.0) + self.assertAlmostEquals(mcc, -0.1490711984) + + def testCalcMCCPreclassified(self): + tab = Table(['reference', 'prediction1', 'prediction2'],'bbb', + reference= [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False], + prediction1=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, True, False, False, False, False, False, False], + prediction2=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True, False, True, True, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, True, False, False, False, False]) + mcc = tab.ComputeMCC(score_col='prediction1', class_col='reference') + self.assertAlmostEquals(mcc, 0.538389277) + mcc = tab.ComputeMCC(score_col='prediction2', class_col='reference') + self.assertAlmostEquals(mcc, 0.882089673321) + def testTableAsNumpyMatrix(self): '''