From 3017ccdc826e40f9ab14e3e3029665bcd80c0d01 Mon Sep 17 00:00:00 2001
From: Tobias Schmidt <tobias.schmidt@unibas.ch>
Date: Tue, 24 Apr 2012 17:20:53 +0200
Subject: [PATCH] Table class: add function to compute matthews correlation
 coefficient

---
 modules/base/pymod/table.py      | 68 ++++++++++++++++++++++++++++++++
 modules/base/tests/test_table.py | 28 +++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/modules/base/pymod/table.py b/modules/base/pymod/table.py
index 6cc878fd9..6541cf721 100644
--- a/modules/base/pymod/table.py
+++ b/modules/base/pymod/table.py
@@ -1613,6 +1613,74 @@ class Table(object):
     except ImportError:
       LogError("Function needs matplotlib, but I could not import it.")
       raise
+    
+  def ComputeMCC(self, score_col, class_col, score_dir='-',
+                 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
+    '''
+    Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
+    with the points classified into true positives, false positives, true
+    negatives and false negatives according to a specified classification
+    column (*class_col*).
+    
+    The datapoints in *score_col* and *class_col* are classified into
+    positive and negative points. This can be done in two ways:
+    
+     - by using 'bool' columns which contains True for positives and False
+       for negatives
+       
+     - by using 'float' or 'int' columns and specifying a cutoff value and the
+       columns direction. This will generate the classification on the fly
+       
+       * if *class_dir*/*score_dir*=='-': values in the classification column 
+                                    that are less than or equal to 
+                                    *class_cutoff*/*score_cutoff* will be
+                                    counted as positives
+       * if *class_dir*/*score_dir*=='+': values in the classification column 
+                                    that are larger than or equal to
+                                    *class_cutoff*/*score_cutoff* will be
+                                    counted as positives
+                                    
+    The two possibilities can be used together, i.e. 'bool' type for one column
+    and 'float'/'int' type and cutoff/direction for the other column.
+    '''
+    ALLOWED_DIR = ['+','-']
+
+    score_idx = self.GetColIndex(score_col)
+    score_type = self.col_types[score_idx]
+    if score_type!='int' and score_type!='float' and score_type!='bool':
+      raise TypeError("Score column must be numeric or bool type")
+
+    class_idx = self.GetColIndex(class_col)
+    class_type = self.col_types[class_idx]
+    if class_type!='int' and class_type!='float' and class_type!='bool':
+      raise TypeError("Classifier column must be numeric or bool type")
+
+    if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
+      raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
+     
+    tp = 0
+    fp = 0
+    fn = 0
+    tn = 0
+
+    for i,row in enumerate(self.rows):
+      class_val = row[class_idx]
+      score_val = row[score_idx]
+      if class_val!=None:
+        if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
+          if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
+            tp += 1
+          else:
+            fn += 1
+        else:
+          if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
+            tn += 1
+          else:
+            fp += 1
+
+    mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
+    return mcc
+    
 
   def IsEmpty(self, col_name=None, ignore_nan=True):
     '''
diff --git a/modules/base/tests/test_table.py b/modules/base/tests/test_table.py
index 7091106f5..265a66186 100644
--- a/modules/base/tests/test_table.py
+++ b/modules/base/tests/test_table.py
@@ -1142,6 +1142,34 @@ class TestTable(unittest.TestCase):
     auc = tab.ComputeROCAUC(score_col='score', score_dir='+', class_col='classific')
     self.assertAlmostEquals(auc, auc_ref)
 
+  def testCalcMCC(self):
+    tab = Table(['score', 'rmsd', 'class_rmsd', 'class_score'], 'ffbb',
+                score=      [2.64, 1.11, 2.17, 0.45,0.15,0.85, 1.13, 2.90, 0.50, 1.03, 1.46, 2.83, 1.15, 2.04, 0.67, 1.27, 2.22, 1.90, 0.68, 0.36,1.04, 2.46, 0.91,0.60],
+                rmsd=[9.58,1.61,7.48,0.29,1.68,3.52,3.34,8.17,4.31,2.85,6.28,8.78,0.41,6.29,4.89,7.30,4.26,3.51,3.38,0.04,2.21,0.24,7.58,8.40],
+                class_rmsd= [False,True, False,True,True,False,False,False,False,False,False,False,True, False,False,False,False,False,False,True,False,True,False,False],
+                class_score=[False,False,False,True,True,True, False,False,True, False,False,False,False,False,True, False,False,False,True, True,False,False,True,True])
+    
+    mcc = tab.ComputeMCC(score_col='score', score_dir='-', class_col='rmsd', class_dir='-', score_cutoff=1.0, class_cutoff=2.0)
+    self.assertAlmostEquals(mcc, 0.1490711984)
+    mcc = tab.ComputeMCC(score_col='class_score', class_col='class_rmsd')
+    self.assertAlmostEquals(mcc, 0.1490711984)
+    mcc = tab.ComputeMCC(score_col='score', score_dir='+', class_col='rmsd', class_dir='+', score_cutoff=1.0, class_cutoff=2.0)
+    self.assertAlmostEquals(mcc, 0.1490711984)
+    mcc = tab.ComputeMCC(score_col='score', score_dir='-', class_col='rmsd', class_dir='+', score_cutoff=1.0, class_cutoff=2.0)
+    self.assertAlmostEquals(mcc, -0.1490711984)
+    mcc = tab.ComputeMCC(score_col='score', score_dir='+', class_col='rmsd', class_dir='-', score_cutoff=1.0, class_cutoff=2.0)
+    self.assertAlmostEquals(mcc, -0.1490711984)
+
+  def testCalcMCCPreclassified(self):
+    tab = Table(['reference', 'prediction1', 'prediction2'],'bbb',
+                reference=  [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True,  False, False, True,  False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True,  False, False, True,  False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False],
+                prediction1=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True,  False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, True,  False, False, False, False, False, False],
+                prediction2=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True,  False, False, True,  False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True,  False, True,  True,  False, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, True,  False, False, False, False])
+    mcc = tab.ComputeMCC(score_col='prediction1', class_col='reference')
+    self.assertAlmostEquals(mcc, 0.538389277)
+    mcc = tab.ComputeMCC(score_col='prediction2', class_col='reference')
+    self.assertAlmostEquals(mcc, 0.882089673321)
+
   def testTableAsNumpyMatrix(self):
 
     '''
-- 
GitLab