Package rdkit :: Package ML :: Package Data :: Module Stats
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.Stats

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2001-2008  greg Landrum and Rational Discovery LLC 
  4  #  All Rights Reserved 
  5  # 
  6  """ various statistical operations on data 
  7   
  8  """ 
  9  import numpy 
 10  import math 
 11   
 12   
13 -def StandardizeMatrix(mat):
14 """ 15 16 This is the standard *subtract off the average and divide by the deviation* 17 standardization function. 18 19 **Arguments** 20 21 - mat: a numpy array 22 23 **Notes** 24 25 - in addition to being returned, _mat_ is modified in place, so **beware** 26 27 """ 28 nObjs = len(mat) 29 avgs = sum(mat, 0) / float(nObjs) 30 mat -= avgs 31 devs = math.sqrt(sum(mat * mat, 0) / (float(nObjs - 1))) 32 try: 33 newMat = mat / devs 34 except OverflowError: 35 newMat = numpy.zeros(mat.shape, 'd') 36 for i in range(mat.shape[1]): 37 if devs[i] != 0.0: 38 newMat[:, i] = mat[:, i] / devs[i] 39 return newMat
40 41
42 -def FormCovarianceMatrix(mat):
43 """ form and return the covariance matrix 44 45 """ 46 nPts = mat.shape[0] 47 sumVect = sum(mat) 48 sumVect /= float(nPts) 49 for row in mat: 50 row -= sumVect 51 return numpy.dot(numpy.transpose(mat), mat) / (nPts - 1)
52 53
54 -def FormCorrelationMatrix(mat):
55 """ form and return the covariance matrix 56 57 """ 58 nVars = len(mat[0]) 59 N = len(mat) 60 61 res = numpy.zeros((nVars, nVars), 'd') 62 for i in range(nVars): 63 x = mat[:, i] 64 sumX = sum(x) 65 sumX2 = sum(x * x) 66 for j in range(i, nVars): 67 y = mat[:, j] 68 sumY = sum(y) 69 sumY2 = sum(y * y) 70 numerator = N * sum(x * y) - sumX * sumY 71 denom = numpy.sqrt((N * sumX2 - sumX**2) * (N * sumY2 - sumY**2)) 72 if denom != 0.0: 73 res[i, j] = numerator / denom 74 res[j, i] = numerator / denom 75 else: 76 res[i, j] = 0 77 res[j, i] = 0 78 return res
79 80
81 -def PrincipalComponents(mat, reverseOrder=1):
82 """ do a principal components analysis 83 84 """ 85 covMat = FormCorrelationMatrix(mat) 86 87 eigenVals, eigenVects = numpy.linalg.eig(covMat) 88 # The the 'real' component, if it exists as its own attribute 89 eigenVals = getattr(eigenVals, "real", eigenVals) 90 eigenVects = getattr(eigenVects, "real", eigenVects) 91 92 # and now sort: 93 ptOrder = numpy.argsort(eigenVals).tolist() 94 if reverseOrder: 95 ptOrder.reverse() 96 eigenVals = numpy.array([eigenVals[x] for x in ptOrder]) 97 eigenVects = numpy.array([eigenVects[x] for x in ptOrder]) 98 return eigenVals, eigenVects
99 100
101 -def TransformPoints(tFormMat, pts):
102 """ transforms a set of points using tFormMat 103 104 **Arguments** 105 106 - tFormMat: a numpy array 107 108 - pts: a list of numpy arrays (or a 2D array) 109 110 **Returns** 111 112 a list of numpy arrays 113 114 """ 115 pts = numpy.array(pts) 116 nPts = len(pts) 117 avgP = sum(pts) / nPts 118 pts = pts - avgP 119 res = [None] * nPts 120 for i in range(nPts): 121 res[i] = numpy.dot(tFormMat, pts[i]) 122 123 return res
124 125
126 -def MeanAndDev(vect, sampleSD=1):
127 """ returns the mean and standard deviation of a vector """ 128 vect = numpy.array(vect, 'd') 129 n = vect.shape[0] 130 if n <= 0: 131 return 0., 0. 132 mean = sum(vect) / n 133 v = vect - mean 134 if n > 1: 135 if sampleSD: 136 dev = numpy.sqrt(sum(v * v) / (n - 1)) 137 else: 138 dev = numpy.sqrt(sum(v * v) / (n)) 139 140 else: 141 dev = 0 142 return mean, dev
143 144
145 -def R2(orig, residSum):
146 """ returns the R2 value for a set of predictions """ 147 148 # FIX: this just is not right 149 # 150 # A correct formulation of this (from Excel) for 2 variables is: 151 # r2 = [n*(Sxy) - (Sx)(Sy)]^2 / ([n*(Sx2) - (Sx)^2]*[n*(Sy2) - (Sy)^2]) 152 # 153 # 154 155 vect = numpy.array(orig) 156 n = vect.shape[0] 157 if n <= 0: 158 return 0., 0. 159 oMean = sum(vect) / n 160 v = vect - oMean 161 oVar = sum(v * v) 162 return 1. - residSum / oVar
163 164 # One Tail 0.10 0.05 0.025 0.01 0.005 0.001 0.0005 165 tConfs = {80: 1, 90: 2, 95: 3, 98: 4, 99: 5, 99.8: 6, 99.9: 7} 166 tTable = [ 167 [1, 3.078, 6.314, 12.71, 31.82, 63.66, 318.30, 637], 168 [2, 1.886, 2.920, 4.303, 6.965, 9.925, 22.330, 31.6], 169 [3, 1.638, 2.353, 3.182, 4.541, 5.841, 10.210, 12.92], 170 [4, 1.533, 2.132, 2.776, 3.747, 4.604, 7.173, 8.610], 171 [5, 1.476, 2.015, 2.571, 3.365, 4.032, 5.893, 6.869], 172 [6, 1.440, 1.943, 2.447, 3.143, 3.707, 5.208, 5.959], 173 [7, 1.415, 1.895, 2.365, 2.998, 3.499, 4.785, 5.408], 174 [8, 1.397, 1.860, 2.306, 2.896, 3.355, 4.501, 5.041], 175 [9, 1.383, 1.833, 2.262, 2.821, 3.250, 4.297, 4.781], 176 [10, 1.372, 1.812, 2.228, 2.764, 3.169, 4.144, 4.587], 177 [11, 1.363, 1.796, 2.201, 2.718, 3.106, 4.025, 4.437], 178 [12, 1.356, 1.782, 2.179, 2.681, 3.055, 3.930, 4.318], 179 [13, 1.350, 1.771, 2.160, 2.650, 3.012, 3.852, 4.221], 180 [14, 1.345, 1.761, 2.145, 2.624, 2.977, 3.787, 4.140], 181 [15, 1.341, 1.753, 2.131, 2.602, 2.947, 3.733, 4.073], 182 [16, 1.337, 1.746, 2.120, 2.583, 2.921, 3.686, 4.015], 183 [17, 1.333, 1.740, 2.110, 2.567, 2.898, 3.646, 3.965], 184 [18, 1.330, 1.734, 2.101, 2.552, 2.878, 3.610, 3.922], 185 [19, 1.328, 1.729, 2.093, 2.539, 2.861, 3.579, 3.883], 186 [20, 1.325, 1.725, 2.086, 2.528, 2.845, 3.552, 3.850], 187 [21, 1.323, 1.721, 2.080, 2.518, 2.831, 3.527, 3.819], 188 [22, 1.321, 1.717, 2.074, 2.508, 2.819, 3.505, 3.792], 189 [23, 1.319, 1.714, 2.069, 2.500, 2.807, 3.485, 3.768], 190 [24, 1.318, 1.711, 2.064, 2.492, 2.797, 3.467, 3.745], 191 [25, 1.316, 1.708, 2.060, 2.485, 2.787, 3.450, 3.725], 192 [26, 1.315, 1.706, 2.056, 2.479, 2.779, 3.435, 3.707], 193 [27, 1.314, 1.703, 2.052, 2.473, 2.771, 3.421, 3.690], 194 [28, 1.313, 1.701, 2.048, 2.467, 2.763, 3.408, 3.674], 195 [29, 1.311, 1.699, 2.045, 2.462, 2.756, 3.396, 3.659], 196 [30, 1.310, 1.697, 2.042, 2.457, 2.750, 3.385, 3.646], 197 [32, 1.309, 1.694, 2.037, 2.449, 2.738, 3.365, 3.622], 198 [34, 1.307, 1.691, 2.032, 2.441, 2.728, 3.348, 3.601], 199 [36, 1.306, 1.688, 2.028, 2.434, 2.719, 3.333, 3.582], 200 [38, 1.304, 1.686, 2.024, 2.429, 2.712, 3.319, 3.566], 201 [40, 1.303, 1.684, 2.021, 2.423, 2.704, 3.307, 3.551], 202 [42, 1.302, 1.682, 2.018, 2.418, 2.698, 3.296, 3.538], 203 [44, 1.301, 1.680, 2.015, 2.414, 2.692, 3.286, 3.526], 204 [46, 1.300, 1.679, 2.013, 2.410, 2.687, 3.277, 3.515], 205 [48, 1.299, 1.677, 2.011, 2.407, 2.682, 3.269, 3.505], 206 [50, 1.299, 1.676, 2.009, 2.403, 2.678, 3.261, 3.496], 207 [55, 1.297, 1.673, 2.004, 2.396, 2.668, 3.245, 3.476], 208 [60, 1.296, 1.671, 2.000, 2.390, 2.660, 3.232, 3.460], 209 [65, 1.295, 1.669, 1.997, 2.385, 2.654, 3.220, 3.447], 210 [70, 1.294, 1.667, 1.994, 2.381, 2.648, 3.211, 3.435], 211 [80, 1.292, 1.664, 1.990, 2.374, 2.639, 3.195, 3.416], 212 [100, 1.290, 1.660, 1.984, 2.364, 2.626, 3.174, 3.390], 213 [150, 1.287, 1.655, 1.976, 2.351, 2.609, 3.145, 3.357], 214 [200, 1.286, 1.653, 1.972, 2.345, 2.601, 3.131, 3.340] 215 ] 216 217
218 -def GetConfidenceInterval(sd, n, level=95):
219 col = tConfs[level] 220 dofs = n - 1 221 sem = sd / numpy.sqrt(n) 222 idx = 0 223 while idx < len(tTable) and tTable[idx][0] < dofs: 224 idx += 1 225 if idx < len(tTable): 226 t = tTable[idx][col] 227 else: 228 t = tTable[-1][col] 229 return t * sem
230