Package rdkit :: Package ML :: Package Data :: Module Quantize
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.Quantize

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2001-2008  Greg Landrum and Rational Discovery LLC 
  4  #   All Rights Reserved 
  5  # 
  6  """ Automatic search for quantization bounds 
  7   
  8  This uses the expected informational gain to determine where quantization bounds should 
  9  lie. 
 10   
 11  **Notes**: 
 12   
 13    - bounds are less than, so if the bounds are [1.,2.], 
 14      [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2] 
 15   
 16  """ 
 17  from __future__ import print_function 
 18  import numpy 
 19  from rdkit.ML.InfoTheory import entropy 
 20  from rdkit.six.moves import zip, map, range 
 21  try: 
 22    from rdkit.ML.Data import cQuantize 
 23  except ImportError: 
 24    hascQuantize = 0 
 25  else: 
 26    hascQuantize = 1 
 27   
 28  _float_tol = 1e-8 
 29   
 30   
31 -def feq(v1, v2, tol=_float_tol):
32 """ floating point equality with a tolerance factor 33 34 **Arguments** 35 36 - v1: a float 37 38 - v2: a float 39 40 - tol: the tolerance for comparison 41 42 **Returns** 43 44 0 or 1 45 46 """ 47 return abs(v1 - v2) < tol
48 49
50 -def FindVarQuantBound(vals, results, nPossibleRes):
51 """ Uses FindVarMultQuantBounds, only here for historic reasons 52 """ 53 bounds, gain = FindVarMultQuantBounds(vals, 1, results, nPossibleRes) 54 return (bounds[0], gain)
55 56
57 -def _GenVarTable(vals, cuts, starts, results, nPossibleRes):
58 """ Primarily intended for internal use 59 60 constructs a variable table for the data passed in 61 The table for a given variable records the number of times each possible value 62 of that variable appears for each possible result of the function. 63 64 **Arguments** 65 66 - vals: a 1D Numeric array with the values of the variables 67 68 - cuts: a list with the indices of the quantization bounds 69 (indices are into _starts_ ) 70 71 - starts: a list of potential starting points for quantization bounds 72 73 - results: a 1D Numeric array of integer result codes 74 75 - nPossibleRes: an integer with the number of possible result codes 76 77 **Returns** 78 79 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes 80 81 **Notes** 82 83 - _vals_ should be sorted! 84 85 """ 86 nVals = len(cuts) + 1 87 varTable = numpy.zeros((nVals, nPossibleRes), 'i') 88 idx = 0 89 for i in range(nVals - 1): 90 cut = cuts[i] 91 while idx < starts[cut]: 92 varTable[i, results[idx]] += 1 93 idx += 1 94 while idx < len(vals): 95 varTable[-1, results[idx]] += 1 96 idx += 1 97 return varTable
98 99
100 -def _PyRecurseOnBounds(vals, cuts, which, starts, results, nPossibleRes, varTable=None):
101 """ Primarily intended for internal use 102 103 Recursively finds the best quantization boundaries 104 105 **Arguments** 106 107 - vals: a 1D Numeric array with the values of the variables, 108 this should be sorted 109 110 - cuts: a list with the indices of the quantization bounds 111 (indices are into _starts_ ) 112 113 - which: an integer indicating which bound is being adjusted here 114 (and index into _cuts_ ) 115 116 - starts: a list of potential starting points for quantization bounds 117 118 - results: a 1D Numeric array of integer result codes 119 120 - nPossibleRes: an integer with the number of possible result codes 121 122 **Returns** 123 124 - a 2-tuple containing: 125 126 1) the best information gain found so far 127 128 2) a list of the quantization bound indices ( _cuts_ for the best case) 129 130 **Notes** 131 132 - this is not even remotely efficient, which is why a C replacement 133 was written 134 135 """ 136 nBounds = len(cuts) 137 maxGain = -1e6 138 bestCuts = None 139 highestCutHere = len(starts) - nBounds + which 140 if varTable is None: 141 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) 142 while cuts[which] <= highestCutHere: 143 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) 144 gainHere = entropy.InfoGain(varTable) 145 if gainHere > maxGain: 146 maxGain = gainHere 147 bestCuts = cuts[:] 148 # recurse on the next vars if needed 149 if which < nBounds - 1: 150 gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes, 151 varTable=varTable) 152 if gainHere > maxGain: 153 maxGain = gainHere 154 bestCuts = cutsHere 155 # update this cut 156 cuts[which] += 1 157 for i in range(which + 1, nBounds): 158 if cuts[i] == cuts[i - 1]: 159 cuts[i] += 1 160 161 return maxGain, bestCuts
162 163
164 -def _NewPyRecurseOnBounds(vals, cuts, which, starts, results, nPossibleRes, varTable=None):
165 """ Primarily intended for internal use 166 167 Recursively finds the best quantization boundaries 168 169 **Arguments** 170 171 - vals: a 1D Numeric array with the values of the variables, 172 this should be sorted 173 174 - cuts: a list with the indices of the quantization bounds 175 (indices are into _starts_ ) 176 177 - which: an integer indicating which bound is being adjusted here 178 (and index into _cuts_ ) 179 180 - starts: a list of potential starting points for quantization bounds 181 182 - results: a 1D Numeric array of integer result codes 183 184 - nPossibleRes: an integer with the number of possible result codes 185 186 **Returns** 187 188 - a 2-tuple containing: 189 190 1) the best information gain found so far 191 192 2) a list of the quantization bound indices ( _cuts_ for the best case) 193 194 **Notes** 195 196 - this is not even remotely efficient, which is why a C replacement 197 was written 198 199 """ 200 nBounds = len(cuts) 201 maxGain = -1e6 202 bestCuts = None 203 highestCutHere = len(starts) - nBounds + which 204 if varTable is None: 205 varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) 206 while cuts[which] <= highestCutHere: 207 gainHere = entropy.InfoGain(varTable) 208 if gainHere > maxGain: 209 maxGain = gainHere 210 bestCuts = cuts[:] 211 # recurse on the next vars if needed 212 if which < nBounds - 1: 213 gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes, 214 varTable=None) 215 if gainHere > maxGain: 216 maxGain = gainHere 217 bestCuts = cutsHere 218 # update this cut 219 oldCut = cuts[which] 220 cuts[which] += 1 221 bot = starts[oldCut] 222 if oldCut + 1 < len(starts): 223 top = starts[oldCut + 1] 224 else: 225 top = starts[-1] 226 for i in range(bot, top): 227 v = results[i] 228 varTable[which, v] += 1 229 varTable[which + 1, v] -= 1 230 for i in range(which + 1, nBounds): 231 if cuts[i] == cuts[i - 1]: 232 cuts[i] += 1 233 234 return maxGain, bestCuts
235 236
237 -def _NewPyFindStartPoints(sortVals, sortResults, nData):
238 # -------------------------------- 239 # 240 # find all possible dividing points 241 # 242 # There are a couple requirements for a dividing point: 243 # 1) the dependent variable (descriptor) must change across it, 244 # 2) the result score must change across it 245 # 246 # So, in the list [(0,0),(1,0),(1,1),(2,1)]: 247 # we should divide before (1,0) and (2,1) 248 # 249 # -------------------------------- 250 startNext = [] 251 tol = 1e-8 252 blockAct = sortResults[0] 253 lastBlockAct = None 254 lastDiv = None 255 i = 1 256 while i < nData: 257 # move to the end of this block: 258 while i < nData and sortVals[i] - sortVals[i - 1] <= tol: 259 if sortResults[i] != blockAct: 260 # this block is heterogeneous 261 blockAct = -1 262 i += 1 263 if lastBlockAct is None: 264 # first time through: 265 lastBlockAct = blockAct 266 lastDiv = i 267 else: 268 if blockAct == -1 or lastBlockAct == -1 or blockAct != lastBlockAct: 269 startNext.append(lastDiv) 270 lastDiv = i 271 lastBlockAct = blockAct 272 else: 273 lastDiv = i 274 if i < nData: 275 blockAct = sortResults[i] 276 i += 1 277 # catch the case that the last point also sets a bin: 278 if blockAct != lastBlockAct: 279 startNext.append(lastDiv) 280 return startNext
281 282
283 -def FindVarMultQuantBounds(vals, nBounds, results, nPossibleRes):
284 """ finds multiple quantization bounds for a single variable 285 286 **Arguments** 287 288 - vals: sequence of variable values (assumed to be floats) 289 290 - nBounds: the number of quantization bounds to find 291 292 - results: a list of result codes (should be integers) 293 294 - nPossibleRes: an integer with the number of possible values of the 295 result variable 296 297 **Returns** 298 299 - a 2-tuple containing: 300 301 1) a list of the quantization bounds (floats) 302 303 2) the information gain associated with this quantization 304 305 306 """ 307 assert len(vals) == len(results), 'vals/results length mismatch' 308 309 nData = len(vals) 310 if nData == 0: 311 return [], -1e8 312 313 # sort the variable values: 314 svs = list(zip(vals, results)) 315 svs.sort() 316 sortVals, sortResults = zip(*svs) 317 startNext = _FindStartPoints(sortVals, sortResults, nData) 318 if not len(startNext): 319 return [0], 0.0 320 if len(startNext) < nBounds: 321 nBounds = len(startNext) - 1 322 if nBounds == 0: 323 nBounds = 1 324 initCuts = list(range(nBounds)) 325 maxGain, bestCuts = _RecurseOnBounds(sortVals, initCuts, 0, startNext, sortResults, nPossibleRes) 326 quantBounds = [] 327 nVs = len(sortVals) 328 for cut in bestCuts: 329 idx = startNext[cut] 330 if idx == nVs: 331 quantBounds.append(sortVals[-1]) 332 elif idx == 0: 333 quantBounds.append(sortVals[idx]) 334 else: 335 quantBounds.append((sortVals[idx] + sortVals[idx - 1]) / 2.) 336 337 return quantBounds, maxGain
338 339 # hascQuantize=0 340 if hascQuantize: 341 _RecurseOnBounds = cQuantize._RecurseOnBounds 342 _FindStartPoints = cQuantize._FindStartPoints 343 else: 344 _RecurseOnBounds = _NewPyRecurseOnBounds 345 _FindStartPoints = _NewPyFindStartPoints 346 347 if __name__ == '__main__': 348 if 1: 349 d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0), 350 (2.1, 0), (2.2, 1), (2.3, 0)] 351 varValues = list(map(lambda x: x[0], d)) 352 resCodes = list(map(lambda x: x[1], d)) 353 nPossibleRes = 2 354 res = FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) 355 print('RES:', res) 356 target = ([1.3, 2.05], .34707) 357 else: 358 d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0), 359 (2.1, 0), (2.2, 1), (2.3, 0)] 360 varValues = list(map(lambda x: x[0], d)) 361 resCodes = list(map(lambda x: x[1], d)) 362 nPossibleRes = 2 363 res = FindVarMultQuantBounds(varValues, 1, resCodes, nPossibleRes) 364 print(res) 365 # sys.exit(1) 366 d = [(1.4, 1), (1.4, 0)] 367 368 varValues = list(map(lambda x: x[0], d)) 369 resCodes = list(map(lambda x: x[1], d)) 370 nPossibleRes = 2 371 res = FindVarMultQuantBounds(varValues, 1, resCodes, nPossibleRes) 372 print(res) 373 374 d = [(1.4, 0), (1.4, 0), (1.6, 1)] 375 varValues = list(map(lambda x: x[0], d)) 376 resCodes = list(map(lambda x: x[1], d)) 377 nPossibleRes = 2 378 res = FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) 379 print(res) 380